In [63]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
import wordcloud
import numpy as np
import re
from transformers import BertTokenizer

In [4]:
DATA_DIR = '../data'
TRAIN_DATA_DIR = DATA_DIR+'/train'
T1_TRAIN = TRAIN_DATA_DIR+'/Task_1_train.jsonl'
T1_DEV = TRAIN_DATA_DIR+'/Task_1_dev.jsonl'
T2_TRAIN = TRAIN_DATA_DIR+'/Task_2_train.jsonl'
T2_DEV = TRAIN_DATA_DIR+'/Task_2_dev.jsonl'

In [5]:
all_files = [T1_TRAIN,T1_DEV,T2_TRAIN,T2_DEV]

## Statistics

In [25]:
columns = []
records = {"Samples":[],
           "Duplicated Examples":[],
           "Duplicated Articles":[],
           "Mean Char Count (Articles)":[],
          "Std Char Count (Articles)":[],
           "Max Char Count (Articles)":[],
           "Min Char Count (Articles)":[],
           "Mean Word Count (Articles)":[],
           "Std Word Count (Articles)":[],
           "Max Word Count (Articles)":[],
           "Min Word Count (Articles)":[],
           "Duplicated Questions":[],
           "Mean Char Count (Questions)":[],
          "Std Char Count (Questions)":[],
           "Max Char Count (Questions)":[],
           "Min Char Count (Questions)":[],
           "Mean Word Count (Questions)":[],
           "Std Word Count (Questions)":[],
           "Max Word Count (Questions)":[],
           "Min Word Count (Questions)":[]
          }

for fil in all_files:

    columns.append(fil.split('/')[-1].split('.')[0])

    df = pd.read_json(fil,lines=True) ## Can also use jsonlines to read
    records["Samples"].append(df.shape[0])
    records["Duplicated Examples"].append(df[df.duplicated()].shape[0])

    
    records["Duplicated Articles"].append(df[df[['article']].duplicated()].shape[0])
    article_char_length = df['article'].apply(lambda x : len(x))
    records["Mean Char Count (Articles)"].append(article_char_length.mean())
    records["Std Char Count (Articles)"].append(article_char_length.std())
    records["Max Char Count (Articles)"].append(article_char_length.max())
    records["Min Char Count (Articles)"].append(article_char_length.min())
    
    article_word_length = df['article'].apply(lambda x : len(x.split()))
    records["Mean Word Count (Articles)"].append(article_word_length.mean())
    records["Std Word Count (Articles)"].append(article_word_length.std())
    records["Max Word Count (Articles)"].append(article_word_length.max())
    records["Min Word Count (Articles)"].append(article_word_length.min())

    records["Duplicated Questions"].append(df[df[['question']].duplicated()].shape[0])
    question_char_length = df['question'].apply(lambda x : len(x))
    records["Mean Char Count (Questions)"].append(question_char_length.mean())
    records["Std Char Count (Questions)"].append(question_char_length.std())
    records["Max Char Count (Questions)"].append(question_char_length.max())
    records["Min Char Count (Questions)"].append(question_char_length.min())
    
    question_word_length = df['question'].apply(lambda x : len(x.split()))
    records["Mean Word Count (Questions)"].append(question_word_length.mean())
    records["Std Word Count (Questions)"].append(question_word_length.std())
    records["Max Word Count (Questions)"].append(question_word_length.max())
    records["Min Word Count (Questions)"].append(question_word_length.min())
    
stats_df_words = pd.DataFrame.from_dict(records,columns=columns, orient="index")
print(stats_df_words.to_markdown(tablefmt="github"))

## Most Freq Tokens

In [28]:
sw = set(stopwords.words('english'))

In [50]:
counts = {"Unique Token Count":[],"Total Count":[]}
columns = []
all_freq_df = None
for fil in all_files:
    columns.append(fil.split('/')[-1].split('.')[0])
    
    df = pd.read_json(fil,lines=True)
    df['article']=df['article'].apply(lambda x : x.lower())
    text = df['article'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',x))
    text = text.apply(lambda x : word_tokenize(x))
    text = text.apply(lambda x: np.array([y.lower() for y in x if y.lower() not in sw])).values
    tokencount = {}
    for split_sentence in text:
        for token in split_sentence:
            if(token in tokencount):
                tokencount[token]+=1
            else:
                tokencount[token]=1
    
    counts["Unique Token Count"].append(len(tokencount))
    token_counter = Counter(tokencount)
    total_count = np.sum(list(tokencount.values()))
    counts["Total Count"].append(total_count)
    #print("Most Common Words")
    # print("="*30)
   

    rows = {"Word":[],"Count":[],"Frequency":[]}
    for token,count in token_counter.most_common(20):
        #print(f"{token}: {count}, {count/total_count}")
        rows["Word"].append(token)
        rows["Count"].append(count)
        rows["Frequency"].append(count/total_count)

    freq_words_df = pd.DataFrame.from_dict(rows)
    if(all_freq_df is None):
        all_freq_df = freq_words_df
    else:
        all_freq_df = pd.concat([all_freq_df,freq_words_df],axis=1)
    
counts_df = pd.DataFrame.from_dict(counts,orient="index",columns=columns)

In [51]:
print(counts_df.to_markdown(tablefmt="github"))

|                    |   Task_1_train |   Task_1_dev |   Task_2_train |   Task_2_dev |
|--------------------|----------------|--------------|----------------|--------------|
| Unique Token Count |          41087 |        20447 |          53285 |        25779 |
| Total Count        |         482713 |       128258 |         775342 |       203877 |


In [56]:
print(all_freq_df.to_markdown(tablefmt="github"))

|    | Word       |   Count |   Frequency | Word   |   Count |   Frequency | Word   |   Count |   Frequency | Word       |   Count |   Frequency |
|----|------------|---------|-------------|--------|---------|-------------|--------|---------|-------------|------------|---------|-------------|
|  0 | said       |    7978 |  0.0165274  | said   |    1940 |  0.0151258  | said   |    9594 |  0.0123739  | said       |    2524 |  0.01238    |
|  1 | mr         |    2465 |  0.00510655 | mr     |     701 |  0.00546555 | mr     |    3549 |  0.00457733 | mr         |     993 |  0.00487058 |
|  2 | would      |    2194 |  0.00454514 | would  |     593 |  0.00462349 | would  |    3539 |  0.00456444 | would      |     954 |  0.00467929 |
|  3 | also       |    2019 |  0.00418261 | also   |     506 |  0.00394517 | people |    3335 |  0.00430133 | people     |     872 |  0.00427709 |
|  4 | people     |    1897 |  0.00392987 | one    |     486 |  0.00378924 | one    |    3182 |  0.004104   | one     

In [57]:
counts = {"Unique Token Count":[],"Total Count":[]}
columns = []
all_freq_df = None
for fil in all_files:
    columns.append(fil.split('/')[-1].split('.')[0])
    
    df = pd.read_json(fil,lines=True)
    df['question']=df['question'].apply(lambda x : x.lower())
    text = df['question'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',x))
    text = text.apply(lambda x : word_tokenize(x))
    text = text.apply(lambda x: np.array([y.lower() for y in x if y.lower() not in sw])).values
    tokencount = {}
    for split_sentence in text:
        for token in split_sentence:
            if(token in tokencount):
                tokencount[token]+=1
            else:
                tokencount[token]=1
    
    counts["Unique Token Count"].append(len(tokencount))
    token_counter = Counter(tokencount)
    total_count = np.sum(list(tokencount.values()))
    counts["Total Count"].append(total_count)
    #print("Most Common Words")
    # print("="*30)
   

    rows = {"Word":[],"Count":[],"Frequency":[]}
    for token,count in token_counter.most_common(20):
        #print(f"{token}: {count}, {count/total_count}")
        rows["Word"].append(token)
        rows["Count"].append(count)
        rows["Frequency"].append(count/total_count)

    freq_words_df = pd.DataFrame.from_dict(rows)
    if(all_freq_df is None):
        all_freq_df = freq_words_df
    else:
        all_freq_df = pd.concat([all_freq_df,freq_words_df],axis=1)
    
counts_df = pd.DataFrame.from_dict(counts,orient="index",columns=columns)

In [58]:
print(counts_df.to_markdown(tablefmt="github"))

|                    |   Task_1_train |   Task_1_dev |   Task_2_train |   Task_2_dev |
|--------------------|----------------|--------------|----------------|--------------|
| Unique Token Count |          10032 |         4602 |          11098 |         4966 |
| Total Count        |          43683 |        11289 |          47297 |        12255 |


In [61]:
print(all_freq_df.to_markdown(tablefmt="github"))

|    | Word        |   Count |   Frequency | Word        |   Count |   Frequency | Word        |   Count |   Frequency | Word        |   Count |   Frequency |
|----|-------------|---------|-------------|-------------|---------|-------------|-------------|---------|-------------|-------------|---------|-------------|
|  0 | placeholder |    3227 |  0.0738731  | placeholder |     837 |  0.074143   | placeholder |    3318 |  0.0701524  | placeholder |     851 |  0.069441   |
|  1 | said        |     268 |  0.00613511 | new         |      69 |  0.00611214 | new         |     229 |  0.00484174 | new         |      71 |  0.00579355 |
|  2 | year        |     230 |  0.00526521 | says        |      62 |  0.00549207 | year        |     218 |  0.00460917 | two         |      60 |  0.00489596 |
|  3 | new         |     222 |  0.00508207 | year        |      56 |  0.00496058 | two         |     209 |  0.00441888 | says        |      57 |  0.00465116 |
|  4 | says        |     197 |  0.00450976 | u

### BertTokenizer Based

In [67]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [82]:
columns = []
records = {
           "Mean Token Count (Articles)":[],
           "Std Token Count (Articles)":[],
           "Max Token Count (Articles)":[],
           "Min Token Count (Articles)":[],

           "Mean Token Count (Questions)":[],
           "Std Token Count (Questions)":[],
           "Max Token Count (Questions)":[],
           "Min Token Count (Questions)":[]
          }

for fil in all_files:

    columns.append(fil.split('/')[-1].split('.')[0])

    df = pd.read_json(fil,lines=True) ## Can also use jsonlines to read

    article_word_length = df['article'].apply(lambda x : len(tokenizer.tokenize(x)))
    records["Mean Token Count (Articles)"].append(article_word_length.mean())
    records["Std Token Count (Articles)"].append(article_word_length.std())
    records["Max Token Count (Articles)"].append(article_word_length.max())
    records["Min Token Count (Articles)"].append(article_word_length.min())

    question_word_length = df['question'].apply(lambda x : len(tokenizer.tokenize(x)))
    records["Mean Token Count (Questions)"].append(question_word_length.mean())
    records["Std Token Count (Questions)"].append(question_word_length.std())
    records["Max Token Count (Questions)"].append(question_word_length.max())
    records["Min Token Count (Questions)"].append(question_word_length.min())
    
stats_df_words = pd.DataFrame.from_dict(records,columns=columns, orient="index")
print(stats_df_words.to_markdown(tablefmt="github"))

|                              |   Task_1_train |   Task_1_dev |   Task_2_train |   Task_2_dev |
|------------------------------|----------------|--------------|----------------|--------------|
| Mean Token Count (Articles)  |      332.603   |    341.452   |       528.58   |    539.303   |
| Std Token Count (Articles)   |      193.729   |    217.673   |       387.756  |    396.286   |
| Max Token Count (Articles)   |     2206       |   2043       |      2188      |   2272       |
| Min Token Count (Articles)   |       38       |     52       |        46      |     52       |
| Mean Token Count (Questions) |       28.6319  |     28.3441  |        30.9629 |     31.1657  |
| Std Token Count (Questions)  |        6.83765 |      6.75112 |        10.0225 |      9.98314 |
| Max Token Count (Questions)  |       84       |     84       |        91      |     89       |
| Min Token Count (Questions)  |        9       |     11       |         9      |      8       |


In [72]:
counts = {"Unique Token Count":[],"Total Count":[]}
columns = []
all_freq_df = None
for fil in all_files:
    columns.append(fil.split('/')[-1].split('.')[0])
    
    df = pd.read_json(fil,lines=True)
    df['article']=df['article'].apply(lambda x : x.lower())
    text = df['article'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',x))
    text = text.apply(lambda x : tokenizer.tokenize(x))
    text = text.apply(lambda x: np.array([y.lower() for y in x if y.lower() not in sw])).values
    
    tokencount = {}
    for split_sentence in text:
        for token in split_sentence:
            if(token in tokencount):
                tokencount[token]+=1
            else:
                tokencount[token]=1
    
    counts["Unique Token Count"].append(len(tokencount))
    token_counter = Counter(tokencount)
    total_count = np.sum(list(tokencount.values()))
    counts["Total Count"].append(total_count)
    #print("Most Common Words")
    # print("="*30)
   

    rows = {"Word":[],"Count":[],"Frequency":[]}
    for token,count in token_counter.most_common(20):
        #print(f"{token}: {count}, {count/total_count}")
        rows["Word"].append(token)
        rows["Count"].append(count)
        rows["Frequency"].append(count/total_count)

    freq_words_df = pd.DataFrame.from_dict(rows)
    if(all_freq_df is None):
        all_freq_df = freq_words_df
    else:
        all_freq_df = pd.concat([all_freq_df,freq_words_df],axis=1)
    
counts_df = pd.DataFrame.from_dict(counts,orient="index",columns=columns)

In [74]:
print(counts_df.to_markdown(tablefmt="github"))

|                    |   Task_1_train |   Task_1_dev |   Task_2_train |   Task_2_dev |
|--------------------|----------------|--------------|----------------|--------------|
| Unique Token Count |          21952 |        15858 |          23842 |        18157 |
| Total Count        |         559481 |       148611 |         898160 |       235177 |


In [75]:
print(all_freq_df.to_markdown(tablefmt="github"))

|    | Word   |   Count |   Frequency | Word   |   Count |   Frequency | Word   |   Count |   Frequency | Word   |   Count |   Frequency |
|----|--------|---------|-------------|--------|---------|-------------|--------|---------|-------------|--------|---------|-------------|
|  0 | ##s    |    8094 |  0.014467   | ##s    |    2393 |  0.0161024  | ##s    |   13511 |  0.015043   | ##s    |    3477 |  0.0147846  |
|  1 | said   |    7994 |  0.0142882  | said   |    1941 |  0.0130609  | said   |    9597 |  0.0106852  | said   |    2525 |  0.0107366  |
|  2 | mr     |    2466 |  0.00440766 | mr     |     702 |  0.00472374 | ##t    |    4282 |  0.00476752 | ##t    |    1068 |  0.00454126 |
|  3 | ##t    |    2266 |  0.00405018 | ##t    |     674 |  0.00453533 | mr     |    3550 |  0.00395253 | mr     |     993 |  0.00422235 |
|  4 | would  |    2204 |  0.00393937 | would  |     595 |  0.00400374 | would  |    3549 |  0.00395141 | would  |     964 |  0.00409904 |
|  5 | also   |    2020 |  

In [76]:
counts = {"Unique Token Count":[],"Total Count":[]}
columns = []
all_freq_df = None
for fil in all_files:
    columns.append(fil.split('/')[-1].split('.')[0])
    
    df = pd.read_json(fil,lines=True)
    df['question']=df['question'].apply(lambda x : x.lower())
    text = df['question'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',x))
    text = text.apply(lambda x : tokenizer.tokenize(x))
    text = text.apply(lambda x: np.array([y.lower() for y in x if y.lower() not in sw])).values
    
    tokencount = {}
    for split_sentence in text:
        for token in split_sentence:
            if(token in tokencount):
                tokencount[token]+=1
            else:
                tokencount[token]=1
    
    counts["Unique Token Count"].append(len(tokencount))
    token_counter = Counter(tokencount)
    total_count = np.sum(list(tokencount.values()))
    counts["Total Count"].append(total_count)
    #print("Most Common Words")
    # print("="*30)
   

    rows = {"Word":[],"Count":[],"Frequency":[]}
    for token,count in token_counter.most_common(20):
        #print(f"{token}: {count}, {count/total_count}")
        rows["Word"].append(token)
        rows["Count"].append(count)
        rows["Frequency"].append(count/total_count)

    freq_words_df = pd.DataFrame.from_dict(rows)
    if(all_freq_df is None):
        all_freq_df = freq_words_df
    else:
        all_freq_df = pd.concat([all_freq_df,freq_words_df],axis=1)
    
counts_df = pd.DataFrame.from_dict(counts,orient="index",columns=columns)

In [77]:
print(counts_df.to_markdown(tablefmt="github"))

|                    |   Task_1_train |   Task_1_dev |   Task_2_train |   Task_2_dev |
|--------------------|----------------|--------------|----------------|--------------|
| Unique Token Count |           9771 |         4876 |          10702 |         5251 |
| Total Count        |          51629 |        13284 |          55858 |        14331 |


In [78]:
print(all_freq_df.to_markdown(tablefmt="github"))

|    | Word       |   Count |   Frequency | Word     |   Count |   Frequency | Word     |   Count |   Frequency | Word     |   Count |   Frequency |
|----|------------|---------|-------------|----------|---------|-------------|----------|---------|-------------|----------|---------|-------------|
|  0 | place      |    3256 |  0.0630653  | place    |     844 |  0.0635351  | place    |    3349 |  0.0599556  | place    |     859 |  0.05994    |
|  1 | ##holder   |    3227 |  0.0625036  | ##holder |     837 |  0.0630081  | ##holder |    3318 |  0.0594006  | ##holder |     851 |  0.0593818  |
|  2 | said       |     272 |  0.00526836 | new      |      69 |  0.00519422 | ##s      |     333 |  0.00596155 | new      |      72 |  0.00502407 |
|  3 | year       |     230 |  0.00445486 | ##s      |      66 |  0.00496838 | new      |     230 |  0.00411758 | ##s      |      72 |  0.00502407 |
|  4 | ##s        |     223 |  0.00431928 | says     |      62 |  0.00466727 | year     |     218 |  0.003

### WordCloud