In [109]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
import wordcloud
import numpy as np
import re

In [110]:
DATA_DIR = '../data'
TRAIN_DATA_DIR = DATA_DIR+'/train'
T1_TRAIN = TRAIN_DATA_DIR+'/Task_1_train.jsonl'
T1_DEV = TRAIN_DATA_DIR+'/Task_1_dev.jsonl'
T2_TRAIN = TRAIN_DATA_DIR+'/Task_2_train.jsonl'
T2_DEV = TRAIN_DATA_DIR+'/Task_2_dev.jsonl'

In [111]:
all_files = [T1_TRAIN,T1_DEV,T2_TRAIN,T2_DEV]

## Statistics

In [112]:
for fil in all_files:
    print("="*68)
    print("Name: ",fil)
    df = pd.read_json(fil,lines=True) ## Can also use jsonlines to read
    print("Samples: ",df.shape[0])
    print("Duplicates: ",df[df.duplicated()].shape[0])
    
        
    print("="*30)
    print("Article Related")
    print("="*30)
    
    print("Duplicates: ",df[df[['article']].duplicated()].shape[0])
    article_char_length = df['article'].apply(lambda x : len(x))
    print("Mean Char Count: ",article_char_length.mean())
    print("Std Char Count: ",article_char_length.std())
    print("Max Char Count: ",article_char_length.max())
    print("Min Char Count: ",article_char_length.min())
    
    article_word_length = df['article'].apply(lambda x : len(x.split()))
    print("Mean Word Count: ",article_word_length.mean())
    print("Std Word Count: ",article_word_length.std())
    print("Max Word Count: ",article_word_length.max())
    print("Min Word Count: ",article_word_length.min())
    
    print("="*30)
    print("Question Related")
    print("="*30)
    print("Duplicates: ",df[df[['question']].duplicated()].shape[0])
    question_char_length = df['question'].apply(lambda x : len(x))
    print("Mean Char Count: ",question_char_length.mean())
    print("Std Char Count: ",question_char_length.std())
    print("Max Char Count: ",question_char_length.max())
    print("Min Char Count: ",question_char_length.min())
    
    question_word_length = df['question'].apply(lambda x : len(x.split()))
    print("Mean Word Count: ",question_word_length.mean())
    print("Std Word Count: ",question_word_length.std())
    print("Max Word Count: ",question_word_length.max())
    print("Min Word Count: ",question_word_length.min())

Name:  ../data/train/Task_1_train.jsonl
Samples:  3227
Duplicates:  0
Article Related
Duplicates:  57
Mean Char Count:  1548.4697861791137
Std Char Count:  905.3006128172633
Max Char Count:  10177
Min Char Count:  162
Mean Word Count:  262.4400371862411
Std Word Count:  154.53378793692275
Max Word Count:  1754
Min Word Count:  31
Question Related
Duplicates:  0
Mean Char Count:  137.82646420824295
Std Char Count:  31.117066022982975
Max Char Count:  389
Min Char Count:  32
Mean Word Count:  24.677099473194918
Std Word Count:  6.1845340781764495
Max Word Count:  73
Min Word Count:  6
Name:  ../data/train/Task_1_dev.jsonl
Samples:  837
Duplicates:  0
Article Related
Duplicates:  4
Mean Char Count:  1582.7909199522103
Std Char Count:  992.3331301339654
Max Char Count:  9563
Min Char Count:  255
Mean Word Count:  268.60812425328555
Std Word Count:  171.25288900142337
Max Word Count:  1641
Min Word Count:  41
Question Related
Duplicates:  0
Mean Char Count:  137.4551971326165
Std Char Count

## Most Freq Tokens

In [113]:
sw = set(stopwords.words('english'))

In [121]:
df = pd.read_json(T1_TRAIN,lines=True)
df['article']=df['article'].apply(lambda x : x.lower())
text = df['article'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',x))
text = text.apply(lambda x : word_tokenize(x))
text = text.apply(lambda x: np.array([y.lower() for y in x if y.lower() not in sw])).values
tokencount = {}
for split_sentence in text:
    for token in split_sentence:
        if(token in tokencount):
            tokencount[token]+=1
        else:
            tokencount[token]=1
print("Unique Token Count: ",len(tokencount))
print("Most Common Words")
print("="*30)
token_counter = Counter(tokencount)
total_count = np.sum(list(tokencount.values()))
for token,count in token_counter.most_common(20):
    print(f"{token}: {count}, {count/total_count}")

Unique Token Count:  41087
Most Common Words
said: 7978, 0.016527418983951127
mr: 2465, 0.005106553997924233
would: 2194, 0.0045451438018035565
also: 2019, 0.004182609542316034
people: 1897, 0.003929871372844734
one: 1705, 0.0035321194995784244
last: 1461, 0.003026643160635823
two: 1349, 0.002794621234563809
first: 1344, 0.002784263112864166
new: 1341, 0.0027780482398443796
year: 1262, 0.0026143899169900125
years: 1248, 0.002585387176231011
could: 1222, 0.002531524943392865
told: 1203, 0.0024921640809342197
us: 1187, 0.0024590180914953606
time: 1159, 0.002401012609977357
government: 1036, 0.0021462028161661276
made: 940, 0.001947326879532973
bbc: 894, 0.0018520321598962531
police: 892, 0.0018478889112163957


### WordCloud

Collecting wordcloud
  Downloading wordcloud-1.8.0-cp38-cp38-manylinux1_x86_64.whl (372 kB)
[K     |████████████████████████████████| 372 kB 125 kB/s eta 0:00:01
Installing collected packages: wordcloud
Successfully installed wordcloud-1.8.0
