In [14]:
#importing necessery libraries
import os
import io
import re
from datetime import datetime
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from zipfile import ZipFile
from sklearn.model_selection import train_test_split
import spacy
nlp = spacy.load("en_core_web_sm")

# Importing Data

In [3]:
path = "../data"

# specifying the zip file name
file_name = os.path.join(path,"commonlitreadabilityprize.zip")

# opening the zip file in READ mode
with ZipFile(file_name, 'r') as zip:
    # printing all the contents of the zip file
    zip.printdir()

    # extracting all the files
    print('Extracting all the files now...')
    zip.extractall(path)
    print('Done!')


File Name                                             Modified             Size
sample_submission.csv                          2021-05-02 22:19:12          108
test.csv                                       2021-05-02 22:19:12         6957
train.csv                                      2021-05-02 22:19:12      2927187
Extracting all the files now...
Done!


In [4]:
data_path = os.path.join(path,'train.csv')
data = pd.read_csv(data_path, usecols=['id','excerpt','target','standard_error'],  index_col = 'id')

In [5]:
data.head()

Unnamed: 0_level_0,excerpt,target,standard_error
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
c12129c31,When the young people returned to the ballroom...,-0.340259,0.464009
85aa80a4c,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
b69ac6792,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
dd1000b26,And outside before the palace a great garden w...,-1.054013,0.450007
37c1b32fb,Once upon a time there were Three Bears who li...,0.247197,0.510845


In [6]:
data.excerpt.tolist()[1]

'All through dinner time, Mrs. Fayre was somewhat silent, her eyes resting on Dolly with a wistful, uncertain expression. She wanted to give the child the pleasure she craved, but she had hard work to bring herself to the point of overcoming her own objections.\nAt last, however, when the meal was nearly over, she smiled at her little daughter, and said, "All right, Dolly, you may go."\n"Oh, mother!" Dolly cried, overwhelmed with sudden delight. "Really?\nOh, I am so glad! Are you sure you\'re willing?"\n"I\'ve persuaded myself to be willing, against my will," returned Mrs. Fayre, whimsically. "I confess I just hate to have you go, but I can\'t bear to deprive you of the pleasure trip. And, as you say, it would also keep Dotty at home, and so, altogether, I think I shall have to give in."\n"Oh, you angel mother! You blessed lady! How good you are!" And Dolly flew around the table and gave her mother a hug that nearly suffocated her.'

### EDA

In [11]:
# excerpt length
# sentence length
# average word length
# tf-idf threshold least common words
# unique vocab 
# excerpt similarity for model eval
# POS 
# number of stopwords
# entity recognition

# compare results after pre-processing text
# tf-idf threshold least common words

# pre-processing steps:
# stemming
# lemmatization
# punctuation
# stopwords
# lowercase

In [15]:
def pos(text):
    doc = nlp(text)
    return [token.pos_ for token in doc]
def is_stop(text):
    doc = nlp(text)
    return sum([token.is_stop for token in doc]) / len(doc)
def avg_token_length(text):
    doc = nlp(text)
    return np.mean([len(token.text) for token in doc if not token.is_stop])
def n_token_sents(text):
    doc = nlp(text)
    return np.mean([len(sent) for sent in doc.sents])
def n_sents(text):
    doc = nlp(text)
    return len(list(doc.sents))

In [16]:
data['excerpt_length'] = data.excerpt.apply(lambda x: len(x))
data['n_sents'] = data.excerpt.apply(lambda x: n_sents(x))
data['pos'] = data.excerpt.apply(lambda x: pos(x))
data['n_stop_words'] = data.excerpt.apply(lambda x: is_stop(x))
data['avg_token_length'] = data.excerpt.apply(lambda x: avg_token_length(x))
data['n_token_sents'] = data.excerpt.apply(lambda x: n_token_sents(x))

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]
vectorizer = TfidfVectorizer()

In [18]:
vectorizer= TfidfVectorizer()
X = vectorizer.fit_transform(data.excerpt.tolist())
pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())

Unnamed: 0,00,000,000th,001,02,03,034,04,049,06,...,µv,½d,ædui,ægidus,æmilius,æneas,æolian,æquians,æschylus,ça
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2829,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2830,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2831,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2832,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [47]:
terms = vectorizer.get_feature_names()

# sum tfidf frequency of each term through documents
sums = X.sum(axis=0)

# connecting term to its sums frequency
temp = []
for col, term in enumerate(terms):
    temp.append((term, sums[0,col] ))

ranking = pd.DataFrame(temp, columns=['term','rank'])
print(ranking.sort_values('rank', ascending=True).head(50))

                    term      rank
23160                 su  0.049764
1113           airplanes  0.049764
23830              tease  0.052762
14999              mbisa  0.053113
16930                oto  0.053113
14831              margy  0.053113
1723             arbiter  0.058505
17112                 p6  0.058505
12360                inc  0.058505
15255  microarchitecture  0.058505
15040          mebibytes  0.058505
8843             exocarp  0.058739
17621            peppers  0.058739
8339            endocarp  0.058739
18032               pits  0.058739
2214            avocados  0.058739
13317            juicier  0.058739
6469                ddwg  0.058891
15395          minimized  0.058891
25060       uncompressed  0.058891
5428           connector  0.058891
7045        differential  0.058891
26126            website  0.059914
333                 2005  0.059914
1488            animator  0.059914
22256            solaris  0.059914
10154       futuresplash  0.059914
25364            upd

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

# Split train and test dataset

In [19]:
X_train, X_test, y_train, y_test = train_test_split(data['excerpt'], data['target'], 
                                                    test_size=0.3, random_state=42)