In [1]:
#importing necessery libraries
import os
import io
import re
from datetime import datetime
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from zipfile import ZipFile
from sklearn.model_selection import train_test_split
import spacy
nlp = spacy.load("en_core_web_sm")

# Importing Data

In [2]:
path = "../data"

# specifying the zip file name
file_name = os.path.join(path,"commonlitreadabilityprize.zip")

# opening the zip file in READ mode
with ZipFile(file_name, 'r') as zip:
    # printing all the contents of the zip file
    zip.printdir()

    # extracting all the files
    print('Extracting all the files now...')
    zip.extractall(path)
    print('Done!')


File Name                                             Modified             Size
sample_submission.csv                          2021-05-02 22:19:12          108
test.csv                                       2021-05-02 22:19:12         6957
train.csv                                      2021-05-02 22:19:12      2927187
Extracting all the files now...
Done!


In [3]:
data_path = os.path.join(path,'train.csv')
data = pd.read_csv(data_path, usecols=['id','excerpt','target','standard_error'],  index_col = 'id')

In [4]:
data.head()

Unnamed: 0_level_0,excerpt,target,standard_error
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
c12129c31,When the young people returned to the ballroom...,-0.340259,0.464009
85aa80a4c,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
b69ac6792,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
dd1000b26,And outside before the palace a great garden w...,-1.054013,0.450007
37c1b32fb,Once upon a time there were Three Bears who li...,0.247197,0.510845


In [5]:
data.excerpt.tolist()[1]

'All through dinner time, Mrs. Fayre was somewhat silent, her eyes resting on Dolly with a wistful, uncertain expression. She wanted to give the child the pleasure she craved, but she had hard work to bring herself to the point of overcoming her own objections.\nAt last, however, when the meal was nearly over, she smiled at her little daughter, and said, "All right, Dolly, you may go."\n"Oh, mother!" Dolly cried, overwhelmed with sudden delight. "Really?\nOh, I am so glad! Are you sure you\'re willing?"\n"I\'ve persuaded myself to be willing, against my will," returned Mrs. Fayre, whimsically. "I confess I just hate to have you go, but I can\'t bear to deprive you of the pleasure trip. And, as you say, it would also keep Dotty at home, and so, altogether, I think I shall have to give in."\n"Oh, you angel mother! You blessed lady! How good you are!" And Dolly flew around the table and gave her mother a hug that nearly suffocated her.'

### EDA

In [6]:
# excerpt length
# sentence length
# average word length
# tf-idf threshold least common words
# unique vocab  - see what to do with the word_counts dict
# excerpt similarity for model eval - word2vec (doc to pic  - the ones with low SD/an alternative would be to
# take some difficult and some easy excerpts)
# POS 
# number of stopwords
# entity recognition

# compare results after pre-processing text
# tf-idf threshold least common words

# pre-processing steps:
# stemming
# lemmatization
# punctuation
# stopwords
# lowercase

In [7]:
def pos(text):
    doc = nlp(text)
    return [token.pos_ for token in doc]
def is_stop(text):
    doc = nlp(text)
    return sum([token.is_stop for token in doc]) / len(doc)
def avg_token_length(text):
    doc = nlp(text)
    return np.mean([len(token.text) for token in doc if not token.is_stop])
def n_token_sents(text):
    doc = nlp(text)
    return np.mean([len(sent) for sent in doc.sents])
def n_sents(text):
    doc = nlp(text)
    return len(list(doc.sents))

In [8]:
# data['excerpt_length'] = data.excerpt.apply(lambda x: len(x))
# data['n_sents'] = data.excerpt.apply(lambda x: n_sents(x))
# data['pos'] = data.excerpt.apply(lambda x: pos(x))
# data['n_stop_words'] = data.excerpt.apply(lambda x: is_stop(x))
# data['avg_token_length'] = data.excerpt.apply(lambda x: avg_token_length(x))
# data['n_token_sents'] = data.excerpt.apply(lambda x: n_token_sents(x))

# TFiDF feature engineering

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]
vectorizer = TfidfVectorizer()

In [10]:
vectorizer= TfidfVectorizer()
X = vectorizer.fit_transform(data.excerpt.tolist())
# pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())

In [11]:
# sum tfidf frequency of each term through documents
sums = X.sum(axis=1)

# visualize supposedly easiest and hardest excerpts based on their tfidf scores
data['tfidf_doc_score'] = np.array(sums).flatten()
print(data.sort_values('tfidf_doc_score').excerpt[-1])

The cross-Atlantic flight of Charles "Lucky Lindy" Lindbergh in 1927 made him an instant global hero, but even lesser daredevils earned brief fame. A Texan won a $500 bet by pushing, in 22 days, a peanut with his nose up the 14,400 feet-high Pikes Peak. A Louisville housewife won a $200 prize for listening to a radio station for 106 hours without falling asleep (She had to be hospitalized for a combination of delirium and exhaustion).
In no mood to worry whether the good times would last, Americans were happily spending money they had had to save because of wartime shortages, and there was a plethora of new marvelous products to buy. Automobiles, an expensive prestige symbol before the start of WWI, became mass-produced, cheaper and a necessity for taking the new roads to America's thriving cities. By 1927, Ford discontinued the Model T after selling 15 million of them.
Industries switched from coal power to electricity, the production of which almost quadrupled; telephone lines began 

## Creating a dictionary of rarest terms

In [12]:
terms = vectorizer.get_feature_names()

# sum tfidf frequency of each term through documents
sums_1 = X.sum(axis=0)

# connecting term to its sums frequency
temp = []
for col, term in enumerate(terms):
    temp.append( (term, sums_1[0,col] ))

ranking = pd.DataFrame(temp, columns=['term','rank'])
ranking = ranking.sort_values('rank')

top_terms = ranking['term'][0:50].to_numpy()

In [13]:
temp_df = pd.DataFrame([vectorizer.get_feature_names(),vectorizer.idf_]).T.rename(columns={0:'word',1:'idf_score'}).sort_values('idf_score', ascending=True)

In [14]:
# temp_df[temp_df['idf_score']>=8.25665].shape
# temp_df
# 
# top_t_temp = ['immeasurable', 'immediacy', 'immobile']


In [15]:
# data['top_t_idf'] = data.word_counts.apply(lambda x: count_word_in_dict(x, top_idf_score))

In [16]:
# data[data['top_t_test']>0].shape

## Counting occurrences of words in dictionary in excerpt

In [17]:
# Tokenizing excerpts
def tokenize(text):
    doc = nlp(text.lower())
    return [token.text for token in doc]

data['tokens'] = data.excerpt.apply(lambda x: tokenize(x))

In [18]:
from collections import Counter

# Count words occurence in excerpt
def count(tokens):
    dict_word_count = Counter(tokens)
    return dict_word_count
    
data['word_counts'] = data.tokens.apply(lambda x: count(x))

In [19]:
top_idf_score = temp_df[temp_df['idf_score']>=8.25665]

def count_word_in_dict(word_count,top_terms):
    counter=0
    for key, value in word_count.items():
        if key in top_terms:
            counter+=value
    return counter

In [20]:
# def count_word_in_dict(word_count):
#     counter=0
#     for key, value in word_count.items():
#         if key in top_terms:
#             counter+=value
#     return counter

In [21]:
data['w_idf_count'] = data.word_counts.apply(lambda x: count_word_in_dict(x, top_idf_score))

In [24]:
# data.head()
data[data['w_idf_count']>1]

Unnamed: 0_level_0,excerpt,target,standard_error,tfidf_doc_score,tokens,word_counts,w_idf_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
7f8863c77,A crystal or crystalline solid is a solid mate...,-2.14045,0.482121,7.756534,"[a, crystal, or, crystalline, solid, is, a, so...","{'a': 6, 'crystal': 7, 'or': 3, 'crystalline':...",2
a15ef0c1c,"Digital data, in information theory and inform...",-2.241483,0.478949,7.636448,"[digital, data, ,, in, information, theory, an...","{'digital': 6, 'data': 1, ',': 14, 'in': 6, 'i...",3
70f07093e,Environment is living things and what is aroun...,-0.539375,0.472558,6.6361,"[environment, is, living, things, and, what, i...","{'environment': 9, 'is': 5, 'living': 4, 'thin...",2
fff594b50,Ancient Egyptians used pictures to make a phon...,-0.705826,0.462108,7.401842,"[ancient, egyptians, used, pictures, to, make,...","{'ancient': 1, 'egyptians': 2, 'used': 2, 'pic...",5
3fbefb41a,"The word ""information"" is used in many differe...",-0.286443,0.472696,6.377611,"[the, word, "", information, "", is, used, in, m...","{'the': 7, 'word': 2, '""': 12, 'information': ...",2
5d139e7ab,The atoms of a chemical element can exist in d...,-1.176278,0.448208,5.076955,"[the, atoms, of, a, chemical, element, can, ex...","{'the': 16, 'atoms': 1, 'of': 9, 'a': 1, 'chem...",2
c6c5c7b1d,A nation is a group of people who share the sa...,-0.049815,0.482689,8.858319,"[a, nation, is, a, group, of, people, who, sha...","{'a': 10, 'nation': 3, 'is': 4, 'group': 2, 'o...",2
f7c205371,"Radiosurgery is surgery using radiation, that ...",-1.961219,0.473896,8.030628,"[radiosurgery, is, surgery, using, radiation, ...","{'radiosurgery': 4, 'is': 3, 'surgery': 1, 'us...",2
82f92f8a5,Computer software (often called just software)...,-0.29726,0.465326,6.818912,"[computer, software, (, often, called, just, s...","{'computer': 6, 'software': 7, '(': 3, 'often'...",2
d2f71c57b,A species is a kind of organism. It is a basic...,-0.095004,0.492757,7.409833,"[a, species, is, a, kind, of, organism, ., it,...","{'a': 10, 'species': 7, 'is': 7, 'kind': 3, 'o...",2


# Split train and test dataset

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data['excerpt'], data['target'], 
                                                    test_size=0.3, random_state=42)