In [6]:
#importing necessery libraries
import os
import io
import re
from datetime import datetime
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from zipfile import ZipFile
from sklearn.model_selection import train_test_split
import spacy

# Importing Data

In [2]:
path = "../data"

# specifying the zip file name
file_name = os.path.join(path,"commonlitreadabilityprize.zip")

# opening the zip file in READ mode
with ZipFile(file_name, 'r') as zip:
    # printing all the contents of the zip file
    zip.printdir()

    # extracting all the files
    print('Extracting all the files now...')
    zip.extractall(path)
    print('Done!')


File Name                                             Modified             Size
sample_submission.csv                          2021-05-02 22:19:12          108
test.csv                                       2021-05-02 22:19:12         6957
train.csv                                      2021-05-02 22:19:12      2927187
Extracting all the files now...
Done!


In [3]:
data_path = os.path.join(path,'train.csv')
data = pd.read_csv(data_path, usecols=['id','excerpt','target','standard_error'],  index_col = 'id')

In [4]:
data.head()

Unnamed: 0_level_0,excerpt,target,standard_error
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
c12129c31,When the young people returned to the ballroom...,-0.340259,0.464009
85aa80a4c,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
b69ac6792,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
dd1000b26,And outside before the palace a great garden w...,-1.054013,0.450007
37c1b32fb,Once upon a time there were Three Bears who li...,0.247197,0.510845


In [8]:
data.excerpt.tolist()[1]

'All through dinner time, Mrs. Fayre was somewhat silent, her eyes resting on Dolly with a wistful, uncertain expression. She wanted to give the child the pleasure she craved, but she had hard work to bring herself to the point of overcoming her own objections.\nAt last, however, when the meal was nearly over, she smiled at her little daughter, and said, "All right, Dolly, you may go."\n"Oh, mother!" Dolly cried, overwhelmed with sudden delight. "Really?\nOh, I am so glad! Are you sure you\'re willing?"\n"I\'ve persuaded myself to be willing, against my will," returned Mrs. Fayre, whimsically. "I confess I just hate to have you go, but I can\'t bear to deprive you of the pleasure trip. And, as you say, it would also keep Dotty at home, and so, altogether, I think I shall have to give in."\n"Oh, you angel mother! You blessed lady! How good you are!" And Dolly flew around the table and gave her mother a hug that nearly suffocated her.'

### EDA

In [None]:
# excerpt length
# sentence length
# average word length
# tf-idf threshold least common words
# unique vocab 
# excerpt similarity for model eval
# POS 
# number of stopwords
# entity recognition

# compare results after pre-processing text
# tf-idf threshold least common words

# pre-processing steps:
# stemming
# lemmatization
# punctuation
# stopwords
# lowercase

In [61]:
def pos(text):
    doc = nlp(text)
    return [token.pos_ for token in doc]
def is_stop(text):
    doc = nlp(text)
    return sum([token.is_stop for token in doc]) / len(doc)
def avg_token_length(text):
    doc = nlp(text)
    return np.mean([len(token.text) for token in doc if not token.is_stop])
def n_token_sents(text):
    doc = nlp(text)
    return np.mean([len(sent) for sent in doc.sents])
def n_sents(text):
    doc = nlp(text)
    return len(list(doc.sents))

In [62]:
data['excerpt_length'] = data.excerpt.apply(lambda x: len(x))
data['n_sents'] = data.excerpt.apply(lambda x: n_sents(x))
data['pos'] = data.excerpt.apply(lambda x: pos(x))
data['n_stop_words'] = data.excerpt.apply(lambda x: is_stop(x))
data['avg_token_length'] = data.excerpt.apply(lambda x: avg_token_length(x))
data['n_token_sents'] = data.excerpt.apply(lambda x: n_token_sents(x))

In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]
vectorizer = TfidfVectorizer()

In [59]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data.excerpt.tolist())
pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())

Unnamed: 0,absorbed,accepted,affected,after,again,against,all,also,altogether,am,...,will,willing,winter,wistful,with,work,would,yet,you,young
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.039764,0.0,0.0,...,0.0,0.0,0.052284,0.0,0.09264,0.0,0.039764,0.0,0.0,0.052284
1,0.0,0.0,0.0,0.0,0.0,0.046956,0.093912,0.046956,0.061741,0.061741,...,0.061741,0.123482,0.0,0.061741,0.072931,0.061741,0.046956,0.0,0.422602,0.0
2,0.065719,0.065719,0.065719,0.065719,0.065719,0.099962,0.049981,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.038815,0.0,0.0,0.065719,0.199923,0.0


# Split train and test dataset

In [17]:
X_train, X_test, y_train, y_test = train_test_split(data['excerpt'], data['target'], 
                                                    test_size=0.3, random_state=42)