In [None]:
# importing required packages
from pathlib import Path

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import spacy
nlp = spacy.load('en')

from spacy import displacy
#from spacy.lang.en import English
#parser = English()

#from tqdm import tqdm


from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin 

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score, cohen_kappa_score

np.random.seed(42)
%matplotlib inline

In [None]:
# set up display area to show dataframe in jupyter qtconsole

#pd.set_option('display.height', 1000)
#pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
#pd.set_option('display.width', 1000)

pd.set_option('display.max_colwidth', -1)

In [None]:
myDir = Path.cwd().parents[0]
dataFolder = myDir / 'data/asap-sas'
ratingsFolder = myDir / 'data/ratings'

print(dataFolder)

gradeMap = {1: 10,
                2: 10,
                3: 10,
                4:10,
                5: 10,
                6: 10,
                7:10,
                8:10,
                9:10,
                10:8}

subjectMap = {1: 'Science',
            2: 'Science',
            3: 'English Language Arts',
            4: 'English Language Arts',
            5: 'Biology',
            6: 'Biology',
            7:'English',
            8:'English',
            9:'English',
            10:'Science'}

df = pd.read_csv(dataFolder/'train.tsv', sep='\t', header=0)  #read data into dataframe
df.drop('Score2', inplace=True, axis=1) #Score 2 is for inter-rate reliability only

df['subject'] = df['EssaySet'].map(subjectMap)
df['studentGrade'] = df['EssaySet'].map(gradeMap)

df = df[['Id','EssaySet','subject','studentGrade','EssayText','Score1']] #rearrange columns
df.head()

In [4]:
# Take only essay set 1
set_1 = df[(df['EssaySet'] == 1)].copy()
set_1.shape

(1672, 6)

In [5]:
del df

In [6]:
X = set_1[['EssayText','Score1']]
y = X.pop('Score1')


train, test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

#Adding the is_copy to False otherwise we get SettingWithCopyWarning
train = train.copy()
test = test.copy()

train.head()

Unnamed: 0,EssayText
1145,"You would need to know how much vinegar was put in to each sample, find the size and shape of the container so the same amount of vinegar was actually covering the sample. You would need to know the shape or volume of sample because the surface are has to be the same so the same amount is affected by the vinegar."
842,"In order to replicate experiment I would need to know exactly how much vinegar to pour in each container, how much of each sample to put in the container, and"
1554,The additional information you would need in order to replicate the experiment form a hypothesis. Draw a conclusion. Say what you are experimenting.
1526,To replicate this experiment you would need to state you problem. What is the person for this lab also you need to indicate your independent and dependent variables.
497,1) I would need to know how much vinegar is being put into the sample. ^p 2) What are they trying to find out through pass is to amount. ^p 3) Was there a control group in this experiment.


In [7]:
new_df = train[['EssayText']][0:5]
new_df = new_df.copy() 

new_df.head()

Unnamed: 0,EssayText
1145,"You would need to know how much vinegar was put in to each sample, find the size and shape of the container so the same amount of vinegar was actually covering the sample. You would need to know the shape or volume of sample because the surface are has to be the same so the same amount is affected by the vinegar."
842,"In order to replicate experiment I would need to know exactly how much vinegar to pour in each container, how much of each sample to put in the container, and"
1554,The additional information you would need in order to replicate the experiment form a hypothesis. Draw a conclusion. Say what you are experimenting.
1526,To replicate this experiment you would need to state you problem. What is the person for this lab also you need to indicate your independent and dependent variables.
497,1) I would need to know how much vinegar is being put into the sample. ^p 2) What are they trying to find out through pass is to amount. ^p 3) Was there a control group in this experiment.


## Feature Extraction

In [39]:
content_words = ['NOUN', 'PROPN', 'VERB', 'ADJ', 'ADV']
tokens = []
word_count = []

avg_word_len = []
X_train = pd.DataFrame()

aoa_ratings_df = pd.read_csv(ratingsFolder/'AoA_Ratings.csv')
aoa_ratings = dict(zip(aoa_ratings_df.Word, aoa_ratings_df.AoA))
maturity_tokens = []
maturity = []

conc_ratings_df = pd.read_csv(ratingsFolder/'Concreteness_Ratings.csv')
conc_ratings = dict(zip(conc_ratings_df.Word, conc_ratings_df.Concreteness))
concreteness = []
conc_tokens = []    


for doc in nlp.pipe(new_df['EssayText'], batch_size=50, n_threads=4):
        
    if doc.is_parsed:
        
        
        
        #Add placeholders for CONTENT words, else parse as usual
        tokens.append(['__{}__'.format(w.pos_) if w.pos_ in content_words else w.lemma_.lower() for w in doc])
        
        #Count words which are not punctuation
        tmp_word_count = []
        word_count.append(len([w for w in doc if not w.is_punct]))
        tmp_word_count.append([len(w.shape_) for w in doc if not w.is_punct])
        
        for i in tmp_word_count:
            avg_word_len.append(sum(i)/len(i))
            
        
        #-------------------Maturity
        mat_tmp = []
        maturity_tokens.append([w.text.lower() if w.lemma_ == '-PRON-' else w.lemma_.lower() for w in doc])

        #maturity.append([value.get('name') for value in d.values()])
        mat_tmp.append([aoa_ratings[t] for a in maturity_tokens for t in a if t in aoa_ratings])
        
         #Now get avg maturity per doc
        for j in mat_tmp:
            maturity.append(sum(j)/len(j))
            
        #----------Concreteness
        conc_tmp = []
        conc_tokens.append([w.text.lower() for w in doc])
        conc_tmp.append([conc_ratings[t] for a in conc_tokens for t in a if t in conc_ratings])
        
        #Now get avg concreteness per doc
        for k in conc_tmp:
            concreteness.append(sum(k)/len(k))
    
    else:
        # We want to make sure that the lists of parsed results have the
        # same number of entries of the original Dataframe, so add some blanks in case the parse fails

        tokens.append(None)
        word_count.append(None)
        avg_word_len.append(None)
        maturity.append(None)
        concreteness.append(None)



new_df['total_words'] = word_count
new_df['avg_word_length'] = avg_word_len
new_df['maturity'] = maturity
new_df['concreteness'] = concreteness


new_df.head()

3.2580645161290325
3.2
3.3043478260869565
3.4642857142857144
3.051282051282051


Unnamed: 0,EssayText,total_words,avg_word_length,maturity,concreteness
1145,"You would need to know how much vinegar was put in to each sample, find the size and shape of the container so the same amount of vinegar was actually covering the sample. You would need to know the shape or volume of sample because the surface are has to be the same so the same amount is affected by the vinegar.",62,3.258065,5.125484,2.293443
842,"In order to replicate experiment I would need to know exactly how much vinegar to pour in each container, how much of each sample to put in the container, and",30,3.2,5.17033,2.344222
1554,The additional information you would need in order to replicate the experiment form a hypothesis. Draw a conclusion. Say what you are experimenting.,23,3.304348,5.313158,2.366161
1526,To replicate this experiment you would need to state you problem. What is the person for this lab also you need to indicate your independent and dependent variables.,28,3.464286,5.422465,2.39
497,1) I would need to know how much vinegar is being put into the sample. ^p 2) What are they trying to find out through pass is to amount. ^p 3) Was there a control group in this experiment.,39,3.051282,5.353314,2.366512


In [40]:
testing = new_df[['total_words','avg_word_length','maturity','concreteness']]
testing

Unnamed: 0,total_words,avg_word_length,maturity,concreteness
1145,62,3.258065,5.125484,2.293443
842,30,3.2,5.17033,2.344222
1554,23,3.304348,5.313158,2.366161
1526,28,3.464286,5.422465,2.39
497,39,3.051282,5.353314,2.366512


In [74]:
from sklearn.preprocessing import StandardScaler
scaled_manual_features = StandardScaler().fit_transform(new_df[['total_words','avg_word_length','maturity','concreteness']])
man_features_df = pd.DataFrame(scaled_manual_features, index=new_df.index, columns=['total_words','avg_word_length','maturity','concreteness'])
man_features_df.head()

Unnamed: 0,total_words,avg_word_length,maturity,concreteness
1145,1.854099,0.018321,-1.353381,-1.793126
842,-0.463525,-0.412628,-0.952675,-0.239957
1554,-0.970505,0.361831,0.323524,0.431064
1526,-0.608376,1.548875,1.300203,1.160223
497,0.188307,-1.516399,0.682329,0.441797


## BoW, N-grams (Tokens + POS)    

In [70]:
def spacy_tokenizer(doc):
    content_words = ['NOUN', 'PROPN', 'VERB', 'ADJ', 'ADV']
    
    doc = nlp(doc)
    
    #remove ^p (bullet points)
    
    return ['__{}__'.format(w.pos_) if w.pos_ in content_words else w.lemma_.lower() for w in doc]  
    


vectorizer = CountVectorizer(tokenizer=spacy_tokenizer,  ngram_range=(2,3), max_features=2000)
text_ngrams = vectorizer.fit_transform(new_df['EssayText'])

print(text_ngrams.shape)

#print(count_vect.vocabulary_)
#pd.DataFrame(X_train_counts.toarray(), columns=vectorizer.get_feature_names()).head(10)

scaled_text_ngrams = StandardScaler(with_mean=False).fit_transform(text_ngrams)
text_ngrams_df = pd.DataFrame(scaled_text_ngrams.toarray(), columns=vectorizer.get_feature_names(), index=new_df.index)
text_ngrams_df.head()
#DataFrame.values

(5, 218)




Unnamed: 0,) __NOUN__,) __NOUN__ __VERB__,) __VERB__,) __VERB__ __ADV__,) i,) i __VERB__,", __ADV__",", __ADV__ __ADJ__",", __VERB__",", __VERB__ the",", and",-pron- __NOUN__,-pron- __NOUN__ .,-pron- __VERB__,-pron- __VERB__ __VERB__,-pron- __VERB__ to,. -pron-,. -pron- __VERB__,. ^p,. ^p 2,. ^p 3,. __NOUN__,. __NOUN__ __VERB__,. __VERB__,. __VERB__ __NOUN__,. __VERB__ a,1 ),1 ) i,2 ),2 ) __NOUN__,3 ),3 ) __VERB__,^p 2,^p 2 ),^p 3,^p 3 ),__ADJ__ __ADJ__,__ADJ__ __ADJ__ and,__ADJ__ __ADV__,__ADJ__ __ADV__ the,__ADJ__ __NOUN__,__ADJ__ __NOUN__ -pron-,__ADJ__ __NOUN__ .,__ADJ__ __NOUN__ __VERB__,__ADJ__ __NOUN__ of,__ADJ__ __NOUN__ to,__ADJ__ and,__ADJ__ and __ADJ__,__ADJ__ of,__ADJ__ of each,__ADV__ -pron-,__ADV__ -pron- __VERB__,__ADV__ __ADJ__,__ADV__ __ADJ__ __NOUN__,__ADV__ __ADJ__ of,__ADV__ __ADV__,__ADV__ __ADV__ __ADJ__,__ADV__ __VERB__,__ADV__ __VERB__ the,__ADV__ a,__ADV__ a __NOUN__,__ADV__ the,__ADV__ the __ADJ__,"__NOUN__ ,","__NOUN__ , __ADV__","__NOUN__ , __VERB__","__NOUN__ , and",__NOUN__ -pron-,__NOUN__ -pron- __VERB__,__NOUN__ .,__NOUN__ . -pron-,__NOUN__ . ^p,__NOUN__ . __NOUN__,__NOUN__ . __VERB__,__NOUN__ __ADV__,__NOUN__ __ADV__ -pron-,__NOUN__ __NOUN__,__NOUN__ __NOUN__ in,__NOUN__ __VERB__,__NOUN__ __VERB__ -pron-,__NOUN__ __VERB__ __ADV__,__NOUN__ __VERB__ __VERB__,__NOUN__ __VERB__ a,__NOUN__ __VERB__ the,__NOUN__ __VERB__ to,__NOUN__ and,__NOUN__ and __NOUN__,__NOUN__ because,__NOUN__ because the,__NOUN__ for,__NOUN__ for this,__NOUN__ i,__NOUN__ i __VERB__,__NOUN__ in,__NOUN__ in this,__NOUN__ of,__NOUN__ of __NOUN__,__NOUN__ of the,__NOUN__ or,__NOUN__ or __NOUN__,__NOUN__ so,__NOUN__ so the,__NOUN__ to,__NOUN__ to __VERB__,__VERB__ -pron-,__VERB__ -pron- __NOUN__,__VERB__ -pron- __VERB__,__VERB__ .,__VERB__ . ^p,__VERB__ __ADJ__,__VERB__ __ADJ__ __ADJ__,__VERB__ __ADV__,__VERB__ __ADV__ __ADJ__,__VERB__ __ADV__ __ADV__,__VERB__ __ADV__ __VERB__,__VERB__ __ADV__ a,__VERB__ __NOUN__,__VERB__ __NOUN__ -pron-,__VERB__ __NOUN__ i,__VERB__ __VERB__,__VERB__ __VERB__ .,__VERB__ __VERB__ __VERB__,__VERB__ __VERB__ by,__VERB__ __VERB__ in,__VERB__ __VERB__ into,__VERB__ __VERB__ to,__VERB__ a,__VERB__ a __NOUN__,__VERB__ by,__VERB__ by the,__VERB__ in,__VERB__ in __NOUN__,__VERB__ in each,__VERB__ in the,__VERB__ in to,__VERB__ into,__VERB__ into the,__VERB__ out,__VERB__ out through,__VERB__ the,__VERB__ the __ADJ__,__VERB__ the __NOUN__,__VERB__ this,__VERB__ this __NOUN__,__VERB__ to,__VERB__ to __VERB__,a __NOUN__,a __NOUN__ .,a __NOUN__ __NOUN__,and __ADJ__,and __ADJ__ __NOUN__,and __NOUN__,and __NOUN__ of,because the,because the __NOUN__,by the,by the __NOUN__,each __NOUN__,"each __NOUN__ ,",each __NOUN__ to,for this,for this __NOUN__,i __VERB__,i __VERB__ __VERB__,in __NOUN__,in __NOUN__ to,in each,in each __NOUN__,in the,in the __NOUN__,in this,in this __NOUN__,in to,in to each,into the,into the __NOUN__,of __NOUN__,of __NOUN__ __VERB__,of __NOUN__ because,of each,of each __NOUN__,of the,of the __NOUN__,or __NOUN__,or __NOUN__ of,out through,out through __NOUN__,so the,so the __ADJ__,the __ADJ__,the __ADJ__ __ADV__,the __ADJ__ __NOUN__,the __NOUN__,"the __NOUN__ ,",the __NOUN__ .,the __NOUN__ __VERB__,the __NOUN__ and,the __NOUN__ for,the __NOUN__ or,the __NOUN__ so,this __NOUN__,this __NOUN__ -pron-,this __NOUN__ .,this __NOUN__ __ADV__,through __NOUN__,through __NOUN__ __VERB__,to __VERB__,to __VERB__ -pron-,to __VERB__ .,to __VERB__ __ADJ__,to __VERB__ __ADV__,to __VERB__ __NOUN__,to __VERB__ in,to __VERB__ out,to __VERB__ the,to __VERB__ this,to each,to each __NOUN__
1145,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,2.5,0.0,0.0,0.0,2.5,2.236068,0.0,2.5,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,2.5,3.75,0.0,0.0,2.5,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.336306,2.041241,0.0,0.0,0.0,2.5,2.5,0.0,0.0,2.5,2.5,1.25,0.0,2.5,0.0,0.0,0.0,2.5,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.721655,0.0,2.5,2.572479,0.0,0.0,0.0,2.5,2.5,2.5,2.5,0.0,0.0,0.0,0.0,0.0,0.0,2.5,2.5,2.5,2.5,2.5,2.5,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.236068,2.041241,0.0,2.5,0.0,0.0,0.0,0.0,3.340766,0.0,0.0,2.5,2.041241,0.0,3.061862,0.0,0.0,2.5,2.5,1.336306,0.0,0.0,0.0,2.5,0.0,0.0,0.0,0.0,2.721655,2.5,2.738613,0.0,0.0,2.572479,2.572479,0.0,0.0,0.0,0.0,0.0,2.5,2.5,2.5,2.5,2.5,2.5,1.25,2.041241,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,2.5,0.0,0.0,2.5,2.5,2.5,0.0,0.0,2.5,2.5,2.5,2.5,0.0,0.0,2.5,2.5,2.572479,2.5,2.5,3.0,0.0,2.5,2.041241,2.5,0.0,2.5,2.5,0.0,0.0,0.0,0.0,0.0,0.0,3.061862,0.0,0.0,0.0,2.041241,0.0,0.0,0.0,2.5,0.0,2.5,2.5
842,0.0,0.0,0.0,0.0,0.0,0.0,2.5,2.5,0.0,0.0,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.25,0.0,0.0,0.0,0.0,2.5,0.0,0.0,2.5,2.5,0.0,0.0,2.672612,2.041241,2.5,2.5,2.5,0.0,0.0,0.0,0.0,0.0,0.0,2.5,2.5,0.0,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.572479,2.572479,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.118034,0.0,2.5,0.0,0.0,2.041241,0.0,2.5,0.668153,0.0,0.0,0.0,0.0,0.0,1.020621,0.0,0.0,0.0,0.0,2.672612,0.0,2.5,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.857493,0.857493,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,2.041241,2.5,0.0,0.0,2.041241,2.041241,2.041241,2.041241,2.5,2.5,2.5,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.082483,0.0,0.0,0.0,2.041241,2.5,2.5,0.0,0.0,0.0,0.0,0.0
1554,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,2.236068,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,2.5,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.25,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,2.5,2.5,0.0,0.0,0.0,2.5,0.0,0.0,0.0,0.0,0.680414,0.0,0.0,0.0,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.857493,0.857493,0.0,0.0,0.0,2.041241,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.041241,2.5,0.0,1.336306,2.5,0.0,0.0,2.041241,0.0,0.0,2.5,2.5,0.0,0.0,1.336306,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.680414,0.0,0.912871,0.0,0.0,0.0,0.0,2.5,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.041241,2.041241,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.857493,0.0,1.25,0.5,0.0,0.0,2.041241,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.020621,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.25,0.0,0.0,0.0
1526,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,2.5,2.5,1.118034,2.041241,0.0,0.0,0.0,0.0,0.0,2.5,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,2.5,0.0,0.0,1.25,0.0,2.5,0.0,0.0,0.0,2.5,2.5,0.0,0.0,2.5,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.25,1.25,2.5,0.0,0.0,2.5,0.0,2.5,2.5,0.0,0.0,0.680414,0.0,0.0,0.0,0.0,2.5,0.0,0.0,0.0,0.0,0.0,2.5,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.041241,2.5,0.0,0.0,0.0,2.5,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.668153,0.0,0.0,0.0,0.0,0.0,1.020621,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.680414,0.0,0.912871,2.5,2.5,1.714986,1.714986,0.0,0.0,0.0,2.5,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,2.5,0.0,0.0,2.5,2.5,0.0,2.5,0.0,0.0,3.061862,2.5,0.0,2.5,0.0,0.0,0.0,0.0,0.0,2.5,0.0,0.0
497,2.5,2.5,2.5,2.5,2.5,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.25,0.0,2.041241,0.0,0.0,2.5,2.5,2.5,0.0,0.0,0.0,0.0,0.0,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,0.0,0.0,0.0,0.0,1.25,0.0,0.0,1.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.336306,2.041241,0.0,0.0,0.0,0.0,0.0,2.5,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,0.0,2.5,0.0,0.0,0.0,0.0,2.5,2.5,2.041241,2.5,0.0,0.857493,0.0,0.0,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.041241,0.0,2.5,2.041241,2.5,0.0,0.0,2.236068,2.041241,0.0,0.0,2.5,0.0,0.0,0.0,2.004459,0.0,2.5,0.0,0.0,2.5,1.020621,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,2.5,2.5,2.5,0.0,0.0,0.0,0.0,0.0,2.572479,2.572479,1.25,0.0,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.041241,2.041241,0.0,0.0,0.0,0.0,0.0,0.0,2.5,2.5,0.0,0.0,2.5,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,2.5,0.0,0.0,0.0,0.0,0.0,0.5,0.0,1.25,0.0,0.0,0.0,0.0,0.0,1.25,0.0,2.5,0.0,2.5,2.5,3.061862,0.0,2.5,0.0,2.041241,0.0,0.0,2.5,0.0,0.0,0.0,0.0


In [75]:
def spacy_pos_tagger(doc):
    content_words = ['NOUN', 'PROPN', 'VERB', 'ADJ', 'ADV']
    
    doc = nlp(doc)
    
    #remove ^p (bullet points)
    
    return ['__{}__'.format(w.pos_) if w.pos_ in content_words else w.pos_ for w in doc]  
    


vectorizer = CountVectorizer(ngram_range=(1,1), max_features=200, tokenizer=spacy_pos_tagger)
pos_ngrams = vectorizer.fit_transform(new_df['EssayText'])
print(pos_ngrams.shape)

#pd.DataFrame(X_train_counts.toarray(), columns=vectorizer.get_feature_names()).head()

scaled_pos_ngrams = StandardScaler(with_mean=False).fit_transform(pos_ngrams)
pos_ngrams_df = pd.DataFrame(scaled_pos_ngrams.toarray(), columns=vectorizer.get_feature_names(), index=new_df.index)
pos_ngrams_df.head()



(5, 11)




Unnamed: 0,ADP,CCONJ,DET,NUM,PART,PRON,PUNCT,__ADJ__,__ADV__,__NOUN__,__VERB__
1145,3.143093,2.672612,3.666178,0.0,3.429972,3.162278,1.336306,3.429972,2.572479,4.481291,4.313311
842,1.796053,1.336306,1.099853,0.0,3.429972,1.581139,0.890871,1.714986,2.572479,1.920553,1.617492
1554,0.449013,0.0,1.466471,0.0,0.857493,3.162278,1.336306,0.857493,0.0,1.920553,2.156655
1526,0.449013,1.336306,1.099853,0.0,2.572479,4.743416,0.890871,2.572479,0.857493,1.920553,1.887074
497,1.34704,0.0,1.099853,2.5,3.429972,3.162278,3.563483,0.857493,1.714986,2.240645,3.234983


# Concatenate all the features

In [80]:
merge_1 = pd.merge(man_features_df, pos_ngrams_df, how='inner',left_index=True, right_index=True, copy=True)
merge_2 = pd.merge(merge_1, text_ngrams_df, how='inner',left_index=True, right_index=True, copy=True)

In [81]:
merge_2

Unnamed: 0,total_words,avg_word_length,maturity,concreteness,ADP,CCONJ,DET,NUM,PART,PRON,PUNCT,__ADJ__,__ADV__,__NOUN__,__VERB__,) __NOUN__,) __NOUN__ __VERB__,) __VERB__,) __VERB__ __ADV__,) i,) i __VERB__,", __ADV__",", __ADV__ __ADJ__",", __VERB__",", __VERB__ the",", and",-pron- __NOUN__,-pron- __NOUN__ .,-pron- __VERB__,-pron- __VERB__ __VERB__,-pron- __VERB__ to,. -pron-,. -pron- __VERB__,. ^p,. ^p 2,. ^p 3,. __NOUN__,. __NOUN__ __VERB__,. __VERB__,. __VERB__ __NOUN__,. __VERB__ a,1 ),1 ) i,2 ),2 ) __NOUN__,3 ),3 ) __VERB__,^p 2,^p 2 ),^p 3,^p 3 ),__ADJ__ __ADJ__,__ADJ__ __ADJ__ and,__ADJ__ __ADV__,__ADJ__ __ADV__ the,__ADJ__ __NOUN__,__ADJ__ __NOUN__ -pron-,__ADJ__ __NOUN__ .,__ADJ__ __NOUN__ __VERB__,__ADJ__ __NOUN__ of,__ADJ__ __NOUN__ to,__ADJ__ and,__ADJ__ and __ADJ__,__ADJ__ of,__ADJ__ of each,__ADV__ -pron-,__ADV__ -pron- __VERB__,__ADV__ __ADJ__,__ADV__ __ADJ__ __NOUN__,__ADV__ __ADJ__ of,__ADV__ __ADV__,__ADV__ __ADV__ __ADJ__,__ADV__ __VERB__,__ADV__ __VERB__ the,__ADV__ a,__ADV__ a __NOUN__,__ADV__ the,__ADV__ the __ADJ__,"__NOUN__ ,","__NOUN__ , __ADV__","__NOUN__ , __VERB__","__NOUN__ , and",__NOUN__ -pron-,__NOUN__ -pron- __VERB__,__NOUN__ .,__NOUN__ . -pron-,__NOUN__ . ^p,__NOUN__ . __NOUN__,__NOUN__ . __VERB__,__NOUN__ __ADV__,__NOUN__ __ADV__ -pron-,__NOUN__ __NOUN__,__NOUN__ __NOUN__ in,__NOUN__ __VERB__,__NOUN__ __VERB__ -pron-,__NOUN__ __VERB__ __ADV__,__NOUN__ __VERB__ __VERB__,__NOUN__ __VERB__ a,__NOUN__ __VERB__ the,__NOUN__ __VERB__ to,__NOUN__ and,__NOUN__ and __NOUN__,__NOUN__ because,__NOUN__ because the,__NOUN__ for,__NOUN__ for this,__NOUN__ i,__NOUN__ i __VERB__,__NOUN__ in,__NOUN__ in this,__NOUN__ of,__NOUN__ of __NOUN__,__NOUN__ of the,__NOUN__ or,__NOUN__ or __NOUN__,__NOUN__ so,__NOUN__ so the,__NOUN__ to,__NOUN__ to __VERB__,__VERB__ -pron-,__VERB__ -pron- __NOUN__,__VERB__ -pron- __VERB__,__VERB__ .,__VERB__ . ^p,__VERB__ __ADJ__,__VERB__ __ADJ__ __ADJ__,__VERB__ __ADV__,__VERB__ __ADV__ __ADJ__,__VERB__ __ADV__ __ADV__,__VERB__ __ADV__ __VERB__,__VERB__ __ADV__ a,__VERB__ __NOUN__,__VERB__ __NOUN__ -pron-,__VERB__ __NOUN__ i,__VERB__ __VERB__,__VERB__ __VERB__ .,__VERB__ __VERB__ __VERB__,__VERB__ __VERB__ by,__VERB__ __VERB__ in,__VERB__ __VERB__ into,__VERB__ __VERB__ to,__VERB__ a,__VERB__ a __NOUN__,__VERB__ by,__VERB__ by the,__VERB__ in,__VERB__ in __NOUN__,__VERB__ in each,__VERB__ in the,__VERB__ in to,__VERB__ into,__VERB__ into the,__VERB__ out,__VERB__ out through,__VERB__ the,__VERB__ the __ADJ__,__VERB__ the __NOUN__,__VERB__ this,__VERB__ this __NOUN__,__VERB__ to,__VERB__ to __VERB__,a __NOUN__,a __NOUN__ .,a __NOUN__ __NOUN__,and __ADJ__,and __ADJ__ __NOUN__,and __NOUN__,and __NOUN__ of,because the,because the __NOUN__,by the,by the __NOUN__,each __NOUN__,"each __NOUN__ ,",each __NOUN__ to,for this,for this __NOUN__,i __VERB__,i __VERB__ __VERB__,in __NOUN__,in __NOUN__ to,in each,in each __NOUN__,in the,in the __NOUN__,in this,in this __NOUN__,in to,in to each,into the,into the __NOUN__,of __NOUN__,of __NOUN__ __VERB__,of __NOUN__ because,of each,of each __NOUN__,of the,of the __NOUN__,or __NOUN__,or __NOUN__ of,out through,out through __NOUN__,so the,so the __ADJ__,the __ADJ__,the __ADJ__ __ADV__,the __ADJ__ __NOUN__,the __NOUN__,"the __NOUN__ ,",the __NOUN__ .,the __NOUN__ __VERB__,the __NOUN__ and,the __NOUN__ for,the __NOUN__ or,the __NOUN__ so,this __NOUN__,this __NOUN__ -pron-,this __NOUN__ .,this __NOUN__ __ADV__,through __NOUN__,through __NOUN__ __VERB__,to __VERB__,to __VERB__ -pron-,to __VERB__ .,to __VERB__ __ADJ__,to __VERB__ __ADV__,to __VERB__ __NOUN__,to __VERB__ in,to __VERB__ out,to __VERB__ the,to __VERB__ this,to each,to each __NOUN__
1145,1.854099,0.018321,-1.353381,-1.793126,3.143093,2.672612,3.666178,0.0,3.429972,3.162278,1.336306,3.429972,2.572479,4.481291,4.313311,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,2.5,0.0,0.0,0.0,2.5,2.236068,0.0,2.5,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,2.5,3.75,0.0,0.0,2.5,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.336306,2.041241,0.0,0.0,0.0,2.5,2.5,0.0,0.0,2.5,2.5,1.25,0.0,2.5,0.0,0.0,0.0,2.5,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.721655,0.0,2.5,2.572479,0.0,0.0,0.0,2.5,2.5,2.5,2.5,0.0,0.0,0.0,0.0,0.0,0.0,2.5,2.5,2.5,2.5,2.5,2.5,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.236068,2.041241,0.0,2.5,0.0,0.0,0.0,0.0,3.340766,0.0,0.0,2.5,2.041241,0.0,3.061862,0.0,0.0,2.5,2.5,1.336306,0.0,0.0,0.0,2.5,0.0,0.0,0.0,0.0,2.721655,2.5,2.738613,0.0,0.0,2.572479,2.572479,0.0,0.0,0.0,0.0,0.0,2.5,2.5,2.5,2.5,2.5,2.5,1.25,2.041241,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,2.5,0.0,0.0,2.5,2.5,2.5,0.0,0.0,2.5,2.5,2.5,2.5,0.0,0.0,2.5,2.5,2.572479,2.5,2.5,3.0,0.0,2.5,2.041241,2.5,0.0,2.5,2.5,0.0,0.0,0.0,0.0,0.0,0.0,3.061862,0.0,0.0,0.0,2.041241,0.0,0.0,0.0,2.5,0.0,2.5,2.5
842,-0.463525,-0.412628,-0.952675,-0.239957,1.796053,1.336306,1.099853,0.0,3.429972,1.581139,0.890871,1.714986,2.572479,1.920553,1.617492,0.0,0.0,0.0,0.0,0.0,0.0,2.5,2.5,0.0,0.0,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.25,0.0,0.0,0.0,0.0,2.5,0.0,0.0,2.5,2.5,0.0,0.0,2.672612,2.041241,2.5,2.5,2.5,0.0,0.0,0.0,0.0,0.0,0.0,2.5,2.5,0.0,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.572479,2.572479,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.118034,0.0,2.5,0.0,0.0,2.041241,0.0,2.5,0.668153,0.0,0.0,0.0,0.0,0.0,1.020621,0.0,0.0,0.0,0.0,2.672612,0.0,2.5,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.857493,0.857493,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,2.041241,2.5,0.0,0.0,2.041241,2.041241,2.041241,2.041241,2.5,2.5,2.5,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.082483,0.0,0.0,0.0,2.041241,2.5,2.5,0.0,0.0,0.0,0.0,0.0
1554,-0.970505,0.361831,0.323524,0.431064,0.449013,0.0,1.466471,0.0,0.857493,3.162278,1.336306,0.857493,0.0,1.920553,2.156655,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,2.236068,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,2.5,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.25,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,2.5,2.5,0.0,0.0,0.0,2.5,0.0,0.0,0.0,0.0,0.680414,0.0,0.0,0.0,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.857493,0.857493,0.0,0.0,0.0,2.041241,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.041241,2.5,0.0,1.336306,2.5,0.0,0.0,2.041241,0.0,0.0,2.5,2.5,0.0,0.0,1.336306,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.680414,0.0,0.912871,0.0,0.0,0.0,0.0,2.5,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.041241,2.041241,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.857493,0.0,1.25,0.5,0.0,0.0,2.041241,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.020621,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.25,0.0,0.0,0.0
1526,-0.608376,1.548875,1.300203,1.160223,0.449013,1.336306,1.099853,0.0,2.572479,4.743416,0.890871,2.572479,0.857493,1.920553,1.887074,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,2.5,2.5,1.118034,2.041241,0.0,0.0,0.0,0.0,0.0,2.5,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,2.5,0.0,0.0,1.25,0.0,2.5,0.0,0.0,0.0,2.5,2.5,0.0,0.0,2.5,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.25,1.25,2.5,0.0,0.0,2.5,0.0,2.5,2.5,0.0,0.0,0.680414,0.0,0.0,0.0,0.0,2.5,0.0,0.0,0.0,0.0,0.0,2.5,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.041241,2.5,0.0,0.0,0.0,2.5,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.668153,0.0,0.0,0.0,0.0,0.0,1.020621,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.680414,0.0,0.912871,2.5,2.5,1.714986,1.714986,0.0,0.0,0.0,2.5,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,2.5,0.0,0.0,2.5,2.5,0.0,2.5,0.0,0.0,3.061862,2.5,0.0,2.5,0.0,0.0,0.0,0.0,0.0,2.5,0.0,0.0
497,0.188307,-1.516399,0.682329,0.441797,1.34704,0.0,1.099853,2.5,3.429972,3.162278,3.563483,0.857493,1.714986,2.240645,3.234983,2.5,2.5,2.5,2.5,2.5,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.25,0.0,2.041241,0.0,0.0,2.5,2.5,2.5,0.0,0.0,0.0,0.0,0.0,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,0.0,0.0,0.0,0.0,1.25,0.0,0.0,1.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.336306,2.041241,0.0,0.0,0.0,0.0,0.0,2.5,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,0.0,2.5,0.0,0.0,0.0,0.0,2.5,2.5,2.041241,2.5,0.0,0.857493,0.0,0.0,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.041241,0.0,2.5,2.041241,2.5,0.0,0.0,2.236068,2.041241,0.0,0.0,2.5,0.0,0.0,0.0,2.004459,0.0,2.5,0.0,0.0,2.5,1.020621,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,2.5,2.5,2.5,0.0,0.0,0.0,0.0,0.0,2.572479,2.572479,1.25,0.0,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.041241,2.041241,0.0,0.0,0.0,0.0,0.0,0.0,2.5,2.5,0.0,0.0,2.5,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,2.5,0.0,0.0,0.0,0.0,0.0,0.5,0.0,1.25,0.0,0.0,0.0,0.0,0.0,1.25,0.0,2.5,0.0,2.5,2.5,3.061862,0.0,2.5,0.0,2.041241,0.0,0.0,2.5,0.0,0.0,0.0,0.0


In [None]:
pd.merge(left, right, how='inner', on=None, left_on=None, right_on=None,
         left_index=False, right_index=False, sort=True,
         suffixes=('_x', '_y'), copy=True, indicator=False,
         validate=None)

In [None]:
def spacy_pos_tagger(doc):
    content_words = ['NOUN', 'PROPN', 'VERB', 'ADJ', 'ADV']
    
    doc = nlp(doc)
    
    #remove ^p (bullet points)
    
    return ['__{}__'.format(w.pos_) if w.pos_ in content_words else w.pos_ for w in doc]  
    


vectorizer = CountVectorizer(ngram_range=(1,1), max_features=200, tokenizer=spacy_pos_tagger)
X_train_counts = vectorizer.fit_transform(train.clean_text)
print(X_train_counts.shape)

pd.DataFrame(X_train_counts.toarray(), columns=vectorizer.get_feature_names()).head()


In [None]:
content_words = ['NOUN', 'PROPN', 'VERB', 'ADJ', 'ADV']
tokens = []
sentences = []
word_count = []
word_len = []
tf_text = []

for doc in nlp.pipe(new_df['EssayText'], batch_size=50, n_threads=4):
        
    if doc.is_parsed:
        #Add placeholders for CONTENT words, else parse as usual
        tokens.append(['__{}__'.format(w.pos_) if w.pos_ in content_words else w.lemma_.lower() for w in doc])
        #tf_text.append((' '.join(t for t in tokens)))
         
        
#         sentences.append([sent.text for sent in doc.sents])
        #word_count.append(len([w for w in doc if not w.is_punct]))
        word_len.append(np.sum([len(w) for w in doc if not w.is_punct]))
    
    else:
        # We want to make sure that the lists of parsed results have the
        # same number of entries of the original Dataframe, so add some blanks in case the parse fails
        tokens.append(None)
        #pos.append(None)        
#         sentences.append(None)

    
#train['tokens'] = tokens
#train['pos'] = pos
#train['total_words'] = word_count
#train['avg_word_length'] = word_len
# X_train['lemmas'] = lemmas
# X_train['sentences'] = sentences

train.head()

In [None]:
train['clean_text'] = tf_text
train.head()

In [None]:
def get_numeric_features(df, col):
    
    content_words = ['NOUN', 'PROPN', 'VERB', 'ADJ', 'ADV']
    tokens = []
    pos = []
    #sentences = []
    word_count = []
    word_len = []
    #tf_text = []
    
    for doc in nlp.pipe(df[col], batch_size=50, n_threads=4, disable=['ner']):
        
        if doc.is_parsed:
            #Add placeholders for CONTENT words, else parse as usual
            tokens.append(['__{}__'.format(w.pos_) if w.pos_ in content_words else w.lemma_.lower() for w in doc])
            
            #tf_text.append('__{}__'.format(w.pos_) if w.pos_ in content_words else w.lemma_.lower() for w in doc)
        
            #pos.append([n.pos_ for n in doc])
            #sentences.append([sent.text for sent in doc.sents])
            
            word_count.append(len([w for w in doc if not w.is_punct]))
            word_len.append(np.sum([len(w) for w in doc if not w.is_punct]))
    
        else:
            
            # We want to make sure that the lists of parsed results have the
            # same number of entries of the original Dataframe, so add some blanks in case the parse fails

            tokens.append(None)
            pos.append(None)        
            #sentences.append(None)
            word_count.append(None)
            word_len.append(None)
    
     
    #df['tokens'] = tokens
    #df['pos'] = pos
    df['total_words'] = word_count
    df['avg_word_length'] = word_len
    #df['lemmas'] = lemmas
    #df['sentences'] = sentences

    return df
    

In [None]:
test = get_numeric_features(train, "EssayText")
test.head()

In [None]:
def spacy_tokenizer(doc):
    content_words = ['NOUN', 'PROPN', 'VERB', 'ADJ', 'ADV']
    
    doc = nlp(doc)
    
    #remove ^p (bullet points)
    
    return ['__{}__'.format(w.pos_) if w.pos_ in content_words else w.lemma_.lower() for w in doc]  
    


vectorizer = CountVectorizer(tokenizer=spacy_tokenizer, ngram_range=(2,3), max_features=2000)
X_train_counts = vectorizer.fit_transform(text)

print(X_train_counts.shape)

#print(count_vect.vocabulary_)
pd.DataFrame(X_train_counts.toarray(), columns=vectorizer.get_feature_names()).head(10)

In [None]:
def spacy_pos_tagger(doc):
    content_words = ['NOUN', 'PROPN', 'VERB', 'ADJ', 'ADV']
    
    doc = nlp(doc)
    
    #remove ^p (bullet points)
    
    return ['__{}__'.format(w.pos_) if w.pos_ in content_words else w.pos_ for w in doc]  
    


vectorizer = CountVectorizer(ngram_range=(1,1), max_features=200, tokenizer=spacy_pos_tagger)
X_train_counts = vectorizer.fit_transform(train.clean_text)
print(X_train_counts.shape)

pd.DataFrame(X_train_counts.toarray(), columns=vectorizer.get_feature_names()).head()

In [None]:
vectorizer = CountVectorizer(ngram_range=(2,3), max_features=200, tokenizer=spacy_pos_tagger)
X_train_counts = vectorizer.fit_transform(train.clean_text)
X_train_counts.shape

#print(count_vect.vocabulary_)
pd.DataFrame(X_train_counts.toarray(), columns=vectorizer.get_feature_names()).head(10)

################################################
vectorizer = CountVectorizer(tokenizer=spacy_tokenizer, ngram_range=(2,3), max_features=2000)
X_train_counts = vectorizer.fit_transform(text)

print(X_train_counts.shape)

#print(count_vect.vocabulary_)
pd.DataFrame(X_train_counts.toarray(), columns=vectorizer.get_feature_names()).head(10)

## Maturity, Concreteness

In [None]:
#content_words = ['NOUN', 'PROPN', 'VERB', 'ADJ', 'ADV']
# new_df = train[['EssayText']][0:5]
# new_df = new_df.copy() 

def get_maturity(col):
    
    aoa_ratings_df = pd.read_csv(ratingsFolder/'AoA_Ratings.csv')
    aoa_ratings = dict(zip(aoa_ratings_df.Word, aoa_ratings_df.AoA))
    
    tokens = []
    maturity = []
    mat_tmp = []
    
    
    for doc in nlp.pipe(col, batch_size=50, n_threads=4, disable=['ner']):

        if doc.is_parsed:
            #Add placeholders for CONTENT words, else parse as usual. If -PRON- then add actual word else lemma.
            tokens.append([w.text.lower() if w.lemma_ == '-PRON-' else w.lemma_.lower() for w in doc])

            #maturity.append([value.get('name') for value in d.values()])

            mat_tmp.append([aoa_ratings[t] for a in tokens for t in a if t in aoa_ratings])
    
    #Now get avg maturity per doc
    for i in mat_tmp:
            avg = sum(i)/len(i)
            maturity.append(avg)

    return maturity

In [None]:
get_maturity(new_df['EssayText'])

In [None]:
def get_concreteness(col):
    
    conc_ratings_df = pd.read_csv(ratingsFolder/'Concreteness_Ratings.csv')
    conc_ratings = dict(zip(conc_ratings_df.Word, conc_ratings_df.Concreteness))
    
    tokens = []
    concreteness = []
    conc_tmp = []
    
    
    for doc in nlp.pipe(col, batch_size=50, n_threads=4, disable=['ner']):

        if doc.is_parsed:
            
            tokens.append([w.text.lower() for w in doc])
            conc_tmp.append([conc_ratings[t] for a in tokens for t in a if t in conc_ratings])
    
    #Now get avg concreteness per doc
    for i in conc_tmp:
        avg = sum(i)/len(i)
        concreteness.append(avg)

    return concreteness

In [None]:
get_concreteness(new_df['EssayText'])

## Words per T-unit

In [None]:
mydoc = (u'Apple is looking at buying U.K. startup for $1 billion. This is another sentence.')
mydoc

words_per_t_unit = []

for doc in nlp.pipe(new_df['EssayText']):
    tokens = []
    words = []
    
    for sent in doc.sents:
        #print(sent)
        tokens.append([w.text.lower() for w in sent if w.pos_ not in ['PUNCT','SYM','X','SPACE']])
        
    
    #Get number of words in a sentence
    for i in tokens:
        words.append(len(i))
    
    #Get avg words per sentence for the doc
    words_per_t_unit.append(sum(words)/len(words))

In [None]:
word_count