In [None]:
# importing required packages
from pathlib import Path

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import spacy
nlp = spacy.load('en')

from spacy import displacy
#from spacy.lang.en import English
#parser = English()

#from tqdm import tqdm


from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin 

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score, cohen_kappa_score

np.random.seed(42)
%matplotlib inline

In [None]:
# set up display area to show dataframe in jupyter qtconsole

#pd.set_option('display.height', 1000)
#pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
#pd.set_option('display.width', 1000)

pd.set_option('display.max_colwidth', -1)

In [None]:
myDir = Path.cwd().parents[0]
dataFolder = myDir / 'data/asap-sas'
ratingsFolder = myDir / 'data/ratings'

print(dataFolder)

gradeMap = {1: 10,
                2: 10,
                3: 10,
                4:10,
                5: 10,
                6: 10,
                7:10,
                8:10,
                9:10,
                10:8}

subjectMap = {1: 'Science',
            2: 'Science',
            3: 'English Language Arts',
            4: 'English Language Arts',
            5: 'Biology',
            6: 'Biology',
            7:'English',
            8:'English',
            9:'English',
            10:'Science'}

df = pd.read_csv(dataFolder/'train.tsv', sep='\t', header=0)  #read data into dataframe
df.drop('Score2', inplace=True, axis=1) #Score 2 is for inter-rate reliability only

df['subject'] = df['EssaySet'].map(subjectMap)
df['studentGrade'] = df['EssaySet'].map(gradeMap)

df = df[['Id','EssaySet','subject','studentGrade','EssayText','Score1']] #rearrange columns
df.head()

In [4]:
# Take only essay set 1
set_1 = df[(df['EssaySet'] == 1)].copy()
set_1.shape

(1672, 6)

In [5]:
del df

In [6]:
X = set_1[['EssayText','Score1']]
y = X.pop('Score1')


train, test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

#Adding the is_copy to False otherwise we get SettingWithCopyWarning
train = train.copy()
test = test.copy()

train.head()

Unnamed: 0,EssayText
1145,"You would need to know how much vinegar was put in to each sample, find the size and shape of the container so the same amount of vinegar was actually covering the sample. You would need to know the shape or volume of sample because the surface are has to be the same so the same amount is affected by the vinegar."
842,"In order to replicate experiment I would need to know exactly how much vinegar to pour in each container, how much of each sample to put in the container, and"
1554,The additional information you would need in order to replicate the experiment form a hypothesis. Draw a conclusion. Say what you are experimenting.
1526,To replicate this experiment you would need to state you problem. What is the person for this lab also you need to indicate your independent and dependent variables.
497,1) I would need to know how much vinegar is being put into the sample. ^p 2) What are they trying to find out through pass is to amount. ^p 3) Was there a control group in this experiment.


In [7]:
new_df = train[['EssayText']][0:5]
new_df = new_df.copy() 

new_df.head()

Unnamed: 0,EssayText
1145,"You would need to know how much vinegar was put in to each sample, find the size and shape of the container so the same amount of vinegar was actually covering the sample. You would need to know the shape or volume of sample because the surface are has to be the same so the same amount is affected by the vinegar."
842,"In order to replicate experiment I would need to know exactly how much vinegar to pour in each container, how much of each sample to put in the container, and"
1554,The additional information you would need in order to replicate the experiment form a hypothesis. Draw a conclusion. Say what you are experimenting.
1526,To replicate this experiment you would need to state you problem. What is the person for this lab also you need to indicate your independent and dependent variables.
497,1) I would need to know how much vinegar is being put into the sample. ^p 2) What are they trying to find out through pass is to amount. ^p 3) Was there a control group in this experiment.


## Feature Extraction

In [12]:
content_words = ['NOUN', 'PROPN', 'VERB', 'ADJ', 'ADV']
tokens = []
word_count = []
tmp_word_len = []
avg_word_len = []
X_train = pd.DataFrame()


for doc in nlp.pipe(new_df['EssayText'], batch_size=50, n_threads=4):
        
    if doc.is_parsed:
        
        #Add placeholders for CONTENT words, else parse as usual
        tokens.append(['__{}__'.format(w.pos_) if w.pos_ in content_words else w.lemma_.lower() for w in doc])
        
        #Count words which are not punctuation
        word_count.append(len([w for w in doc if not w.is_punct]))
        
        
        #word_len.append(np.sum([len(w) for w in doc if not w.is_punct]))
    
    else:
        # We want to make sure that the lists of parsed results have the
        # same number of entries of the original Dataframe, so add some blanks in case the parse fails

        tokens.append(None)
        word_count.append(None)
        word_len.append(None)
   
    



new_df['total_words'] = word_count
new_df['avg_word_length'] = word_len
# X_train['lemmas'] = lemmas
# X_train['sentences'] = sentences

new_df.head()

Unnamed: 0,EssayText,total_words,avg_word_length
1145,"You would need to know how much vinegar was put in to each sample, find the size and shape of the container so the same amount of vinegar was actually covering the sample. You would need to know the shape or volume of sample because the surface are has to be the same so the same amount is affected by the vinegar.",62,250
842,"In order to replicate experiment I would need to know exactly how much vinegar to pour in each container, how much of each sample to put in the container, and",30,127
1554,The additional information you would need in order to replicate the experiment form a hypothesis. Draw a conclusion. Say what you are experimenting.,23,123
1526,To replicate this experiment you would need to state you problem. What is the person for this lab also you need to indicate your independent and dependent variables.,28,136
497,1) I would need to know how much vinegar is being put into the sample. ^p 2) What are they trying to find out through pass is to amount. ^p 3) Was there a control group in this experiment.,39,144


In [19]:
for doc in nlp.pipe(new_df['EssayText'], batch_size=50, n_threads=4):
    word_count =[]    
    if doc.is_parsed:
        
        #Add placeholders for CONTENT words, else parse as usual
        #tokens.append(['__{}__'.format(w.pos_) if w.pos_ in content_words else w.lemma_.lower() for w in doc])
        
        #Count words which are not punctuation
        word_count.append([len(w.shape_) for w in doc if not w.is_punct])
        
        #print(word_count)
        
        for i in word_count:
            print(sum(i)/len(i))
        #word_len.append(np.sum([len(w) for w in doc if not w.is_punct]))
    

3.2580645161290325
3.2
3.3043478260869565
3.4642857142857144
3.051282051282051


In [None]:
#content_words = ['NOUN', 'PROPN', 'VERB', 'ADJ', 'ADV']
# new_df = train[['EssayText']][0:5]
# new_df = new_df.copy() 

def get_maturity(col):
    
    aoa_ratings_df = pd.read_csv(ratingsFolder/'AoA_Ratings.csv')
    aoa_ratings = dict(zip(aoa_ratings_df.Word, aoa_ratings_df.AoA))
    
    tokens = []
    maturity = []
    mat_tmp = []
    
    
    for doc in nlp.pipe(col, batch_size=50, n_threads=4, disable=['ner']):

        if doc.is_parsed:
            #Add placeholders for CONTENT words, else parse as usual. If -PRON- then add actual word else lemma.
            tokens.append([w.text.lower() if w.lemma_ == '-PRON-' else w.lemma_.lower() for w in doc])

            #maturity.append([value.get('name') for value in d.values()])

            mat_tmp.append([aoa_ratings[t] for a in tokens for t in a if t in aoa_ratings])
    
    #Now get avg maturity per doc
    for i in mat_tmp:
            avg = sum(i)/len(i)
            maturity.append(avg)

    return maturity

In [None]:
def get_concreteness(col):
    
    conc_ratings_df = pd.read_csv(ratingsFolder/'Concreteness_Ratings.csv')
    conc_ratings = dict(zip(conc_ratings_df.Word, conc_ratings_df.Concreteness))
    
    tokens = []
    concreteness = []
    conc_tmp = []
    
    
    for doc in nlp.pipe(col, batch_size=50, n_threads=4, disable=['ner']):

        if doc.is_parsed:
            
            tokens.append([w.text.lower() for w in doc])
            conc_tmp.append([conc_ratings[t] for a in tokens for t in a if t in conc_ratings])
    
    #Now get avg concreteness per doc
    for i in conc_tmp:
        avg = sum(i)/len(i)
        concreteness.append(avg)

    return concreteness

In [None]:
words_per_t_unit = []

for doc in nlp.pipe(new_df['EssayText']):
    tokens = []
    words = []
    
    for sent in doc.sents:
        #print(sent)
        tokens.append([w.text.lower() for w in sent if w.pos_ not in ['PUNCT','SYM','X','SPACE']])
        
    
    #Get number of words in a sentence
    for i in tokens:
        words.append(len(i))
    
    #Get avg words per sentence for the doc
    words_per_t_unit.append(sum(words)/len(words))

## BoW, N-grams (Tokens + POS)    

In [None]:
def spacy_tokenizer(doc):
    content_words = ['NOUN', 'PROPN', 'VERB', 'ADJ', 'ADV']
    
    doc = nlp(doc)
    
    #remove ^p (bullet points)
    
    return ['__{}__'.format(w.pos_) if w.pos_ in content_words else w.lemma_.lower() for w in doc]  
    


vectorizer = CountVectorizer(tokenizer=spacy_tokenizer, ngram_range=(2,3), max_features=2000)
X_train_counts = vectorizer.fit_transform(text)

print(X_train_counts.shape)

#print(count_vect.vocabulary_)
pd.DataFrame(X_train_counts.toarray(), columns=vectorizer.get_feature_names()).head(10)

In [None]:
def spacy_pos_tagger(doc):
    content_words = ['NOUN', 'PROPN', 'VERB', 'ADJ', 'ADV']
    
    doc = nlp(doc)
    
    #remove ^p (bullet points)
    
    return ['__{}__'.format(w.pos_) if w.pos_ in content_words else w.pos_ for w in doc]  
    


vectorizer = CountVectorizer(ngram_range=(1,1), max_features=200, tokenizer=spacy_pos_tagger)
X_train_counts = vectorizer.fit_transform(train.clean_text)
print(X_train_counts.shape)

pd.DataFrame(X_train_counts.toarray(), columns=vectorizer.get_feature_names()).head()


In [None]:
content_words = ['NOUN', 'PROPN', 'VERB', 'ADJ', 'ADV']
tokens = []
sentences = []
word_count = []
word_len = []
tf_text = []

for doc in nlp.pipe(new_df['EssayText'], batch_size=50, n_threads=4):
        
    if doc.is_parsed:
        #Add placeholders for CONTENT words, else parse as usual
        tokens.append(['__{}__'.format(w.pos_) if w.pos_ in content_words else w.lemma_.lower() for w in doc])
        #tf_text.append((' '.join(t for t in tokens)))
         
        
#         sentences.append([sent.text for sent in doc.sents])
        #word_count.append(len([w for w in doc if not w.is_punct]))
        word_len.append(np.sum([len(w) for w in doc if not w.is_punct]))
    
    else:
        # We want to make sure that the lists of parsed results have the
        # same number of entries of the original Dataframe, so add some blanks in case the parse fails
        tokens.append(None)
        #pos.append(None)        
#         sentences.append(None)

    
#train['tokens'] = tokens
#train['pos'] = pos
#train['total_words'] = word_count
#train['avg_word_length'] = word_len
# X_train['lemmas'] = lemmas
# X_train['sentences'] = sentences

train.head()

In [None]:
train['clean_text'] = tf_text
train.head()

In [None]:
def get_numeric_features(df, col):
    
    content_words = ['NOUN', 'PROPN', 'VERB', 'ADJ', 'ADV']
    tokens = []
    pos = []
    #sentences = []
    word_count = []
    word_len = []
    #tf_text = []
    
    for doc in nlp.pipe(df[col], batch_size=50, n_threads=4, disable=['ner']):
        
        if doc.is_parsed:
            #Add placeholders for CONTENT words, else parse as usual
            tokens.append(['__{}__'.format(w.pos_) if w.pos_ in content_words else w.lemma_.lower() for w in doc])
            
            #tf_text.append('__{}__'.format(w.pos_) if w.pos_ in content_words else w.lemma_.lower() for w in doc)
        
            #pos.append([n.pos_ for n in doc])
            #sentences.append([sent.text for sent in doc.sents])
            
            word_count.append(len([w for w in doc if not w.is_punct]))
            word_len.append(np.sum([len(w) for w in doc if not w.is_punct]))
    
        else:
            
            # We want to make sure that the lists of parsed results have the
            # same number of entries of the original Dataframe, so add some blanks in case the parse fails

            tokens.append(None)
            pos.append(None)        
            #sentences.append(None)
            word_count.append(None)
            word_len.append(None)
    
     
    #df['tokens'] = tokens
    #df['pos'] = pos
    df['total_words'] = word_count
    df['avg_word_length'] = word_len
    #df['lemmas'] = lemmas
    #df['sentences'] = sentences

    return df
    

In [None]:
test = get_numeric_features(train, "EssayText")
test.head()

In [None]:
def spacy_tokenizer(doc):
    content_words = ['NOUN', 'PROPN', 'VERB', 'ADJ', 'ADV']
    
    doc = nlp(doc)
    
    #remove ^p (bullet points)
    
    return ['__{}__'.format(w.pos_) if w.pos_ in content_words else w.lemma_.lower() for w in doc]  
    


vectorizer = CountVectorizer(tokenizer=spacy_tokenizer, ngram_range=(2,3), max_features=2000)
X_train_counts = vectorizer.fit_transform(text)

print(X_train_counts.shape)

#print(count_vect.vocabulary_)
pd.DataFrame(X_train_counts.toarray(), columns=vectorizer.get_feature_names()).head(10)

In [None]:
def spacy_pos_tagger(doc):
    content_words = ['NOUN', 'PROPN', 'VERB', 'ADJ', 'ADV']
    
    doc = nlp(doc)
    
    #remove ^p (bullet points)
    
    return ['__{}__'.format(w.pos_) if w.pos_ in content_words else w.pos_ for w in doc]  
    


vectorizer = CountVectorizer(ngram_range=(1,1), max_features=200, tokenizer=spacy_pos_tagger)
X_train_counts = vectorizer.fit_transform(train.clean_text)
print(X_train_counts.shape)

pd.DataFrame(X_train_counts.toarray(), columns=vectorizer.get_feature_names()).head()

In [None]:
vectorizer = CountVectorizer(ngram_range=(2,3), max_features=200, tokenizer=spacy_pos_tagger)
X_train_counts = vectorizer.fit_transform(train.clean_text)
X_train_counts.shape

#print(count_vect.vocabulary_)
pd.DataFrame(X_train_counts.toarray(), columns=vectorizer.get_feature_names()).head(10)

################################################
vectorizer = CountVectorizer(tokenizer=spacy_tokenizer, ngram_range=(2,3), max_features=2000)
X_train_counts = vectorizer.fit_transform(text)

print(X_train_counts.shape)

#print(count_vect.vocabulary_)
pd.DataFrame(X_train_counts.toarray(), columns=vectorizer.get_feature_names()).head(10)

## Maturity, Concreteness

In [None]:
#content_words = ['NOUN', 'PROPN', 'VERB', 'ADJ', 'ADV']
# new_df = train[['EssayText']][0:5]
# new_df = new_df.copy() 

def get_maturity(col):
    
    aoa_ratings_df = pd.read_csv(ratingsFolder/'AoA_Ratings.csv')
    aoa_ratings = dict(zip(aoa_ratings_df.Word, aoa_ratings_df.AoA))
    
    tokens = []
    maturity = []
    mat_tmp = []
    
    
    for doc in nlp.pipe(col, batch_size=50, n_threads=4, disable=['ner']):

        if doc.is_parsed:
            #Add placeholders for CONTENT words, else parse as usual. If -PRON- then add actual word else lemma.
            tokens.append([w.text.lower() if w.lemma_ == '-PRON-' else w.lemma_.lower() for w in doc])

            #maturity.append([value.get('name') for value in d.values()])

            mat_tmp.append([aoa_ratings[t] for a in tokens for t in a if t in aoa_ratings])
    
    #Now get avg maturity per doc
    for i in mat_tmp:
            avg = sum(i)/len(i)
            maturity.append(avg)

    return maturity

In [None]:
get_maturity(new_df['EssayText'])

In [None]:
def get_concreteness(col):
    
    conc_ratings_df = pd.read_csv(ratingsFolder/'Concreteness_Ratings.csv')
    conc_ratings = dict(zip(conc_ratings_df.Word, conc_ratings_df.Concreteness))
    
    tokens = []
    concreteness = []
    conc_tmp = []
    
    
    for doc in nlp.pipe(col, batch_size=50, n_threads=4, disable=['ner']):

        if doc.is_parsed:
            
            tokens.append([w.text.lower() for w in doc])
            conc_tmp.append([conc_ratings[t] for a in tokens for t in a if t in conc_ratings])
    
    #Now get avg concreteness per doc
    for i in conc_tmp:
        avg = sum(i)/len(i)
        concreteness.append(avg)

    return concreteness

In [None]:
get_concreteness(new_df['EssayText'])

## Words per T-unit

In [None]:
words = []
word_count = []

In [None]:
mydoc = (u'Apple is looking at buying U.K. startup for $1 billion. This is another sentence.')
mydoc

In [None]:
pd.set_option('display.max_colwidth', -1)

In [None]:

 
word_count = []

for doc in nlp.pipe(new_df['EssayText']):
    tokens = []
    words = []
    
    for sent in doc.sents:
        #print(sent)
        tokens.append([w.text.lower() for w in sent if w.pos_ not in ['PUNCT','SYM','X','SPACE']])
        
    
    #Get number of words in a sentence
    for i in tokens:
        words.append(len(i))
    
    #Get avg words per sentence for the doc
    word_count.append(sum(words)/len(words))

In [None]:
word_count