In [None]:
def spacy_tokenizer(doc):
    content_words = ['NOUN', 'PROPN', 'VERB', 'ADJ', 'ADV']
    doc_tokens = []

    doc = nlp(doc, disable=['ner','textcat'])
    return ['__{}__'.format(w.pos_) if w.pos_ in content_words else w.text.lower().strip() if w.pos_ == 'PRON' else w.lemma_.lower().strip() for w in doc]


def spacy_pos_tagger(doc):
    
    doc = nlp(doc, disable=['ner','textcat'])
    
    return [w.pos_ for w in doc] 

def get_text_ngrams(col, tokenizer, ngram_range, max_features, scale_values):
    
    #Get text BoW n-grams

    #Initialise vectoriser
    count_vectorizer = CountVectorizer(tokenizer=tokenizer,  ngram_range=ngram_range, max_features=max_features)
    
    unscaled_text_ngrams = count_vectorizer.fit_transform(col)
    
    if scale_values:
        print('Scaling values')
        text_ngrams = StandardScaler().fit_transform(unscaled_text_ngrams.astype(float))
    else:
        text_ngrams = unscaled_text_ngrams
    
    ngram_columns = count_vectorizer.get_feature_names()
    
    print(text_ngrams.shape)
    
    return text_ngrams, ngram_columns

def get_pos_ngrams(col, tokenizer, ngram_range, max_features, scale_values):
    
    #Get POS n-grams
    pos_vectorizer = CountVectorizer(tokenizer=tokenizer, ngram_range=ngram_range, max_features=max_features)
    
    unscaled_pos_ngrams = pos_vectorizer.fit_transform(col)
    
    if scale_values:
        print('Scaling values')
        pos_ngrams = StandardScaler().fit_transform(pos_ngrams.astype(float))
    else:
        pos_ngrams = unscaled_pos_ngrams
        
    pos_ngram_columns = pos_vectorizer.get_feature_names()
    
    print(pos_ngrams.shape)
    
    return pos_ngrams, pos_ngram_columns



In [None]:
def get_custom_features(col, scale_values):
    
    features_df = pd.DataFrame(index=col.index)
    content_words = ['NOUN', 'PROPN', 'VERB', 'ADJ', 'ADV']
    
    total_words = []
    words_no_punct = []
    words_no_punct_no_stop =[]
    count_content_words=[]
    count_stopwords=[]
    avg_word_len = []
    
    
    aoa_ratings_df = pd.read_csv(ratingsFolder/'AoA_Ratings.csv')
    aoa_ratings = dict(zip(aoa_ratings_df.Word, aoa_ratings_df.AoA))
    maturity_tokens = []
    maturity = []

    conc_ratings_df = pd.read_csv(ratingsFolder/'Concreteness_Ratings.csv')
    conc_ratings = dict(zip(conc_ratings_df.Word, conc_ratings_df.Concreteness))
    concreteness = []
    conc_tokens = []    


    for doc in nlp.pipe(col, batch_size=50, n_threads=4):

        if doc.is_parsed:
            
            #Add placeholders for CONTENT words, else parse as usual
            #tokens.append(['__{}__'.format(w.pos_) if w.pos_ in content_words else w.lemma_.lower() for w in doc])
            
            total_words.append(len([w.text for w in doc]))
            words_no_punct.append(len([w.text for w in doc if not w.is_punct]))
            words_no_punct_no_stop.append(len([w.text for w in doc if not w.is_punct if not w.is_stop]))
            count_content_words.append(len([w.text for w in doc if w.pos_ in content_words]))
            count_stopwords.append(len([w.text for w in doc if w.is_stop]))
            avg_word_len.append(sum([len(w) for w in doc if not w.is_punct])/len([w for w in doc if not w.is_punct]))
            
            #-------------------Maturity
            mat_tmp = []
            maturity_tokens.append([w.text.lower().strip() if w.lemma_ == '-PRON-' else w.lemma_.lower().strip() for w in doc])

            #maturity.append([value.get('name') for value in d.values()])
            mat_tmp.append([aoa_ratings[t] for a in maturity_tokens for t in a if t in aoa_ratings])

             #Now get avg maturity per doc
            for j in mat_tmp:
                maturity.append(sum(j)/len(j))

            #----------Concreteness
            conc_tmp = []
            conc_tokens.append([w.text.lower().strip() for w in doc])
            conc_tmp.append([conc_ratings[t] for a in conc_tokens for t in a if t in conc_ratings])

            #Now get avg concreteness per doc
            for k in conc_tmp:
                concreteness.append(sum(k)/len(k))

        else:
            # We want to make sure that the lists of parsed results have the
            # same number of entries of the original Dataframe, so add some blanks in case the parse fails

            #tokens.append(None)
            total_words.append(None)
            words_no_punct.append(None)
            words_no_punct_no_stop.append(None)
            count_content_words.append(None)
            count_stopwords.append(None)
            avg_word_len.append(None)
            maturity.append(None)
            concreteness.append(None)
    
    features_df['total_words'] = total_words
    features_df['words_no_punct'] = words_no_punct
    features_df['words_no_punct_no_stop'] = words_no_punct_no_stop
    features_df['count_content_words'] = count_content_words
    features_df['count_stopwords'] = count_stopwords
    features_df['avg_word_len'] = avg_word_len
    features_df['maturity'] = maturity
    features_df['concreteness'] = concreteness
    
    col_names = ['total_words','words_no_punct','words_no_punct_no_stop','count_content_words',
                 'count_stopwords','avg_word_len','maturity','concreteness']
    
    if scale_values:
        scaled_features = StandardScaler().fit_transform(train[col_names].astype(float))
        features_df = pd.DataFrame(scaled_features, index=col.index, columns=[col_names])
    
    return features_df

In [None]:
#Get handcrafted features
custom_features_df = get_custom_features(train['EssayText'], scale_values=False)
custom_features_df.head()

In [None]:
#Get text based ngrams    
text_ngrams, text_ngrams_columns = get_text_ngrams(train['EssayText'],spacy_tokenizer, (1,1), 10000, scale_values=False )    

text_ngrams_df = pd.DataFrame(text_ngrams.toarray(), columns=text_ngrams_columns, index=train.index)
text_ngrams_df.head()

In [None]:
pos_ngrams, pos_ngram_columns = get_pos_ngrams(train['EssayText'], spacy_pos_tagger, (1,1), 10000, scale_values=False)
pos_ngrams_df = pd.DataFrame(pos_ngrams.toarray(), 
                             columns=pos_ngram_columns, 
                             index=train.index)

pos_ngrams_df.head()

In [None]:
# Concatenate all the features
merged_1 = pd.merge(custom_features_df, pos_ngrams_df, how='inner',left_index=True, right_index=True, copy=True)
merged_2 = pd.merge(merged_1, text_ngrams_df, how='inner',left_index=True, right_index=True, copy=True)

feature_names = merged_2.columns

print(merged_2.shape)
merged_2.head()

In [None]:
# fig = plt.figure(figsize=(10,8))
# ax = fig.add_subplot(111)
from yellowbrick.text import FreqDistVisualizer
visualizer = FreqDistVisualizer(features=text_ngrams_columns, orient='v',n=30, color='green')
visualizer.fit(text_ngrams)

# Call finalize to draw the final yellowbrick-specific elements
visualizer.finalize()
#visualizer.poof()
# Get access to the axes object and modify labels
visualizer.ax.set_title("Frequency Distribution of Content Independent uni grams")
#visualizer.ax.set_xlabel("Xlabel")
#visualizer.ax.set_ylabel("yLabel")

#visualizer.ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)


In [None]:
#plt.figure(figsize=(12,10))
visualizer = FreqDistVisualizer(features=pos_ngram_columns, orient='v', n=16)
visualizer.fit(pos_ngrams)
visualizer.poof()

## Words per T-unit

https://github.com/explosion/spaCy/issues/252

http://mlreference.com/dependency-tree-spacy

http://grammar.yourdictionary.com/grammar-rules-and-tips/grammar-clause.html

https://shirishkadam.com/2016/12/23/dependency-parsing-in-nlp/

https://stackoverflow.com/questions/36610179/how-to-get-the-dependency-tree-with-spacy

In [None]:
mydoc = (u'Apple is looking at buying U.K. startup for $1 billion. This is another sentence.')
mydoc

words_per_t_unit = []

for doc in nlp.pipe(new_df['EssayText']):
    tokens = []
    words = []
    
    for sent in doc.sents:
        #print(sent)
        tokens.append([w.text.lower() for w in sent if w.pos_ not in ['PUNCT','SYM','X','SPACE']])
        
    
    #Get number of words in a sentence
    for i in tokens:
        words.append(len(i))
    
    #Get avg words per sentence for the doc
    words_per_t_unit.append(sum(words)/len(words))