In [79]:
import numpy as np
import pandas as pd
import scipy
import math
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
import re

from collections import Counter
import time
import sys
from datetime import datetime as dt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import ensemble
from sklearn.linear_model import LogisticRegression

import nltk
from nltk.corpus import gutenberg, stopwords
import spacy

## Project

Data cleaning / processing / language parsing

Create features using two different NLP methods: For example, BoW vs tf-idf.

Use the features to fit supervised learning models for each feature set to predict the 
category outcomes.

Assess your models using cross-validation and determine whether one model performed better.

Pick one of the models and try to increase accuracy by at least 5 percentage points.


In [80]:
def record_start_time(t = dt.now):
    print('start time {}'.format(t))
    return t

In [81]:
def record_end_time(starttime, t = dt.now):
    print('end time {}'.format(t))
    print('Total time passed is {}'.format(t - starttime))
    return t

In [82]:
def word_frequencies(x, include_stop=True):
    words = []
    for token in x:
        if not token.is_punct and (not token.is_stop or include_stop):
            words.append(token.text)
            
    return Counter(words)

In [83]:
def lemma_frequencies(text, include_stop=True):
    lemmas = []
    for token in text:
        if not token.is_punct and (not token.is_stop or include_stop):
            lemmas.append(token.lemma_)
    
    return Counter(lemmas)

In [84]:
def text_cleaner(x):
    x = re.sub(r'--', ' ', x)
    x = re.sub(r'\*+', ' ', x)
    x = re.sub("[\[].*?[\]]", "", x)
    x = ' '.join(x.split())
    return x

In [85]:
def bag_of_words(text, nsize, include_stop):
    container = lemma_frequencies(text, include_stop).most_common(nsize)
    return [item[0] for item in container]  

In [86]:
def bow_of_features(sentences, common_words):
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1]
    df.loc[:, common_words] = 0
    
    for i, sentence in enumerate(df['text_sentence']):
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
    
        for word in words:
            df.loc[i, word] += 1
        
        if i % 500 == 0:
            print('Processing row {}'.format(i))
    
    return df

In [87]:
def stat_words_per_sentence(sentences, df):
    words = []
    for ss in sentences[0]:
        words.append(len(ss)) 
        
    df_local = pd.DataFrame(words, columns=['entity'])
    df['count'] = df_local.describe().loc['count', 'entity']
    df['25perc'] = df_local.describe().loc['25%', 'entity']
    df['50perc'] = df_local.describe().loc['50%', 'entity']
    df['75perc'] = df_local.describe().loc['75%', 'entity']
    df['max'] = df_local.describe().loc['max', 'entity']

In [88]:
def tfidf_of_features(sentences, common_words):
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1]
    df.loc[:, common_words] = 0
    # tf-idf
    for w in common_words:
        df[w + '-tfidf'] = 0
    
    for i, sentence in enumerate(df['text_sentence']):
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
    
        for word in words:
            df.loc[i, word] += 1
        
        for w in words:
            tf = df.loc[i, w]
            idf = math.log2(1/tf) if tf != 0 else 0
            col = w + '-tfidf'
            df.loc[i, col] = tf * idf 
        
        if i % 1000 == 0:
            print('Processing row {}'.format(i))
    
    return df

In [89]:
# Grab and Process the raw data
print(gutenberg.fileids())

chesterton = gutenberg.raw('chesterton-brown.txt')
austen = gutenberg.raw('austen-persuasion.txt')

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


In [90]:
# cleaning Data
# removing Chapter
chesterton = re.sub(r'[I]*[V]*[X]*\. ', '', chesterton)
austen = re.sub(r'Chapter \d+', '', austen)
chesterton = text_cleaner(chesterton)
austen = text_cleaner(austen)

In [91]:
nlp = spacy.load('en_core_web_sm')
chesterton_doc = nlp(chesterton)
austen_doc = nlp(austen)
print('done')

done


In [92]:
# group into sentences
chesterton_sent = [[sent, 'chesterton'] for sent in chesterton_doc.sents]
austen_sent = [[sent, 'austen'] for sent in austen_doc.sents]
sentences = pd.DataFrame(chesterton_sent + austen_sent)
sentences.head(3)

Unnamed: 0,0,1
0,"(The, Absence, of, Mr, Glass, THE, consulting,...",chesterton
1,"(In, such, a, place, the, sea, had, something,...",chesterton
2,"(It, must, not, be, supposed, that, Dr, Hood, ...",chesterton


## Now use bag_of_words modelling

In [93]:
# constructing common_words
chesterton_common = bag_of_words(chesterton_doc, 40, False)
austen_common = bag_of_words(austen_doc, 40, False)

words_common = [x for x in (set(chesterton_common) - set(austen_common))]
for y in (set(austen_common) - set(chesterton_common)):
    words_common.append(y)

In [98]:
# constructing Dataset
features = bow_of_features(sentences, words_common)
# add words per sentence statistics
stat_words_per_sentence(sentences, features)
features.head(3)

Processing row 0
Processing row 500
Processing row 1000
Processing row 1500
Processing row 2000
Processing row 2500
Processing row 3000
Processing row 3500
Processing row 4000
Processing row 4500
Processing row 5000
Processing row 5500
Processing row 6000
Processing row 6500


Unnamed: 0,priest,face,long,flambeau,brown,tell,old,and,door,be,...,good,musgrove,mary,text_sentence,text_source,count,25perc,50perc,75perc,max
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,"(The, Absence, of, Mr, Glass, THE, consulting,...",chesterton,6590.0,12.0,21.0,36.0,227.0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,"(In, such, a, place, the, sea, had, something,...",chesterton,6590.0,12.0,21.0,36.0,227.0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,"(It, must, not, be, supposed, that, Dr, Hood, ...",chesterton,6590.0,12.0,21.0,36.0,227.0


## start modelling

In [99]:
Y = features['text_source']
X = features.drop(['text_source', 'text_sentence'], 1)

# RandomForestModel
rfc = ensemble.RandomForestClassifier()

X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.55,
                                                    random_state=4)
rfc.fit(X_train, Y_train)

print('Training set score : {}'.format(rfc.score(X_train, Y_train)))
print('Test set score : {}'.format(rfc.score(X_test, Y_test)))



Training set score : 0.7935919055649241
Test set score : 0.7627586206896552


In [100]:
cross_val_score(rfc, X_train, Y_train, cv=4)

array([0.73584906, 0.76518219, 0.76653171, 0.76653171])

## Logical Regression model

In [101]:
lr = LogisticRegression()
lr.fit(X_train, Y_train)

print('Training set score : {}'.format(lr.score(X_train, Y_train)))
print('Test set score : {}'.format(lr.score(X_test, Y_test)))
cross_val_score(lr, X_train, Y_train, cv=4)



Training set score : 0.7672849915682968
Test set score : 0.7671724137931034




array([0.74528302, 0.76518219, 0.76518219, 0.76923077])

## 2nd approach, using tf-idf

In [107]:
# constructing Dataset
#del features
#features = pd.DataFrame()
features_2 = tfidf_of_features(sentences, words_common)
# add words per sentence statistics
stat_words_per_sentence(sentences, features_2)
features_2.head(10)

Processing row 0
Processing row 1000
Processing row 2000
Processing row 3000
Processing row 4000
Processing row 5000
Processing row 6000


Unnamed: 0,priest,face,long,flambeau,brown,tell,old,and,door,be,...,miss-tfidf,wentworth-tfidf,good-tfidf,musgrove-tfidf,mary-tfidf,count,25perc,50perc,75perc,max
0,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,6590.0,12.0,21.0,36.0,227.0
1,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,6590.0,12.0,21.0,36.0,227.0
2,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,6590.0,12.0,21.0,36.0,227.0
3,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,6590.0,12.0,21.0,36.0,227.0
4,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,6590.0,12.0,21.0,36.0,227.0
5,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,6590.0,12.0,21.0,36.0,227.0
6,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,6590.0,12.0,21.0,36.0,227.0
7,0,0,0,0,0,0,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,6590.0,12.0,21.0,36.0,227.0
8,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,6590.0,12.0,21.0,36.0,227.0
9,0,0,0,0,0,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,6590.0,12.0,21.0,36.0,227.0


## start modeling

In [108]:
Y = features_2['text_source']
X = features_2.drop(['text_source', 'text_sentence'], 1)

# RandomForestModel
rfc = ensemble.RandomForestClassifier()

X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.55,
                                                    random_state=4)
rfc.fit(X_train, Y_train)

print('Training set score : {}'.format(rfc.score(X_train, Y_train)))
print('Test set score : {}'.format(rfc.score(X_test, Y_test)))



Training set score : 0.7925801011804384
Test set score : 0.7638620689655172


In [109]:
cross_val_score(rfc, X_train, Y_train, cv=4)

array([0.74663073, 0.77192982, 0.76923077, 0.76653171])

## Logistic Regression

In [110]:
lr = LogisticRegression()
lr.fit(X_train, Y_train)

print('Training set score : {}'.format(lr.score(X_train, Y_train)))
print('Test set score : {}'.format(lr.score(X_test, Y_test)))
cross_val_score(lr, X_train, Y_train, cv=4)



Training set score : 0.7672849915682968
Test set score : 0.7671724137931034




array([0.74393531, 0.76248313, 0.76518219, 0.76923077])

## Conclusion:   Both models, Random forest or Logistic Regression,

    give comparable results. Changing features method, from bag of word to tf idf, don't 
    make much differences.