In [96]:
import numpy as np
import pandas as pd
import scipy
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

from datetime import datetime as dt
from collections import Counter
import re
import spacy
import nltk
from nltk.corpus import gutenberg, stopwords

import sklearn
from sklearn import ensemble
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split

In [97]:
# utility function
def text_cleaner(x):
    x = re.sub(r'--', ' ', x)
    x = re.sub(r'\*+', ' ', x)
    x = re.sub("[\[].*?[\]]", "", x)
    x = ' '.join(x.split())
    return x

In [98]:
def record_start_time(t = dt.now):
    print('start time {}'.format(t))
    return t

In [99]:
def record_end_time(starttime, t = dt.now):
    print('end time {}'.format(t))
    print('Total time passed is {}'.format(t - starttime))
    return t

In [100]:
def word_frequencies(x, include_stop=True):
    words = []
    for token in x:
        if not token.is_punct and (not token.is_stop or include_stop):
            words.append(token.text)
            
    return Counter(words)

In [101]:
def lemma_frequencies(text, include_stop=True):
    lemmas = []
    for token in text:
        if not token.is_punct and (not token.is_stop or include_stop):
            lemmas.append(token.lemma_)
    
    return Counter(lemmas)

In [102]:
def stat_words_per_sentence(sentences, df):
    words = []
    for ss in sentences[0]:
        words.append(len(ss)) 
        
    df_local = pd.DataFrame(words, columns=['entity'])
    df['count'] = df_local.describe().loc['count', 'entity']
    df['25perc'] = df_local.describe().loc['25%', 'entity']
    df['50perc'] = df_local.describe().loc['50%', 'entity']
    df['75perc'] = df_local.describe().loc['75%', 'entity']
    df['max'] = df_local.describe().loc['max', 'entity']

In [103]:
def polysemy_frequencies(x, include_stop=True):
    polys = {}
    for token in x:
        if not token.is_punct and (not token.is_stop or include_stop):
            polys[token] = token.pos_
            
    return Counter(polys)    

In [104]:
def bow_of_polysemy(x, include_stop=True):
    container = polysemy_frequencies(x, include_stop)
    phrase_types = ['NOUN', 'VERB', 'ADV', 'ADJ', 'ADP']


In [105]:
def bag_of_words(text, nsize, include_stop):
    container = lemma_frequencies(text, include_stop).most_common(nsize)
    return [item[0] for item in container]  

In [106]:
def bow_of_features(sentences, common_words):
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1]
    df.loc[:, common_words] = 0
    
    for i, sentence in enumerate(df['text_sentence']):
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
    
        for word in words:
            df.loc[i, word] += 1
        
        if i % 100 == 0:
            print('Processing row {}'.format(i))
    
    return df

In [107]:
# load dataset
persuasion_raw = gutenberg.raw('austen-persuasion.txt')
alice_raw = gutenberg.raw('carroll-alice.txt')

In [108]:
# cleanup dataset
# remove CHAPTER
alice_raw = re.sub(r'CHAPTER .*', '', alice_raw)
persuasion_raw = re.sub(r'Chapter \d+', '', persuasion_raw)
print('Done CHAPTER')
# make them shorter
alice = text_cleaner(alice_raw[:int(len(alice_raw)/10)])
persuasion = text_cleaner(persuasion_raw[:int(len(persuasion_raw)/10)])
print('Done Cleaner')

Done CHAPTER
Done Cleaner


In [109]:
# generate features 
nlp = spacy.load('en_core_web_sm')
alice_doc = nlp(alice)
persuasion_doc = nlp(persuasion)

## Challenger 0

In [110]:
# group into sentences
alice_sents = [[sent, "Carroll"] for sent in alice_doc.sents]
persuasion_sents = [[sent, "Austen"] for sent in persuasion_doc.sents]

sentences = pd.DataFrame(alice_sents + persuasion_sents)
sentences.head(3)

Unnamed: 0,0,1
0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll
1,"(So, she, was, considering, in, her, own, mind...",Carroll
2,"(There, was, nothing, so, VERY, remarkable, in...",Carroll


In [111]:
# constructing common_words
alice_common = bag_of_words(alice_doc, 30, False)
persuasion_common = bag_of_words(persuasion_doc, 30, False)

words_common = [x for x in (set(alice_common) - set(persuasion_common))]
for y in (set(persuasion_common) - set(alice_common)):
    words_common.append(y)

In [112]:
# constructing Dataset
features = bow_of_features(sentences, words_common)
# add words per sentence statistics
stat_words_per_sentence(sentences, features)

Processing row 0
Processing row 100
Processing row 200
Processing row 300
Processing row 400


In [113]:
features.head(3)

Unnamed: 0,alice,like,wonder,try,begin,oh,foot,door,rabbit,not,...,elliot,sir,'s,text_sentence,text_source,count,25perc,50perc,75perc,max
0,2,0,0,0,1,0,0,0,0,0,...,0,0,0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll,412.0,12.0,23.0,42.25,204.0
1,0,0,0,0,0,0,0,0,1,0,...,0,0,0,"(So, she, was, considering, in, her, own, mind...",Carroll,412.0,12.0,23.0,42.25,204.0
2,1,0,0,0,0,1,0,0,1,0,...,0,0,0,"(There, was, nothing, so, VERY, remarkable, in...",Carroll,412.0,12.0,23.0,42.25,204.0


In [88]:
Y = features['text_source']
X = features.drop(['text_source', 'text_sentence'], 1)

In [93]:
# RandomForestModel
rfc = ensemble.RandomForestClassifier()

X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.55,
                                                    random_state=4)
rfc.fit(X_train, Y_train)

print('Training set score : {}'.format(rfc.score(X_train, Y_train)))
print('Test set score : {}'.format(rfc.score(X_test, Y_test)))




Training set score : 0.9081081081081082
Test set score : 0.8986784140969163


In [94]:
# Logistic Regression
lr = LogisticRegression()
lr.fit(X_train, Y_train)

print('Training set score : {}'.format(lr.score(X_train, Y_train)))
print('Test set score : {}'.format(lr.score(X_test, Y_test)))



Training set score : 0.8918918918918919
Test set score : 0.9074889867841409


In [95]:
# Gradient Boosting Model
gbc = ensemble.GradientBoostingClassifier()
gbc.fit(X_train, Y_train)

print('Training set score : {}'.format(gbc.score(X_train, Y_train)))
print('Test set score : {}'.format(gbc.score(X_test, Y_test)))

Training set score : 0.9081081081081082
Test set score : 0.8942731277533039


## Challenge 1:

In [115]:
# download 'milton-paradise.txt'
paradise_raw = gutenberg.raw('milton-paradise.txt')

In [124]:
# cleanup dataset
# remove Book
paradise_raw = re.sub(r'Book [A-Z]*', '', paradise_raw)
print('Done Book')
# make them shorter
paradise = text_cleaner(paradise_raw[:int(len(alice_raw)/2)])
print('Done Cleaner')

Done Book
Done Cleaner


In [125]:
#nlp = spacy.load('en_core_web_sm')
paradise_doc = nlp(paradise)

paradise_sents = [[sent, "Milton"] for sent in paradise_doc.sents]

sentences = pd.DataFrame(alice_sents + paradise_sents)
sentences.head(3)

Unnamed: 0,0,1
0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll
1,"(So, she, was, considering, in, her, own, mind...",Carroll
2,"(There, was, nothing, so, VERY, remarkable, in...",Carroll


In [127]:
# constructing common_words
alice_common = bag_of_words(alice_doc, 30, False)
paradise_common = bag_of_words(paradise_doc, 30, False)

words_common = [x for x in (set(alice_common) - set(paradise_common))]
for y in (set(paradise_common) - set(alice_common)):
    words_common.append(y)

In [128]:
# constructing Dataset
features = bow_of_features(sentences, words_common)
# add words per sentence statistics
stat_words_per_sentence(sentences, features)

Processing row 0
Processing row 100
Processing row 200
Processing row 300
Processing row 400
Processing row 500


In [129]:
Y = features['text_source']
X = features.drop(['text_source', 'text_sentence'], 1)

In [130]:
# RandomForestModel
rfc = ensemble.RandomForestClassifier()

X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.55,
                                                    random_state=4)
rfc.fit(X_train, Y_train)

print('Training set score : {}'.format(rfc.score(X_train, Y_train)))
print('Test set score : {}'.format(rfc.score(X_test, Y_test)))




Training set score : 0.9488188976377953
Test set score : 0.907051282051282


In [131]:
# Logistic Regression
lr = LogisticRegression()
lr.fit(X_train, Y_train)

print('Training set score : {}'.format(lr.score(X_train, Y_train)))
print('Test set score : {}'.format(lr.score(X_test, Y_test)))



Training set score : 0.9094488188976378
Test set score : 0.8974358974358975


In [132]:
# Gradient Boosting Model
gbc = ensemble.GradientBoostingClassifier()
gbc.fit(X_train, Y_train)

print('Training set score : {}'.format(gbc.score(X_train, Y_train)))
print('Test set score : {}'.format(gbc.score(X_test, Y_test)))

Training set score : 0.9409448818897638
Test set score : 0.9038461538461539
