In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
# import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import gutenberg, stopwords
from collections import Counter
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
# note we are using nltk to tokenize because the current version of spacy requires too much memory locally
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
import string
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn import svm
import gensim
from gensim.models import word2vec

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/douglaswood/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# Utility function to clean text.
def text_cleaner(text):
    
    text = re.sub(r'--',' ',text)
    
    # Get rid of headings in square brackets.
    text = re.sub("[\[].*?[\]]", "", text)
    
    # Get rid of chapter titles.
    text = re.sub(r'Chapter \d+','',text)
    
    # Get rid of extra whitespace.
    text = ' '.join(text.split())
    
    # Remove all punctuation except "."
    x = string.punctuation
    y = x.replace('.','')
    for c in y:     
        text = text.replace(c,"")
    
    return text

In [3]:
# Import all the Chesterton in the Project Gutenberg corpus.
chesterton = ""
for novel in ['ball','brown','thursday']:
    work = gutenberg.raw('chesterton-' + novel + '.txt')
    chesterton = chesterton + work

# Clean the data.
chesterton_clean = text_cleaner(chesterton)

In [4]:
# Parse the data with nltk tokenize. 
chesterton_doc = sent_tokenize(chesterton_clean)

In [5]:
list_of_lists_of_words_Ch = [word_tokenize(t) for t in chesterton_doc]

In [6]:
# Import all the Austen in the Project Gutenberg corpus.
austen = ""
for novel in ['persuasion','emma','sense']:
    work = gutenberg.raw('austen-' + novel + '.txt')
    austen = austen + work

# Clean the data.
austen_clean = text_cleaner(austen)

In [7]:
# Parse the data with nltk tokenize. 
austen_doc = sent_tokenize(austen_clean)

In [8]:
list_of_lists_of_words_Au = [word_tokenize(t) for t in austen_doc]

In [9]:
# Group into sentences.
Chesterton_sents = [[sent, "Chesterton"] for sent in chesterton_doc]
Austen_sents = [[sent, "Austen"] for sent in austen_doc]

# Combine the sentences from the two sets of works into one data frame.
sentences = pd.DataFrame(Chesterton_sents + Austen_sents)
sentences.head()

Unnamed: 0,0,1
0,I.,Chesterton
1,A DISCUSSION SOMEWHAT IN THE AIR The flying sh...,Chesterton
2,That it was far above the earth was no express...,Chesterton
3,The professor had himself invented the flying ...,Chesterton
4,Every sort of tool or apparatus had in consequ...,Chesterton


In [10]:
# For this assignment, we are comparing word2vec v. most common words for generating features to be used in a supervised 
# bag of words model.

# word2vec model for generating Chesterton features:
modelCh = word2vec.Word2Vec(
    list_of_lists_of_words_Ch,
    workers=4,     # Number of threads to run in parallel.
    min_count=10,  # Minimum word count threshold.
    window=6,      # Number of words around target word to consider.
    sg=0,          # Use CBOW rather than skip-gram.
    sample=1e-3 ,  # Penalize frequent words.
    size=300,      # Word vector length.
    hs=1           # Use hierarchical softmax.
)

In [11]:
# Set of feature words from Chesterton in word2vec model.
vocabCh = modelCh.wv.vocab.keys()
word2vec_Features = set(vocabCh)

In [12]:
# For this assignment, we are comparing word2vec v. most common words for generating features to be used in a supervised 
# bag of words model.

# word2vec model for generating Austen features:
modelAu = word2vec.Word2Vec(
    list_of_lists_of_words_Au,
    workers=4,     # Number of threads to run in parallel.
    min_count=10,  # Minimum word count threshold.
    window=6,      # Number of words around target word to consider.
    sg=0,          # Use CBOW rather than skip-gram.
    sample=1e-3 ,  # Penalize frequent words.
    size=300,      # Word vector length.
    hs=1           # Use hierarchical softmax.
)

In [13]:
# Set of feature words from Austen in word2vec model.
vocabAu = modelAu.wv.vocab.keys()
word2vec_Features = word2vec_Features & set(vocabAu) 

In [14]:
# Creates a data frame with features for each word in our feature word set.
def make_frame(sentences_local, feature_words):
    
    # Scaffold the data frame and initialize counts to zero.
    df = pd.DataFrame(columns=feature_words)
    df['text_sentence'] = sentences_local[0]
    df['text_source'] = sentences_local[1]
    df.loc[:, feature_words] = 0
    return df     

# Each value is the count of the times the word appears in each sentence.
def count_words(df,list_of_lists_of_words):
    for sentence in list_of_lists_of_words:
        for word in sentence:
            if word in df.columns:
                df.loc[0, word] += 1
    for column in df.columns:
        if df.loc[0,column] > 0:
            df.loc[df.loc[0,column],column] = 1
            df.loc[0,column] = 0
    return df

In [15]:
word_counts_word2vec = make_frame(sentences,word2vec_Features)

In [16]:
word_counts_word2vec.head()

Unnamed: 0,middle,one,comes,vain,Lady,decent,sort,experience,man,because,...,miles,past,Why,giving,space,yes,free,lost,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,I.,Chesterton
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,A DISCUSSION SOMEWHAT IN THE AIR The flying sh...,Chesterton
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,That it was far above the earth was no express...,Chesterton
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,The professor had himself invented the flying ...,Chesterton
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Every sort of tool or apparatus had in consequ...,Chesterton


In [17]:
count_words(word_counts_word2vec,list_of_lists_of_words_Ch)
count_words(word_counts_word2vec,list_of_lists_of_words_Au)

KeyboardInterrupt: 

In [None]:
word_counts_word2vec.head()

In [None]:
# RFC Supervised Model using word2vec features
rfc = ensemble.RandomForestClassifier()
Y = word_counts_word2vec['text_source']
X = np.array(word_counts_word2vec.drop(['text_sentence','text_source'], 1))

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.4,
                                                    random_state=0)
train = rfc.fit(X_train, y_train)

print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

In [None]:
# For this assignment, we are comparing word2vec v. most common words for generating features to be used in a supervised 
# bag of words model.

# most common words technique for generating features:
def bag_of_words(list_of_lists_of_words):
    
    allwords = []
    for sentence in list_of_lists_of_words:
        for word in sentence:
            allwords.append(word)
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(2000)]

In [None]:
# Set up the bags.
chesterton_words = bag_of_words(list_of_lists_of_words_Ch)
austen_words = bag_of_words(list_of_lists_of_words_Au)

# Combine bags to create a set of unique words.
common_words = set(chesterton_words + austen_words)

In [None]:
word_counts_most_common = make_frame(sentences,common_words)

In [None]:
word_counts_most_common.head()

In [None]:
count_words(word_counts_most_common,list_of_lists_of_words_Ch)
count_words(word_counts_most_common,list_of_lists_of_words_Au)

In [None]:
word_counts_most_common.head()

In [None]:
# RFC Supervised Model using most common words features
rfc = ensemble.RandomForestClassifier()
Y = word_counts_most_common['text_source']
X = np.array(word_counts_most_common.drop(['text_sentence','text_source'], 1))

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.4,
                                                    random_state=0)
train = rfc.fit(X_train, y_train)

print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

In [None]:
# The RFC Supervised Model works better with features generated by word2vec, rathern than most common words. 
# using word2vec features in a Logistic Regression model to further improve performance:
lr = LogisticRegression()
Y = word_counts_word2vec['text_source']
X = np.array(word_counts_word2vec.drop(['text_sentence','text_source'], 1))

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.4,
                                                    random_state=0)
train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))