In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy 
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import gutenberg, stopwords
from collections import Counter

import time

In [2]:
# Util function for standard text cleaning
def text_cleaner(text):2
    # Visual inspection identifies a form of punctuation spaCy does not recognize: the double dash '--'
    text = re.sub(r'--', ' ', text)
    text = re.sub('[\[].*?[\]]', '', text)
    text = ' '.join(text.split())
    return text

# Util function to create a list of the 2000 most common words
def bag_of_words(text):
    
    # Filter out punctuation and stop words
    allwords = [token.lemma_ for token in text if not token.is_punct and not token.is_stop]
    
    # Return most common words
    return [item[0] for item in Counter(allwords).most_common(2000)]

In [3]:
# Function that creats a data frame with feature for each word in our common word set
# Each value is the count of the time the word appeard in each sentence
def bow_features(sentences, common_words):
    
    # Scaffold the data frame and initialize counts to zero
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1]
    df.loc[:, common_words] = 0
    
    # Process each row, counting occurence of words in each sentence
    start = time.time()
    for i, sentence in enumerate(df['text_sentence']):
        
        # Convert sentence to lemmas, then filter out punctuation, stop words, and uncommon words
        words = [token.lemma_ 
                 for token in sentence 
                 if(
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        
        # Populate the row with word counts
        for word in words:
            df.loc[i, word] += 1
        
        
        # This couner is just to make sure the kernel didnt hang
        if i % 500 == 0:
            end = time.time()
            print('{}:\tProcessing row {}'.format(end-start, i))
            start = time.time()
    
    return df

In [4]:
# Load and clean data
persuasion = gutenberg.raw('austen-persuasion.txt')
alice = gutenberg.raw('carroll-alice.txt')

# Chapter indicator is idiosyncratic
persuasion = re.sub(r'Chapter \d+', '', persuasion)
alice = re.sub(r'CHAPTER .*', '', alice)


alice = alice.lower()
persuasion = persuasion.lower()

alice = text_cleaner(alice)
persuasion = text_cleaner(persuasion)

In [5]:
# Parse cleaned novels. Can take a bit
nlp = spacy.load('en')
alice_doc = nlp(alice)
persuasion_doc = nlp(persuasion)

In [6]:
# Group into sentences
alice_sents = [[sent, 'Carroll'] for sent in alice_doc.sents]
persuasion_sents = [[sent, 'Austen'] for sent in persuasion_doc.sents]

# Combine sentence fromt he two novels into one data frame
sentences = pd.DataFrame(alice_sents + persuasion_sents)
sentences.head()

Unnamed: 0,0,1
0,"(alice, was, beginning, to, get, very, tired, ...",Carroll
1,"(so, she, was, considering, in, her, own, mind...",Carroll
2,"(there, was, nothing, so, very, remarkable, in...",Carroll
3,"(oh, dear, !)",Carroll
4,"(i, shall, be, late, !, ')",Carroll


In [7]:
# Set up bags
alicewords = bag_of_words(alice_doc)
persuasionwords = bag_of_words(persuasion_doc)

# Combine bags to create a set of unique words
common_words = set(alicewords + persuasionwords)

In [9]:
# Create data frame with features.  Can take a while to run (takes 4-5 hours for some reason)
# word_counts = bow_features(sentences, common_words)
# word_counts.head()

In [10]:
# word_counts.head()
# word_counts.to_csv('~/Downloads/word_count.csv')
word_counts = pd.read_csv('word_count.csv')
word_counts = word_counts.drop(['Unnamed: 0'], axis=1)

In [11]:
# Random Forest
from sklearn import ensemble
from sklearn.model_selection import train_test_split

rfc = ensemble.RandomForestClassifier()
Y = word_counts['text_source']
X = np.array(word_counts.drop(['text_sentence', 'text_source'], 1))

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=0)

train = rfc.fit(X_train, y_train)

print('Random Forest')
print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

  from numpy.core.umath_tests import inner1d


Random Forest
Training set score: 0.9804924848097217

Test set score: 0.8896882494004796


In [12]:
# Ridge Regression (L2 Regularizer)
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
train = lr.fit(X_train, y_train)
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

Training set score: 0.9491525423728814

Test set score: 0.9122302158273381


In [13]:
# Gradient Boosting (Tree)
clf = ensemble.GradientBoostingClassifier()
train = clf.fit(X_train, y_train)

print('Training set score:', clf.score(X_train, y_train))
print('\nTest set score:', clf.score(X_test, y_test))

Training set score: 0.8899904061400703

Test set score: 0.8815347721822542


In [60]:
# New Data: Emma by Jane Austen

# Clean Emma data
emma = gutenberg.raw('austen-emma.txt')
emma = re.sub(r'VOLUME \w+', '', emma)
emma = re.sub(r'CHAPTER \w+', '', emma)
emma = text_cleaner(emma)

emma = emma.lower()

# Process cleaned Emma data
emma_doc = nlp(emma)

# Group into sentences
persuasion_sents = [[sent, 'Austen'] for sent in persuasion_doc.sents]
emma_sents = [[sent, 'Austen'] for sent in emma_doc.sents]

# Cut down Emma to the same length as Alice
emma_sents = emma_sents[0:len(alice_sents)]

In [14]:
# Build new bag of words data frame for Emma word counts
# Using the same common words from Alice and Persuasion
# emma_sentences = pd.DataFrame(emma_sents)
# emma_bow = bow_features(emma_sentences, common_words)
# emma_bow.to_csv('~/Downloads/emma_bow.csv')
emma_bow = pd.read_csv('emma_bow.csv')
emma_bow = emma_bow.drop(['Unnamed: 0'], axis=1)

In [15]:
# Using logistic regression again
X_Emma_test = np.concatenate((
    X_train[y_train[y_train=='Carroll'].index],
    emma_bow.drop(['text_sentence', 'text_source'], 1)
), axis=0)
y_Emma_test = pd.concat([y_train[y_train=='Carroll'],
                         pd.Series(['Austen'] * emma_bow.shape[0])])

# Model
print('\nTest set score:', lr.score(X_Emma_test, y_Emma_test))
lr_Emma_predicted = lr.predict(X_Emma_test)
pd.crosstab(y_Emma_test, lr_Emma_predicted)


Test set score: 0.6953278366705928


col_0,Austen,Carroll
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
Austen,1528,58
Carroll,718,243


In [16]:
from sklearn.svm import SVC

svc = SVC()
train = svc.fit(X_train, y_train)
print('\nTest set score:', svc.score(X_Emma_test, y_Emma_test))


Test set score: 0.6226933647428347


In [17]:
# New Features
# Sentence length, punctiation count per sentence, proper noun count per sentence

for index, row in enumerate(word_counts):
    word_counts.at[index, 'sent_length'] = len(nlp(word_counts.at[index, 'text_sentence']))
    word_counts.at[index, 'puncts_count'] = sum([token.is_punct for token in nlp(word_counts.at[index, 'text_sentence'])])
    word_counts.at[index, 'PROPN_count'] = [token.pos_ for token in nlp(word_counts.at[index, 'text_sentence'])].count('PROPN')

for index in range(len(emma_bow)):
    emma_bow.at[index, 'sent_length'] = len(nlp(emma_bow.at[index, 'text_sentence']))
    emma_bow.at[index, 'puncts_count'] = sum([token.is_punct for token in nlp(emma_bow.at[index, 'text_sentence'])])
    emma_bow.at[index, 'PROPN_count'] = [token.pos_ for token in nlp(emma_bow.at[index, 'text_sentence'])].count('PROPN')
        

In [18]:
Y = word_counts['text_source']
X = np.array(word_counts.drop(['text_sentence', 'text_source'], 1))

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=0)

In [19]:
X_Emma_test = np.concatenate((
    X_train[y_train[y_train=='Carroll'].index],
    emma_bow.drop(['text_sentence', 'text_source'], 1)
), axis=0)
y_Emma_test = pd.concat([y_train[y_train=='Carroll'],
                         pd.Series(['Austen'] * emma_bow.shape[0])])

In [20]:
X_train[np.isnan(X_train)] = 0
X_Emma_test[np.isnan(X_Emma_test)] = 0

In [21]:
svc = SVC()
train = svc.fit(X_train, y_train)
print('\nTest set score:', svc.score(X_Emma_test, y_Emma_test))
svc_predicted = svc.predict(X_Emma_test)
pd.crosstab(y_Emma_test, svc_predicted)


Test set score: 0.5508441303494307


col_0,Austen,Carroll
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
Austen,1172,414
Carroll,730,231


In [22]:
lr = LogisticRegression()
train = lr.fit(X_train, y_train)
print('\nTest set score:', lr.score(X_Emma_test, y_Emma_test))


Test set score: 0.6474283470749902
