In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
# import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import gutenberg, stopwords
from collections import Counter
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
# note we are using nltk to tokenize because the current version of spacy requires too much memory locally
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
import string
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn import svm

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/douglaswood/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# Utility function for standard text cleaning.
def text_cleaner(text):
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = ' '.join(text.split())
    # Remove all punctuation except "."
    x = string.punctuation
    y = x.replace('.','')
    for c in y:     
        text = text.replace(c,"")
    return text
    
# Load and clean the data.
persuasion = gutenberg.raw('austen-persuasion.txt')
alice = gutenberg.raw('carroll-alice.txt')

# The Chapter indicator is idiosyncratic
persuasion = re.sub(r'Chapter \d+', '', persuasion)
alice = re.sub(r'CHAPTER .*', '', alice)
    
alice = text_cleaner(alice)
persuasion = text_cleaner(persuasion)

In [3]:
# Parse the data with nltk tokenize. 
alice_doc = sent_tokenize(alice)
persuasion_doc = sent_tokenize(persuasion)

In [4]:
# Group into sentences.
alice_sents = [[sent, "Carroll"] for sent in alice_doc]
persuasion_sents = [[sent, "Austen"] for sent in persuasion_doc]

# Combine the sentences from the two novels into one data frame.
sentences = pd.DataFrame(alice_sents + persuasion_sents)
sentences.head()

Unnamed: 0,0,1
0,Alice was beginning to get very tired of sitti...,Carroll
1,There was nothing so VERY remarkable in that n...,Carroll
2,In another moment down went Alice after it nev...,Carroll
3,The rabbithole went straight on like a tunnel ...,Carroll
4,Either the well was very deep or she fell very...,Carroll


In [20]:
# Utility function to create a list of the 2000 most common words.
def bag_of_words(list_of_strings):
    allwords = []
    wnl = WordNetLemmatizer()
    list_of_lists_of_words = [word_tokenize(t) for t in list_of_strings]
    for sentence in list_of_lists_of_words:
        for word in sentence:
            allwords.append(wnl.lemmatize(word))                           
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(2000)]
    

# Creates a data frame with features for each word in our common word set.
# Each value is the count of the times the word appears in each sentence.
def bow_features(sentences, common_words):
    
    wnl2 = WordNetLemmatizer()
    
    # Scaffold the data frame and initialize counts to zero.
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1]
    df.loc[:, common_words] = 0
    
    # Process each row, counting the occurrence of lemmas in each sentence.
    for i, sentence in enumerate(df['text_sentence']):
        lemmas = list()
        # Make words into lemmas using nltk.stem
        for word in word_tokenize(sentence):
            lemmas.append(wnl2.lemmatize(word))
        
        # Populate the row with word counts.
        for lemma in lemmas:
            if lemma in common_words:
                df.loc[i, lemma] += 1
        
        # This counter is just to make sure the kernel didn't hang.
        if i % 500 == 0:
            print("Processing row {}".format(i))
    return df     

# Set up the bags.
alicewords = bag_of_words(alice_doc)
persuasionwords = bag_of_words(persuasion_doc)

# Combine bags to create a set of unique words.
common_words = set(alicewords + persuasionwords)

In [21]:
# Create our data frame with features. This can take a while to run.
word_counts = bow_features(sentences, common_words)
word_counts.head()

Processing row 0


KeyboardInterrupt: 

In [None]:
rfc = ensemble.RandomForestClassifier()
Y = word_counts['text_source']
X = np.array(word_counts.drop(['text_sentence','text_source'], 1))

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.4,
                                                    random_state=0)
train = rfc.fit(X_train, y_train)

print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

In [None]:
lr = LogisticRegression()
train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

In [None]:
clf = ensemble.GradientBoostingClassifier()
train = clf.fit(X_train, y_train)

print('Training set score:', clf.score(X_train, y_train))
print('\nTest set score:', clf.score(X_test, y_test))

In [None]:
# Challenge 0
svc = SVC()
train = svc.fit(X_train, y_train)
print('Training set score:', svc.score(X_train, y_train))
print('\nTest set score:', svc.score(X_test, y_test))

In [None]:
# Challenge 1
print(gutenberg.fileids())