In [11]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [9]:
import spacy as spacy

In [15]:
import nltk
from nltk.corpus import gutenberg, stopwords
from collections import Counter
nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\Kristine\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

In [16]:
# Import the data we just downloaded and installed.
from nltk.corpus import stopwords

# Grab and process the raw data.
print(gutenberg.fileids())

persuasion = gutenberg.raw('austen-persuasion.txt')
alice = gutenberg.raw('carroll-alice.txt')

# Print the first 100 characters of Alice in Wonderland.
print('\nRaw:\n', alice[0:100])

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']

Raw:
 [Alice's Adventures in Wonderland by Lewis Carroll 1865]

CHAPTER I. Down the Rabbit-Hole

Alice was


In [17]:
# This pattern matches all text between square brackets.
pattern = "[\[].*?[\]]"
persuasion = re.sub(pattern, "", persuasion)
alice = re.sub(pattern, "", alice)

# Print the first 100 characters of Alice again.
print('Title removed:\n', alice[0:100])

Title removed:
 

CHAPTER I. Down the Rabbit-Hole

Alice was beginning to get very tired of sitting by her sister on


In [18]:
# Now we'll match and remove chapter headings.
persuasion = re.sub(r'Chapter \d+', '', persuasion)
alice = re.sub(r'CHAPTER .*', '', alice)

# Ok, what's it look like now?
print('Chapter headings removed:\n', alice[0:100])

Chapter headings removed:
 



Alice was beginning to get very tired of sitting by her sister on the
bank, and of having nothin


In [19]:
# Remove newlines and other extra whitespace by splitting and rejoining.
persuasion = ' '.join(persuasion.split())
alice = ' '.join(alice.split())

# All done with cleanup? Let's see how it looks.
print('Extra whitespace removed:\n', alice[0:100])

Extra whitespace removed:
 Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to


In [20]:
import nltk
nltk.download('stopwords')
print(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Kristine\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [30]:
nltk.download('gutenberg')
!python -m spacy download en

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\Kristine\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


[+] Download and installation successful
You can now load the model via spacy.load('en_core_web_sm')


In [34]:
import spacy
nlp = spacy.load('en')

# All the processing work is done here, so it may take a while.
alice_doc = nlp(alice)
persuasion_doc = nlp(persuasion)

In [35]:
# Let's explore the objects we've built.
print("The alice_doc object is a {} object.".format(type(alice_doc)))
print("It is {} tokens long".format(len(alice_doc)))
print("The first three tokens are '{}'".format(alice_doc[:3]))
print("The type of each token is {}".format(type(alice_doc[0])))

The alice_doc object is a <class 'spacy.tokens.doc.Doc'> object.
It is 34408 tokens long
The first three tokens are 'Alice was beginning'
The type of each token is <class 'spacy.tokens.token.Token'>


In [36]:
from collections import Counter

# Utility function to calculate how frequently words appear in the text.
def word_frequencies(text, include_stop=True):
    
    # Build a list of words.
    # Strip out punctuation and, optionally, stop words.
    words = []
    for token in text:
        if not token.is_punct and (not token.is_stop or include_stop):
            words.append(token.text)
            
    # Build and return a Counter object containing word counts.
    return Counter(words)
    
# The most frequent words:
alice_freq = word_frequencies(alice_doc).most_common(10)
persuasion_freq = word_frequencies(persuasion_doc).most_common(10)
print('Alice:', alice_freq)
print('Persuasion:', persuasion_freq)

Alice: [('the', 1524), ('and', 796), ('to', 724), ('a', 611), ('I', 533), ('it', 524), ('she', 508), ('of', 499), ('said', 453), ('Alice', 394)]
Persuasion: [('the', 3120), ('to', 2775), ('and', 2738), ('of', 2563), ('a', 1529), ('in', 1346), ('was', 1329), ('had', 1177), ('her', 1159), ('I', 1118)]


In [37]:
# Use our optional keyword argument to remove stop words.
alice_freq = word_frequencies(alice_doc, include_stop=False).most_common(10)
persuasion_freq = word_frequencies(persuasion_doc, include_stop=False).most_common(10)
print('Alice:', alice_freq)
print('Persuasion:', persuasion_freq)

Alice: [('said', 453), ('Alice', 394), ('little', 124), ('like', 84), ('went', 83), ('know', 83), ('thought', 74), ('Queen', 73), ('time', 68), ('King', 61)]
Persuasion: [('Anne', 496), ('Captain', 297), ('Mrs', 291), ('Elliot', 288), ('Mr', 254), ('Wentworth', 217), ('Lady', 191), ('good', 181), ('little', 175), ('Charles', 166)]


In [38]:
# Pull out just the text from our frequency lists.
alice_common = [pair[0] for pair in alice_freq]
persuasion_common = [pair[0] for pair in persuasion_freq]

# Use sets to find the unique values in each top ten.
print('Unique to Alice:', set(alice_common) - set(persuasion_common))
print('Unique to Persuasion:', set(persuasion_common) - set(alice_common))

Unique to Alice: {'time', 'went', 'King', 'know', 'said', 'like', 'thought', 'Queen', 'Alice'}
Unique to Persuasion: {'Elliot', 'Charles', 'Mrs', 'Mr', 'Captain', 'Wentworth', 'good', 'Lady', 'Anne'}


In [39]:
# Utility function to calculate how frequently lemas appear in the text.
def lemma_frequencies(text, include_stop=True):
    
    # Build a list of lemas.
    # Strip out punctuation and, optionally, stop words.
    lemmas = []
    for token in text:
        if not token.is_punct and (not token.is_stop or include_stop):
            lemmas.append(token.lemma_)
            
    # Build and return a Counter object containing word counts.
    return Counter(lemmas)

# Instantiate our list of most common lemmas.
alice_lemma_freq = lemma_frequencies(alice_doc, include_stop=False).most_common(10)
persuasion_lemma_freq = lemma_frequencies(persuasion_doc, include_stop=False).most_common(10)
print('\nAlice:', alice_lemma_freq)
print('Persuasion:', persuasion_lemma_freq)

# Again, identify the lemmas common to one text but not the other.
alice_lemma_common = [pair[0] for pair in alice_lemma_freq]
persuasion_lemma_common = [pair[0] for pair in persuasion_lemma_freq]
print('Unique to Alice:', set(alice_lemma_common) - set(persuasion_lemma_common))
print('Unique to Persuasion:', set(persuasion_lemma_common) - set(alice_lemma_common))


Alice: [('say', 477), ('Alice', 394), ('think', 130), ('go', 130), ('little', 125), ('look', 106), ('know', 103), ('come', 96), ('like', 92), ('begin', 91)]
Persuasion: [('Anne', 493), ('Captain', 294), ('Mrs', 291), ('Elliot', 288), ('think', 256), ('know', 255), ('Mr', 254), ('good', 224), ('Wentworth', 215), ('Lady', 191)]
Unique to Alice: {'begin', 'go', 'come', 'look', 'like', 'say', 'Alice', 'little'}
Unique to Persuasion: {'Elliot', 'Mrs', 'Mr', 'Captain', 'good', 'Wentworth', 'Lady', 'Anne'}


In [40]:
# Initial exploration of sentences.
sentences = list(alice_doc.sents)
print("Alice in Wonderland has {} sentences.".format(len(sentences)))

example_sentence = sentences[2]
print("Here is an example: \n{}\n".format(example_sentence))

Alice in Wonderland has 1727 sentences.
Here is an example: 
There was nothing so VERY remarkable in that; nor did Alice think it so VERY much out of the way to hear the Rabbit say to itself, '



In [41]:
# Look at some metrics around this sentence.
example_words = [token for token in example_sentence if not token.is_punct]
unique_words = set([token.text for token in example_words])

print(("There are {} words in this sentence, and {} of them are"
       " unique.").format(len(example_words), len(unique_words)))

There are 27 words in this sentence, and 23 of them are unique.


In [42]:
print(nlp("I need a break")[3].pos_)
print(nlp("I need to break the glass")[3].pos_)

NOUN
VERB


In [43]:
# View the part of speech for some tokens in our sentence.
print('\nParts of speech:')
for token in example_sentence[:9]:
    print(token.orth_, token.pos_)


Parts of speech:
There ADV
was VERB
nothing NOUN
so ADV
VERY ADV
remarkable ADJ
in ADP
that DET
; PUNCT


In [44]:
# View the dependencies for some tokens.
print('\nDependencies:')
for token in example_sentence[:9]:
    print(token.orth_, token.dep_, token.head.orth_)


Dependencies:
There expl was
was ROOT was
nothing attr was
so advmod VERY
VERY advmod remarkable
remarkable amod nothing
in prep remarkable
that pobj in
; punct was


In [45]:
# Extract the first ten entities.
entities = list(alice_doc.ents)[0:10]
for entity in entities:
    print(entity.label_, ' '.join(t.orth_ for t in entity))

PERSON Alice
DATE the hot day
PERSON Alice
PERSON Rabbit
PERSON Rabbit
PERSON Alice
PERSON Alice
PERSON Alice
ORDINAL First
CARDINAL one


In [46]:
# All of the uniqe entities spaCy thinks are people.
people = [entity.text for entity in list(alice_doc.ents) if entity.label_ == "PERSON"]
print(set(people))

{'INSIDE', 'Mercia', 'Majesty', 'Alice', 'Rule Forty-two', 'ALICE', 'Shy', 'Longitude', 'Tis', 'Behead', 'Boots', 'Edwin', 'Panther', 'the Knave of Hearts', 'Magpie', 'Down', 'Tillie', "I'M", 'Mabel', 'Latin Grammar', 'Pat', 'Mouse', 'Ada', 'Stretching', 'Cheshire', 'Duchess', 'Tortoise--', 'Swim', 'Shakespeare', 'Lacie', 'a Cheshire Cat,', 'FATHER WILLIAM', 'Dinn', 'Mary Ann', 'Said', 'Turtle', 'Footman', 'Cheshire Puss', 'HER', 'Lory', 'a Lobster Quadrille', 'William the Conqueror', 'Bill', 'Begin', 'William the Conqueror.', 'Knave', 'Normans--', 'Ma', 'Beau', 'Gryphon', 'Rabbit', 'Queen', 'Run', 'Game', 'VERY', 'Elsie', 'King', 'Curiouser', 'Edgar Atheling', 'Soup', 'Dinah', 'Lobster', "W. RABBIT'", 'Ou', 'Treacle', 'Off', 'Soo', 'FOOT', 'Hush', 'Kings', "Alice)--'and", 'Owl', 'Mine', 'The Knave of Hearts', 'Fury', 'Hare', 'Latitude', 'Brandy', 'Jack', 'Tortoise', 'Soles', 'William', 'Sing', 'Canary', 'Duck', "the King: '", 'Off--', 'Seaography', 'Twinkle', 'Lizard'}


Part 2

In [47]:
# Utility function for standard text cleaning.
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = ' '.join(text.split())
    return text
    
# Load and clean the data.
persuasion = gutenberg.raw('austen-persuasion.txt')
alice = gutenberg.raw('carroll-alice.txt')

# The Chapter indicator is idiosyncratic
persuasion = re.sub(r'Chapter \d+', '', persuasion)
alice = re.sub(r'CHAPTER .*', '', alice)
    
alice = text_cleaner(alice[:int(len(alice)/10)])
persuasion = text_cleaner(persuasion[:int(len(persuasion)/10)])

In [48]:
# Parse the cleaned novels. This can take a bit.
nlp = spacy.load('en')
alice_doc = nlp(alice)
persuasion_doc = nlp(persuasion)

In [49]:
# Group into sentences.
alice_sents = [[sent, "Carroll"] for sent in alice_doc.sents]
persuasion_sents = [[sent, "Austen"] for sent in persuasion_doc.sents]

# Combine the sentences from the two novels into one data frame.
sentences = pd.DataFrame(alice_sents + persuasion_sents)
sentences.head()

Unnamed: 0,0,1
0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll
1,"(So, she, was, considering, in, her, own, mind...",Carroll
2,"(There, was, nothing, so, VERY, remarkable, in...",Carroll
3,"(Oh, dear, !)",Carroll
4,"(Oh, dear, !)",Carroll


In [50]:
# Utility function to create a list of the 2000 most common words.
def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(2000)]
    

# Creates a data frame with features for each word in our common word set.
# Each value is the count of the times the word appears in each sentence.
def bow_features(sentences, common_words):
    
    # Scaffold the data frame and initialize counts to zero.
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1]
    df.loc[:, common_words] = 0
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(df['text_sentence']):
        
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        
        # Populate the row with word counts.
        for word in words:
            df.loc[i, word] += 1
        
        # This counter is just to make sure the kernel didn't hang.
        if i % 50 == 0:
            print("Processing row {}".format(i))
            
    return df

# Set up the bags.
alicewords = bag_of_words(alice_doc)
persuasionwords = bag_of_words(persuasion_doc)

# Combine bags to create a set of unique words.
common_words = set(alicewords + persuasionwords)

In [51]:
# Create our data frame with features. This can take a while to run.
word_counts = bow_features(sentences, common_words)
word_counts.head()

Processing row 0
Processing row 50
Processing row 100
Processing row 150
Processing row 200
Processing row 250
Processing row 300
Processing row 350
Processing row 400


Unnamed: 0,will,Gloucester,toffee,pace,scruple,important,time,spite,steal,lawn,...,funny,wise,planning,totally,Somerset,alternative,prescribe,case,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(So, she, was, considering, in, her, own, mind...",Carroll
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(There, was, nothing, so, VERY, remarkable, in...",Carroll
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Oh, dear, !)",Carroll
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Oh, dear, !)",Carroll


In [89]:
word_counts

Unnamed: 0,will,Gloucester,toffee,pace,scruple,important,time,spite,steal,lawn,...,funny,wise,planning,totally,Somerset,alternative,prescribe,case,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(So, she, was, considering, in, her, own, mind...",Carroll
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(There, was, nothing, so, VERY, remarkable, in...",Carroll
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Oh, dear, !)",Carroll
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Oh, dear, !)",Carroll
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(I, shall, be, late, !, ')",Carroll
6,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,"((, when, she, thought, it, over, afterwards, ...",Carroll
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(A, WATCH, OUT, OF, ITS, WAISTCOAT, -, POCKET,...",Carroll
8,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,"(,, Alice, started, to, her, feet, ,, for, it,...",Carroll
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(In, another, moment, down, went, Alice, after...",Carroll


In [52]:
from sklearn import ensemble
from sklearn.model_selection import train_test_split

rfc = ensemble.RandomForestClassifier()
Y = word_counts['text_source']
X = np.array(word_counts.drop(['text_sentence','text_source'], 1))

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.4,
                                                    random_state=0)
train = rfc.fit(X_train, y_train)

print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))



Training set score: 0.981203007518797

Test set score: 0.8370786516853933


In [53]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(penalty='l2') # No need to specify l2 as it's the default. But we put it for demonstration.
train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))



(266, 1612) (266,)
Training set score: 0.9699248120300752

Test set score: 0.8764044943820225


In [54]:
clf = ensemble.GradientBoostingClassifier()
train = clf.fit(X_train, y_train)

print('Training set score:', clf.score(X_train, y_train))
print('\nTest set score:', clf.score(X_test, y_test))

Training set score: 0.9661654135338346

Test set score: 0.8146067415730337


In [55]:
# Clean the Emma data.
emma = gutenberg.raw('austen-emma.txt')
emma = re.sub(r'VOLUME \w+', '', emma)
emma = re.sub(r'CHAPTER \w+', '', emma)
emma = text_cleaner(emma[:int(len(emma)/60)])
print(emma[:100])

Emma Woodhouse, handsome, clever, and rich, with a comfortable home and happy disposition, seemed to


In [56]:
# Parse our cleaned data.
emma_doc = nlp(emma)

In [57]:
# Group into sentences.
persuasion_sents = [[sent, "Austen"] for sent in persuasion_doc.sents]
emma_sents = [[sent, "Austen"] for sent in emma_doc.sents]

In [58]:
# Build a new Bag of Words data frame for Emma word counts.
# We'll use the same common words from Alice and Persuasion.
emma_sentences = pd.DataFrame(emma_sents)
emma_bow = bow_features(emma_sentences, common_words)

print('done')

Processing row 0
Processing row 50
Processing row 100
Processing row 150
done


In [59]:
# Now we can model it!
# Let's use logistic regression again.

# Combine the Emma sentence data with the Alice data from the test set.
X_Emma_test = np.concatenate((
    X_train[y_train[y_train=='Carroll'].index],
    emma_bow.drop(['text_sentence','text_source'], 1)
), axis=0)
y_Emma_test = pd.concat([y_train[y_train=='Carroll'],
                         pd.Series(['Austen'] * emma_bow.shape[0])])

# Model.
print('\nTest set score:', lr.score(X_Emma_test, y_Emma_test))
lr_Emma_predicted = lr.predict(X_Emma_test)
pd.crosstab(y_Emma_test, lr_Emma_predicted)


Test set score: 0.7235772357723578


col_0,Austen,Carroll
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
Austen,158,12
Carroll,56,20


challenge

In [60]:
from sklearn import svm
clf = svm.SVC()
clf.fit(X_train, y_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [61]:
print('Training set score:', clf.score(X_train, y_train))
print('\nTest set score:', clf.score(X_test, y_test))

Training set score: 0.7142857142857143

Test set score: 0.702247191011236


In [62]:
alice_sent_len = [[len(sent), 'Carroll'] for sent in alice_doc.sents]
persuasion_sent_len = [[len(sent), 'Austen'] for sent in persuasion_doc.sents]

In [63]:
sent_len = pd.DataFrame(alice_sent_len + persuasion_sent_len)
sent_len.head(5)

Unnamed: 0,0,1
0,67,Carroll
1,63,Carroll
2,30,Carroll
3,3,Carroll
4,3,Carroll


In [64]:
sent_len.describe()

Unnamed: 0,0
count,444.0
mean,29.225225
std,26.815963
min,1.0
25%,10.75
50%,21.5
75%,39.0
max,187.0


In [65]:
sent_len.columns = ['sent_len', 'author']

all_features = word_counts.join(sent_len)

In [66]:
for sent in alice_doc.sents:
    for token in sent:
        print([token.pos_, token.tag_, 'Carroll'])

['PROPN', 'NNP', 'Carroll']
['VERB', 'VBD', 'Carroll']
['VERB', 'VBG', 'Carroll']
['PART', 'TO', 'Carroll']
['VERB', 'VB', 'Carroll']
['ADV', 'RB', 'Carroll']
['ADJ', 'JJ', 'Carroll']
['ADP', 'IN', 'Carroll']
['VERB', 'VBG', 'Carroll']
['ADP', 'IN', 'Carroll']
['DET', 'PRP$', 'Carroll']
['NOUN', 'NN', 'Carroll']
['ADP', 'IN', 'Carroll']
['DET', 'DT', 'Carroll']
['NOUN', 'NN', 'Carroll']
['PUNCT', ',', 'Carroll']
['CCONJ', 'CC', 'Carroll']
['ADP', 'IN', 'Carroll']
['VERB', 'VBG', 'Carroll']
['NOUN', 'NN', 'Carroll']
['PART', 'TO', 'Carroll']
['VERB', 'VB', 'Carroll']
['PUNCT', ':', 'Carroll']
['ADV', 'RB', 'Carroll']
['CCONJ', 'CC', 'Carroll']
['ADV', 'RB', 'Carroll']
['PRON', 'PRP', 'Carroll']
['VERB', 'VBD', 'Carroll']
['VERB', 'VBN', 'Carroll']
['ADP', 'IN', 'Carroll']
['DET', 'DT', 'Carroll']
['NOUN', 'NN', 'Carroll']
['DET', 'PRP$', 'Carroll']
['NOUN', 'NN', 'Carroll']
['VERB', 'VBD', 'Carroll']
['VERB', 'VBG', 'Carroll']
['PUNCT', ',', 'Carroll']
['CCONJ', 'CC', 'Carroll']
['PRON'

['NOUN', 'NN', 'Carroll']
['PRON', 'PRP', 'Carroll']
['VERB', 'VBD', 'Carroll']
['PART', 'TO', 'Carroll']
['VERB', 'VB', 'Carroll']
['PART', 'RP', 'Carroll']
['ADV', 'RB', 'Carroll']
['PUNCT', '.', 'Carroll']
['DET', 'DT', 'Carroll']
['NOUN', 'NN', 'Carroll']
['PUNCT', 'HYPH', 'Carroll']
['NOUN', 'NN', 'Carroll']
['VERB', 'VBD', 'Carroll']
['ADV', 'RB', 'Carroll']
['ADV', 'RB', 'Carroll']
['ADP', 'IN', 'Carroll']
['DET', 'DT', 'Carroll']
['NOUN', 'NN', 'Carroll']
['ADP', 'IN', 'Carroll']
['DET', 'DT', 'Carroll']
['NOUN', 'NN', 'Carroll']
['PUNCT', ',', 'Carroll']
['CCONJ', 'CC', 'Carroll']
['ADV', 'RB', 'Carroll']
['VERB', 'VBD', 'Carroll']
['ADV', 'RB', 'Carroll']
['ADV', 'RB', 'Carroll']
['PUNCT', ',', 'Carroll']
['ADV', 'RB', 'Carroll']
['ADV', 'RB', 'Carroll']
['ADP', 'IN', 'Carroll']
['PROPN', 'NNP', 'Carroll']
['VERB', 'VBD', 'Carroll']
['ADV', 'RB', 'Carroll']
['DET', 'DT', 'Carroll']
['NOUN', 'NN', 'Carroll']
['PART', 'TO', 'Carroll']
['VERB', 'VB', 'Carroll']
['ADP', 'IN', 'Ca

['ADV', 'RB', 'Carroll']
['ADP', 'IN', 'Carroll']
['DET', 'DT', 'Carroll']
['NOUN', 'NN', 'Carroll']
['ADP', 'IN', 'Carroll']
['DET', 'DT', 'Carroll']
['NOUN', 'NN', 'Carroll']
['PUNCT', '.', 'Carroll']
['VERB', 'VB', 'Carroll']
['PRON', 'PRP', 'Carroll']
['VERB', 'VB', 'Carroll']
['PUNCT', ':', 'Carroll']
['DET', 'DT', 'Carroll']
['VERB', 'MD', 'Carroll']
['VERB', 'VB', 'Carroll']
['NUM', 'CD', 'Carroll']
['NUM', 'CD', 'Carroll']
['NOUN', 'NNS', 'Carroll']
['PART', 'RP', 'Carroll']
['PUNCT', ',', 'Carroll']
['PRON', 'PRP', 'Carroll']
['VERB', 'VBP', 'Carroll']
['PUNCT', "''", 'Carroll']
['PUNCT', '-LRB-', 'Carroll']
['ADP', 'IN', 'Carroll']
['PUNCT', ',', 'Carroll']
['PRON', 'PRP', 'Carroll']
['VERB', 'VBP', 'Carroll']
['PUNCT', ',', 'Carroll']
['PROPN', 'NNP', 'Carroll']
['VERB', 'VBD', 'Carroll']
['VERB', 'VBN', 'Carroll']
['ADJ', 'JJ', 'Carroll']
['NOUN', 'NNS', 'Carroll']
['ADP', 'IN', 'Carroll']
['DET', 'DT', 'Carroll']
['NOUN', 'NN', 'Carroll']
['ADP', 'IN', 'Carroll']
['DET', '

['NOUN', 'NN', 'Carroll']
['PUNCT', ',', 'Carroll']
['PRON', 'PRP', 'Carroll']
['VERB', 'MD', 'Carroll']
['VERB', 'VB', 'Carroll']
['PUNCT', '.', 'Carroll']
['PUNCT', "''", 'Carroll']
['PUNCT', '-LRB-', 'Carroll']
['PROPN', 'NNP', 'Carroll']
['VERB', 'VBD', 'Carroll']
['DET', 'DT', 'Carroll']
['NOUN', 'NN', 'Carroll']
['PUNCT', '.', 'Carroll']
['PUNCT', '-RRB-', 'Carroll']
['PUNCT', "''", 'Carroll']
['PRON', 'PRP', 'Carroll']
['VERB', 'VBP', 'Carroll']
['PRON', 'PRP', 'Carroll']
['AUX', 'MD', 'Carroll']
['VERB', 'VB', 'Carroll']
['DET', 'PRP$', 'Carroll']
['NOUN', 'NN', 'Carroll']
['ADP', 'IN', 'Carroll']
['NOUN', 'NN', 'Carroll']
['ADP', 'IN', 'Carroll']
['NOUN', 'NN', 'Carroll']
['PUNCT', 'HYPH', 'Carroll']
['NOUN', 'NN', 'Carroll']
['PUNCT', '.', 'Carroll']
['PROPN', 'NNP', 'Carroll']
['DET', 'PRP$', 'Carroll']
['NOUN', 'NN', 'Carroll']
['PUNCT', '.', 'Carroll']
['PRON', 'PRP', 'Carroll']
['VERB', 'VBP', 'Carroll']
['PRON', 'PRP', 'Carroll']
['VERB', 'VBD', 'Carroll']
['ADV', 'RB', 

['NOUN', 'NNS', 'Carroll']
['PUNCT', ',', 'Carroll']
['ADV', 'WRB', 'Carroll']
['ADV', 'RB', 'Carroll']
['PRON', 'PRP', 'Carroll']
['VERB', 'VBZ', 'Carroll']
['VERB', 'VBG', 'Carroll']
['PUNCT', '.', 'Carroll']
['PUNCT', "''", 'Carroll']
['PRON', 'PRP', 'Carroll']
['VERB', 'VBD', 'Carroll']
['ADJ', 'JJ', 'Carroll']
['ADP', 'IN', 'Carroll']
['PRON', 'PRP', 'Carroll']
['ADV', 'WRB', 'Carroll']
['PRON', 'PRP', 'Carroll']
['VERB', 'VBD', 'Carroll']
['DET', 'DT', 'Carroll']
['NOUN', 'NN', 'Carroll']
['PUNCT', ',', 'Carroll']
['CCONJ', 'CC', 'Carroll']
['DET', 'DT', 'Carroll']
['PROPN', 'NNP', 'Carroll']
['VERB', 'VBD', 'Carroll']
['ADV', 'RB', 'Carroll']
['ADV', 'RBR', 'Carroll']
['PART', 'TO', 'Carroll']
['VERB', 'VB', 'Carroll']
['VERB', 'VBN', 'Carroll']
['PUNCT', ':', 'Carroll']
['PRON', 'PRP', 'Carroll']
['VERB', 'VBD', 'Carroll']
['PRON', 'PRP', 'Carroll']
['ADP', 'IN', 'Carroll']
['DET', 'DT', 'Carroll']
['ADJ', 'JJ', 'Carroll']
['PUNCT', ',', 'Carroll']
['ADJ', 'JJ', 'Carroll']
['NO

['VERB', 'VB', 'Carroll']
['ADV', 'RB', 'Carroll']
['PUNCT', ',', 'Carroll']
['PUNCT', "''", 'Carroll']
['VERB', 'VBD', 'Carroll']
['ADJ', 'JJ', 'Carroll']
['PROPN', 'NNP', 'Carroll']
['PUNCT', ',', 'Carroll']
['PUNCT', "''", 'Carroll']
['PRON', 'PRP', 'Carroll']
['VERB', 'MD', 'Carroll']
['VERB', 'VB', 'Carroll']
['ADP', 'IN', 'Carroll']
['ADV', 'RB', 'Carroll']
['ADJ', 'JJ', 'Carroll']
['NOUN', 'NN', 'Carroll']
['ADP', 'IN', 'Carroll']
['DET', 'PRP$', 'Carroll']
['NOUN', 'NNS', 'Carroll']
['PUNCT', '.', 'Carroll']
['INTJ', 'UH', 'Carroll']
['PUNCT', ',', 'Carroll']
['ADV', 'WRB', 'Carroll']
['PRON', 'PRP', 'Carroll']
['VERB', 'VBP', 'Carroll']
['PRON', 'PRP', 'Carroll']
['VERB', 'MD', 'Carroll']
['VERB', 'VB', 'Carroll']
['PART', 'RP', 'Carroll']
['ADP', 'IN', 'Carroll']
['DET', 'DT', 'Carroll']
['NOUN', 'NN', 'Carroll']
['PUNCT', '.', 'Carroll']
['PRON', 'PRP', 'Carroll']
['VERB', 'VBP', 'Carroll']
['PRON', 'PRP', 'Carroll']
['VERB', 'MD', 'Carroll']
['PUNCT', ',', 'Carroll']
['ADP'

['ADP', 'IN', 'Carroll']
['PUNCT', ',', 'Carroll']
['ADP', 'IN', 'Carroll']
['PRON', 'PRP', 'Carroll']
['VERB', 'VBP', 'Carroll']
['ADV', 'RB', 'Carroll']
['ADP', 'IN', 'Carroll']
['DET', 'DT', 'Carroll']
['NOUN', 'NN', 'Carroll']
['VERB', 'VBN', 'Carroll']
['PUNCT', "''", 'Carroll']
['NOUN', 'NN', 'Carroll']
['PUNCT', ',', 'Carroll']
['PUNCT', "''", 'Carroll']
['PRON', 'PRP', 'Carroll']
['VERB', 'VBZ', 'Carroll']
['ADV', 'RB', 'Carroll']
['ADJ', 'JJ', 'Carroll']
['PART', 'TO', 'Carroll']
['VERB', 'VB', 'Carroll']
['ADP', 'IN', 'Carroll']
['PRON', 'PRP', 'Carroll']
['PUNCT', ',', 'Carroll']
['ADV', 'RBR', 'Carroll']
['CCONJ', 'CC', 'Carroll']
['ADV', 'RBR', 'Carroll']
['PUNCT', '.', 'Carroll']
['ADV', 'RB', 'Carroll']
['PUNCT', ',', 'Carroll']
['DET', 'DT', 'Carroll']
['NOUN', 'NN', 'Carroll']
['VERB', 'VBD', 'Carroll']
['ADV', 'RB', 'Carroll']
['VERB', 'VBN', 'Carroll']
['PUNCT', "''", 'Carroll']
['NOUN', 'NN', 'Carroll']
['PUNCT', ',', 'Carroll']
['PUNCT', "''", 'Carroll']
['ADV', 'R

['NOUN', 'NN', 'Carroll']
['PUNCT', ',', 'Carroll']
['CCONJ', 'CC', 'Carroll']
['ADV', 'WRB', 'Carroll']
['PRON', 'PRP', 'Carroll']
['VERB', 'VBD', 'Carroll']
['ADV', 'RB', 'Carroll']
['ADP', 'IN', 'Carroll']
['DET', 'DT', 'Carroll']
['NOUN', 'NN', 'Carroll']
['ADP', 'IN', 'Carroll']
['PRON', 'PRP', 'Carroll']
['PUNCT', ',', 'Carroll']
['PRON', 'PRP', 'Carroll']
['VERB', 'VBD', 'Carroll']
['PRON', 'PRP', 'Carroll']
['VERB', 'MD', 'Carroll']
['ADV', 'RB', 'Carroll']
['ADV', 'RB', 'Carroll']
['VERB', 'VB', 'Carroll']
['PRON', 'PRP', 'Carroll']
['PUNCT', ':', 'Carroll']
['PRON', 'PRP', 'Carroll']
['VERB', 'MD', 'Carroll']
['VERB', 'VB', 'Carroll']
['PRON', 'PRP', 'Carroll']
['ADV', 'RB', 'Carroll']
['ADV', 'RB', 'Carroll']
['ADP', 'IN', 'Carroll']
['DET', 'DT', 'Carroll']
['NOUN', 'NN', 'Carroll']
['PUNCT', ',', 'Carroll']
['CCONJ', 'CC', 'Carroll']
['PRON', 'PRP', 'Carroll']
['VERB', 'VBD', 'Carroll']
['PRON', 'PRP', 'Carroll']
['ADJ', 'JJS', 'Carroll']
['PART', 'TO', 'Carroll']
['VERB',

['PRON', 'PRP', 'Carroll']
['VERB', 'VBP', 'Carroll']
['PRON', 'WP', 'Carroll']
['VERB', 'MD', 'Carroll']
['VERB', 'VB', 'Carroll']
['ADP', 'IN', 'Carroll']
['DET', 'PRP$', 'Carroll']
['NOUN', 'NNS', 'Carroll']
['CCONJ', 'CC', 'Carroll']
['NOUN', 'NNS', 'Carroll']
['ADP', 'IN', 'Carroll']
['PRON', 'PRP', 'Carroll']
['ADV', 'RB', 'Carroll']
['PUNCT', ',', 'Carroll']
['NOUN', 'NNS', 'Carroll']
['PUNCT', '.', 'Carroll']
['PRON', 'PRP', 'Carroll']
['VERB', 'VBP', 'Carroll']
['ADJ', 'JJ', 'Carroll']
['ADP', 'IN', 'Carroll']
['PRON', 'PRP', 'Carroll']
['VERB', 'VBP', 'Carroll']
['AUX', 'MD', 'Carroll']
['ADV', 'RB', 'Carroll']
['VERB', 'VB', 'Carroll']
['ADJ', 'JJ', 'Carroll']
['PUNCT', '.', 'Carroll']
['PRON', 'PRP', 'Carroll']
['VERB', 'MD', 'Carroll']
['VERB', 'VB', 'Carroll']
['DET', 'DT', 'Carroll']
['ADJ', 'JJ', 'Carroll']
['NOUN', 'NN', 'Carroll']
['ADV', 'RB', 'Carroll']
['ADV', 'RB', 'Carroll']
['ADV', 'RB', 'Carroll']
['PART', 'TO', 'Carroll']
['VERB', 'VB', 'Carroll']
['PRON', 'PR

['DET', 'DT', 'Carroll']
['ADJ', 'JJ', 'Carroll']
['PUNCT', ',', 'Carroll']
['VERB', 'VBG', 'Carroll']
['NOUN', 'NNS', 'Carroll']


IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [67]:
all_features.shape

(444, 1616)

In [68]:
all_features.text_sentence.head(5)

0    (Alice, was, beginning, to, get, very, tired, ...
1    (So, she, was, considering, in, her, own, mind...
2    (There, was, nothing, so, VERY, remarkable, in...
3                                        (Oh, dear, !)
4                                        (Oh, dear, !)
Name: text_sentence, dtype: object

In [69]:
from sklearn import ensemble
from sklearn.model_selection import train_test_split

rfc = ensemble.RandomForestClassifier()
Y = all_features['text_source']
X = np.array(all_features.drop(['text_sentence','text_source', 'author'], 1))

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.4,
                                                    random_state=0)
train = rfc.fit(X_train, y_train)

print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))



Training set score: 0.9849624060150376

Test set score: 0.8370786516853933


In [70]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))



(266, 1613) (266,)
Training set score: 0.9699248120300752

Test set score: 0.8651685393258427


In [71]:
clf = ensemble.GradientBoostingClassifier()
train = clf.fit(X_train, y_train)

print('Training set score:', clf.score(X_train, y_train))
print('\nTest set score:', clf.score(X_test, y_test))

Training set score: 0.9699248120300752

Test set score: 0.8033707865168539


In [72]:
from sklearn import svm
clf = svm.SVC()
clf.fit(X_train, y_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [73]:
print('Training set score:', clf.score(X_train, y_train))
print('\nTest set score:', clf.score(X_test, y_test))

Training set score: 0.7142857142857143

Test set score: 0.702247191011236


In [74]:
# Utility function to calculate how frequently words appear in the text.
def word_freq(text, include_stop=True):
    
    # Build a list of words.
    # Strip out punctuation and, optionally, stop words.
    words = []
    for token in text:
        if not token.is_punct and (not token.is_stop or include_stop):
            words.append(token.text)
            
    # Build and return a Counter object containing word counts.
    return Counter(words)
    
# The most frequent words:
alice_freq = word_frequencies(alice_doc).most_common(10)
persuasion_freq = word_frequencies(persuasion_doc).most_common(10)
print('Alice:', alice_freq)
print('Persuasion:', persuasion_freq)

Alice: [('the', 119), ('she', 94), ('to', 92), ('and', 74), ('it', 68), ('was', 63), ('a', 62), ('I', 54), ('of', 50), ('Alice', 34)]
Persuasion: [('the', 329), ('of', 324), ('and', 276), ('to', 249), ('a', 180), ('in', 153), ('had', 137), ('was', 117), ('her', 95), ('be', 92)]


In [75]:
sentences.head(5)

Unnamed: 0,0,1
0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll
1,"(So, she, was, considering, in, her, own, mind...",Carroll
2,"(There, was, nothing, so, VERY, remarkable, in...",Carroll
3,"(Oh, dear, !)",Carroll
4,"(Oh, dear, !)",Carroll


In [76]:
list(sentences.columns)

[0, 1]

In [77]:
sentences.columns = ['sent_text', 'author']
list(sentences.columns)

['sent_text', 'author']

Curriculum:
Random Forest: 0.987 // 0.890
Logistic Regression: 0.958 // 0.916
Gradient Boost: 0.887 // 0.874

SVM: 0.682 // 0.692

W/ Added Features:
Random Forest: 0.988 // 0.890
Logistic Regression: 0.958 // 0.918
Gradient Boost: 0.887 // 0.871
SVM: 0.682 // 0.692

With one added feature, "sentence length", the accuracy of each of the models did not change in any significant way, which is what would be expected. The largest change among these was the test set of the gradient boost model, which actually decreased by 0.3%. Training accuracy improved very slightly for the random forest model but was unchanged for the other three models. Logistic Regression is currently the model that performs the best, at nearly 92% accuracy.

I will continue to work on adding a part-of-speech feature, which I think will be improve accuracy more considerably. It would seem that the ways in which Jane Austen and Lewis Carroll use different parts of speech would potentially correlate much more strongly to the author than would general sentence length.

challenge 2

In [78]:
# Clean the Emma data.
paradise = gutenberg.raw('milton-paradise.txt')
paradise = re.sub(r'VOLUME \w+', '', paradise)
paradise = re.sub(r'CHAPTER \w+', '', paradise)
paradise = re.sub(r'Book \w+', '', paradise)
paradise = text_cleaner(paradise)
print(paradise[:100])

Of Man's first disobedience, and the fruit Of that forbidden tree whose mortal taste Brought death i


In [79]:
# Parse our cleaned data.
paradise_doc = nlp(paradise)

In [80]:
# Group into sentences.
paradise_sents = [[sent, "Milton"] for sent in paradise_doc.sents]

# Paradise is a little longer than Alice, let's cut it down to the same length as Alice.
paradise_sents = paradise_sents[0:len(alice_sents)]

In [81]:
# Build a new Bag of Words data frame for Persuasian word counts.
# We'll use the same common words from Alice and Persuasion.
paradise_sentences = pd.DataFrame(paradise_sents)
paradise_bow = bow_features(paradise_sentences, common_words)

print('done')

Processing row 0
Processing row 50
Processing row 100
done


In [82]:
# Creating function to add counts to bow features
def bow_counts(bow_df):
    counts = []
    # Counting number of each feature per sentence
    for sent in bow_df.text_sentence:
        len_sent = len(sent)
        advs = 0
        verbs = 0
        nouns = 0
        punc = 0
        for token in sent:
            if token.pos_ == 'ADV':
                advs +=1
            elif token.pos_ == 'VERB':
                verbs +=1
            elif token.pos_ == 'NOUN':
                nouns +=1
            elif token.pos_ == 'PUNCT':
                punc +=1
        # Appending counts to list
        counts.append([len_sent, advs, verbs, nouns, punc])
    # Combining original bow_features df with counts
    df = pd.concat([bow_df, pd.DataFrame(counts, 
                    columns=['sent_length', 'adv_count', 'verb_count', 'noun_count', 'punc_count'])],
                     axis=1)
    return df

In [83]:
paradise_bow.head()

Unnamed: 0,will,Gloucester,toffee,pace,scruple,important,time,spite,steal,lawn,...,funny,wise,planning,totally,Somerset,alternative,prescribe,case,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Of, Man, 's, first, disobedience, ,, and, the...",Milton
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(In, the, beginning, how, the, heavens, and, e...",Milton
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Invoke, thy, aid, to, my, adventurous, song, ...",Milton
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(And, chiefly, thou, ,, O, Spirit, ,, that, do...",Milton
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(th, ', upright, heart, and, pure, ,)",Milton


Milton vs Carrol

In [84]:
# Identifying variables
X_paradise = paradise_bow.drop(['text_sentence','text_source'], 1)
y_paradise = paradise_bow.text_source

alice_wc = word_counts[word_counts.text_source == 'Carroll']
X_alice = alice_wc.drop(['text_sentence','text_source'], 1)
y_alice = alice_wc.text_source

# Combine the Paradise sentence data with the Alice data from the test set.
X_pa = pd.concat([X_paradise, X_alice], 0)
y_pa = pd.concat([y_paradise, y_alice], 0)

# Split into train and test sets
X_train_pa, X_test_pa, y_train_pa, y_test_pa = train_test_split(X_pa, y_pa, test_size=0.4, random_state=0)

# Model
lr2 = LogisticRegression()
train = lr2.fit(X_train_pa, y_train_pa)
print('Train set score:', lr2.score(X_train_pa, y_train_pa))
print('\nTest set score:', lr2.score(X_test_pa, y_test_pa))
lr_pa_predicted = lr2.predict(X_test_pa)
pd.crosstab(y_test_pa, lr_pa_predicted)




Train set score: 0.9805194805194806

Test set score: 0.7788461538461539


col_0,Carroll,Milton
text_source,Unnamed: 1_level_1,Unnamed: 2_level_1
Carroll,30,21
Milton,2,51


Milton vs austen

In [85]:
# Identifying variables
X_paradise = paradise_bow.drop(['text_sentence','text_source'], 1)
y_paradise = paradise_bow.text_source

persuasion_wc = word_counts[word_counts.text_source == 'Austen']
X_persuasion = persuasion_wc.drop(['text_sentence','text_source'], 1)
y_persuasion = persuasion_wc.text_source

# Combine the Paradise sentence data with the Alice data from the test set.
X_pp = pd.concat([X_paradise, X_persuasion], 0)
y_pp = pd.concat([y_paradise, y_persuasion], 0)

# Split into train and test sets
X_train_pp, X_test_pp, y_train_pp, y_test_pp = train_test_split(X_pp, y_pp, test_size=0.4, random_state=0)

# Model.
lr3 = LogisticRegression()
train = lr3.fit(X_train_pp, y_train_pp)
print('Train set score:', lr3.score(X_train_pp, y_train_pp))
print('\nTest set score:', lr3.score(X_test_pp, y_test_pp))
lr_pp_predicted = lr3.predict(X_test_pp)
pd.crosstab(y_test_pp, lr_pp_predicted)



Train set score: 0.9511278195488722

Test set score: 0.7808988764044944


col_0,Austen,Milton
text_source,Unnamed: 1_level_1,Unnamed: 2_level_1
Austen,120,5
Milton,34,19
