In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn

import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import gutenberg, stopwords
from collections import Counter

from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import VarianceThreshold

import nltk

  from numpy.core.umath_tests import inner1d


Supervised NLP requires a pre-labelled dataset for training and testing, and is generally interested in categorizing text in various ways. In this case, we are going to try to predict whether a sentence comes from _Alice in Wonderland_ by Lewis Carroll or _Persuasion_ by Jane Austen. We can use any of the supervised models we've covered previously, as long as they allow categorical outcomes. In this case, we'll try Random Forests, SVM, and KNN.

Our feature-generation approach will be something called _BoW_, or _Bag of Words_. BoW is quite simple: For each sentence, we count how many times each word appears. We will then use those counts as features.  

In [2]:
text = 'abc def dghs'
text.split()

['abc', 'def', 'dghs']

In [3]:
text = ' '.join(text.split())
text

'abc def dghs'

In [4]:
# Utility function for standard text cleaning.
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    
    # 'r' = raw string notation for regular expression patterns; backslashes are not handled in any special way 
    # in a string literal prefixed with 'r':
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    #text = re.sub("\b[A-Z]{2,}\b","",text)
    text = ' '.join(text.split())
    return text



In [5]:
    
# Load and clean the data.
persuasion = gutenberg.raw('austen-persuasion.txt')
alice = gutenberg.raw('carroll-alice.txt')

In [6]:
persuasion



In [7]:
# The Chapter indicator is idiosyncratic
persuasion = re.sub(r'Chapter \d+', '', persuasion)
alice = re.sub(r'CHAPTER .*', '', alice)


In [8]:
persuasion



In [9]:

alice = text_cleaner(alice)
persuasion = text_cleaner(persuasion)

In [10]:
persuasion



In [11]:
# Parse the cleaned novels. This can take a bit.
nlp = spacy.load('en')
alice_doc = nlp(alice)
persuasion_doc = nlp(persuasion)

In [12]:
# Group into sentences.
alice_sents = [[sent, "Carroll"] for sent in alice_doc.sents]
persuasion_sents = [[sent, "Austen"] for sent in persuasion_doc.sents]

In [13]:
persuasion_sents

[[Sir Walter Elliot, of Kellynch Hall, in Somersetshire, was a man who, for his own amusement, never took up any book but the Baronetage; there he found occupation for an idle hour, and consolation in a distressed one; there his faculties were roused into admiration and respect, by contemplating the limited remnant of the earliest patents; there any unwelcome sensations, arising from domestic affairs changed naturally into pity and contempt as he turned over the almost endless creations of the last century; and there, if every other leaf were powerless, he could read his own history with an interest which never failed.,
  'Austen'],
 [This was the page at which the favourite volume always opened: "ELLIOT OF KELLYNCH HALL. ",
  'Austen'],
 [Walter Elliot, born March 1, 1760, married, July 15, 1784, Elizabeth, daughter of James Stevenson, Esq.,
  'Austen'],
 [of South Park, in the county of Gloucester, by which lady (who died 1800) he has issue Elizabeth, born June 1, 1785; Anne, born Au

In [14]:


# Combine the sentences from the two novels into one data frame.
sentences = pd.DataFrame(alice_sents + persuasion_sents)
sentences#.head()

Unnamed: 0,0,1
0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll
1,"(So, she, was, considering, in, her, own, mind...",Carroll
2,"(There, was, nothing, so, VERY, remarkable, in...",Carroll
3,"(Oh, dear, !)",Carroll
4,"(I, shall, be, late, !, ')",Carroll
5,"((, when, she, thought, it, over, afterwards, ...",Carroll
6,"(In, another, moment, down, went, Alice, after...",Carroll
7,"(The, rabbit, -, hole, went, straight, on, lik...",Carroll
8,"(Either, the, well, was, very, deep, ,, or, sh...",Carroll
9,"(First, ,, she, tried, to, look, down, and, ma...",Carroll


Time to bag some words!  Since spaCy has already tokenized and labelled our data, we can move directly to recording how often various words occur.  We will exclude stopwords and punctuation.  In addition, in an attempt to keep our feature space from exploding, we will work with lemmas (root words) rather than the raw text terms, and we'll only use the 2000 most common words for each text.

In [15]:
# Utility function to create a list of the 2000 most common words.
def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(2000)]
    

In [16]:
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')

In [17]:
bag_of_words(doc)

['look', '$', '1', 'u.k.', 'billion', 'apple', 'startup', 'buy']

In [18]:
# Set up the bags.
alicewords = bag_of_words(alice_doc)
persuasionwords = bag_of_words(persuasion_doc)

In [19]:
persuasionwords

['-PRON-',
 'anne',
 "'s",
 'captain',
 'elliot',
 'mrs',
 'good',
 'know',
 'think',
 'mr',
 'lady',
 'wentworth',
 'the',
 'say',
 'come',
 'little',
 'time',
 'charles',
 'man',
 'look',
 'great',
 'see',
 'go',
 'sir',
 'russell',
 'walter',
 'mary',
 'feel',
 'musgrove',
 'miss',
 'find',
 'soon',
 'father',
 'hear',
 'friend',
 'louisa',
 'but',
 'leave',
 'wish',
 'have',
 'place',
 'long',
 'bath',
 'speak',
 'day',
 'like',
 'room',
 'house',
 'feeling',
 'sister',
 'young',
 'woman',
 'elizabeth',
 'family',
 'talk',
 'walk',
 'year',
 'give',
 'moment',
 'way',
 'home',
 'want',
 'away',
 'uppercross',
 'manner',
 'harville',
 'happy',
 'henrietta',
 'kellynch',
 'believe',
 'take',
 'sure',
 'begin',
 'present',
 'party',
 'there',
 'benwick',
 'admiral',
 'tell',
 'smith',
 'mind',
 'mean',
 'lyme',
 'clay',
 'return',
 'thing',
 'a',
 'hour',
 'and',
 'acquaintance',
 'love',
 'hope',
 'half',
 'child',
 'bring',
 'morning',
 'meet',
 'pass',
 'croft',
 'dear',
 'eye',
 '

In [20]:
df = pd.DataFrame(columns=persuasionwords)
df['text_sentence'] = sentences[0]
df['text_source'] = sentences[1]

In [21]:
df

Unnamed: 0,-PRON-,anne,'s,captain,elliot,mrs,good,know,think,mr,...,apologize,ladyship,hazard,roof,inevitable,smoke,compatible,heighten,text_sentence,text_source
0,,,,,,,,,,,...,,,,,,,,,"(Alice, was, beginning, to, get, very, tired, ...",Carroll
1,,,,,,,,,,,...,,,,,,,,,"(So, she, was, considering, in, her, own, mind...",Carroll
2,,,,,,,,,,,...,,,,,,,,,"(There, was, nothing, so, VERY, remarkable, in...",Carroll
3,,,,,,,,,,,...,,,,,,,,,"(Oh, dear, !)",Carroll
4,,,,,,,,,,,...,,,,,,,,,"(I, shall, be, late, !, ')",Carroll
5,,,,,,,,,,,...,,,,,,,,,"((, when, she, thought, it, over, afterwards, ...",Carroll
6,,,,,,,,,,,...,,,,,,,,,"(In, another, moment, down, went, Alice, after...",Carroll
7,,,,,,,,,,,...,,,,,,,,,"(The, rabbit, -, hole, went, straight, on, lik...",Carroll
8,,,,,,,,,,,...,,,,,,,,,"(Either, the, well, was, very, deep, ,, or, sh...",Carroll
9,,,,,,,,,,,...,,,,,,,,,"(First, ,, she, tried, to, look, down, and, ma...",Carroll


In [22]:
df.loc[:, persuasionwords]=0

In [23]:
df

Unnamed: 0,-PRON-,anne,'s,captain,elliot,mrs,good,know,think,mr,...,apologize,ladyship,hazard,roof,inevitable,smoke,compatible,heighten,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(So, she, was, considering, in, her, own, mind...",Carroll
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(There, was, nothing, so, VERY, remarkable, in...",Carroll
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Oh, dear, !)",Carroll
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(I, shall, be, late, !, ')",Carroll
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"((, when, she, thought, it, over, afterwards, ...",Carroll
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(In, another, moment, down, went, Alice, after...",Carroll
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(The, rabbit, -, hole, went, straight, on, lik...",Carroll
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Either, the, well, was, very, deep, ,, or, sh...",Carroll
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(First, ,, she, tried, to, look, down, and, ma...",Carroll


In [24]:
list(enumerate(df['text_sentence']))

[(0,
  Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, 'and what is the use of a book,' thought Alice 'without pictures or conversation?'),
 (1,
  So she was considering in her own mind (as well as she could, for the hot day made her feel very sleepy and stupid), whether the pleasure of making a daisy-chain would be worth the trouble of getting up and picking the daisies, when suddenly a White Rabbit with pink eyes ran close by her.),
 (2,
  There was nothing so VERY remarkable in that; nor did Alice think it so VERY much out of the way to hear the Rabbit say to itself, 'Oh dear!),
 (3, Oh dear!),
 (4, I shall be late!'),
 (5,
  (when she thought it over afterwards, it occurred to her that she ought to have wondered at this, but at the time it all seemed quite natural); but when the Rabbit actually TOOK A WATCH OUT OF

In [25]:


# Creates a data frame with features for each word in our common word set.
# Each value is the count of the times the word appears in each sentence.
def bow_features(sentences, common_words):
    
    # Scaffold the data frame and initialize counts to zero.
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1]
    # fill values with '0'
    df.loc[:, common_words] = 0
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(df['text_sentence']):
        
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        
        # Populate the row with word counts.
        for word in words:
            df.loc[i, word] += 1 # .loc[rows,columns] > columns have been tokenized w/ prev function!
        
        # This counter is just to make sure the kernel didn't hang.
        if i % 500 == 0:
            print("Processing row {}".format(i))
            
    return df



# Combine bags to create a set of unique words.
common_words = set(alicewords + persuasionwords)

In [26]:
# Create our data frame with features. This can take a while to run.
import time
start_time = time.time()

word_counts = bow_features(sentences, common_words)
word_counts.head()

print("\n--- %s seconds ---" % (time.time() - start_time))

Processing row 0
Processing row 500
Processing row 1000
Processing row 1500
Processing row 2000
Processing row 2500
Processing row 3000
Processing row 3500
Processing row 4000
Processing row 4500
Processing row 5000

--- 91923.11640262604 seconds ---


## Trying out BoW

Now let's give the bag of words features a whirl by trying a random forest.

In [27]:
from sklearn import ensemble
from sklearn.model_selection import train_test_split

rfc = ensemble.RandomForestClassifier()
Y = word_counts['text_source']
X = word_counts.drop(['text_sentence','text_source'], 1)

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.4,
                                                    random_state=0)
train = rfc.fit(X_train, y_train)

print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

Training set score: 0.9890282131661442

Test set score: 0.8829887218045113


Holy overfitting, Batman! Overfitting is a known problem when using bag of words, since it basically involves throwing a massive number of features at a model – some of those features (in this case, word frequencies) will capture noise in the training set. Since overfitting is also a known problem with Random Forests, the divergence between training score and test score is expected.


## BoW with Logistic Regression

Let's try a technique with some protection against overfitting due to extraneous features – logistic regression with ridge regularization (from ridge regression, also called L2 regularization).

In [28]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

(3190, 3057) (3190,)
Training set score: 0.957680250783699

Test set score: 0.9163533834586466


Logistic regression performs a bit better than the random forest.  

# BoW with Gradient Boosting

And finally, let's see what gradient boosting can do:

In [29]:
clf = ensemble.GradientBoostingClassifier()
train = clf.fit(X_train, y_train)

print('Training set score:', clf.score(X_train, y_train))
print('\nTest set score:', clf.score(X_test, y_test))

Training set score: 0.886833855799373

Test set score: 0.8735902255639098


Looks like logistic regression is the winner, but there's room for improvement.

# Same model, new inputs

What if we feed the model a different novel by Jane Austen, like _Emma_?  Will it be able to distinguish Austen from Carroll with the same level of accuracy if we insert a different sample of Austen's writing?

First, we need to process _Emma_ the same way we processed the other data, and combine it with the Alice data:

In [30]:
# Clean the Emma data.
emma = gutenberg.raw('austen-emma.txt')
emma = re.sub(r'VOLUME \w+', '', emma)
emma = re.sub(r'CHAPTER \w+', '', emma)
emma = text_cleaner(emma)
print(emma[:100])

Emma Woodhouse, handsome, clever, and rich, with a comfortable home and happy disposition, seemed to


In [31]:
import time
start_time = time.time()

# Parse our cleaned data.
emma_doc = nlp(emma)

print("\n--- %s seconds ---" % (time.time() - start_time))


--- 107.7531247138977 seconds ---


In [32]:
# Group into sentences.
persuasion_sents = [[sent, "Austen"] for sent in persuasion_doc.sents]
emma_sents = [[sent, "Austen"] for sent in emma_doc.sents]

# Emma is quite long, let's cut it down to the same length as Alice.
emma_sents = emma_sents[0:len(alice_sents)]

In [33]:
# Build a new Bag of Words data frame for Emma word counts.
# We'll use the same common words from Alice and Persuasion.

emma_sentences = pd.DataFrame(emma_sents) #1 second

import time
start_time = time.time()

emma_bow = bow_features(emma_sentences, common_words)

print("\n--- %s seconds ---" % (time.time() - start_time))
print('done')

Processing row 0
Processing row 500
Processing row 1000
Processing row 1500

--- 1456.6673557758331 seconds ---
done


In [34]:
print(emma_bow.shape)
emma_bow.head()

(1669, 3059)


Unnamed: 0,pleasant,instinctively,shore,baldwin,skurri,blind,apparently,safety,gowland,ann,...,glad,longer,approach,eye,rude,be,uppercross,sleep,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Emma, Woodhouse, ,, handsome, ,, clever, ,, a...",Austen
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(She, was, the, youngest, of, the, two, daught...",Austen
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Her, mother, had, died, too, long, ago, for, ...",Austen
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Sixteen, years, had, Miss, Taylor, been, in, ...",Austen
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Between, _, them)",Austen


In [35]:
# Now we can model it!
# Let's use logistic regression again.

# Combine the Emma sentence data with the Alice data from the test set.
X_Emma_test = np.concatenate((
    X_train.loc[y_train[y_train=='Carroll'].index],
    emma_bow.drop(['text_sentence','text_source'], 1)
), axis=0)
y_Emma_test = pd.concat([y_train[y_train=='Carroll'],
                         pd.Series(['Austen'] * emma_bow.shape[0])]
                       , axis = 0
                       )



In [36]:
# Model.
print('\nTest set score:', lr.score(X_Emma_test, y_Emma_test))
lr_Emma_predicted = lr.predict(X_Emma_test)
pd.crosstab(y_Emma_test, lr_Emma_predicted)


Test set score: 0.9161073825503355


col_0,Austen,Carroll
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
Austen,1563,106
Carroll,119,894


In [37]:
lr_Emma_predicted

array(['Carroll', 'Carroll', 'Austen', ..., 'Austen', 'Austen', 'Austen'],
      dtype=object)

Well look at that!  NLP approaches are generally effective on the same type of material as they were trained on. It looks like this model is actually able to differentiate multiple works by Austen from Alice in Wonderland.  Now the question is whether the model is very good at identifying Austen, or very good at identifying Alice in Wonderland, or both...

# Challenge 0:

Recall that the logistic regression model's best performance on the test set was 93%.  See what you can do to improve performance.  Suggested avenues of investigation include: 
- Other modeling techniques (SVM?), 
- making more features that take advantage of the spaCy information (include grammar, phrases, POS, etc), 
- making sentence-level features (number of words, amount of punctuation), or 
- including contextual information (length of previous and next sentences, 
- words repeated from one sentence to the next, etc), 
- and anything else your heart desires.  

Make sure to design your models on the test set, or use cross_validation with multiple folds, and see if you can get accuracy above 90%.  

# Challenge 1:
Find out whether your new model is good at identifying Alice in Wonderland vs any other work, Persuasion vs any other work, or Austen vs any other work.  This will involve pulling a new book from the Project Gutenberg corpus (print(gutenberg.fileids()) for a list) and processing it.

Record your work for each challenge in a notebook and submit it below.

-----------

Using list comprehensions, I will add features for the number of words in the sentence, the average word length, the number of stop words, and the number of quotation marks. These are all features that could potentially differentiate one author's writing style from another.

In [38]:
def create_spacy_features(df):

    df['sentence_length'] = df['text_sentence'].apply(lambda sentence: len(sentence))
    df['avg_word_length'] = df['text_sentence'].apply(
        lambda sentence: sum(len(word) for word in sentence)/len(sentence))
    
    def sum_stops (sentence):
        sum_stops = []
        sum_stops = [sum_stops.append(token) for token in sentence if token.is_stop]
        return len(sum_stops)
    
    def sum_quote (sentence):
        sum_quote = []
        sum_quote = [sum_quote.append(token) for token in sentence if token.is_quote]
        return len(sum_quote)
        

    df['num_stop'] = df['text_sentence'].apply(lambda sentence: sum_stops(sentence))
    df['num_quotes'] = df['text_sentence'].apply(lambda sentence: sum_quote(sentence))
        
    print(df.head())
    
    return(df)

alice_persuasion_spacy = create_spacy_features(word_counts)

  pleasant instinctively shore baldwin skurri blind apparently safety gowland  \
0        0             0     0       0      0     0          0      0       0   
1        0             0     0       0      0     0          0      0       0   
2        0             0     0       0      0     0          0      0       0   
3        0             0     0       0      0     0          0      0       0   
4        0             0     0       0      0     0          0      0       0   

  ann    ...     rude be uppercross sleep  \
0   0    ...        0  0          0     0   
1   0    ...        0  0          0     0   
2   0    ...        0  0          0     0   
3   0    ...        0  0          0     0   
4   0    ...        0  0          0     0   

                                       text_sentence text_source  \
0  (Alice, was, beginning, to, get, very, tired, ...     Carroll   
1  (So, she, was, considering, in, her, own, mind...     Carroll   
2  (There, was, nothing, so, VERY, rem

In [39]:
alice_persuasion_spacy

Unnamed: 0,pleasant,instinctively,shore,baldwin,skurri,blind,apparently,safety,gowland,ann,...,rude,be,uppercross,sleep,text_sentence,text_source,sentence_length,avg_word_length,num_stop,num_quotes
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll,67,3.656716,37,4
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,"(So, she, was, considering, in, her, own, mind...",Carroll,63,3.730159,32,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,"(There, was, nothing, so, VERY, remarkable, in...",Carroll,33,3.393939,18,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,"(Oh, dear, !)",Carroll,3,2.333333,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,"(I, shall, be, late, !, ')",Carroll,6,2.333333,1,1
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,"((, when, she, thought, it, over, afterwards, ...",Carroll,126,3.650794,70,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,"(In, another, moment, down, went, Alice, after...",Carroll,23,3.869565,15,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,"(The, rabbit, -, hole, went, straight, on, lik...",Carroll,44,4.113636,22,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,"(Either, the, well, was, very, deep, ,, or, sh...",Carroll,37,3.486486,23,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,"(First, ,, she, tried, to, look, down, and, ma...",Carroll,49,3.591837,31,0




Let's try iterating through multiple C values for the logistic regressor (`l2`).


In [40]:
def run_logistic(df, c):
    
    Y = df['text_source']
    X = df.drop(['text_sentence','text_source'], 1)
    
    X_dummies = pd.get_dummies(X)
    X_train, X_test, y_train, y_test = train_test_split(X_dummies, 
                                                        Y,
                                                        test_size=0.4,
                                                        random_state=0)
    print('C = {c}'.format(c=c))
    lr = LogisticRegression(C=c, penalty='l2', random_state = 1)
    lr.fit(X_train, y_train)
    print('Training set score:', lr.score(X_train, y_train))
    print('Test set score:', lr.score(X_test, y_test))

    y_pred = lr.predict(X_test)
    cm = pd.crosstab(y_test, y_pred)
    print('\n')
    print(cm)
    print('\n')

In [41]:
import time
start_time = time.time()

c_values = [1e-1, 1, 3, 5, 10, 100]
# Inverse of regularization strength; must be a positive float. 
# Like in support vector machines, smaller values specify stronger regularization.
for c in c_values:
    run_logistic(alice_persuasion_spacy, c)
    
print("\n--- %s seconds ---" % (time.time() - start_time))    

C = 0.1
Training set score: 0.9238244514106583
Test set score: 0.9055451127819549


col_0        Austen  Carroll
text_source                 
Austen         1417       55
Carroll         146      510


C = 1
Training set score: 0.974294670846395
Test set score: 0.9191729323308271


col_0        Austen  Carroll
text_source                 
Austen         1417       55
Carroll         117      539


C = 3
Training set score: 0.9849529780564263
Test set score: 0.918233082706767


col_0        Austen  Carroll
text_source                 
Austen         1411       61
Carroll         113      543


C = 5
Training set score: 0.9884012539184953
Test set score: 0.9163533834586466


col_0        Austen  Carroll
text_source                 
Austen         1407       65
Carroll         113      543


C = 10
Training set score: 0.990282131661442
Test set score: 0.9135338345864662


col_0        Austen  Carroll
text_source                 
Austen         1403       69
Carroll         115      541




Some improvement. The best score was with C = 1, score = 91.91, but not over 93.

Let's try using lasso regression:

In [42]:


def run_lasso(df, c):
    
    Y = df['text_source']
    X = df.drop(['text_sentence','text_source'], 1)
    
    X_dummies = pd.get_dummies(X)
    X_train, X_test, y_train, y_test = train_test_split(X_dummies, 
                                                        Y,
                                                        test_size=0.4,
                                                        random_state=0)
    print('C = {c}'.format(c=c))
    lr = LogisticRegression(C=c, penalty='l1', random_state = 1)
    lr.fit(X_train, y_train)
    print('Training set score:', lr.score(X_train, y_train))
    print('Test set score:', lr.score(X_test, y_test))

    y_pred = lr.predict(X_test)
    cm = pd.crosstab(y_test, y_pred)
    print('\n')
    print(cm)
    print('\n')
    
for c in c_values:
    run_lasso(alice_persuasion_spacy, c)



C = 0.1
Training set score: 0.8564263322884013
Test set score: 0.8543233082706767


col_0        Austen  Carroll
text_source                 
Austen         1386       86
Carroll         224      432


C = 1
Training set score: 0.9391849529780564
Test set score: 0.9069548872180451


col_0        Austen  Carroll
text_source                 
Austen         1412       60
Carroll         138      518


C = 3
Training set score: 0.9821316614420063
Test set score: 0.9088345864661654


col_0        Austen  Carroll
text_source                 
Austen         1404       68
Carroll         126      530


C = 5
Training set score: 0.986833855799373
Test set score: 0.9093045112781954


col_0        Austen  Carroll
text_source                 
Austen         1403       69
Carroll         124      532


C = 10
Training set score: 0.9893416927899686
Test set score: 0.9050751879699248


col_0        Austen  Carroll
text_source                 
Austen         1397       75
Carroll         127      529




Turns out lasso regression does not perform as well as ridge, the highest testing score was 0.9093 with default C=5. This is slightly lower than the highest ridge score 0.9196.

Next, I will try using a __support vector classifier__ to distinguish between Carroll and Austen's writing styles. I will iterate through several values of the penalty parameter C and pick the best score. I will continue using the spaCy feature set since it produced a better score in logistic regression.


In [43]:
def run_svc(df, c):
    print('C = {c}'.format(c=c))
    
    Y = df['text_source']
    X = df.drop(['text_sentence','text_source'], 1)
    X_dummies = pd.get_dummies(X)

    X_train, X_test, y_train, y_test = train_test_split(X_dummies, 
                                                        Y,
                                                        test_size=0.4,
                                                        random_state=0)
    svc = SVC(C = c)
    svc.fit(X_train, y_train)
    print('Training set score:', svc.score(X_train, y_train))
    print('Test set score:', svc.score(X_test, y_test))

    y_pred = svc.predict(X_test)
    cm = pd.crosstab(y_test, y_pred)
    print(cm)
    print('\n')


import time
start_time = time.time()

c_values = [1e-3, 1e-2, 1e-1, 1, 100, 1000]
for c in c_values:
    run_svc(alice_persuasion_spacy, c)
    
print("\n--- %s seconds ---" % (time.time() - start_time))       

C = 0.001
Training set score: 0.6824451410658308
Test set score: 0.6917293233082706
col_0        Austen
text_source        
Austen         1472
Carroll         656


C = 0.01
Training set score: 0.6824451410658308
Test set score: 0.6917293233082706
col_0        Austen
text_source        
Austen         1472
Carroll         656


C = 0.1
Training set score: 0.6824451410658308
Test set score: 0.6917293233082706
col_0        Austen
text_source        
Austen         1472
Carroll         656


C = 1
Training set score: 0.6893416927899687
Test set score: 0.6997180451127819
col_0        Austen  Carroll
text_source                 
Austen         1472        0
Carroll         639       17


C = 100
Training set score: 0.925705329153605
Test set score: 0.9111842105263158
col_0        Austen  Carroll
text_source                 
Austen         1428       44
Carroll         145      511


C = 1000
Training set score: 0.9746081504702194
Test set score: 0.912124060150376
col_0        Austen  Carro



Up to a certain value of C, all the training/testing scores are the same, and they are not very good. For these values, the testing score is 0.69. Once C goes up to C=100, the score goes up to 0.91, and at C=1000, the score is 0.9129. This isn't bad, but doesn't beat the logistic regression score of 0.9196.

Let's try some forms of __feature selection__ to see if reducing the features down from ~3000 will help the score.


In [44]:


k_values = [500,1000,1500,2000,2500]

Y = alice_persuasion_spacy['text_source']
X = alice_persuasion_spacy.drop(['text_sentence','text_source'], 1)

import time
start_time = time.time()


for k in k_values:
    print('k:', k)
    kb = SelectKBest(k=k)
    k_reduced = kb.fit_transform(X,y=Y)

    X_train_k, X_test_k, y_train_k, y_test_k = train_test_split(k_reduced, 
                                                        Y,
                                                        test_size=0.4,
                                                        random_state=0)

    lr = LogisticRegression()
    lr.fit(X_train_k, y_train_k)
    print('Training set score:', lr.score(X_train_k, y_train_k))
    print('Test set score:', lr.score(X_test_k, y_test_k))

    y_pred = lr.predict(X_test_k)
    cm = pd.crosstab(y_test_k, y_pred)
    print(cm)
    print('\n')


print("\n--- %s seconds ---" % (time.time() - start_time))   

k: 500
Training set score: 0.9285266457680251
Test set score: 0.9215225563909775
col_0        Austen  Carroll
text_source                 
Austen         1427       45
Carroll         122      534


k: 1000
Training set score: 0.9426332288401253
Test set score: 0.9252819548872181
col_0        Austen  Carroll
text_source                 
Austen         1429       43
Carroll         116      540


k: 1500
Training set score: 0.9445141065830721
Test set score: 0.9257518796992481
col_0        Austen  Carroll
text_source                 
Austen         1424       48
Carroll         110      546


k: 2000
Training set score: 0.9482758620689655
Test set score: 0.9243421052631579
col_0        Austen  Carroll
text_source                 
Austen         1423       49
Carroll         112      544


k: 2500
Training set score: 0.9532915360501567
Test set score: 0.9234022556390977
col_0        Austen  Carroll
text_source                 
Austen         1425       47
Carroll         116      540







When using SelectKBest to reduce the feature set to the 1000 best features, we improve the score from 0.9224 to 0.9252 ! This was achieved using the default value of C=1.

Let's try one more method of __feature selection__ before moving on to the next challenge.


In [45]:
var = VarianceThreshold(threshold=(.999 * (1 - .999)))
var_reduced = var.fit_transform(X)
print(var_reduced.shape)

X_train_var, X_test_var, y_train_var, y_test_var = train_test_split(var_reduced, 
                                                    Y,
                                                    test_size=0.4,
                                                    random_state=0)

lr = LogisticRegression()
lr.fit(X_train_var, y_train_var)
print('Training set score:', lr.score(X_train_var, y_train_var))
print('Test set score:', lr.score(X_test_var, y_test_var))

y_pred = lr.predict(X_test_var)
cm = pd.crosstab(y_test_var, y_pred)
print(cm)
print('\n')

(5318, 1429)
Training set score: 0.9523510971786834
Test set score: 0.9219924812030075
col_0        Austen  Carroll
text_source                 
Austen         1422       50
Carroll         116      540




In [46]:
var.fit_transform(X)

array([[ 0.        ,  0.        ,  0.        , ...,  3.65671642,
        37.        ,  4.        ],
       [ 0.        ,  0.        ,  0.        , ...,  3.73015873,
        32.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  3.39393939,
        18.        ,  1.        ],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  4.14285714,
        16.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  4.24390244,
        20.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  5.        ,
         0.        ,  0.        ]])



With a score of 0.9219, variance threshold produces results that are good, but not better than SelectKBest.

Let's see if using k=1000 with SelectKBest can improve the SVC classifier.


In [47]:
print('k: 1000')
kb = SelectKBest(k=1000)
k_reduced = kb.fit_transform(X,y=Y)


import time
start_time = time.time()

for c in c_values:
    print('C = {c}'.format(c=c))
    
    X_dummies = pd.get_dummies(X)

    X_train, X_test, y_train, y_test = train_test_split(k_reduced, 
                                                        Y,
                                                        test_size=0.4,
                                                        random_state=0)
    svc = SVC(C = c)
    svc.fit(X_train, y_train)
    print('Training set score:', svc.score(X_train, y_train))
    print('Test set score:', svc.score(X_test, y_test))

    y_pred = svc.predict(X_test)
    cm = pd.crosstab(y_test, y_pred)
    print(cm)
    print('\n')
    

print("\n--- %s seconds ---" % (time.time() - start_time))     

k: 1000
C = 0.001
Training set score: 0.6824451410658308
Test set score: 0.6917293233082706
col_0        Austen
text_source        
Austen         1472
Carroll         656


C = 0.01
Training set score: 0.6824451410658308
Test set score: 0.6917293233082706
col_0        Austen
text_source        
Austen         1472
Carroll         656


C = 0.1
Training set score: 0.6824451410658308
Test set score: 0.6917293233082706
col_0        Austen
text_source        
Austen         1472
Carroll         656


C = 1
Training set score: 0.7749216300940439
Test set score: 0.7824248120300752
col_0        Austen  Carroll
text_source                 
Austen         1375       97
Carroll         366      290


C = 100
Training set score: 0.936050156739812
Test set score: 0.9149436090225563
col_0        Austen  Carroll
text_source                 
Austen         1418       54
Carroll         127      529


C = 1000
Training set score: 0.9664576802507837
Test set score: 0.9168233082706767
col_0        Aust

These scores look good! For some unexplicable reason.

Even better! However, this does not beat our score of 0.9297 using k=1500 with logistic regression. Now, let's move on to the next challenge.


## Challenge 1:

Find out whether your new model is good at identifying Alice in Wonderland vs any other work, Persuasion vs any other work, or Austen vs any other work. This will involve pulling a new book from the Project Gutenberg corpus (print(gutenberg.fileids()) for a list) and processing it. Record your work for each challenge in a notebook and submit it below.

Let's see whether we can distinguish sentences from another novel. We will use Chesterton's "The Ballad of the White Horse."

Import the text and get rid of the chapter titles (denoted with roman numerals).


In [48]:


bryant = gutenberg.raw('bryant-stories.txt')
# bryant = re.sub(r'^[IVXLMC]+[.][A-Z ]+$','', chesball, flags=re.MULTILINE)



In [49]:
bryant



In [50]:
def text_cleaner2(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    
    # 'r' = raw string notation for regular expression patterns; backslashes are not handled in any special way 
    # in a string literal prefixed with 'r':
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = re.sub(r"\b[A-Z]{2,}\b","",text) #Delete chapter names: At least two words in capital letters!
    text = ' '.join(text.split())
    return text

In [51]:
bryant = text_cleaner2(bryant)
print(bryant[:2000])

There's a garden that I ken, Full of little gentlemen; Little caps of blue they wear, And green ribbons, very fair. (Flax.) From house to house he goes, A messenger small and slight, And whether it rains or snows, He sleeps outside in the night. (The path.) Once there was a little yellow Tulip, and she lived down in a little dark house under the ground. One day she was sitting there, all by herself, and it was very still. Suddenly, she heard a little _tap, tap, tap_, at the door. "Who is that?" she said. "It's the Rain, and I want to come in," said a soft, sad, little voice. "No, you can't come in," the little Tulip said. By and by she heard another little _tap, tap, tap_ on the window-pane. "Who is there?" she said. The same soft little voice answered, "It's the Rain, and I want to come in!" "No, you can't come in," said the little Tulip. Then it was very still for a long time. At last, there came a little rustling, whispering sound, all round the window: _rustle, whisper, whisper_. "

In [52]:
import time
start_time = time.time()

bryant_doc = nlp(bryant)

bryant_sents = [[sent, "Bryant"] for sent in bryant_doc.sents]
bryant_all_sentences = pd.DataFrame(bryant_sents + alice_sents)

bryantwords = bag_of_words(bryant_doc)

bryant_all_words = set(bryantwords + alicewords)

bryant_all_word_counts = bow_features(bryant_all_sentences, bryant_all_words)
print(bryant_all_word_counts.head())


print("\n--- %s seconds ---" % (time.time() - start_time))

Processing row 0
Processing row 500
Processing row 1000
Processing row 1500
Processing row 2000
Processing row 2500
Processing row 3000
Processing row 3500
Processing row 4000
  pleasant latch shore skurri blind enclosure safety swift ann always  \
0        0     0     0      0     0         0      0     0   0      0   
1        0     0     0      0     0         0      0     0   0      0   
2        0     0     0      0     0         0      0     0   0      0   
3        0     0     0      0     0         0      0     0   0      0   
4        0     0     0      0     0         0      0     0   0      0   

      ...     glad longer eye rude be axe shell sleep  \
0     ...        0      0   0    0  1   0     0     0   
1     ...        0      0   0    0  0   0     0     0   
2     ...        0      0   0    0  0   0     0     1   
3     ...        0      0   0    0  0   0     0     0   
4     ...        0      0   0    0  0   0     0     0   

                                       tex

In [53]:

print(len(bryant_doc))
print(len(alice_doc))



55907
34363




Looks like we may have a bit of class imbalance here. Let's proceed anyways and create the spacy features dataframe.


In [54]:
bryant_all_spacy = create_spacy_features(bryant_all_word_counts)



  pleasant latch shore skurri blind enclosure safety swift ann always  \
0        0     0     0      0     0         0      0     0   0      0   
1        0     0     0      0     0         0      0     0   0      0   
2        0     0     0      0     0         0      0     0   0      0   
3        0     0     0      0     0         0      0     0   0      0   
4        0     0     0      0     0         0      0     0   0      0   

     ...     be axe shell sleep  \
0    ...      1   0     0     0   
1    ...      0   0     0     0   
2    ...      0   0     0     1   
3    ...      0   0     0     0   
4    ...      0   0     0     0   

                                       text_sentence text_source  \
0  (There, 's, a, garden, that, I, ken, ,, Full, ...      Bryant   
1                                    ((, Flax, ., ))      Bryant   
2  (From, house, to, house, he, goes, ,, A, messe...      Bryant   
3                               ((, The, path, ., ))      Bryant   
4  (Once, 

In [55]:
for c in c_values:
    run_logistic(bryant_all_spacy, c)

C = 0.001
Training set score: 0.6609977324263039
Test set score: 0.6723356009070295


col_0        Bryant  Carroll
text_source                 
Bryant         1069       45
Carroll         533      117


C = 0.01
Training set score: 0.7826908541194255
Test set score: 0.7704081632653061


col_0        Bryant  Carroll
text_source                 
Bryant         1038       76
Carroll         329      321


C = 0.1
Training set score: 0.8983371126228269
Test set score: 0.8531746031746031


col_0        Bryant  Carroll
text_source                 
Bryant         1046       68
Carroll         191      459


C = 1
Training set score: 0.9727891156462585
Test set score: 0.8752834467120182


col_0        Bryant  Carroll
text_source                 
Bryant         1037       77
Carroll         143      507


C = 100
Training set score: 0.9856386999244142
Test set score: 0.8605442176870748


col_0        Bryant  Carroll
text_source                 
Bryant         1010      104
Carroll         142 

Identifying Lewis Carroll vs. Bryant using logistic regression gives us a testing accuracy score of 0.875 when C=1. 

Let's see what happens when we reduce features using SelectKBest.

In [60]:


Y = bryant_all_spacy['text_source']
X = bryant_all_spacy.drop(['text_sentence','text_source'], 1)

print('k: 1500')
kb = SelectKBest(k=1500)
k_reduced = kb.fit_transform(X,y=Y)

X_train_k, X_test_k, y_train_k, y_test_k = train_test_split(k_reduced, 
                                                    Y,
                                                    test_size=0.4,
                                                    random_state=0)

lr = LogisticRegression()
lr.fit(X_train_k, y_train_k)
print('Training set score:', lr.score(X_train_k, y_train_k))
print('Test set score:', lr.score(X_test_k, y_test_k))

y_pred = lr.predict(X_test_k)
cm = pd.crosstab(y_test_k, y_pred)
print(cm)
print('\n')



k: 1500
Training set score: 0.9414210128495842
Test set score: 0.8758503401360545
col_0        Bryant  Carroll
text_source                 
Bryant         1046       68
Carroll         151      499




Using the 1500 best features did not improve our score. 

Let's run SVC, which performed the best for the last dataset.

In [61]:
for c in c_values:
    run_svc(bryant_all_spacy, c)

C = 0.001
Training set score: 0.6148904006046864
Test set score: 0.6315192743764172
col_0        Bryant
text_source        
Bryant         1114
Carroll         650


C = 0.01
Training set score: 0.6148904006046864
Test set score: 0.6315192743764172
col_0        Bryant
text_source        
Bryant         1114
Carroll         650


C = 0.1
Training set score: 0.6148904006046864
Test set score: 0.6315192743764172
col_0        Bryant
text_source        
Bryant         1114
Carroll         650


C = 1
Training set score: 0.6175359032501889
Test set score: 0.6332199546485261
col_0        Bryant  Carroll
text_source                 
Bryant         1112        2
Carroll         645        5


C = 100
Training set score: 0.8756613756613757
Test set score: 0.8321995464852607
col_0        Bryant  Carroll
text_source                 
Bryant         1062       52
Carroll         244      406


C = 1000
Training set score: 0.973167044595616
Test set score: 0.8781179138321995
col_0        Bryant  Carr

Interestingly, these results are much worse than all of the other methods. This looks like a case of class imbalance, as the classifier is aggressively predicting __Bryant__, which is the longer document. In the interest of time, I will not re-run these. But, in the future, I would randomly sub-sample the larger class (__Bryant__) to create two equally sized classes then re-run this model.


### Conclusion

When classes are comparably sized, logistic regression with feature reduction by SelectKBest likely gives the best results. Although SVC is a very powerful classifier, like anything else it requires some fine-tuning and parameter optimizing, and this takes too long to run for this simple drill.

The class imbalance does not seem to affect the accuracy for logistic regression, which is a testament to this method's robustness and versatility compared to SVC. However, I acknowledge that it is good practice to represent the classes equally going forth.
