# Poe or Austen? 

This notebook scrapes *The Purloined Letter* by Edgar Allen Poe and *Pride and Prejudice* by Jane Austen, then cleans the text up a bit. It then uses a pipeline to test CountVectorizer and TfidfVectorizer, then investigates coefficients.

In [1]:
import re
import requests

import pandas as pd

from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline

In [2]:
# URLs
poe_url = 'http://www.gutenberg.org/files/2148/2148-0.txt'
austen_url = 'http://www.gutenberg.org/files/1342/1342-0.txt'

In [3]:
# get and set up austen
res = requests.get(austen_url)
assert str(res) == '<Response [200]>'
austen = res.content.decode("utf-8-sig")

In [4]:
# clean austen
austen = austen[austen.find('By Jane Austen'):austen.find('End of the Project Gutenberg EBook of Pride and Prejudice')]
austen_list = austen.replace('\r', '').split('\n')
austen_list = [each for each in austen_list if each != '']
austen_df = pd.DataFrame(austen_list, columns=['text'])
austen_df['author'] = 'Austen'
austen_df.head()

Unnamed: 0,text,author
0,By Jane Austen,Austen
1,Chapter 1,Austen
2,"It is a truth universally acknowledged, that a...",Austen
3,"of a good fortune, must be in want of a wife.",Austen
4,However little known the feelings or views of ...,Austen


In [5]:
# get and set up poe
res = requests.get(poe_url)
assert str(res) == '<Response [200]>'
poe = res.content.decode("utf-8-sig") 

In [7]:
# clean poe
poe = poe[poe.find('The Purloined Letter'):poe.find('End of Project Gutenberg')]
poe_list = poe.replace('\r', '').split('\n')
poe_list = [each for each in poe_list if each != '']
poe_df = pd.DataFrame(poe_list, columns=['text'])
poe_df['author'] = 'Poe'
poe_df.head()

Unnamed: 0,text,author
0,The Purloined Letter,Poe
1,The Thousand-and-Second Tale of Scheherazade,Poe
2,A Descent into the Maelström,Poe
3,Von Kempelen and his Discovery,Poe
4,Mesmeric Revelation,Poe


In [8]:
# merge
df = pd.concat([austen_df, poe_df], axis=0)

In [9]:
df

Unnamed: 0,text,author
0,By Jane Austen,Austen
1,Chapter 1,Austen
2,"It is a truth universally acknowledged, that a...",Austen
3,"of a good fortune, must be in want of a wife.",Austen
4,However little known the feelings or views of ...,Austen
5,"first entering a neighbourhood, this truth is ...",Austen
6,"of the surrounding families, that he is consid...",Austen
7,of some one or other of their daughters.,Austen
8,"“My dear Mr. Bennet,” said his lady to him one...",Austen
9,Netherfield Park is let at last?”,Austen


In [31]:
# use .find() to set up indices for the string
austen.find('By Jane Austen')

0

In [32]:
austen.find('End of the Project Gutenberg EBook of Pride and Prejudice')

-1

In [36]:
X = df['text']
y = df['author']

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [37]:
pipe = Pipeline([
    ('cv', CountVectorizer()),
    ('lr', LogisticRegression())
])

pipe.fit(X_train, y_train)
pipe.score(X_train, y_train)

0.9651755181799149

In [38]:
pipe.score(X_test, y_test)

0.8932384341637011

In [42]:
test_report = pd.DataFrame({'sent': X_test, 'actual': y_test, 'preds': pipe.predict(X_test)})

In [43]:
test_report.head()

Unnamed: 0,actual,preds,sent
1103,Austen,Austen,“Oh! yes--I understand you perfectly.”
4368,Austen,Austen,quest of this wonder; It was two ladies stoppi...
4056,Austen,Austen,"She wrote again when the visit was paid, and s..."
4414,Austen,Austen,"they were to expect, that the sight of such ro..."
6109,Austen,Austen,"scheme, of which Lydia had given them a hint a..."


In [48]:
pipe.predict_proba(X_test)

array([[6.90782880e-01, 3.09217120e-01],
       [5.64030668e-01, 4.35969332e-01],
       [9.99977825e-01, 2.21753537e-05],
       ...,
       [6.33776841e-01, 3.66223159e-01],
       [8.60081047e-01, 1.39918953e-01],
       [9.98916520e-01, 1.08348019e-03]])

In [46]:
test_report[test_report['actual'] != test_report['preds']]

Unnamed: 0,actual,preds,sent
682,Poe,Austen,as a man of devout feeling and excellent sense...
6884,Austen,Poe,"it by a simple bridge, in character with the g..."
3922,Poe,Austen,himself felt as such to him who has done the w...
2370,Poe,Austen,You may as well come now. D---- and F---- are ...
4053,Austen,Poe,lost.
2514,Austen,Poe,think it a faithful portrait undoubtedly.”
3001,Austen,Poe,encourage my suit as would be consistent with ...
7861,Poe,Austen,of marriage.
4706,Poe,Austen,"convolute censers, together with multitudinous..."
8442,Austen,Poe,speed through the neighbourhood. It was borne ...


In [14]:
pipe = Pipeline([
    ('tf', TfidfVectorizer()),
    ('lr', LogisticRegression())
])

pipe.fit(X_train, y_train)
pipe.score(X_train, y_train)

0.9278386488938516

In [15]:
pipe.score(X_test, y_test)

0.8903077245132929

In [16]:
coefs = pipe.named_steps['tf'].get_feature_names()
values = pipe.named_steps['lr'].coef_[0]

In [17]:
pd.DataFrame({'coefs': coefs, 'values': values}).sort_values('values').head(20)

Unnamed: 0,coefs,values
9262,she,-8.077755
5065,her,-7.226897
3533,elizabeth,-6.809754
6801,mr,-4.773352
2671,darcy,-4.642335
11547,your,-4.482881
11542,you,-4.447601
5853,jane,-4.276804
10326,though,-4.169499
6802,mrs,-4.135591


In [18]:
pd.DataFrame({'coefs': coefs, 'values': values}).sort_values('values', ascending=False).head(20)

Unnamed: 0,coefs,values
10931,upon,4.708163
10272,the,4.440622
10371,thus,3.033704
5844,its,2.704613
651,although,2.502863
652,altogether,2.340826
2710,death,2.284301
3399,dupin,2.261263
10976,valdemar,2.063224
10897,until,2.033268


In [19]:
ps = PorterStemmer()

In [20]:
ps.stem('cats')

'cat'

In [21]:
def to_words(raw_text):
    
    letters_only = re.sub("[^a-zA-Z]", " ", raw_text)
    
    words = letters_only.lower().split()
    
    stops = set(stopwords.words('english'))
    
    meaningful_words = [w for w in words if not w in stops]
    
    stemmed_words = [ps.stem(w) for w in words if not w in stops]
    #
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return(" ".join(stemmed_words))

In [22]:
pipe = Pipeline([
    ('cv', CountVectorizer(preprocessor=to_words)),
    ('nb', MultinomialNB())
])

In [23]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('cv', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1),
        preprocessor=<function to_words at 0x11642d9d8>, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('nb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [24]:
pipe.score(X_train, y_train)

0.9265824551608626

In [25]:
pipe.score(X_test, y_test)

0.8890517060916894

In [26]:
pipe = Pipeline([
    ('cv', CountVectorizer(preprocessor=to_words)),
    ('nb', MultinomialNB())
])

params = {
    'cv__preprocessor': [None, to_words]
}

gs = GridSearchCV(pipe, params)
gs.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('cv', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1),
        prepro...enizer=None, vocabulary=None)), ('nb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'cv__preprocessor': [None, <function to_words at 0x11642d9d8>]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [27]:
gs.best_score_

0.8870821411124293

In [28]:
gs.score(X_test, y_test)

0.9041239271509316

In [29]:
gs.best_estimator_

Pipeline(memory=None,
     steps=[('cv', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('nb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])