# Poe or Austen? 

This notebook scrapes *The Purloined Letter* by Edgar Allen Poe and *Pride and Prejudice* by Jane Austen, then cleans the text up a bit. It then uses a pipeline to test CountVectorizer and TfidfVectorizer, then investigates coefficients.

In [1]:
import re
import requests

import pandas as pd

from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline

In [2]:
# URLs
poe_url = 'http://www.gutenberg.org/files/2148/2148-0.txt'
austen_url = 'http://www.gutenberg.org/files/1342/1342-0.txt'

In [3]:
# get and set up austen
res = requests.get(austen_url)
assert str(res) == '<Response [200]>'
austen = res.content.decode("utf-8-sig")

In [4]:
# clean austen
austen = austen[austen.find('By Jane Austen'):austen.find('End of the Project Gutenberg EBook of Pride and Prejudice')]
austen_list = austen.replace('\r', '').split('\n')
austen_list = [each for each in austen_list if each != '']
austen_df = pd.DataFrame(austen_list, columns=['text'])
austen_df['author'] = 'Austen'
austen_df.head()

Unnamed: 0,text,author
0,By Jane Austen,Austen
1,Chapter 1,Austen
2,"It is a truth universally acknowledged, that a...",Austen
3,"of a good fortune, must be in want of a wife.",Austen
4,However little known the feelings or views of ...,Austen


In [5]:
# get and set up poe
res = requests.get(poe_url)
assert str(res) == '<Response [200]>'
poe = res.content.decode("utf-8-sig") 

In [6]:
# clean poe
poe = poe[poe.find('The Purloined Letter'):poe.find('End of Project Gutenberg')]
poe_list = poe.replace('\r', '').split('\n')
poe_list = [each for each in poe_list if each != '']
poe_df = pd.DataFrame(poe_list, columns=['text'])
poe_df['author'] = 'Poe'
poe_df.head()

Unnamed: 0,text,author
0,The Purloined Letter,Poe
1,The Thousand-and-Second Tale of Scheherazade,Poe
2,A Descent into the Maelström,Poe
3,Von Kempelen and his Discovery,Poe
4,Mesmeric Revelation,Poe


In [7]:
# merge
df = pd.concat([austen_df, poe_df], axis=0)

In [8]:
# use .find() to set up indices for the string
austen.find('By Jane Austen')

0

In [9]:
austen.find('End of the Project Gutenberg EBook of Pride and Prejudice')

-1

In [10]:
X = df['text']
y = df['author']

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [11]:
pipe = Pipeline([
    ('cv', CountVectorizer()),
    ('lr', LogisticRegression())
])

pipe.fit(X_train, y_train)
pipe.score(X_train, y_train)

0.9669202316979552

In [12]:
pipe.score(X_test, y_test)

0.8863303328448817

In [13]:
pipe = Pipeline([
    ('tf', TfidfVectorizer()),
    ('lr', LogisticRegression())
])

pipe.fit(X_train, y_train)
pipe.score(X_train, y_train)

0.9324446925814781

In [14]:
pipe.score(X_test, y_test)

0.8752355034540507

In [15]:
coefs = pipe.named_steps['tf'].get_feature_names()
values = pipe.named_steps['lr'].coef_[0]

In [16]:
pd.DataFrame({'coefs': coefs, 'values': values}).sort_values('values').head(20)

Unnamed: 0,coefs,values
9189,she,-8.509704
5009,her,-7.691493
3497,elizabeth,-6.900182
2632,darcy,-4.74631
6730,mr,-4.688416
11440,your,-4.445771
6731,mrs,-4.37383
5786,jane,-4.366205
11435,you,-4.173485
10255,though,-4.001465


In [17]:
pd.DataFrame({'coefs': coefs, 'values': values}).sort_values('values', ascending=False).head(20)

Unnamed: 0,coefs,values
10204,the,4.636783
10831,upon,4.407513
652,although,3.015711
10298,thus,2.628662
5778,its,2.56695
2669,death,2.408402
3363,dupin,2.219989
220,_p,2.139007
10877,valdemar,2.094018
653,altogether,2.07952


In [18]:
ps = PorterStemmer()

In [19]:
ps.stem('cats')

'cat'

In [20]:
def to_words(raw_text):
    
    letters_only = re.sub("[^a-zA-Z]", " ", raw_text)
    
    words = letters_only.lower().split()
    
    stops = set(stopwords.words('english'))
    
    meaningful_words = [w for w in words if not w in stops]
    
    stemmed_words = [ps.stem(w) for w in words if not w in stops]
    #
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return(" ".join(stemmed_words))

In [21]:
pipe = Pipeline([
    ('cv', CountVectorizer(preprocessor=to_words)),
    ('nb', MultinomialNB())
])

In [22]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('cv', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1),
        preprocessor=<function to_words at 0x1a0cb038c8>, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('nb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [23]:
pipe.score(X_train, y_train)

0.9278386488938516

In [None]:
pipe.score(X_test, y_test)

0.8790035587188612

In [None]:
pipe = Pipeline([
    ('cv', CountVectorizer(preprocessor=to_words)),
    ('nb', MultinomialNB())
])

params = {
    'cv__preprocessor': [None, to_words]
}

gs = GridSearchCV(pipe, params)
gs.fit(X_train, y_train)

In [None]:
gs.best_score_

In [None]:
gs.score(X_test, y_test)

In [None]:
gs.best_estimator_