In [97]:
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
import spacy
from nltk.corpus import stopwords, gutenberg
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn import ensemble
%matplotlib inline

In [77]:
gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [78]:
macbeth = gutenberg.raw('shakespeare-macbeth.txt')
paradise = gutenberg.raw('milton-paradise.txt')

In [79]:
nlp = spacy.load('en_core_web_sm')

In [80]:
def text_cleaner(text):
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = ' '.join(text.split()[:5000])
    return text

macbeth = text_cleaner(macbeth)
paradise = text_cleaner(paradise)

In [81]:
#Parse using SpaCy
macbeth_doc = nlp(macbeth)
paradise_doc = nlp(paradise)

In [82]:
#Group into sentences
macbeth_sents = [[sent, 'Shakespeare'] for sent in macbeth_doc.sents]
paradise_sents = [[sent, 'Milton'] for sent in paradise_doc.sents]

sentences = pd.DataFrame(macbeth_sents + paradise_sents)
sentences.head()

Unnamed: 0,0,1
0,"(Actus, Primus, .)",Shakespeare
1,"(Scoena, Prima, .)",Shakespeare
2,"(Thunder, and, Lightning, .)",Shakespeare
3,"(Enter, three, Witches, .)",Shakespeare
4,"(1, .)",Shakespeare


In [83]:
#Look at excerpts from each 
print(macbeth_doc[:100])
print('\nMacbethlength:', len(macbeth_doc))

print('\n', paradise_doc[:100])
print('\nParadise length:', len(paradise_doc))

Actus Primus. Scoena Prima. Thunder and Lightning. Enter three Witches. 1. When shall we three meet againe? In Thunder, Lightning, or in Raine? 2. When the Hurley-burley's done, When the Battaile's lost, and wonne 3. That will be ere the set of Sunne 1. Where the place? 2. Vpon the Heath 3. There to meet with Macbeth 1. I come, Gray-Malkin All. Padock calls anon: faire is foule, and foule is faire

Macbethlength: 6238

 Book I Of Man's first disobedience, and the fruit Of that forbidden tree whose mortal taste Brought death into the World, and all our woe, With loss of Eden, till one greater Man Restore us, and regain the blissful seat, Sing, Heavenly Muse, that, on the secret top Of Oreb, or of Sinai, didst inspire That shepherd who first taught the chosen seed In the beginning how the heavens and earth Rose out of Chaos: or, if Sion hill Delight thee more, and Siloa's

Paradise length: 5938


## Bag Of Words

In [84]:
def bag_of_words(text):
    
    allwords = [token.lemma_
               for token in text if not token.is_punct
               and not token.is_stop]
    return [item[0] for item in Counter(allwords).most_common(500)]

macbeth_words = bag_of_words(macbeth_doc)
paradise_words = bag_of_words(paradise_doc)
common_words = set(macbeth_words + paradise_words)

In [85]:
#bow data frame
def bow_features(sentences, common_words):
    
    # Build data frame
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1]
    df.loc[:, common_words] = 0
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentences in enumerate(df['text_sentence']):
        
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = [token.lemma_
                 for token in sentences
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        
        # Populate the row with word counts.
        for word in words:
            df.loc[i, word] += 1
    
    return df

In [86]:
#bow features
bow = bow_features(sentences, common_words)
bow.head()

Unnamed: 0,bid,stay,guest,prepare,deepe,depth,move,treasons,stature,hearke,...,bow,cover,appear,wast,ruin,one,chamber,man,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Actus, Primus, .)",Shakespeare
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Scoena, Prima, .)",Shakespeare
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Thunder, and, Lightning, .)",Shakespeare
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Enter, three, Witches, .)",Shakespeare
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(1, .)",Shakespeare


## TF-IDF Features

In [87]:
macbeth = gutenberg.sents('shakespeare-macbeth.txt')
paradise = gutenberg.sents('milton-paradise.txt')

In [88]:
#Creating a list of text
macbeth_list = [" ".join(sent) for sent in macbeth]
paradise_list =[" ".join(sent) for sent in paradise]
joined = macbeth_list + paradise_list

In [89]:
# Vectorize

vectorizer = TfidfVectorizer(max_df=0.5, 
                             min_df=2, 
                             stop_words='english',   
                             use_idf=True,
                             norm=u'l2', 
                             smooth_idf=True 
                            )

tfidf = vectorizer.fit_transform(joined).tocsr()

In [90]:
#Evaluating the feature sets using cross validation

X_bow = bow.drop(['text_sentence', 'text_source'],1)
Y_bow = bow['text_source']

X_tfidf = tfidf
Y_tfidf = ['Shakespeare']*len(macbeth_list) +['Milton']*len(paradise_list)

## Logistic Regression

In [95]:
lr = LogisticRegression()
lr_bow = lr.fit(X_bow, Y_bow)
print('BOW Logistic Regression Scores:', cross_val_score(lr_bow, X_bow,Y_bow, cv =5))
print('Average Score:', np.mean(cross_val_score(lr_bow, X_bow, Y_bow, cv=5)))

lr = LogisticRegression()
lr_tfidf = lr.fit(X_tfidf, Y_tfidf)
print('\nTfidf Logistic Regression Scores:', cross_val_score(lr_tfidf, X_tfidf, Y_tfidf, cv=5))
print('Average Score:', np.mean(cross_val_score(lr_tfidf, X_tfidf, Y_tfidf, cv=5)))

BOW Logistic Regression Scores: [0.91111111 0.85925926 0.88148148 0.87407407 0.88059701]
Average Score: 0.8813045881702598

Tfidf Logistic Regression Scores: [0.91766268 0.9481383  0.96138482 0.91611185 0.9241012 ]
Average Score: 0.933479769996517


## Random Forest

In [105]:
rfc = ensemble.RandomForestClassifier()
rfc_bow = rfc.fit(X_bow, Y_bow)
print('BOW Random Forest Scores:', cross_val_score(rfc_bow, X_bow, Y_bow, cv =5))
print('Average Score:', np.mean(cross_val_score(rfc_bow, X_bow, Y_bow, cv=5)))

rfc = ensemble.RandomForestClassifier()
rfc_tfidf = rfc.fit(X_tfidf, Y_tfidf)
print('\nTfidf RFC Scores:', cross_val_score(rfc_tfidf, X_tfidf, Y_tfidf, cv=5))
print('Average Score:', np.mean(cross_val_score(rfc_tfidf,X_tfidf, Y_tfidf, cv=5)))

BOW Random Forest Scores: [0.86666667 0.87407407 0.85925926 0.82962963 0.88059701]
Average Score: 0.8649861802100608

Tfidf RFC Scores: [0.84196547 0.90292553 0.89480692 0.86018642 0.87616511]
Average Score: 0.8717527962584694


## Gradient Boosting

In [110]:
clf = ensemble.GradientBoostingClassifier()
clf_bow = clf.fit(X_bow, Y_bow)
print('BOW Gradient Boosting Scores', cross_val_score(clf_bow, X_bow, Y_bow, cv =5))
print('Average Score:', np.mean(cross_val_score(clf_bow, X_bow, Y_bow, cv =5)))

clf = ensemble.GradientBoostingClassifier()
clf_tfidf = clf.fit(X_tfidf, Y_tfidf)
print('\nTfidf Gradient Boosting Scores:', cross_val_score(clf_tfidf, X_tfidf, Y_tfidf, cv=5))
print('Average Score:', np.mean(cross_val_score(clf_tfidf,X_tfidf, Y_tfidf, cv=5)))

BOW Gradient Boosting Scores [0.8962963  0.84444444 0.87407407 0.82962963 0.88059701]
Average Score: 0.8561194029850746

Tfidf Gradient Boosting Scores: [0.74103586 0.85638298 0.8828229  0.81491345 0.82556591]
Average Score: 0.8241442197891157


## Write up

I got some pretty good scores from my models, but I would choose my Random forest model using TF-IDF since it has a score of 87% which is not too high or too low. I figured the accuracy scores would be high because Shakespeare and Milton have similar writing styles so I can assume they used many similar key words. 

I would try to improve the accuracy score for my TF-IDF gradient boosting model, but I am not sure sure what to modify in order to increase the accuracy score.