In [20]:
import pandas as pd
import numpy as np
import glob
import os
import gutenberg
import nltk
import random
from ast import literal_eval


# This package contains a variety of scripts to make working with 
# the Project Gutenberg body of public domain texts easier.
from gutenberg.acquire import load_etext
from gutenberg.cleanup import strip_headers
from gutenberg.query import get_etexts
from gutenberg.query import get_metadata
from gutenberg.query import list_supported_metadatas


from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer


from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.cross_validation import KFold
from sklearn.ensemble import (RandomForestClassifier, VotingClassifier)
from sklearn.naive_bayes import MultinomialNB


from keras.layers import Embedding, LSTM, Dense, Dropout, Conv1D, MaxPooling1D
from keras.models import Sequential
from keras.utils import np_utils

In [None]:
# Before use one of the gutenberg.query functions, we must populate the local metadata cache. 
from gutenberg.acquire import get_metadata_cache
cache = get_metadata_cache()
cache.populate()

In this project, I download all the data by using the package gutenberg. This package contains a variety of scripts to make working with the Project Gutenberg body of public domain texts easier. There are 100 observations in training data as for each of the ten famous authors I choose, ten English works are selected. This makes up the corpus that would be further processed.

In [2]:
# 10 authors are selected. They are given a unqiue id from 1 to 10.
authors = ['Shakespeare, William', 'Dickens, Charles', 'Twain, Mark', 'Verne, Jules', 'Austen, Jane', 'Poe, Edgar Allan', 'Henry, O.', 'Melville, Herman', 'Hawthorne, Nathaniel', 'Wharton, Edith']
authors_id = [1,2,3,4,5,6,7,8,9,10]

In [3]:
# Show all english works of different authors.
def Intersection(lst1, lst2):
    return set(lst1).intersection(set(lst2))

ShakespeareWilliam = Intersection(get_etexts('author', 'Shakespeare, William'), get_etexts('language', 'en'))
DickensCharles = Intersection(get_etexts('author', 'Dickens, Charles'), get_etexts('language', 'en'))
TwainMark = Intersection(get_etexts('author', 'Twain, Mark'), get_etexts('language', 'en'))
VerneJules = Intersection(get_etexts('author', 'Verne, Jules'), get_etexts('language', 'en'))
AustenJane = Intersection(get_etexts('author', 'Austen, Jane'), get_etexts('language', 'en'))
PoeEdgarAllan = Intersection(get_etexts('author', 'Poe, Edgar Allan'), get_etexts('language', 'en'))
HenryO = Intersection(get_etexts('author', 'Henry, O.'), get_etexts('language', 'en'))
MelvilleHerman = Intersection(get_etexts('author', 'Melville, Herman'), get_etexts('language', 'en'))
HawthorneNathaniel = Intersection(get_etexts('author', 'Hawthorne, Nathaniel'), get_etexts('language', 'en'))
WhartonEdith = Intersection(get_etexts('author', 'Wharton, Edith'), get_etexts('language', 'en'))


print('Shakespeare, William'+str(ShakespeareWilliam))
print('Dickens, Charles'+str(DickensCharles))
print('Twain, Mark'+str(TwainMark))
print('Verne, Jules'+str(VerneJules))
print('Austen, Jane'+str(AustenJane))
print('Poe, Edgar Allan'+str(PoeEdgarAllan))
print('Henry, O.'+str(HenryO))
print('Melville, Herman'+str(MelvilleHerman))
print('Hawthorne, Nathaniel'+str(HawthorneNathaniel))
print('Wharton, Edith'+str(WhartonEdith))

Shakespeare, William{1536, 1537, 23042, 23043, 1527, 1538, 1539, 1540, 1541, 23041, 1543, 1544, 1545, 1546, 23045, 23046, 1041, 1045, 1126, 10281, 12842, 45128, 1100, 1101, 1102, 1103, 1104, 1105, 1106, 1107, 1108, 1109, 1110, 1111, 1112, 1113, 1114, 1115, 1116, 1117, 1118, 1119, 1120, 1121, 1122, 1123, 1124, 100, 1125, 1127, 1128, 1129, 1130, 1131, 1132, 1133, 1134, 1135, 1136, 27761, 1137, 26224, 38901, 49297, 26268, 28334, 2235, 2236, 2237, 2238, 2239, 2240, 2241, 2242, 2243, 2244, 2245, 2246, 2247, 2248, 2249, 2250, 2251, 2252, 2253, 2254, 2255, 2256, 2257, 2258, 2259, 2260, 2261, 2262, 2263, 2264, 2265, 2266, 2267, 2268, 2269, 2270, 23044, 1765, 1768, 1769, 1771, 1772, 1773, 1774, 1775, 1776, 1777, 1778, 1779, 1780, 1781, 1782, 1783, 1784, 1785, 1786, 1787, 1790, 1791, 1792, 1793, 1794, 1795, 1796, 1797, 1798, 22791, 1799, 1800, 1801, 1802, 12578, 47960, 10606, 49007, 49008, 9077, 50559, 23935, 1430, 49146, 47518, 8609, 23970, 12719, 12720, 12721, 12722, 12723, 12724, 50095, 47715

In [4]:
# Randomly choose 10 works from each author to form the corpus(training data).
print('Shakespeare, William chosen works:'+str(np.random.RandomState(15).choice(list(ShakespeareWilliam), size=10)))
print('Dickens, Charles chosen works:'+str(np.random.RandomState(15).choice(list(DickensCharles), size=10)))
print('Twain, Mark chosen works:'+str(np.random.RandomState(15).choice(list(TwainMark), size=10)))
print('Verne, Jules chosen works:'+str(np.random.RandomState(15).choice(list(VerneJules), size=10)))
print('Austen, Jane chosen works:'+str(np.random.RandomState(15).choice(list(AustenJane), size=10)))
print('Poe, Edgar Allan chosen works:'+str(np.random.RandomState(15).choice(list(PoeEdgarAllan), size=10)))
print('Henry, O. chosen works:'+str(np.random.RandomState(15).choice(list(HenryO), size=10)))
print('Melville, Herman chosen works:'+str(np.random.RandomState(15).choice(list(MelvilleHerman), size=10)))
print('Hawthorne, Nathaniel chosen works:'+str(np.random.RandomState(15).choice(list(HawthorneNathaniel), size=10)))
print('Wharton, Edith chosen works:'+str(np.random.RandomState(15).choice(list(WhartonEdith), size=10)))

Shakespeare, William chosen works:[49007  1799  1784  1795 50095 12724  2254  1783  1516  1045]
Dickens, Charles chosen works:[ 9717  9711  9696  9706  1423  9695 49683   588   810  9738]
Twain, Mark chosen works:[ 7156  9016 33077  8481  9007  9033  9031  7155  7159  3251]
Verne, Jules chosen works:[ 8980 22759  8984 12051 33516 46597 11556  8993  8979  8983]
Austen, Jane chosen works:[22963 31100 26301 42671   141 22962  1212 31100 19839 20686]
Poe, Edgar Allan chosen works:[12714  1062  9516  9511  2147 14082 28908  2151  9513  9515]
Henry, O. chosen works:[ 3707  1583 22440  8962 22442  2776  2777  2141  2295  8962]
Melville, Herman chosen works:[ 9268  1900  2489 34970 15859  2694 12841 21816  1900 28656]
Hawthorne, Nathaniel chosen works:[41368  9226  9219   512  9244  9243  9239  8091  9204  2182]
Wharton, Edith chosen works:[  284 24132  4514   283 55807 39042  9282  9281 24349 24351]


In [5]:
# Generate training data(10 works from each author and some works are replaced by the other works from the same author.)
y_train = [i for i in authors_id for r in range(10)]
x_train = []
x_train.extend([strip_headers(load_etext(49007)).strip(),strip_headers(load_etext(1799)).strip(),strip_headers(load_etext(1784)).strip(),strip_headers(load_etext(1795)).strip(),strip_headers(load_etext(50095)).strip(), strip_headers(load_etext(1536)).strip(),strip_headers(load_etext(2254)).strip(),strip_headers(load_etext(1783)).strip(),strip_headers(load_etext(1516)).strip(),strip_headers(load_etext(1045)).strip()])
x_train.extend([strip_headers(load_etext(1023)).strip(),strip_headers(load_etext(46)).strip(),strip_headers(load_etext(19505)).strip(),strip_headers(load_etext(564)).strip(),strip_headers(load_etext(1423)).strip(),strip_headers(load_etext(580)).strip(),strip_headers(load_etext(49683)).strip(),strip_headers(load_etext(588)).strip(),strip_headers(load_etext(810)).strip(),strip_headers(load_etext(46675)).strip()])
x_train.extend([strip_headers(load_etext(7156)).strip(),strip_headers(load_etext(2572)).strip(),strip_headers(load_etext(33077)).strip(),strip_headers(load_etext(8481)).strip(),strip_headers(load_etext(19987)).strip(),strip_headers(load_etext(1044)).strip(),strip_headers(load_etext(7193)).strip(),strip_headers(load_etext(7155)).strip(),strip_headers(load_etext(7159)).strip(),strip_headers(load_etext(3251)).strip()])
x_train.extend([strip_headers(load_etext(19362)).strip(),strip_headers(load_etext(22759)).strip(),strip_headers(load_etext(9618)).strip(),strip_headers(load_etext(12051)).strip(),strip_headers(load_etext(33516)).strip(),strip_headers(load_etext(46597)).strip(),strip_headers(load_etext(11556)).strip(),strip_headers(load_etext(3091)).strip(),strip_headers(load_etext(26658)).strip(),strip_headers(load_etext(2083)).strip()])
x_train.extend([strip_headers(load_etext(158)).strip(),strip_headers(load_etext(31100)).strip(),strip_headers(load_etext(161)).strip(),strip_headers(load_etext(42671)).strip(),strip_headers(load_etext(141)).strip(),strip_headers(load_etext(946)).strip(),strip_headers(load_etext(1212)).strip(),strip_headers(load_etext(31100)).strip(),strip_headers(load_etext(19839)).strip(),strip_headers(load_etext(37431)).strip()])
x_train.extend([strip_headers(load_etext(932)).strip(),strip_headers(load_etext(1062)).strip(),strip_headers(load_etext(32037)).strip(),strip_headers(load_etext(50852)).strip(),strip_headers(load_etext(2147)).strip(),strip_headers(load_etext(14082)).strip(),strip_headers(load_etext(45484)).strip(),strip_headers(load_etext(2151)).strip(),strip_headers(load_etext(10031)).strip(),strip_headers(load_etext(1063)).strip()])
x_train.extend([strip_headers(load_etext(3707)).strip(),strip_headers(load_etext(1583)).strip(),strip_headers(load_etext(2851)).strip(),strip_headers(load_etext(1444)).strip(),strip_headers(load_etext(13094)).strip(),strip_headers(load_etext(2776)).strip(),strip_headers(load_etext(2777)).strip(),strip_headers(load_etext(2141)).strip(),strip_headers(load_etext(2295)).strip(),strip_headers(load_etext(3815)).strip()])
x_train.extend([strip_headers(load_etext(2701)).strip(),strip_headers(load_etext(1900)).strip(),strip_headers(load_etext(2489)).strip(),strip_headers(load_etext(34970)).strip(),strip_headers(load_etext(15859)).strip(),strip_headers(load_etext(2694)).strip(),strip_headers(load_etext(12841)).strip(),strip_headers(load_etext(21816)).strip(),strip_headers(load_etext(1900)).strip(),strip_headers(load_etext(28656)).strip()])
x_train.extend([strip_headers(load_etext(41368)).strip(),strip_headers(load_etext(9226)).strip(),strip_headers(load_etext(9219)).strip(),strip_headers(load_etext(512)).strip(),strip_headers(load_etext(9244)).strip(),strip_headers(load_etext(9243)).strip(),strip_headers(load_etext(9239)).strip(),strip_headers(load_etext(8091)).strip(),strip_headers(load_etext(9204)).strip(),strip_headers(load_etext(2182)).strip()])
x_train.extend([strip_headers(load_etext(284)).strip(),strip_headers(load_etext(24132)).strip(),strip_headers(load_etext(4514)).strip(),strip_headers(load_etext(283)).strip(),strip_headers(load_etext(55807)).strip(),strip_headers(load_etext(39042)).strip(),strip_headers(load_etext(57347)).strip(),strip_headers(load_etext(267)).strip(),strip_headers(load_etext(29349)).strip(),strip_headers(load_etext(24351)).strip()])
# Randomly shuffle the train data
train = pd.DataFrame(
    {'text': x_train,
     'author': y_train
     })
train = train.sample(frac=1)
x_train = train['text'].values
y_train = train['author'].values

In [6]:
# Generate test data(3 works from each author and some works are replaced by the other works from the same author.)
y_test = [i for i in authors_id for r in range(3)]
x_test = []
x_test.extend([strip_headers(load_etext(1537)).strip(),strip_headers(load_etext(23042)).strip(),strip_headers(load_etext(23043)).strip()])
x_test.extend([strip_headers(load_etext(98)).strip(),strip_headers(load_etext(43111)).strip(),strip_headers(load_etext(644)).strip()])
x_test.extend([strip_headers(load_etext(7194)).strip(),strip_headers(load_etext(7195)).strip(),strip_headers(load_etext(19484)).strip()])
x_test.extend([strip_headers(load_etext(1698)).strip(),strip_headers(load_etext(3748)).strip(),strip_headers(load_etext(164)).strip()])
x_test.extend([strip_headers(load_etext(1342)).strip(),strip_headers(load_etext(42078)).strip(),strip_headers(load_etext(21839)).strip()])
x_test.extend([strip_headers(load_etext(1064)).strip(),strip_headers(load_etext(1065)).strip(),strip_headers(load_etext(25525)).strip()])
x_test.extend([strip_headers(load_etext(1725)).strip(),strip_headers(load_etext(1805)).strip(),strip_headers(load_etext(1646)).strip()])
x_test.extend([strip_headers(load_etext(15)).strip(),strip_headers(load_etext(13720)).strip(),strip_headers(load_etext(13721)).strip()])
x_test.extend([strip_headers(load_etext(512)).strip(),strip_headers(load_etext(513)).strip(),strip_headers(load_etext(9218)).strip()])
x_test.extend([strip_headers(load_etext(54932)).strip(),strip_headers(load_etext(41855)).strip(),strip_headers(load_etext(24348)).strip()])
# Randomly shuffle the test data
test = pd.DataFrame(
    {'text': x_test,
     'author': y_test
     })
test = test.sample(frac=1)
x_test = test['text'].values
y_test = test['author'].values

In [7]:
# Remove stopwords, stem the texts and create tfidf matrix. 
# Each unique word in our dictionary will correspond to a feature (descriptive feature).
stemmer = SnowballStemmer(language="english", ignore_stopwords=True)
analyzer = CountVectorizer().build_analyzer()

def stemmed_words(doc):
    return (stemmer.stem(w) for w in analyzer(doc))

stem_vectorizer = CountVectorizer(analyzer=stemmed_words)
x_train_stemmed = TfidfTransformer().fit_transform(stem_vectorizer.fit_transform(x_train))

In [8]:
# Implement SVM and use 5-fold cross-validation to evaluate the performance.
clf_svm = LinearSVC(loss='hinge', penalty='l2', random_state=69)
scores = cross_val_score(clf_svm, x_train_stemmed, y_train, scoring='accuracy', cv=5) 
scores

array([0.8 , 0.85, 0.8 , 0.65, 0.9 ])

In [9]:
print ('Validation Score for SVM:'+str(scores.mean()))

Validation Score for SVM:0.8


In [10]:
# Implement Random Forest and use 5-fold cross-validation to evaluate the performance.
clf_rf = RandomForestClassifier(n_estimators=1000, 
                                max_features='sqrt', 
                                max_depth=15,
                                verbose=0,
                                random_state=69)
scores = cross_val_score(clf_rf, x_train_stemmed, y_train, scoring='accuracy', cv=5)
scores

array([0.95, 0.9 , 0.9 , 0.9 , 0.95])

In [11]:
print ('Validation Score for Random Forest:'+str(scores.mean()))

Validation Score for Random Forest:0.9199999999999999


In [12]:
# Build a pipeline and evaluate the performance of SVM on the test data.
stemmer = SnowballStemmer("english", ignore_stopwords=True)
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])
stemmed_count_vect = StemmedCountVectorizer(stop_words='english')

clf_svm = Pipeline([('vect', stemmed_count_vect),
                    ('tfidf', TfidfTransformer()),
                    ('clf-svm', LinearSVC(loss='hinge', penalty='l2', random_state=69))])

_ = clf_svm.fit(x_train, y_train)
predicted_svm = clf_svm.predict(x_test)
accuracy = np.mean(predicted_svm == y_test)

print ('Test Score for SVM:'+str(accuracy))

Test Score for SVM:0.9


In [13]:
# Evaluate the performance of Random Forest on the test data.
clf_rf = Pipeline([('vect', stemmed_count_vect),
                    ('tfidf', TfidfTransformer()),
                    ('clf-rf', RandomForestClassifier(n_estimators=1000, 
                                max_features='sqrt', 
                                max_depth=15,
                                verbose=0,
                                random_state=69))])

_ = clf_rf.fit(x_train, y_train)
predicted_rf = clf_rf.predict(x_test)
accuracy = np.mean(predicted_rf == y_test)

print ('Test Score for Random Forest:'+str(accuracy))

Test Score for Random Forest:0.9


So far we have trained two models (SVM & Random Forest) and evaluate their performance by 1) cross-validation 2) test data. Next let's try $\textbf{Grid Search}$ and train one more model($\textbf{Naive Bayes}$). I use $\textbf{majority vote}$ as our emsemble method.

In [14]:
# Grid search for Random Forest to select best parameters.
rf = RandomForestClassifier(random_state = 69)
rf_params = {
             'n_estimators':[1000, 1500, 3000], 
             'max_depth': [10, 30, 50],
             'max_features': ['log2', 'sqrt']
}
clf = GridSearchCV(rf, rf_params, n_jobs=-1)
clf.fit(x_train_stemmed, y_train)
print (clf.best_params_)

rf_max_depth = clf.best_params_['max_depth']
rf_max_features = clf.best_params_['max_features']
rf_n_estimators = clf.best_params_['n_estimators']

{'max_depth': 10, 'max_features': 'sqrt', 'n_estimators': 1000}


In [15]:
# Grid search for Naive Bayes to select best parameters.
nb = MultinomialNB()
nb_params = {
             'alpha':[0.1, 0.5, 1, 2], 
}
clf = GridSearchCV(nb, nb_params, n_jobs=-1)
clf.fit(x_train_stemmed, y_train)
print (clf.best_params_)

nb_alpha = clf.best_params_['alpha']

{'alpha': 0.1}


In [16]:
# Grid search for Support Vector Classifier to select best parameters.
svc = LinearSVC()
svc_params = {
    'C':[0.025, 0.05, 0.1, 0.5, 1]}
clf = GridSearchCV(svc, svc_params)
clf.fit(x_train_stemmed, y_train)
print (clf.best_params_)

svc_C = clf.best_params_['C']

{'C': 1}


In [24]:
# Use the parameters selected from grid search and use majority as ensemble method
clfs = []
model1 = RandomForestClassifier(n_jobs=-1,
                                n_estimators=rf_n_estimators,
                                max_depth=rf_max_depth,
                                min_samples_leaf=2,
                                max_features=rf_max_features,
                                verbose=0,
                                random_state=69)
clfs.append(('RandomForest', model1))
model2 = MultinomialNB(alpha=0.1)
clfs.append(('NaiveBayes', model2))
model3 = LinearSVC(C=1)
clfs.append(('SVM', model3))
# create the ensemble model
mv = Pipeline([('vect', stemmed_count_vect),
                    ('tfidf', TfidfTransformer()),
                    ('clf-svm', VotingClassifier(clfs, voting='hard'))])

_ = mv.fit(x_train, y_train)
predicted = mv.predict(x_test)
accuracy = np.mean(predicted == y_test)

print ('Test Score for Majority Vote:'+str(accuracy))

Test Score for Majority Vote:0.9333333333333333


  if diff:


By using Majority vote and grid search, my model outperforms the single SVM or Random Forest algorithm. Now let's go deeper by using $\textbf{Recurrent Neural Network (RNN) with Long Short-Term Memory (LSTM).}$. 

In [38]:
# Tokennize the texts and pad the sequences
tokenizer = Tokenizer(num_words=10000) # use the most common 20000 words 
tokenizer.fit_on_texts(x_train) # calculate the word frequencies
sequences = tokenizer.texts_to_sequences(x_train) # transform the text into numerical tokens
data = pad_sequences(sequences, maxlen=300) # "pad” the sequences so that the training examples are the same size

In [39]:
# Build Recurrent Neural Network (RNN) with Long Short-Term Memory (LSTM)
random.seed(69)
model = Sequential()
model.add(Embedding(10000, 256, input_length=300))
model.add(LSTM(256, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(100, activation='sigmoid'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(data, np.array(y_train), validation_split=0.4, epochs=5)

Train on 60 samples, validate on 40 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1a27e03e80>

With the accuracy of 0.75, I think we need more data to train this complicated neural network.