In [143]:
from nltk.corpus import gutenberg, stopwords
import nltk

# Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation, SpatialDropout1D
from keras.layers.embeddings import Embedding
from keras.utils import np_utils

from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression

from gensim.models.ldamulticore import LdaMulticore
from gensim.models.word2vec import Word2Vec

import string
import numpy as np
import pandas as pd
import re
import os
# set seed for reproducibility
np.random.seed(0)

In [144]:
all_files = os.listdir("book/new/")

In [145]:
arr = []
one_book = []
content = ''
path = 'book/'
for b in all_files[0:100]:
    one_book = []
    path = 'book/'
    one_book.append(b.split('___')[0])
    one_book.append(b)
    
    path += b
    with open(path, 'r') as myfile:
        content = myfile.read().replace('\n', ' ')
    one_book.append(content)
    arr.append(one_book)

In [146]:
columns = ['author', 'file_name', 'text']
df = pd.DataFrame(arr, columns=columns)

In [147]:
df.shape

(90, 3)

In [148]:
df.head()

Unnamed: 0,author,file_name,text
0,Abraham Lincoln,Abraham Lincoln___State of the Union Addresses...,Fellow-Citizens of the Senate and House of Rep...
1,Abraham Lincoln,Abraham Lincoln___The Emancipation Proclamatio...,By the President of the United States of Ameri...
2,Abraham Lincoln,Abraham Lincoln___The Life and Public Service ...,ï»¿The discovery of an unknown address by Abra...
3,Abraham Lincoln,Abraham Lincoln___The Writings of Abraham Linc...,Immediately after Lincoln's re-election to the...
4,Abraham Lincoln,Abraham Lincoln___The Writings of Abraham Linc...,"FIRST CHILD TO JOSHUA F. SPEED. SPRINGFIELD, ..."


In [149]:
stop_words = set(stopwords.words('english'))

In [150]:
def get_data_of_file(row):
    file_name = row.file_name
    text = ''
    words = row.text.lower().split()
    for r in words:
        if not r in stop_words:
            text += ' ' + r
    
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    row['text'] = text
    return row
df = df.apply(get_data_of_file, axis=1)

In [151]:
df.head()

Unnamed: 0,author,file_name,text
0,Abraham Lincoln,Abraham Lincoln___State of the Union Addresses...,fellow - citizens senate house representative...
1,Abraham Lincoln,Abraham Lincoln___The Emancipation Proclamatio...,president united states america : proclamatio...
2,Abraham Lincoln,Abraham Lincoln___The Life and Public Service ...,the discovery unknown address abraham lincoln...
3,Abraham Lincoln,Abraham Lincoln___The Writings of Abraham Linc...,immediately lincoln re - election presidency ...
4,Abraham Lincoln,Abraham Lincoln___The Writings of Abraham Linc...,first child joshua f speed springfield may 18...


In [152]:
df.shape

(90, 3)

### Word2Vec

In [153]:
def get_good_tokens(sentence):
    replaced_punctation = list(map(lambda token: re.sub('[^0-9A-Za-z!?]+', '', token), sentence))
    removed_punctation = list(filter(lambda token: token, replaced_punctation))
    return removed_punctation

In [154]:
def w2v_preprocessing(df):
    """ All the preprocessing steps for word2vec are done in this function.
    All mutations are done on the dataframe itself. So this function returns
    nothing.
    """
    df['document_sentences'] = df.text.str.split('.')  # split texts into individual sentences
    df['tokenized_sentences'] = list(map(lambda sentences:
                                         list(map(nltk.word_tokenize, sentences)),
                                         df.document_sentences))  # tokenize sentences
    df['tokenized_sentences'] = list(map(lambda sentences:
                                         list(map(get_good_tokens, sentences)),
                                         df.tokenized_sentences))  # remove unwanted characters
    df['tokenized_sentences'] = list(map(lambda sentences:
                                         list(filter(lambda lst: lst, sentences)),
                                         df.tokenized_sentences))  # remove empty lists

w2v_preprocessing(df)

In [155]:
df.head()

Unnamed: 0,author,file_name,text,document_sentences,tokenized_sentences
0,Abraham Lincoln,Abraham Lincoln___State of the Union Addresses...,fellow - citizens senate house representative...,[ fellow - citizens senate house representativ...,"[[fellow, citizens, senate, house, representat..."
1,Abraham Lincoln,Abraham Lincoln___The Emancipation Proclamatio...,president united states america : proclamatio...,[ president united states america : proclamati...,"[[president, united, states, america, proclama..."
2,Abraham Lincoln,Abraham Lincoln___The Life and Public Service ...,the discovery unknown address abraham lincoln...,[ the discovery unknown address abraham lincol...,"[[the, discovery, unknown, address, abraham, l..."
3,Abraham Lincoln,Abraham Lincoln___The Writings of Abraham Linc...,immediately lincoln re - election presidency ...,[ immediately lincoln re - election presidency...,"[[immediately, lincoln, re, election, presiden..."
4,Abraham Lincoln,Abraham Lincoln___The Writings of Abraham Linc...,first child joshua f speed springfield may 18...,[ first child joshua f speed springfield may 1...,"[[first, child, joshua, f, speed, springfield,..."


In [156]:
sentences = []
for sentence_group in df.tokenized_sentences:
    sentences.extend(sentence_group)

print("Number of sentences: {}.".format(len(sentences)))
print("Number of texts: {}.".format(len(df)))

Number of sentences: 90.
Number of texts: 90.


In [157]:
# Set values for various parameters
num_features = 200    # Word vector dimensionality
min_word_count = 3    # Minimum word count
num_workers = 4       # Number of threads to run in parallel
context = 6           # Context window size
downsampling = 1e-3   # Downsample setting for frequent words

# Initialize and train the model
W2Vmodel = Word2Vec(sentences=sentences,
                    sg=1,
                    hs=0,
                    workers=num_workers,
                    size=num_features,
                    min_count=min_word_count,
                    window=context,
                    sample=downsampling,
                    negative=5,
                    iter=6)

In [158]:
# W2Vmodel['sentences']

In [159]:
def get_w2v_features(w2v_model, sentence_group):
    """ Transform a sentence_group (containing multiple lists
    of words) into a feature vector. It averages out all the
    word vectors of the sentence_group.
    """
    words = np.concatenate(sentence_group)  # words in text
    index2word_set = set(w2v_model.wv.vocab.keys())  # words known to model
    
    featureVec = np.zeros(w2v_model.vector_size, dtype="float32")
    
    # Initialize a counter for number of words in a review
    nwords = 0
    # Loop over each word in the comment and, if it is in the model's vocabulary, add its feature vector to the total
    for word in words:
        if word in index2word_set: 
            featureVec = np.add(featureVec, w2v_model[word])
            nwords += 1.

    # Divide the result by the number of words to get the average
    if nwords > 0:
        featureVec = np.divide(featureVec, nwords)
    return featureVec

df['w2v_features'] = list(map(lambda sen_group:
                                      get_w2v_features(W2Vmodel, sen_group),
                                      df.tokenized_sentences))

  app.launch_new_instance()


In [160]:
df.head()

Unnamed: 0,author,file_name,text,document_sentences,tokenized_sentences,w2v_features
0,Abraham Lincoln,Abraham Lincoln___State of the Union Addresses...,fellow - citizens senate house representative...,[ fellow - citizens senate house representativ...,"[[fellow, citizens, senate, house, representat...","[-0.08300484, -0.034919422, 0.14951418, 0.0426..."
1,Abraham Lincoln,Abraham Lincoln___The Emancipation Proclamatio...,president united states america : proclamatio...,[ president united states america : proclamati...,"[[president, united, states, america, proclama...","[-0.052613672, -0.097373344, 0.19649191, -0.01..."
2,Abraham Lincoln,Abraham Lincoln___The Life and Public Service ...,the discovery unknown address abraham lincoln...,[ the discovery unknown address abraham lincol...,"[[the, discovery, unknown, address, abraham, l...","[-0.07926907, -0.021158708, 0.13386746, 0.0301..."
3,Abraham Lincoln,Abraham Lincoln___The Writings of Abraham Linc...,immediately lincoln re - election presidency ...,[ immediately lincoln re - election presidency...,"[[immediately, lincoln, re, election, presiden...","[-0.0740697, 0.022560358, 0.14278434, 0.046736..."
4,Abraham Lincoln,Abraham Lincoln___The Writings of Abraham Linc...,first child joshua f speed springfield may 18...,[ first child joshua f speed springfield may 1...,"[[first, child, joshua, f, speed, springfield,...","[-0.07442752, 0.01899455, 0.15480827, 0.037126..."


In [161]:
df.author.nunique()

9

In [162]:
le = LabelEncoder()
df['author_LabelEncoded'] = le.fit_transform(df.author)

In [163]:
df.author_LabelEncoded.value_counts()

3    11
1    11
8    10
6    10
4    10
2    10
0    10
7     9
5     9
Name: author_LabelEncoded, dtype: int64

In [164]:
y_one_hot = np_utils.to_categorical(df['author_LabelEncoded'])

In [165]:
df.head()

Unnamed: 0,author,file_name,text,document_sentences,tokenized_sentences,w2v_features,author_LabelEncoded
0,Abraham Lincoln,Abraham Lincoln___State of the Union Addresses...,fellow - citizens senate house representative...,[ fellow - citizens senate house representativ...,"[[fellow, citizens, senate, house, representat...","[-0.08300484, -0.034919422, 0.14951418, 0.0426...",0
1,Abraham Lincoln,Abraham Lincoln___The Emancipation Proclamatio...,president united states america : proclamatio...,[ president united states america : proclamati...,"[[president, united, states, america, proclama...","[-0.052613672, -0.097373344, 0.19649191, -0.01...",0
2,Abraham Lincoln,Abraham Lincoln___The Life and Public Service ...,the discovery unknown address abraham lincoln...,[ the discovery unknown address abraham lincol...,"[[the, discovery, unknown, address, abraham, l...","[-0.07926907, -0.021158708, 0.13386746, 0.0301...",0
3,Abraham Lincoln,Abraham Lincoln___The Writings of Abraham Linc...,immediately lincoln re - election presidency ...,[ immediately lincoln re - election presidency...,"[[immediately, lincoln, re, election, presiden...","[-0.0740697, 0.022560358, 0.14278434, 0.046736...",0
4,Abraham Lincoln,Abraham Lincoln___The Writings of Abraham Linc...,first child joshua f speed springfield may 18...,[ first child joshua f speed springfield may 1...,"[[first, child, joshua, f, speed, springfield,...","[-0.07442752, 0.01899455, 0.15480827, 0.037126...",0


In [166]:
X_train, X_test, y_train, y_test = train_test_split(df.w2v_features, df['author_LabelEncoded'], random_state = 0)

In [167]:
X_train_w2v = np.array(list(map(np.array, X_train)))
X_test_w2v = np.array(list(map(np.array, X_test)))

In [168]:
clf = SVC(kernel='linear').fit(X_train_w2v, y_train)

In [169]:
y_score = clf.predict(X_test_w2v)

In [170]:
n_right = 0
for i in range(len(y_score)):
    if y_score.tolist()[i] == y_test.tolist()[i]:
        n_right += 1

print("Accuracy: %.2f%%" % ((n_right/float(len(y_test)) * 100)))

Accuracy: 30.43%


In [171]:
print(classification_report(y_test, y_score))

             precision    recall  f1-score   support

          0       1.00      0.75      0.86         4
          1       0.11      0.50      0.18         2
          2       0.00      0.00      0.00         4
          3       0.33      1.00      0.50         2
          4       0.00      0.00      0.00         3
          5       0.00      0.00      0.00         2
          6       0.00      0.00      0.00         3
          7       0.00      0.00      0.00         2
          8       0.20      1.00      0.33         1

avg / total       0.22      0.30      0.22        23



  'precision', 'predicted', average, warn_for)


In [172]:
rfc = RandomForestClassifier(random_state=42)

In [173]:
rfc.fit(X_train_w2v, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [174]:
y_score = clf.predict(X_test_w2v)

In [175]:
print(classification_report(y_test, y_score))

             precision    recall  f1-score   support

          0       1.00      0.75      0.86         4
          1       0.11      0.50      0.18         2
          2       0.00      0.00      0.00         4
          3       0.33      1.00      0.50         2
          4       0.00      0.00      0.00         3
          5       0.00      0.00      0.00         2
          6       0.00      0.00      0.00         3
          7       0.00      0.00      0.00         2
          8       0.20      1.00      0.33         1

avg / total       0.22      0.30      0.22        23



  'precision', 'predicted', average, warn_for)


In [176]:
df.head()

Unnamed: 0,author,file_name,text,document_sentences,tokenized_sentences,w2v_features,author_LabelEncoded
0,Abraham Lincoln,Abraham Lincoln___State of the Union Addresses...,fellow - citizens senate house representative...,[ fellow - citizens senate house representativ...,"[[fellow, citizens, senate, house, representat...","[-0.08300484, -0.034919422, 0.14951418, 0.0426...",0
1,Abraham Lincoln,Abraham Lincoln___The Emancipation Proclamatio...,president united states america : proclamatio...,[ president united states america : proclamati...,"[[president, united, states, america, proclama...","[-0.052613672, -0.097373344, 0.19649191, -0.01...",0
2,Abraham Lincoln,Abraham Lincoln___The Life and Public Service ...,the discovery unknown address abraham lincoln...,[ the discovery unknown address abraham lincol...,"[[the, discovery, unknown, address, abraham, l...","[-0.07926907, -0.021158708, 0.13386746, 0.0301...",0
3,Abraham Lincoln,Abraham Lincoln___The Writings of Abraham Linc...,immediately lincoln re - election presidency ...,[ immediately lincoln re - election presidency...,"[[immediately, lincoln, re, election, presiden...","[-0.0740697, 0.022560358, 0.14278434, 0.046736...",0
4,Abraham Lincoln,Abraham Lincoln___The Writings of Abraham Linc...,first child joshua f speed springfield may 18...,[ first child joshua f speed springfield may 1...,"[[first, child, joshua, f, speed, springfield,...","[-0.07442752, 0.01899455, 0.15480827, 0.037126...",0


### TF-IDF

In [177]:
vectorizer = TfidfVectorizer(sublinear_tf=True, use_idf =True, lowercase=True, strip_accents='ascii', stop_words = 'english')

In [178]:
XText_tfidf = vectorizer.fit_transform(df['text'])

In [179]:
XText_tfidf.shape

(90, 66412)

In [180]:
X_train, X_test, y_train, y_test = train_test_split(XText_tfidf, df.author_LabelEncoded, random_state = 0)

In [181]:
clf = MultinomialNB().fit(X_train, y_train)

In [182]:
y_score = clf.predict(X_test)

In [183]:
n_right = 0
for i in range(len(y_score)):
    if y_score.tolist()[i] == y_test.tolist()[i]:
        n_right += 1

print("Accuracy: %.2f%%" % ((n_right/float(len(y_test)) * 100)))

Accuracy: 52.17%


In [184]:
print(classification_report(y_test, y_score))

             precision    recall  f1-score   support

          0       1.00      0.75      0.86         4
          1       1.00      1.00      1.00         2
          2       0.00      0.00      0.00         4
          3       0.20      1.00      0.33         2
          4       1.00      0.33      0.50         3
          5       1.00      0.50      0.67         2
          6       1.00      0.67      0.80         3
          7       0.00      0.00      0.00         2
          8       0.25      1.00      0.40         1

avg / total       0.64      0.52      0.51        23



  'precision', 'predicted', average, warn_for)


In [185]:
clf = SVC(kernel='linear').fit(X_train, y_train)

In [186]:
y_score = clf.predict(X_test)

In [187]:
n_right = 0
for i in range(len(y_score)):
    if y_score.tolist()[i] == y_test.tolist()[i]:
        n_right += 1

print("Accuracy: %.2f%%" % ((n_right/float(len(y_test)) * 100)))

Accuracy: 82.61%


In [188]:
print(classification_report(y_test, y_score))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00         4
          1       0.50      1.00      0.67         2
          2       1.00      0.75      0.86         4
          3       0.67      1.00      0.80         2
          4       1.00      1.00      1.00         3
          5       1.00      0.50      0.67         2
          6       1.00      0.67      0.80         3
          7       1.00      0.50      0.67         2
          8       0.50      1.00      0.67         1

avg / total       0.91      0.83      0.83        23



In [189]:
df.head()

Unnamed: 0,author,file_name,text,document_sentences,tokenized_sentences,w2v_features,author_LabelEncoded
0,Abraham Lincoln,Abraham Lincoln___State of the Union Addresses...,fellow - citizens senate house representative...,[ fellow - citizens senate house representativ...,"[[fellow, citizens, senate, house, representat...","[-0.08300484, -0.034919422, 0.14951418, 0.0426...",0
1,Abraham Lincoln,Abraham Lincoln___The Emancipation Proclamatio...,president united states america : proclamatio...,[ president united states america : proclamati...,"[[president, united, states, america, proclama...","[-0.052613672, -0.097373344, 0.19649191, -0.01...",0
2,Abraham Lincoln,Abraham Lincoln___The Life and Public Service ...,the discovery unknown address abraham lincoln...,[ the discovery unknown address abraham lincol...,"[[the, discovery, unknown, address, abraham, l...","[-0.07926907, -0.021158708, 0.13386746, 0.0301...",0
3,Abraham Lincoln,Abraham Lincoln___The Writings of Abraham Linc...,immediately lincoln re - election presidency ...,[ immediately lincoln re - election presidency...,"[[immediately, lincoln, re, election, presiden...","[-0.0740697, 0.022560358, 0.14278434, 0.046736...",0
4,Abraham Lincoln,Abraham Lincoln___The Writings of Abraham Linc...,first child joshua f speed springfield may 18...,[ first child joshua f speed springfield may 1...,"[[first, child, joshua, f, speed, springfield,...","[-0.07442752, 0.01899455, 0.15480827, 0.037126...",0


### Bag Of Word

In [190]:
count_vectorizer = CountVectorizer(stop_words='english')
XText_CountVectorizer = count_vectorizer.fit_transform(df['text'])

In [191]:
XText_CountVectorizer.shape[1]

66412

In [192]:
X_train, X_test, y_train, y_test = train_test_split(XText_CountVectorizer, df.author_LabelEncoded, random_state = 0)

In [193]:
clf = MultinomialNB().fit(X_train, y_train)

In [194]:
y_score = clf.predict(X_test)

In [195]:
n_right = 0
for i in range(len(y_score)):
    if y_score.tolist()[i] == y_test.tolist()[i]:
        n_right += 1

print("Accuracy: %.2f%%" % ((n_right/float(len(y_test)) * 100)))

Accuracy: 95.65%


In [196]:
print(classification_report(y_test, y_score))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00         4
          1       1.00      1.00      1.00         2
          2       1.00      0.75      0.86         4
          3       1.00      1.00      1.00         2
          4       1.00      1.00      1.00         3
          5       1.00      1.00      1.00         2
          6       1.00      1.00      1.00         3
          7       1.00      1.00      1.00         2
          8       0.50      1.00      0.67         1

avg / total       0.98      0.96      0.96        23



In [197]:
clf = SVC(kernel='linear').fit(X_train, y_train)

In [198]:
y_score = clf.predict(X_test)

In [199]:
y_score.shape

(23,)

In [200]:
n_right = 0
for i in range(len(y_score)):
    if y_score.tolist()[i] == y_test.tolist()[i]:
        n_right += 1

print("Accuracy: %.2f%%" % ((n_right/float(len(y_test)) * 100)))

Accuracy: 69.57%


In [201]:
print(classification_report(y_test.tolist(), y_score.tolist()))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00         4
          1       0.50      1.00      0.67         2
          2       1.00      0.75      0.86         4
          3       0.50      1.00      0.67         2
          4       0.50      0.33      0.40         3
          5       1.00      0.50      0.67         2
          6       1.00      0.33      0.50         3
          7       0.50      0.50      0.50         2
          8       0.50      1.00      0.67         1

avg / total       0.78      0.70      0.69        23



### Word Embedding

In [202]:
### Create sequence
vocabulary_size = XText_CountVectorizer.shape[1]
num_of_author = len(df['author_LabelEncoded'].unique())

tokenizer = Tokenizer(num_words= vocabulary_size)
tokenizer.fit_on_texts(df['text'])
sequences = tokenizer.texts_to_sequences(df['text'])
data = pad_sequences(sequences, maxlen=5000)

In [203]:
X_train, X_test, y_train, y_test = train_test_split(data, np_utils.to_categorical(df['author_LabelEncoded']), random_state = 0)

In [204]:
model = Sequential()
model.add(Embedding(vocabulary_size, 256, input_length = data.shape[1]))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(num_of_author, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X_train, y_train, epochs=10, batch_size=5)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x22cfa5c0>

In [205]:
y_score = model.predict(X_test)
y_score = [[1 if i == max(sc) else 0 for i in sc] for sc in y_score]
n_right = 0
for i in range(len(y_score)):
    if all(y_score[i][j] == np.array(y_test)[i][j] for j in range(len(y_score[i]))):
        n_right += 1

print("Accuracy: %.2f%%" % ((n_right/float(len(y_test)) * 100)))

Accuracy: 17.39%
