In [127]:
from nltk.corpus import gutenberg, stopwords
import nltk
# Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding
# Others
import string
import numpy as np
import pandas as pd

from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import re

In [128]:
gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [129]:
# for fileid in gutenberg.fileids():
#     num_chars = len(gutenberg.raw(fileid)) # Tính số lượng ký tự
#     num_words = len(gutenberg.words(fileid)) # Tính số lượng từ đơn
#     num_sents = len(gutenberg.sents(fileid)) # Tính số lượng câu.
#     num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)])) # Tính số lượng từ vựng trong văn bản
#     print(int(num_chars / num_words), int(num_words / num_sents), int(num_words / num_vocab), fileid)


In [130]:
list_of_lists = [
    ['austen', 'austen-emma.txt'],
    ['bible', 'bible-kjv.txt'],
    ['blake', 'blake-poems.txt'],
    ['bryant', 'bryant-stories.txt'],
    ['burgess', 'burgess-busterbrown.txt'],
    ['carroll', 'carroll-alice.txt'],
    ['chesterton', 'chesterton-ball.txt'],
    ['edgeworth', 'edgeworth-parents.txt'],
    ['melville', 'melville-moby_dick.txt'],
    ['shakespeare', 'shakespeare-hamlet.txt'],
    ['austen', 'austen-persuasion.txt'],
    ['chesterton', 'chesterton-brown.txt'],
    ['shakespeare', 'shakespeare-macbeth.txt']
]

# columns
columns = ['author', 'file_name']

df = pd.DataFrame(list_of_lists, columns=columns)
df['text'] = ''

In [131]:
df.head()

Unnamed: 0,author,file_name,text
0,austen,austen-emma.txt,
1,bible,bible-kjv.txt,
2,blake,blake-poems.txt,
3,bryant,bryant-stories.txt,
4,burgess,burgess-busterbrown.txt,


In [132]:
def get_data_of_file(row):
    file_name = row.file_name
    text = gutenberg.raw(file_name)
    i = text.index('\n')
    text = (text[i:]).lower()
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    row['text'] = text
    return row
df = df.apply(get_data_of_file, axis=1)

In [133]:
df.head()

Unnamed: 0,author,file_name,text
0,austen,austen-emma.txt,volume i chapter i emma woodhouse handsome cl...
1,bible,bible-kjv.txt,the old testament of the king james bible the...
2,blake,blake-poems.txt,songs of innocence and of experience and the ...
3,bryant,bryant-stories.txt,two little riddles in rhyme there a garden th...
4,burgess,burgess-busterbrown.txt,i buster bear goes fishing buster bear yawned...


In [134]:
le = LabelEncoder()
df['author_LabelEncoded'] = le.fit_transform(df.author)

In [137]:
df.head(15)

Unnamed: 0,author,file_name,text,author_LabelEncoded
0,austen,austen-emma.txt,volume i chapter i emma woodhouse handsome cl...,0
1,bible,bible-kjv.txt,the old testament of the king james bible the...,1
2,blake,blake-poems.txt,songs of innocence and of experience and the ...,2
3,bryant,bryant-stories.txt,two little riddles in rhyme there a garden th...,3
4,burgess,burgess-busterbrown.txt,i buster bear goes fishing buster bear yawned...,4
5,carroll,carroll-alice.txt,chapter i down the rabbit - hole alice was be...,5
6,chesterton,chesterton-ball.txt,i a discussion somewhat in the air the flying...,6
7,edgeworth,edgeworth-parents.txt,the orphans near the ruins of the castle of r...,7
8,melville,melville-moby_dick.txt,etymology supplied by a late consumptive ushe...,8
9,shakespeare,shakespeare-hamlet.txt,actus primus scoena prima enter barnardo and ...,9


In [138]:
vectorizer = TfidfVectorizer(sublinear_tf=True, use_idf =True, lowercase=True, strip_accents='ascii', stop_words = 'english')

In [139]:
XText_tfidf = vectorizer.fit_transform(df['text'])

In [140]:
XText_tfidf.shape

(13, 35113)

In [143]:
X_train = XText_tfidf[0:10]
y_train = df.author_LabelEncoded[0:10]
X_test = XText_tfidf[10:13]
y_test = df.author_LabelEncoded[10:13]

In [144]:
clf = MultinomialNB().fit(X_train, y_train)

In [145]:
y_score = clf.predict(X_test)

In [147]:
n_right = 0
for i in range(len(y_score)):
    if y_score.tolist()[i] == y_test.tolist()[i]:
        n_right += 1

print("Accuracy: %.2f%%" % ((n_right/float(len(y_test)) * 100)))

Accuracy: 100.00%


In [148]:
print(classification_report(y_test, y_score))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00         1
          6       1.00      1.00      1.00         1
          9       1.00      1.00      1.00         1

avg / total       1.00      1.00      1.00         3



In [89]:
clf = SVC(kernel='linear').fit(X_train, y_train)

In [90]:
y_score = clf.predict(X_test)

In [91]:
n_right = 0
for i in range(len(y_score)):
    if y_score.tolist()[i] == y_test.tolist()[i]:
        n_right += 1

print("Accuracy: %.2f%%" % ((n_right/float(len(y_test)) * 100)))

Accuracy: 100.00%


In [93]:
print(classification_report(y_test, y_score))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00         1
          6       1.00      1.00      1.00         1
          9       1.00      1.00      1.00         1

avg / total       1.00      1.00      1.00         3



In [156]:
### Create sequence
vocabulary_size = 50000
tokenizer = Tokenizer(num_words= vocabulary_size)
tokenizer.fit_on_texts(df['text'])
sequences = tokenizer.texts_to_sequences(df['text'])
data = pad_sequences(sequences, maxlen=1000)

In [157]:
s_dummies = pd.get_dummies(df.author)
X_train = data[0:10]
y_train = s_dummies[0:10]
X_test = data[10:13]
y_test = s_dummies[10:13]

In [158]:
model = Sequential()
model.add(Embedding(50000, 100, input_length=1000))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(10, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X_train, y_train, epochs=10, batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x11afbf60>

In [159]:
y_score = model.predict(X_test)
y_score = [[1 if i == max(sc) else 0 for i in sc] for sc in y_score]
n_right = 0
for i in range(len(y_score)):
    if all(y_score[i][j] == np.array(y_test)[i][j] for j in range(len(y_score[i]))):
        n_right += 1

print("Accuracy: %.2f%%" % ((n_right/float(len(y_test)) * 100)))

Accuracy: 33.33%
