# Natural Language Processing: Classification

In [26]:
def text2paragraphs(filename, min_size=1):
    txt = open(filename, 'r', encoding='utf8')

    txt = txt.read()

    paragraphs = [para for para in txt.split('\n\n') if len(para) > min_size]

    return paragraphs

In [27]:
labels = ['Virginia Woolf', 'Samuel Butler', 'Herman Melville', 'David Herbert Lawrence',
          'Daniel Defoe', 'James Joyce']

files = ['night_and_day_virginia_woolf.txt', 'the_way_of_all_flash_butler.txt',
         'moby_dick_melville.txt', 'sons_and_lovers_lawrence.txt',
         'robinson_crusoe_defoe.txt', 'james_joyce_ulysses.txt']

In [28]:
data = []
targets = []
counter = 0

for fname in files:
    paras = text2paragraphs(fname, min_size=150)
    data.extend(paras)

    targets += [counter] * len(paras)
    counter += 1

In [30]:
import random

data_targets = list(zip(data, targets))
data_targets = random.sample(data_targets, len(data_targets))

data, targets = list(zip(*data_targets))

In [31]:
from sklearn.model_selection import train_test_split

train_data, test_data, train_targets, test_targets = train_test_split(data, targets,
                                                                      train_size=0.8,
                                                                      test_size=0.2,
                                                                      random_state=42)

In [33]:
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

vectorizer = CountVectorizer(stop_words=list(ENGLISH_STOP_WORDS))
vectors = vectorizer.fit_transform(train_data)

classifier = MultinomialNB(alpha=0.01)
classifier.fit(vectors, train_targets)

vectors_test = vectorizer.transform(test_data)

predictions = classifier.predict(vectors_test)
accuracy_score = metrics.accuracy_score(test_targets, predictions)

f1_score = metrics.f1_score(test_targets, predictions, average='macro')

print('Accuracy score: ', accuracy_score)
print("F1-score: ", f1_score)

Accuracy score:  0.9079019073569482
F1-score:  0.9040653862199467


In [36]:
paras = text2paragraphs('the_way_of_all_flash_butler.txt', min_size=250)

first_para, last_para = 100, 500
vectors_test = vectorizer.transform(paras[first_para: last_para])

predictions = classifier.predict(vectors_test)
print(predictions)

targets = [0] * (last_para - first_para)

accuracy_score = metrics.accuracy_score(targets, predictions)
f1_score = metrics.f1_score(targets, predictions, average='macro')

print('Accuracy Score: ', accuracy_score)
print('F1-score: ', f1_score)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 5 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
Accuracy Score:  0.0025
F1-score:  0.0012468827930174565


In [37]:
predictions = classifier.predict_proba(vectors_test)
print(predictions)

[[1.14225044e-45 1.00000000e+00 5.75575570e-68 4.54535031e-55
  1.00531347e-59 8.38127129e-58]
 [5.12376439e-11 1.00000000e+00 2.01599535e-29 9.87528690e-31
  1.03842869e-42 3.30261009e-19]
 [1.25300503e-19 1.00000000e+00 2.23665689e-43 7.34845309e-32
  4.68185436e-32 7.67785640e-36]
 ...
 [4.76854571e-17 1.00000000e+00 2.00913469e-36 3.31795283e-29
  9.38027729e-46 6.54210889e-31]
 [3.33282493e-10 1.00000000e+00 1.07749183e-17 1.79841096e-14
  4.59916910e-22 2.44288555e-15]
 [2.87964670e-16 1.00000000e+00 2.46375214e-35 2.52443688e-29
  2.89432339e-32 1.02570110e-40]]


In [38]:
for i in range(0, 10):
    print(predictions[i], paras[i+first_para])

[1.14225044e-45 1.00000000e+00 5.75575570e-68 4.54535031e-55
 1.00531347e-59 8.38127129e-58] The Allabys behaved with great judgement.  They humoured him till his
retreat was practically cut off, though he still flattered himself that
it was open.  One day about six months after Theobald had become an
almost daily visitor at the Rectory the conversation happened to turn
upon long engagements.  "I don't like long engagements, Mr Allaby, do
you?" said Theobald imprudently.  "No," said Mr Allaby in a pointed tone,
"nor long courtships," and he gave Theobald a look which he could not
pretend to misunderstand.  He went back to Cambridge as fast as he could
go, and in dread of the conversation with Mr Allaby which he felt to be
impending, composed the following letter which he despatched that same
afternoon by a private messenger to Crampsford.  The letter was as
follows:--
[5.12376439e-11 1.00000000e+00 2.01599535e-29 9.87528690e-31
 1.03842869e-42 3.30261009e-19]    "Dearest Miss Christina

In [39]:
from sklearn.neural_network import MLPClassifier

vectorizer = CountVectorizer(stop_words=list(ENGLISH_STOP_WORDS))
vectors = vectorizer.fit_transform(train_data)

print("Creating a classifier. This will take some time!")
classifier = MLPClassifier(random_state=1, max_iter=300).fit(vectors, train_targets)

Creating a classifier. This will take some time!


In [None]:
vectors_test = vectorizer.transform(test_data)

predictions = classifier.predict(vectors_test)
accuracy_score = metrics.accuracy_score(test_targets, predictions)

f1_score = metrics.f1_score(test_targets, predictions, average='macro')

print('accuracy score: ', accuracy_score)
print('f1-score: ', f1_score)