In [63]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.util import ngrams
import string
import numpy as np
from sklearn.utils import shuffle
import collections
import re
import math
from sklearn.metrics import confusion_matrix

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\damio\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [64]:
def tokenize(str_data):
    data = str_data.lower()
    data = re.sub(r'\n', '', data)
    data = re.sub(r'[^A-Za-z\s]', '', data)
    data = data.split(' ')
    data = list(filter(lambda item: item != '', data))
    return data
def extract_ngrams(tokens, n = 2):
    ngram_list = ngrams(tokens, n)
    return ngram_list

In [65]:
books = []

for i in range(1, 8):
    with open (f"HP{i}.txt", 'r') as f:
        pages = []
        data = f.read()
        data = data.split('\n')
        for page in data:
            pages.append(tokenize(page))
        books.append(pages)
        f.close()

In [66]:
data = []
labels = []
for i, book in enumerate(books):
    for page in book:
        data.append(page)
        labels.append(i + 1)
books = None

In [67]:
data, labels = shuffle(data, labels, random_state=0)
train_data = data[:int(len(data) * 0.8)]
train_labels = labels[:int(len(labels) * 0.8)]

test_data = data[int(len(data) * 0.8):int(len(data) * 0.9)]
test_labels = labels[int(len(labels) * 0.8):int(len(labels) * 0.9)]

val_data = data[int(len(data) * 0.9):]
val_labels = labels[int(len(labels) * 0.9):]

data = None
labels = None

In [68]:
word_frequencies = collections.defaultdict(lambda: collections.defaultdict(int))
for label, page in zip(train_labels, train_data):
    ngrams_list = list(extract_ngrams(page))
    for ngram in ngrams_list:
        word_frequencies[label][ngram] += 1

In [69]:
def classify(page, delta = 0.01):
    class_counts = collections.Counter(label for label in train_labels)
    class_priors = {cls: count / len(train_data) for cls, count in class_counts.items()}

    page_ngrams = list(extract_ngrams(page))

    probabilities = {}
    for label in class_counts.keys():
        prob = math.log(class_priors[label])

        for ngram in page_ngrams:
            n_count = word_frequencies[label][tuple(ngram)] + delta
            n_total = sum(word_frequencies[label].values()) + delta * len(word_frequencies[label])
            prob += math.log(n_count / n_total)
        probabilities[label] = prob

    return max(probabilities, key=probabilities.get)

In [70]:
deltas = [0.5, 0.1, 0.01, 0.001, 0.0001]
for delta in deltas:
    predicted = []
    for page in test_data:
        predicted.append(classify(page))
    cm = confusion_matrix(test_labels, predicted)
    tot = np.sum(cm)
    correct = np.sum(np.diagonal(cm))
    accuracy = correct/tot * 100
    print(f"Accuracy for delta = {delta} is {accuracy}")

Accuracy for delta = 0.5 is 77.49469214437367
Accuracy for delta = 0.1 is 77.49469214437367
Accuracy for delta = 0.01 is 77.49469214437367
Accuracy for delta = 0.001 is 77.49469214437367


KeyboardInterrupt: 