In [2]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.util import ngrams
import string
import numpy as np
from sklearn.utils import shuffle
import collections
import re
import math

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\damio\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
def tokenize(str_data):
    data = str_data.lower()
    data = re.sub(r'\n', '', data)
    data = re.sub(r'[^A-Za-z\s]', '', data)
    data = data.split(' ')
    data = list(filter(lambda item: item != '', data))
    return data

def extract_ngrams(tokens, n = 2):
    ngram_list = ngrams(tokens, n)
    return list(ngram_list)

In [4]:
books = []
labels = [1, 2, 3, 4, 5, 6, 7]
for book in labels:
    with open (f"harry_potter/HP{book}.txt", 'r') as f:
        data = f.read()
        books.append(extract_ngrams(tokenize(data), 2))
        f.close()

#### Create the dataset, shuffle and split into train, test, validate

In [5]:
#Merge all the ngrams together and have corresponding labels
data = []
labels = []
for k in range(len(books)):
    data += books[k]
    labels += [k+1 for i in range(len(books[k]))]

books = None
data = np.array(data)
labels = np.array(labels)

data, labels = shuffle(data, labels, random_state=0)

#Split the data into training and testing and validation
train_data = data[:int(len(data)*0.8)]
train_labels = labels[:int(len(labels)*0.8)]

test_data = data[int(len(data)*0.8):int(len(data)*0.9)]
test_labels = labels[int(len(labels)*0.8):int(len(labels)*0.9)]

val_data = data[int(len(data)*0.9):]
val_labels = labels[int(len(labels)*0.9):]

In [6]:
print(train_data.shape, test_data.shape, val_data.shape)
print(train_labels.shape, test_labels.shape, val_labels.shape)

(872433, 2) (109054, 2) (109055, 2)
(872433,) (109054,) (109055,)


In [7]:
#Get the word frequencies for the training set 
word_frequencies = collections.defaultdict(lambda: collections.defaultdict(int))
for label, ngram in zip(train_labels, train_data):
    word_frequencies[label][tuple(ngram)] += 1

In [8]:
class_counts = collections.Counter(label for label in train_labels)
class_priors = {cls: count / len(train_data) for cls, count in class_counts.items()}

In [14]:
def classify(data, delta = 0.1):
    probabilities = {}
    for label in class_counts.keys():
        prob = math.log(class_priors[label])
        # for ngram in data:
        n_count = word_frequencies[label][tuple(data)] + delta
        n_total = sum(word_frequencies[label].values()) + delta * len(word_frequencies[label])
        prob += math.log(n_count / n_total)
        probabilities[label] = prob
    return probabilities

In [10]:
sentence = "But there was still a fortnight to go before he went back to school. He looked hopelessly around his room again, and his eye paused on the birthday cards his two best friends had sent him at the end of July. What would they say if Harry wrote to them and told them about his scar hurting? At once, Hermione Granger’s voice seemed to fill his head, shrill and panicky. “ Your scar hurt? Harry, that’s really serious. . . . Write to Professor Dumbledorel And I’ll go and check Common Magical Ailments and Afflictions. ... Maybe there’s something in there about curse scars. ...” Yes, that would be Hermione’s advice: Go straight to the headmaster of Hogwarts, and in the meantime, consult a book. Harry stared out of the window at the inky blue-black sky. He doubted very much whether a book could help him now. As far as he knew, he was the only living person to have survived a curse like Voldemort’s; it was highly unlikely, therefore, that he would find his symptoms listed in Common Magical Ailments and Afflictions. As for informing the headmaster, Harry had no idea where Dumbledore went during the summer holidays. He amused himself for a moment, picturing Dumbledore, with his long silver beard, full-length wizard’s robes, and pointed hat, stretched out on a beach somewhere, rubbing suntan lotion onto his long crooked nose. Wherever Dumbledore was, though, Harry was sure that Hedwig would be able to find him; Harry’s owl had never yet failed to deliver a letter to anyone, even without an address. But what would he write? Dear Professor Dumbledore, Sorry to bother you, but my scar hurt this morning. Yours sincerely, Harry Potter."
sentence = extract_ngrams(tokenize(sentence), 2)

In [15]:
accuracy = 0
tot = 0
for dp in zip(test_data, test_labels):
    probs = classify(dp[0])
    if max(probs, key=probs.get) == dp[1]:
        accuracy += 1
    
    tot += 1

print(accuracy / tot)

0.25867001668897976


In [12]:
test_data[0]

array(['pain', 'seared'], dtype='<U41')

In [13]:
for d in test_data[0]:
    print(d)

pain
seared
