In [1]:
## Download Datasets
!wget -nc https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/edgar_allan_poe.txt
!wget -nc https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/robert_frost.txt

File ‘edgar_allan_poe.txt’ already there; not retrieving.

File ‘robert_frost.txt’ already there; not retrieving.



In [2]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import string
from sklearn.model_selection import train_test_split

In [3]:
input_files = [
    "edgar_allan_poe.txt",
    "robert_frost.txt"
]

In [4]:
!head edgar_allan_poe.txt

LO! Death hath rear'd himself a throne
In a strange city, all alone,
Far down within the dim west
Where the good, and the bad, and the worst, and the best,
Have gone to their eternal rest.
 
There shrines, and palaces, and towers
Are not like any thing of ours
Oh no! O no! ours never loom
To heaven with that ungodly gloom!


In [5]:
!head robert_frost.txt

Two roads diverged in a yellow wood,
And sorry I could not travel both
And be one traveler, long I stood
And looked down one as far as I could
To where it bent in the undergrowth; 

Then took the other, as just as fair,
And having perhaps the better claim
Because it was grassy and wanted wear,
Though as for that the passing there


In [6]:
# collect all data into list
input_texts= []
labels = []

#illitrate over input_files
for label, file in enumerate(input_files):
    print(f'{file} correspond to label {label}')

    for line in open(file):
        line = line.strip().lower()
        if line:
           # remove punctuation
           line = line.translate(str.maketrans('', '', string.punctuation))
           input_texts.append(line)
           labels.append(label)

edgar_allan_poe.txt correspond to label 0
robert_frost.txt correspond to label 1


In [7]:
# split dataset into training and test sets
input_train, input_test, label_train, label_test = train_test_split(
    input_texts, labels, shuffle=True, random_state=42)


In [8]:
len(input_train), len(input_test)

(1615, 539)

In [9]:
input_train[:10]

['does the rain seem to you to cool the eyes',
 'and perhaps she will come still unafraid',
 'these cheeks where the worm never dies',
 'the ledges show lines ruled southeastnorthwest',
 'if i could see it or else mow the room',
 'winds blow the open grassy places bleak',
 'tis said that when',
 'such as it is it isnt worth the mortgage',
 'will start which lately slept in apathy',
 'which is wrong']

In [10]:
label_train[:10]

[1, 1, 0, 1, 1, 1, 0, 1, 0, 1]

In [11]:
# create word to index mapping
idx = 1
word2idx = {'<unk>': 0}

for line in input_train:
    tokens = line.split()
    for token in tokens:
        if token not in word2idx:
            word2idx[token] = idx
            idx += 1

In [12]:
word2idx

{'<unk>': 0,
 'does': 1,
 'the': 2,
 'rain': 3,
 'seem': 4,
 'to': 5,
 'you': 6,
 'cool': 7,
 'eyes': 8,
 'and': 9,
 'perhaps': 10,
 'she': 11,
 'will': 12,
 'come': 13,
 'still': 14,
 'unafraid': 15,
 'these': 16,
 'cheeks': 17,
 'where': 18,
 'worm': 19,
 'never': 20,
 'dies': 21,
 'ledges': 22,
 'show': 23,
 'lines': 24,
 'ruled': 25,
 'southeastnorthwest': 26,
 'if': 27,
 'i': 28,
 'could': 29,
 'see': 30,
 'it': 31,
 'or': 32,
 'else': 33,
 'mow': 34,
 'room': 35,
 'winds': 36,
 'blow': 37,
 'open': 38,
 'grassy': 39,
 'places': 40,
 'bleak': 41,
 'tis': 42,
 'said': 43,
 'that': 44,
 'when': 45,
 'such': 46,
 'as': 47,
 'is': 48,
 'isnt': 49,
 'worth': 50,
 'mortgage': 51,
 'start': 52,
 'which': 53,
 'lately': 54,
 'slept': 55,
 'in': 56,
 'apathy': 57,
 'wrong': 58,
 'not': 59,
 'much': 60,
 'concerned': 61,
 'for': 62,
 'those': 63,
 'whom': 64,
 'mad': 65,
 'pride': 66,
 'of': 67,
 'intellectuality': 68,
 'a': 69,
 'bead': 70,
 'silver': 71,
 'water': 72,
 'more': 73,
 'less'

In [13]:
#convert input text into integers
train_text_int = []
test_text_int = []
for sentence in input_train:
    sentence = sentence.split()
    line_to_int = [word2idx[token] for token in sentence]
    train_text_int.append(line_to_int)

for sentence in input_test:
    sentence = sentence.split()
    line_to_int = [word2idx.get(token, 0) for token in sentence]
    test_text_int.append(line_to_int)

In [14]:
train_text_int[:5]

[[1, 2, 3, 4, 5, 6, 5, 7, 2, 8],
 [9, 10, 11, 12, 13, 14, 15],
 [16, 17, 18, 2, 19, 20, 21],
 [2, 22, 23, 24, 25, 26],
 [27, 28, 29, 30, 31, 32, 33, 34, 2, 35]]

In [15]:
test_text_int[:5]

[[303, 18, 173, 262, 123, 69, 0, 0],
 [9, 0, 147],
 [0, 47, 104, 1397, 5, 0],
 [67, 272, 56, 749, 769, 468, 177, 9, 555],
 [5, 2, 180, 18, 147, 151, 282, 0, 789]]

In [16]:
#initialise A and pi for both 
v = len(word2idx)
A0 = np.ones((v, v))
pi0 = np.ones(v)

A1 = np.ones((v, v))
pi1 = np.ones(v)

In [17]:
def compute_counts(text_to_int, A, pi):
  for tokens in text_to_int:
    last_idx = None
    for idx in tokens:
      if last_idx is None:
        # it's the first word in a sentence
        pi[idx] += 1
      else:
        # the last word exists, so count a transition
        A[last_idx, idx] += 1

      # update last idx
      last_idx = idx


compute_counts([t for t, y in zip(train_text_int, label_train) if y == 0], A0, pi0)
compute_counts([t for t, y in zip(train_text_int, label_test) if y == 1], A1, pi1)

In [18]:
# normalize A and pi so they are valid probability matrices
A0 /= A0.sum(axis=1, keepdims=True)
pi0 /= pi0.sum()

A1 /= A1.sum(axis=1, keepdims=True)
pi1 /= pi1.sum()

In [19]:
# log A and pi since we don't need the actual probs
logA0 = np.log(A0)
logpi0 = np.log(pi0)

logA1 = np.log(A1)
logpi1 = np.log(pi1)

In [20]:
# compute priors
count0 = sum(y == 0 for y in label_train)
count1 = sum(y == 1 for y in label_train)
total = len(label_train)
p0 = count0 / total
p1 = count1 / total
logp0 = np.log(p0)
logp1 = np.log(p1)
p0, p1

(0.32941176470588235, 0.6705882352941176)

In [21]:
# build a classifier
class Classifier:
  def __init__(self, logAs, logpis, logpriors):
    self.logAs = logAs
    self.logpis = logpis
    self.logpriors = logpriors
    self.K = len(logpriors) # number of classes

  def _compute_log_likelihood(self, input_, class_):
    logA = self.logAs[class_]
    logpi = self.logpis[class_]

    last_idx = None
    logprob = 0
    for idx in input_:
      if last_idx is None:
        # it's the first token
        logprob += logpi[idx]
      else:
        logprob += logA[last_idx, idx]
      
      # update last_idx
      last_idx = idx
    
    return logprob
  
  def predict(self, inputs):
    predictions = np.zeros(len(inputs))
    for i, input_ in enumerate(inputs):
      posteriors = [self._compute_log_likelihood(input_, c) + self.logpriors[c] \
             for c in range(self.K)]
      pred = np.argmax(posteriors)
      predictions[i] = pred
    return predictions

In [22]:
# each array must be in order since classes are assumed to index these lists
clf = Classifier([logA0, logA1], [logpi0, logpi1], [logp0, logp1])

In [23]:
Ptrain = clf.predict(train_text_int)
print(f"Train acc: {np.mean(Ptrain == label_train)}")

Train acc: 0.8743034055727554


In [24]:
Ptest = clf.predict(test_text_int)
print(f"Test acc: {np.mean(Ptest == label_test)}")

Test acc: 0.75139146567718


In [25]:
from sklearn.metrics import confusion_matrix, f1_score
cm = confusion_matrix(label_train, Ptrain)
cm

array([[472,  60],
       [143, 940]])

In [26]:
cm_test = confusion_matrix(label_test, Ptest)
cm_test

array([[116,  70],
       [ 64, 289]])