<a href="https://colab.research.google.com/github/ereshmittal/Markov-Model/blob/main/Markov_Model_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!head robert_frost.txt

Two roads diverged in a yellow wood,
And sorry I could not travel both
And be one traveler, long I stood
And looked down one as far as I could
To where it bent in the undergrowth; 

Then took the other, as just as fair,
And having perhaps the better claim
Because it was grassy and wanted wear,
Though as for that the passing there


In [None]:
import numpy as np
import pandas as pd

import string
from sklearn.model_selection import train_test_split

In [None]:
input_files = ['robert_frost.txt', 'edgar_allan_poe.txt']

In [None]:
input_texts = []
labels = []

for label, f in enumerate(input_files):
  print(f'{f} corresponds to label {label}')
  with open(f) as txt:
    for line in txt:
      line = line.rstrip().lower()
      if line:
        line = line.translate(str.maketrans('', '', string.punctuation))

      input_texts.append(line)
      labels.append(label)

robert_frost.txt corresponds to label 0
edgar_allan_poe.txt corresponds to label 1


In [None]:
train, test, ytrain, ytest = train_test_split(input_texts, labels)

In [None]:
len(train), len(test)

(1783, 595)

In [None]:
idx = 1
word2idx = {'<unk>':0}

In [None]:
for text in train:
  tokens = text.split()
  for token in tokens:
    if token not in word2idx:
      word2idx[token] = idx
      idx += 1

In [None]:
train_text_int = []
test_text_int = []

for text in train:
  tokens = text.split()
  text_as_int = [word2idx[token] for token in tokens]
  train_text_int.append(text_as_int)

for text in test:
  tokens = text.split()
  text_as_int = [word2idx.get(token,0) for token in tokens]
  test_text_int.append(text_as_int)

In [None]:
V = len(word2idx) + 1

A0 = np.ones((V,V))
pi0 = np.ones(V)

A1 = np.ones((V,V))
pi1 = np.ones(V)

In [None]:
def compute_counts(text_as_int, A, pi):
  for tokens in text_as_int:
    last_idx = None
    for id in tokens:
      if last_idx is None:
        pi[id] += 1
      else:
        A[last_idx, id] += 1
      last_idx = idx

In [None]:
rob_text_int = [t for t, y in zip(train_text_int, ytrain) if y == 0]
ed_text_int = [t for t, y in zip(train_text_int, ytrain) if y == 1]

In [None]:
compute_counts(rob_text_int, A0, pi0)
compute_counts(ed_text_int, A1, pi1)

In [None]:
A0 /= A0.sum(axis=1, keepdims=True)
pi0 /= pi0.sum()

A1 /= A1.sum(axis=1, keepdims=True)
pi1 /= pi1.sum()

In [None]:
logA0 = np.log(A0)
logpi0 = np.log(pi0)

logA1 = np.log(A1)
logpi1 = np.log(pi1)

In [None]:
count0 = sum(y==0 for y in ytrain)
count1 = sum(y==1 for y in ytrain)
total = len(ytrain)

p0 = count0/total
p1 = count1/total

logp0 = np.log(p0)
logp1 = np.log(p1)

p0,p1

(0.6634885025238362, 0.33651149747616377)

In [None]:
class Classifier():
  def __init__(self, logA, logpi, logpriors):
    self.logA = logA
    self.logpi = logpi
    self.logpriors = logpriors
    self.K = len(logpriors)

  def _compute_log_likelihood(self, input_, class_):
    logA = self.logA[class_]
    logpi = self.logpi[class_]

    last_idx = None
    logprob = 0
    for idx in input_:
      if last_idx is None:
        logprob += logpi[idx]
      else:
        logprob += logA[last_idx, idx]

      last_idx = idx

    return logprob

  def predict(self, inputs):
    predictions = np.ones(len(inputs))
    for i, input_ in enumerate(inputs):
      posteriors = [self._compute_log_likelihood(input_, c) + self.logpriors[c] for c in range(self.K)] 
      pred = np.argmax(posteriors)
      predictions[i] = pred
    return predictions

In [None]:
clf = Classifier([logA0, logA1], [logpi0, logpi1], [logp0, logp1])

In [None]:
Ptrain = clf.predict(train_text_int)
print(f"Train acc: {np.mean(Ptrain==ytrain)}")

Train acc: 0.7453729669097028


In [None]:
Ptest = clf.predict(test_text_int)
print(f"Train acc: {np.mean(Ptest==ytest)}")

Train acc: 0.6957983193277311
