<a href="https://colab.research.google.com/github/chandanareddy-enugala/NLP-SLU/blob/main/NLP_HMM_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Loading Libraries & Loading Dataset

In [None]:
# Importing libraries
import nltk
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
import pprint, time
import random

### **Load Dataset**

In [None]:
filePath = "/content/drive/MyDrive/NLP/data/train.tsv"
data = pd.read_csv(filePath, lineterminator='\n',error_bad_lines=False, sep = '\t',header= None)
data = data[data[0].apply(lambda x: len(str(x))<18)]
# data[2] = data[0].apply(lambda x: len(str(x)))
# data = data[data[2]<18]
data = data.reset_index(drop=True)



  exec(code_obj, self.user_global_ns, self.user_ns)


### Split the dataset into Training and Testing

In [None]:
# split data into training and validation set in the ratio 80:20
train_set, test_set = train_test_split(data, train_size=0.80, test_size=0.20, random_state = 101)
splitIndex = list(data[data[0]=="<S>"].index)[-100]
train_set = data.loc[0:5041631]
test_set = data.loc[5041631:]

### Finding Tags, Words, Tagscount

In [None]:
def find_tagsCount(data):
  only_words_data = data[data[0]!='<S>'].reset_index(drop=True)
  tag_value_counts = only_words_data[1].value_counts()
  tags_count = {}
  for key in tag_value_counts.keys():
    tags_count[key] = tag_value_counts[key]
  
  tags = list(tags_count.keys())
  words = list(only_words_data[0])

  return tags_count, tags, words

In [None]:
tags_count, tags, words = find_tagsCount(train_set)
tags_count

{'N': 4125188, 'S': 492555, 'U': 165642, 'H': 40378, 'T': 17707}

### Finding Words Emission Counts and Probability Tables

In [None]:
def find_words_emission_CountProbTables(data):
  only_words_data = data[data[0]!='<S>'].reset_index(drop=True)
  words_emission_count = pd.crosstab(only_words_data[0], only_words_data[1])
  words_emission_prob = words_emission_count.copy()
  for tag in tags:
    words_emission_prob[tag] /= words_emission_prob[tag].sum()
  return words_emission_count, words_emission_prob

In [None]:
words_emission_count, words_emission_prob = find_words_emission_CountProbTables(train_set)

### Finding Tags Transition Counts and Probabilities Table

In [None]:
def find_tags_transition_CountProbTables(data):
  data_S = data[data[0]=="<S>"]
  data_S.columns = [1,0]
  data_not_S = data[data[0]!="<S>"]
  data_copy = pd.concat([data_not_S, data_S])
  data_copy = data_copy.sort_index()

  data_copy[2] = data_copy[1].shift(1)
  data_copy.loc[0, 2] = "<S>"
  data_copy

  tags_transition_count = pd.crosstab(data_copy[2], data_copy[1])
  tags_transition_count.loc['<E>', :] = tags_transition_count['<S>']
  tags_transition_count = tags_transition_count[tags]

  tags_transition_prob = tags_transition_count.copy()
  for tag in tags:
    tags_transition_prob[tag] /= tags_transition_prob[tag].sum()
  return tags_transition_count, tags_transition_prob

In [None]:
tags_transition_count, tags_transition_prob = find_tags_transition_CountProbTables(train_set)

### **Algorithm**

In [None]:
# finding best values
def find_best_tag_prob(input_predd, i, word, bestSeq):
  tempdata = input_pred.loc[str(i)+'_'+word, :]
  bestValue = tempdata.max()
  result = dict(tempdata[tempdata == bestValue])
  bestSeq[i] = (word, list(result.keys())[0], list(result.values())[0])
  return bestSeq

In [None]:
def viterbi_Algorithm(sentence, words, tags, words_emission_prob, tags_transition_prob):
  rows = [str(i)+'_'+word for i, word in enumerate(sentence)]
  cols = tags
  zero_data = np.zeros(shape=(len(rows),len(cols)))
  input_pred = pd.DataFrame(zero_data, index=rows, columns=cols)
  
  bestSeq = {}
  # Finding Probabilities for the first word -----------------------------------------
  word = sentence[0]
  prevTag = "<S>"
  if word in words:
    for tag in tags:
      input_pred.loc[str(0)+'_'+word, tag] = words_emission_prob.loc[word, tag]*tags_transition_prob.loc[prevTag, tag]
  else:
    for tag in tags:
      input_pred.loc[str(0)+'_'+word, tag] = tags_transition_prob.loc[prevTag, tag]

  # Finding Max value and Best tag for the first word -----------------------------------------
  bestSeq = find_best_tag_prob(input_pred, 0, word, bestSeq)

  for i in range(1, len(sentence)):
    word = sentence[i]
    prevTag = bestSeq[i-1][1]
    if word in words:
      for tag in tags:
        input_pred.loc[str(i)+'_'+word, tag] = words_emission_prob.loc[word, tag]*tags_transition_prob.loc[prevTag, tag]
    else:
      for tag in tags:
        input_pred.loc[str(i)+'_'+word, tag] = tags_transition_prob.loc[prevTag, tag]
    
    # Finding Max value and Best tag for the words -----------------------------------------
    bestSeq = find_best_tag_prob(input_pred, i, word, bestSeq)
    Y_pred = [bestSeq[i][1] for i in range(len(bestSeq))]
    return input_pred, bestSeq, Y_pred


In [None]:
def get_accuracy(Y, Y_pred):
  count = 0
  for i in range(len(Y_pred)):
    if Y[i] == Y_pred[i]:
      count += 1
  print(f"Accuracy : {(count/len(Y_pred))*100}%")

Pick Random Sentenc

In [None]:
test_set = test_set.reset_index(drop=True)
test_set[test_set[0]=="<S>"].index

Int64Index([   0,   23,   84,  100,  112,  154,  167,  184,  214,  240,  321,
             336,  363,  377,  388,  419,  429,  456,  467,  487,  508,  526,
             552,  570,  584,  596,  616,  646,  659,  681,  707,  740,  760,
             778,  806,  866,  928,  934,  940,  963,  979, 1013, 1019, 1049,
            1055, 1077, 1081, 1097, 1116, 1124, 1146, 1154, 1162, 1179, 1219,
            1237, 1269, 1278, 1299, 1342, 1386, 1427, 1457, 1470, 1481, 1492,
            1512, 1536, 1543, 1553, 1583, 1611, 1627, 1641, 1663, 1688, 1727,
            1780, 1795, 1823, 1836, 1844, 1877, 1887, 1903, 1917, 1951, 1972,
            1998, 2013, 2023, 2068, 2124, 2133, 2162, 2183, 2200, 2215, 2248,
            2297],
           dtype='int64')

In [None]:
startIndx = 0+1 
endIndx = 23
test = test_set[startIndx:endIndx]
sentence = list(test[0])
Y = list(test[1])

In [None]:
input_pred, bestSeq, Y_pred = viterbi_Algorithm(sentence, words, tags, words_emission_prob, tags_transition_prob)

In [None]:
get_accuracy(Y, Y_pred)

Accuracy : 100.0%
