In [None]:
from matplotlib import pyplot as plt
from sklearn.cluster import AgglomerativeClustering
import scipy.cluster.hierarchy as sch

#Basic imports
import numpy as np
import pandas as pd

#sklearn imports
from sklearn.decomposition import PCA #Principal Component Analysis
from sklearn.manifold import TSNE #T-Distributed Stochastic Neighbor Embedding
from sklearn.cluster import KMeans #K-Means Clustering
from sklearn.preprocessing import StandardScaler #used for 'Feature Scaling'

#plotly imports
import plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

In [None]:
df = pd.read_csv('./data_TM2/synt_annotated_data.csv', index_col=0)
# df['concat_da_names'] = df.iloc[:, 8:21].apply(lambda row: row.dropna().tolist(), axis=1)
df.head()

In [None]:
df_to_cluster = df.iloc[:, 8:21]
df_to_cluster.columns

In [None]:
DA = ['repair_initiator','greeting','request_summary','confirmation','receipt','disconfirmation',
      'sequence_closer','completion_check','hold_request','partial_request', 'detail_request', 'grant', 'answer'] 

for e,i in enumerate(DA):
    print(e,i, e+8) #(class number, original tag, column number)
    df_to_cluster.iloc[:,e] = df_to_cluster.iloc[:,e].replace(i, 1)
    df_to_cluster.iloc[:,e] = df_to_cluster.iloc[:,e].fillna(0)
    
df_to_cluster

In [None]:
#select only labeled dataset

df.dropna(subset=['DA_rep_init', 'DA_greet', 'DA_req_sum',
       'DA_conf', 'DA_receipt', 'DA_disconf', 'DA_closer', 'DA_comp_check',
       'DA_hold', 'DA_partial_req', 'DA_detail_req', 'DA_grant', 'DA_answer'], how='all', inplace = True)

print(len(df))

In [None]:
DA = ['repair_initiator','greeting','request_summary','confirmation','receipt','disconfirmation',
      'sequence_closer','completion_check','hold_request','partial_request', 'detail_request', 'grant', 'answer'] 

#replace tags per number of class
for e,i in enumerate(DA):
    print(e,i, e+8) #(class number, original tag, column number)
    df.iloc[:,e+8] = df.iloc[:,e+8].replace(i, (e+1))
    df.iloc[:,e+8] = df.iloc[:,e+8].fillna(0)

#     df.iloc[:,e+8] =df.iloc[:,e+8].astype(int)
    
df['concat_col'] = df.iloc[:, 8:21].apply(lambda row: row.dropna().tolist(), axis=1)

In [None]:
concat_col = []
for row in df['concat_col']:
    row = [int(x) for x in row]
    concat_col.append(row)
    
df['concat_col'] = concat_col
# print(df['concat_col'].head())
df.head()

In [None]:
#filled with zeros
X = df['concat_col'].to_list()

In [None]:
#example

X = [[0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      [0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 11, 0, 0],
      [0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      [0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11, 0, 0],
      [0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 11, 0, 0],
      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13],
      [0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 11, 0, 0],
      [0, 0, 0, 4, 0, 0, 7, 0, 0, 0, 0, 0, 0],
      [0, 0, 3, 4, 0, 0, 7, 0, 0, 0, 0, 0, 0],
      [0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      [0, 0, 0, 4, 0, 0, 7, 0, 0, 0, 0, 0, 0],
      [0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0],
      [0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 0, 0, 0],
      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11, 12, 0],
      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13],
      [0, 0, 0, 4, 0, 0, 7, 0, 0, 0, 0, 0, 0],
      [0, 0, 0, 4, 0, 0, 7, 0, 0, 0, 0, 0, 0],
      [0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0],
      [0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0]]


# X = [[10,20],[5,15],[13,10],[20,10],[1,13],[2,12]]

dendrogram = sch.dendrogram(sch.linkage(X, method='ward'))

In [None]:
%matplotlib inline
import matplotlib.pylab as plt
import seaborn as sns
import pandas as pd
import numpy as np

sns.clustermap(X)

In [None]:
model = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='ward')
model.fit(X)
labels = model.labels_


In [None]:
# plt.scatter(X[labels==0,0], X[labels==0,1], s=50, marker='o', color='red')
# plt.scatter(X[labels==1, 0], X[labels==1, 1], s=50, marker='o', color='blue')
# plt.scatter(X[labels==2, 0], X[labels==2, 1], s=50, marker='o', color='green')
plt.show()

## Notebook t-sne

In [None]:
# Time to build our clusters.
# In this kernel, we will be visualizing only three different clusters on our data. 
#I chose three because I found it to be a good number of clusters to help us visualize our data


########### en lugar de kmean use HAC ##########


#Initialize our model
kmeans = KMeans(n_clusters=3)
#Fit our model
kmeans.fit(df_to_cluster)
KMeans(n_clusters=3)
#Find which cluster each data-point belongs to
clusters = kmeans.predict(df_to_cluster)
#Add the cluster vector to our DataFrame, X
df_to_cluster["Cluster"] = clusters

In [None]:
#sampling 
#plotX is a DataFrame containing 5000 values sampled randomly from X
plot_df = pd.DataFrame(np.array(df_to_cluster.sample(5000)))

#Rename plotX's columns since it was briefly converted to an np.array above
plot_df.columns = df_to_cluster.columns

# plot_df.head()

In [None]:
#Set our perplexity
perplexity = 50

#T-SNE with two dimensions
tsne_2d = TSNE(n_components=2, perplexity=perplexity)

# #T-SNE with three dimensions
# tsne_3d = TSNE(n_components=3, perplexity=perplexity)

#This DataFrame contains two dimensions, built by T-SNE
TCs_2d = pd.DataFrame(tsne_2d.fit_transform(plot_df.drop(["Cluster"], axis=1)))

# #And this DataFrame contains three dimensions, built by T-SNE
# TCs_3d = pd.DataFrame(tsne_3d.fit_transform(df_to_cluster.drop(["Cluster"], axis=1)))

#"TC1_2d" means: 'The first component of the components created for 2-D visualization, by T-SNE.'
#And "TC2_2d" means: 'The second component of the components created for 2-D visualization, by T-SNE.'
TCs_2d.columns = ["TC1_2d","TC2_2d"]

# TCs_3d.columns = ["TC1_3d","TC2_3d","TC3_3d"]

df_to_cluster = pd.concat([plot_df,TCs_1d,TCs_2d,TCs_3d], axis=1, join='inner')


#Each of these new DataFrames will hold all of the values contained in exacltly one of the clusters. For example, all of the values contained within the DataFrame, cluster0 will belong to 'cluster 0', and all the values contained in DataFrame, cluster1 will belong to 'cluster 1', etc.

cluster0 = plot_df[plot_df["Cluster"] == 0]
cluster1 = plot_df[plot_df["Cluster"] == 1]
cluster2 = plot_df[plot_df["Cluster"] == 2]


In [None]:
#Instructions for building the 2-D plot

#trace1 is for 'Cluster 0'
trace1 = go.Scatter(
                    x = cluster0["TC1_2d"],
                    y = cluster0["TC2_2d"],
                    mode = "markers",
                    name = "Cluster 0",
                    marker = dict(color = 'rgba(255, 128, 255, 0.8)'),
                    text = None)

#trace2 is for 'Cluster 1'
trace2 = go.Scatter(
                    x = cluster1["TC1_2d"],
                    y = cluster1["TC2_2d"],
                    mode = "markers",
                    name = "Cluster 1",
                    marker = dict(color = 'rgba(255, 128, 2, 0.8)'),
                    text = None)

#trace3 is for 'Cluster 2'
trace3 = go.Scatter(
                    x = cluster2["TC1_2d"],
                    y = cluster2["TC2_2d"],
                    mode = "markers",
                    name = "Cluster 2",
                    marker = dict(color = 'rgba(0, 255, 200, 0.8)'),
                    text = None)

data = [trace1, trace2, trace3]

title = "Visualizing Clusters in Two Dimensions Using T-SNE (perplexity=" + str(perplexity) + ")"

layout = dict(title = title,
              xaxis= dict(title= 'TC1',ticklen= 5,zeroline= False),
              yaxis= dict(title= 'TC2',ticklen= 5,zeroline= False)
             )

fig = dict(data = data, layout = layout)

iplot(fig)

# BEGIN HERE
## N-grams from Jurafsky

In [389]:
import pandas as pd
import numpy as np
import re
from collections import Counter
import statistics

In [2]:
df = pd.read_csv('./data_TM2/processed_utterances_sentence_DA_labeling.csv', index_col=0)

In [3]:
####### correct format of column that was string and should be list

import ast

print(type(df['all_DA'][0]))
all_DA = []
for row in range(len(df['all_DA'])):
    inst = ast.literal_eval(df['all_DA'][row])
    inst = [i.strip("[]") for i in inst]
    all_DA.append(inst)
    
df['all_DA'] = all_DA

print(type(df['all_DA'][0]))

<class 'str'>
<class 'list'>


In [4]:
df.drop(df.iloc[:, 8:-1], inplace = True, axis = 1)
df = df.explode('all_DA')
df['all_DA'] = df['all_DA'].fillna('<UNK>')
df['all_DA'].head()

0          U_greeting
1               <UNK>
2          A_greeting
2    A_detail_request
3      A_confirmation
Name: all_DA, dtype: object

In [5]:
# #adding unknown token <UNK> for unlabeled utterances and *(removing more than 1 label for each utterance)

# ALL_DA = []
# for row in range(len(df['all_DA'])):
#     if len(df['all_DA'].iloc[row]) == 0:
#         ALL_DA.append(['<UNK>'])
# #     *uncomment the below to leave only one label per utterance. Otherwise multiple labels may be used 
# #     elif len(df['all_DA'].iloc[row]) >1:
# #         ALL_DA.append([df['all_DA'].iloc[row][0]])
#     else:
#         ALL_DA.append(df['all_DA'].iloc[row])
        
# # ALL_DA

In [394]:
#solution for all dialogues. After This data will be prepared to serve as input

unique_ids = df['conversation_id'].unique()

start = '<s>'
start_dou = '<ss>'
end_dou = '<ee>'
end ='<e>'
full_DA = []  

for dialog in unique_ids:
    dialog_DA = []   
    temp = df.loc[df['conversation_id'] == dialog]
    for x in range(len(temp)):
        dialog_DA.append(temp['all_DA'].iloc[x])

    #insert begin and end tokens for each dialog. This is for bigram only. for tri would need two symbols in the begin and 2 in the end
    dialog_DA.insert(0, start)
    dialog_DA.insert(1, start_dou)
    dialog_DA.append(end_dou)
    dialog_DA.append(end)
    full_DA.append(dialog_DA)

full_DA[0]

['<s>',
 '<ss>',
 'U_greeting',
 '<UNK>',
 'A_greeting',
 'A_detail_request',
 'A_confirmation',
 'U_greeting',
 '<UNK>',
 'A_detail_request',
 'A_greeting',
 'A_detail_request',
 'U_answer',
 '<UNK>',
 '<UNK>',
 '<UNK>',
 'A_confirmation',
 'A_detail_request',
 'A_confirmation',
 'A_sequence_closer',
 '<UNK>',
 '<UNK>',
 '<UNK>',
 '<UNK>',
 'A_request_summary',
 'A_confirmation',
 'A_sequence_closer',
 'U_confirmation',
 'A_confirmation',
 'A_sequence_closer',
 '<UNK>',
 'A_sequence_closer',
 '<UNK>',
 'U_partial_request',
 'A_detail_request',
 'A_grant',
 'U_answer',
 'A_confirmation',
 'A_sequence_closer',
 '<UNK>',
 '<UNK>',
 '<UNK>',
 'A_confirmation',
 'A_sequence_closer',
 'U_sequence_closer',
 'U_sequence_closer',
 'A_confirmation',
 'A_sequence_closer',
 'A_sequence_closer',
 '<UNK>',
 'U_greeting',
 'U_sequence_closer',
 '<ee>',
 '<e>']

In [7]:
# #for one example of dialog
# start = ['<s>']
# end =['<e>']

# dialog_DA = []   
# temp = df.loc[df['conversation_id'] == 'dlg-00100680-00e0-40fe-8321-6d81b21bfc4f']
# [dialog_DA.append(temp['all_DA'][x]) for x in range(len(temp))]

# #insert begin and end tokens for each dialog. This is for bigram only. for tri would need two symbols in the begin and 2 in the end
# dialog_DA.insert(0, start)
# dialog_DA.append(end)

# dialog_DA

In [396]:
#flatten nested lists

def flatten(t):
    # https://stackoverflow.com/questions/952914/how-to-make-a-flat-list-out-of-a-list-of-lists
    return [item for sublist in t for item in sublist]

# df['labels_UNK_and_SINGLE_DA'] = flatten(ALL_DA)

In [397]:
#modeling phase
#from this tutorial: https://medium.com/swlh/language-modelling-with-nltk-20eac7e70853#b9bf

import nltk
from nltk.util import ngrams
# from nltk import word_tokenize

unigram=[]
bigram=[]
trigram=[]
fourgram=[]
tokenized_text = flatten(unigram)

#unigram, bigram, trigram, and fourgram models are created
for sequence in full_DA:
    unigram.append(sequence)

bigram.extend(list(ngrams(sequence, 2)))  
trigram.extend(list(ngrams(sequence, 3)))
fourgram.extend(list(ngrams(sequence, 4)))

freq_uni = Counter(flatten(unigram))
freq_bi = nltk.FreqDist(bigram)
freq_tri = nltk.FreqDist(trigram)
freq_four = nltk.FreqDist(fourgram)


print("Most common n-grams without add-1 smoothing: \n")
print ("Most common unigrams: \n", freq_uni.most_common(5))
print ("Most common bigrams: \n", freq_bi.most_common(5))
print ("\nMost common trigrams: \n", freq_tri.most_common(5))
print ("\nMost common fourgrams: \n", freq_four.most_common(5))

Most common n-grams without add-1 smoothing: 

Most common unigrams: 
 [('<UNK>', 88216), ('A_detail_request', 63851), ('U_answer', 57676), ('U_sequence_closer', 57516), ('A_greeting', 53345)]
Most common bigrams: 
 [(('U_confirmation', 'U_sequence_closer'), 6), (('U_partial_request', 'A_grant'), 3), (('U_greeting', 'U_confirmation'), 3), (('U_sequence_closer', 'U_partial_request'), 3), (('A_grant', 'U_confirmation'), 2)]

Most common trigrams: 
 [(('U_greeting', 'U_confirmation', 'U_sequence_closer'), 3), (('U_confirmation', 'U_sequence_closer', 'U_partial_request'), 3), (('A_grant', 'U_confirmation', 'U_sequence_closer'), 2), (('U_sequence_closer', 'U_partial_request', 'A_grant'), 2), (('<s>', '<ss>', 'A_greeting'), 1)]

Most common fourgrams: 
 [(('U_greeting', 'U_confirmation', 'U_sequence_closer', 'U_partial_request'), 3), (('U_confirmation', 'U_sequence_closer', 'U_partial_request', 'A_grant'), 2), (('<s>', '<ss>', 'A_greeting', 'A_detail_request'), 1), (('<ss>', 'A_greeting', 'A

In [398]:
# bigram

In [399]:
#Add-1 smoothing is performed here. Different value might be better
            
ngrams_all = {1:[], 2:[], 3:[], 4:[]} #from unigram to fourgram in this case
for i in range(4):
    for each in unigram:
        for j in ngrams(each, i+1):
            ngrams_all[i+1].append(j);
ngrams_voc = {1:set([]), 2:set([]), 3:set([]), 4:set([])} #set() method is used to convert any of the iterable to sequence of iterable elements with distinct elements

for i in range(4):
    for gram in ngrams_all[i+1]:
        if gram not in ngrams_voc[i+1]:
            ngrams_voc[i+1].add(gram)
total_ngrams = {1:-1, 2:-1, 3:-1, 4:-1}
total_voc = {1:-1, 2:-1, 3:-1, 4:-1}

for i in range(4):
    total_ngrams[i+1] = len(ngrams_all[i+1])
    total_voc[i+1] = len(ngrams_voc[i+1])                       
    
ngrams_prob = {1:[], 2:[], 3:[], 4:[]}

for i in range(4):
    for ngram in ngrams_voc[i+1]:
        tlist = [ngram]
        tlist.append(ngrams_all[i+1].count(ngram))
        ngrams_prob[i+1].append(tlist)
    
for i in range(4):
    for ngram in ngrams_prob[i+1]:
        ngram[-1] = (ngram[-1])/(total_ngrams[i+1]+total_voc[i+1])             


In [400]:
#Prints top 10 unigram, bigram, trigram, fourgram
print("Most common n-grams without stopword removal and with add-1 smoothing: \n")
for i in range(4):
    ngrams_prob[i+1] = sorted(ngrams_prob[i+1], key = lambda x:x[1], reverse = True)
    
print ("Most common unigrams: ", str(ngrams_prob[1][:10]))
print ("\nMost common bigrams: ", str(ngrams_prob[2][:10]))
print ("\nMost common trigrams: ", str(ngrams_prob[3][:10]))
print ("\nMost common fourgrams: ", str(ngrams_prob[4][:10]))

Most common n-grams without stopword removal and with add-1 smoothing: 

Most common unigrams:  [[('<UNK>',), 0.12918531362028163], [('A_detail_request',), 0.09350470957613814], [('U_answer',), 0.08446191335317135], [('U_sequence_closer',), 0.08422760608261674], [('A_greeting',), 0.07811950842334638], [('A_sequence_closer',), 0.06953361137796106], [('U_confirmation',), 0.06925976475550036], [('A_confirmation',), 0.06890098174746362], [('U_partial_request',), 0.06879847231659598], [('A_grant',), 0.06227155041120926]]

Most common bigrams:  [[('A_confirmation', 'A_sequence_closer'), 0.042966175117006435], [('A_detail_request', 'U_answer'), 0.04189408533774582], [('<UNK>', '<UNK>'), 0.04128897023824718], [('U_confirmation', 'U_sequence_closer'), 0.04007123240774247], [('U_partial_request', 'A_grant'), 0.028276743309574182], [('A_greeting', 'A_detail_request'), 0.026013943173235594], [('<s>', '<ss>'), 0.02595988822638212], [('<ee>', '<e>'), 0.02595988822638212], [('U_answer', 'A_confirmati

### To calculate probability of unseen sequences

In [401]:
def ngram_prediction(tokenized_sent):
    ngram = {1:[], 2:[], 3:[]}#to store n-grams formed from the strings

    for i in range(1, 4):
        ngram[i] = list(ngrams(tokenized_sent, i))[-1]
    
    print("String: ", ngram)
    
    for j in range(4):
        ngrams_prob[j+1] = sorted(ngrams_prob[j+1], key = lambda x:x[1], reverse = True)
    
    pred = {1:[], 2:[], 3:[]}
    for k in range(3):
        count = 0
        for each in ngrams_prob[k+2]:
            if each[0][:-1] == ngram[k+1]:#to find predictions based on highest probability of n-grams                   
                count +=1
                pred[k+1].append(each[0][-1])
                if count ==5:
                    break
        if count<5:
            while(count!=5):
                pred[k+1].append("NA")#if no word prediction is found, replace with NOT FOUND
                count +=1
                
    return pred, ngrams_prob

token_1 = ['<s>','U_greeting','<UNK>','A_greeting','A_confirmation']
token_2 = ['<s>','U_greeting','A_greeting']
pred_1, ngrams_prob_1 = ngram_prediction(token_1)
pred_2, ngrams_prob_2 = ngram_prediction(token_2)
    

String:  {1: ('A_confirmation',), 2: ('A_greeting', 'A_confirmation'), 3: ('<UNK>', 'A_greeting', 'A_confirmation')}
String:  {1: ('A_greeting',), 2: ('U_greeting', 'A_greeting'), 3: ('<s>', 'U_greeting', 'A_greeting')}


In [402]:
print("Next word predictions for the strings using the probability models of bigrams, trigrams, and fourgrams\n")
print("String 1 \n")
print("Bigram model predictions: {}\nTrigram model predictions: {}\nFourgram model predictions: {}\n" .format(pred_1[1], pred_1[2], pred_1[3]))
print("String 2 \n")
print("Bigram model predictions: {}\nTrigram model predictions: {}\nFourgram model predictions: {}" .format(pred_2[1], pred_2[2], pred_2[3]))

Next word predictions for the strings using the probability models of bigrams, trigrams, and fourgrams

String 1 

Bigram model predictions: ['A_sequence_closer', 'A_grant', 'A_detail_request', '<UNK>', 'U_confirmation']
Trigram model predictions: ['A_sequence_closer', 'A_grant', 'A_detail_request', 'A_completion_check', '<UNK>']
Fourgram model predictions: ['A_sequence_closer', 'A_detail_request', 'A_completion_check', '<UNK>', 'U_confirmation']

String 2 

Bigram model predictions: ['A_detail_request', 'A_grant', 'A_confirmation', 'A_greeting', 'A_completion_check']
Trigram model predictions: ['A_detail_request', 'A_confirmation', 'A_greeting', '<UNK>', 'U_greeting']
Fourgram model predictions: ['NA', 'NA', 'NA', 'NA', 'NA']


In [403]:
#Dict of probabilities from unigram to fourgram

ngrams_prob

{1: [[('<UNK>',), 0.12918531362028163],
  [('A_detail_request',), 0.09350470957613814],
  [('U_answer',), 0.08446191335317135],
  [('U_sequence_closer',), 0.08422760608261674],
  [('A_greeting',), 0.07811950842334638],
  [('A_sequence_closer',), 0.06953361137796106],
  [('U_confirmation',), 0.06925976475550036],
  [('A_confirmation',), 0.06890098174746362],
  [('U_partial_request',), 0.06879847231659598],
  [('A_grant',), 0.06227155041120926],
  [('U_greeting',), 0.05573877082405867],
  [('<s>',), 0.02531836500386607],
  [('<ss>',), 0.02531836500386607],
  [('<e>',), 0.02531836500386607],
  [('<ee>',), 0.02531836500386607],
  [('A_completion_check',), 0.009451369525996392],
  [('A_hold_request',), 0.008628365238173341],
  [('A_receipt',), 0.0062647906464537596],
  [('A_request_summary',), 0.002663780782117669],
  [('A_repair_initiator',), 0.002287424728789334],
  [('A_disconfirmation',), 0.00187152932355491],
  [('U_request_summary',), 0.0010880643876379483],
  [('U_disconfirmation',),

### Perplexity
An intrinsic evaluation metric is one that measures the quality of a model independent of any application.

The perplexity of a language model on a test set is the inverse probability of the test set, normalized by the number of words. Thus the higher the conditional probability of the word sequence, the lower the perplexity, and maximizing the perplexity is equivalent to maximizing the test set probability according to the language model.

https://stackoverflow.com/questions/54941966/how-can-i-calculate-perplexity-using-nltk/55043954#55043954?newreg=b97aa34187184c90988f9a75e51898c2

##### Here we calculate perprexity in the whole dataset. without train, dev, test. Why though??  
Maybe because it's unsupervised??
i've read about v/k folding. also seems to need x and y

In [404]:
print(len(unigram))

17289


In [405]:
import nltk
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.lm import MLE
from nltk.lm import Vocabulary

unigram_train = unigram[0:17000]

n = 2
train_data = [nltk.bigrams(t,  pad_right=True, pad_left=True, left_pad_symbol="<s>", right_pad_symbol="<e>") for t in unigram_train]
words = [word for sent in unigram_train for word in sent]
words.extend(["<s>", "<e>"])
padded_vocab = Vocabulary(words)
model = MLE(n)
model.fit(train_data, padded_vocab)

unigram_test = unigram[17000:] # if you want to test just small sample
print(len(unigram_test))

test_data = [nltk.bigrams(t,  pad_right=True, pad_left=True, left_pad_symbol="<s>", right_pad_symbol="<e>") for t in unigram_test]

#for each bigram MLE estimate (tuples estimate)
#maximizing a likelihood function so that, under the assumed statistical model, the observed data is most probable.
#if you don't want to see MLE per tuple comment print line

for test in test_data:
#     print ("MLE Estimates:", [((ngram[-1], ngram[:-1]),model.score(ngram[-1], ngram[:-1])) for ngram in test]) # will generate one estimator per input. Here each input contains all DA labels for one dialog sequence.
    pass

#for whole dialog perplexity calculation.
#if print unigram_test[i] you see sentence and perplexity. 
#to make it easier to read perplexities (not caring about the specific sentence) print second line (only i)

test_data = [nltk.bigrams(t,  pad_right=True, pad_left=True, left_pad_symbol="<s>", right_pad_symbol="<e>") for t in unigram_test]

for i, test in enumerate(test_data):
#     print("PP({0}):{1}".format(unigram_test[i], model.perplexity(test)))
    print("PP({0}):{1}".format(i, model.perplexity(test)))


289
PP(0):4.495144982893566
PP(1):6.326616659124132
PP(2):5.497682392981005
PP(3):5.75324874562967
PP(4):5.412144654668064
PP(5):4.529008613318499
PP(6):4.180794405334284
PP(7):5.2581944340617826
PP(8):5.08739754205226
PP(9):6.011258447507772
PP(10):7.6716667135807075
PP(11):6.089039511106873
PP(12):5.7559805622349955
PP(13):4.246438130101796
PP(14):5.372605240203295
PP(15):3.8863468625452278
PP(16):4.180711472490924
PP(17):5.107190378495074
PP(18):5.995346154355288
PP(19):7.417165309837154
PP(20):4.3063066521824895
PP(21):4.31752606891241
PP(22):5.224650106896126
PP(23):3.9139436007820816
PP(24):4.770950276259043
PP(25):4.646804835196973
PP(26):5.130569452790832
PP(27):5.0718929856778185
PP(28):5.350757978164802
PP(29):5.516711860489342
PP(30):4.792476553398838
PP(31):5.093554370823045
PP(32):5.50690398840427
PP(33):4.486775876468721
PP(34):5.655286798517427
PP(35):4.327048160207561
PP(36):3.5013216630540036
PP(37):5.009321705869809
PP(38):5.462687492048554
PP(39):4.937509135793148
PP

In [406]:
uni = {}
bi = {}
tri = {}
four = {}

######## this alternative version makes some parts of the code not to work for bi,tri,four.

for key, value in ngrams_prob.items():
    for lista in value:
#         lista[0] = str(lista[0])
#         lista[0] = re.sub(r"['(),]", "", lista[0])
        
        if key ==1:
            lista[0] = str(lista[0])
            lista[0] = re.sub(r"['(),]", "", lista[0])
            uni[lista[0]] = lista[1]
        if key ==2:
            bi[lista[0]] = lista[1]
        if key ==3:
            tri[lista[0]] = lista[1]
        if key ==4:
            four[lista[0]] = lista[1]

In [407]:
for k, v in bi.items():
    print(k[0], k[1], v) #ks in sequence order i.e: k[0] first DA, k[1] second DA. p(k[1]|k[0])

A_confirmation A_sequence_closer 0.042966175117006435
A_detail_request U_answer 0.04189408533774582
<UNK> <UNK> 0.04128897023824718
U_confirmation U_sequence_closer 0.04007123240774247
U_partial_request A_grant 0.028276743309574182
A_greeting A_detail_request 0.026013943173235594
<s> <ss> 0.02595988822638212
<ee> <e> 0.02595988822638212
U_answer A_confirmation 0.02397487045581834
U_greeting U_partial_request 0.02162348026769211
U_answer <UNK> 0.02069703853967558
A_sequence_closer A_detail_request 0.01702280367994066
A_grant U_confirmation 0.01692069989143965
U_sequence_closer <UNK> 0.016775051840195558
A_detail_request U_confirmation 0.016642917525664838
U_confirmation U_answer 0.016482254211405895
<UNK> U_confirmation 0.016362132107287056
U_partial_request A_greeting 0.01595822153218747
A_detail_request U_greeting 0.015585843009419075
<ss> A_greeting 0.015246498065283361
<UNK> A_confirmation 0.014309545653156433
U_answer A_detail_request 0.014040772445190535
A_greeting A_grant 0.01307

U_disconfirmation A_detail_request 3.303357863268012e-05
U_sequence_closer U_disconfirmation 3.153205233119466e-05
U_confirmation U_repair_initiator 3.00305260297092e-05
U_sequence_closer U_receipt 2.852899972822374e-05
A_detail_request A_disconfirmation 2.852899972822374e-05
U_hold_request U_answer 2.852899972822374e-05
A_repair_initiator U_greeting 2.852899972822374e-05
A_hold_request A_receipt 2.702747342673828e-05
U_partial_request A_receipt 2.702747342673828e-05
A_completion_check U_confirmation 2.702747342673828e-05
U_hold_request U_partial_request 2.702747342673828e-05
U_receipt A_sequence_closer 2.702747342673828e-05
U_greeting A_disconfirmation 2.552594712525282e-05
A_grant A_completion_check 2.552594712525282e-05
U_confirmation A_disconfirmation 2.552594712525282e-05
U_repair_initiator A_confirmation 2.552594712525282e-05
U_request_summary U_sequence_closer 2.25228945222819e-05
<UNK> U_hold_request 2.25228945222819e-05
A_disconfirmation A_confirmation 2.25228945222819e-05
A_r

In [408]:
############################
# ### transform bigram probabilities into dataframe
# tmp = []
# values = []

# for key, value in bi.items():
#     values.append(value)
#     for item in key.split(' '):
#         tmp.append(item)
        

# row = [] # row = itens pares de tmp
# column = [] # columns = itens impares de tmp

# for i in range(len(tmp)):
#     if i % 2 == 0: row.append(tmp[i])
#     else: column.append(tmp[i])

In [409]:
# ############################
# #sanity checks
# print(len(np.unique(row)) == len(np.unique(column)))
# print(row[3],column[3],values[3])

In [410]:
############################
# unique_da = np.unique(row) #24 unique DAs
# mat = np.zeros((len(unique_da),len(unique_da))) #empty matrix shape unique DA. here 20x20

# for e, da1 in enumerate(unique_da):
#     for f, da2 in enumerate(unique_da):
#         for i in range(len(row)):
#             if da1 == row[i] and da2 == column[i]:
#                 mat[e][f] = values[i]
                

In [411]:
############################
# #teste e funcionaaa. comparar com os resultados da lista bi
# print(mat[3][12]) #'A_confirmation A_sequence_closer': 0.045319277995720685

In [24]:
############################
# emition_p_df = pd.DataFrame(mat, index=unique_da, columns=unique_da)
# emition_p_df

Unnamed: 0,<UNK>,<s>,A_completion_check,A_confirmation,A_detail_request,A_disconfirmation,A_grant,A_greeting,A_hold_request,A_receipt,...,U_completion_check,U_confirmation,U_disconfirmation,U_greeting,U_hold_request,U_partial_request,U_receipt,U_repair_initiator,U_request_summary,U_sequence_closer
<UNK>,0.04355,0.0,0.000181,0.015093,0.010058,0.000253,0.0,0.013121,0.001362,0.000618,...,3e-06,0.017258,0.0001,0.007757,2.4e-05,0.008932,1.1e-05,0.000185,0.000154,0.011303
<s>,0.003297,0.0,0.0,4.4e-05,0.000119,0.0,0.0,0.016081,0.0,3e-06,...,0.0,0.000219,0.0,0.006899,0.0,0.000702,0.0,0.0,0.0,6e-06
A_completion_check,4.1e-05,0.0,0.0,0.0,0.010095,0.0,2e-06,2e-06,5e-06,0.0,...,0.0,2.9e-05,0.0,2.2e-05,0.0,3e-06,0.0,2e-06,0.0,1.4e-05
A_confirmation,0.004452,0.0,0.001166,0.000554,0.006704,0.000426,0.009826,0.001516,0.000478,9e-05,...,0.0,0.001516,1.4e-05,0.000561,2e-06,0.000521,2e-06,1e-05,1.6e-05,0.00111
A_detail_request,0.001593,0.0,4e-05,0.002544,0.002509,3e-05,0.005808,0.001264,0.000146,6.5e-05,...,0.0,0.017554,0.000366,0.016439,2.2e-05,0.004735,3e-06,0.000163,0.000532,0.00262
A_disconfirmation,0.000285,0.0,2e-06,2.4e-05,0.000162,1.6e-05,0.000486,6e-05,1.4e-05,2e-06,...,0.0,0.000399,6e-06,6.7e-05,0.0,0.000181,0.0,5e-06,3e-06,2.1e-05
A_grant,0.011661,0.0,2.7e-05,0.000993,0.001758,4.4e-05,0.0,0.003636,0.001044,7.8e-05,...,0.0,0.017847,4e-05,0.009948,1.4e-05,0.011732,0.0,0.000139,5.5e-05,0.005614
A_greeting,0.004884,0.0,0.006443,0.010602,0.027439,0.000245,0.013796,0.009425,0.000493,5.2e-05,...,5e-06,0.003559,1.6e-05,0.003437,2e-06,0.001463,3e-06,4e-05,4.8e-05,0.001048
A_hold_request,0.003125,0.0,0.0,0.000234,0.000684,4.9e-05,0.001669,0.001033,0.00071,2.9e-05,...,0.0,0.001055,0.0,9.2e-05,8e-06,0.000127,2e-06,2e-06,2e-06,0.000341
A_receipt,0.000698,0.0,5.2e-05,1.7e-05,7.8e-05,0.0,1.7e-05,0.000136,8e-06,2.4e-05,...,0.0,9.3e-05,0.0,1e-05,0.0,1.4e-05,3e-06,0.0,0.0,0.000581


## HMM

from wikipedia, code inclusive: https://en.wikipedia.org/wiki/Viterbi_algorithm#Pseudocode

In [533]:
#viterbi working version explained

def viterbi(obs, states, start_p, trans_p, emit_p):
    V = [{}]
    for st in states:
        V[0] [st] = {"prob": start_p[st] * emit_p[st] [obs[0]], "prev": None}
        
#         print(obs[0]) # <s>
#         print(V[0]) #{'New': {'prob': 0.026402178674778336, 'prev': None}}
#         print(V[0][st]) #{'prob': 0.026402178674778336, 'prev': None}
        
#         print(emit_p[st]) #{'<UNK>': 0.13607616415027982, 'A_detail_request': 0.09849232743674069, 'U_answer': 0.08896718105028044, 'U_sequence_closer': 0.08872037563783773, 'A_greeting': 0.08228646704222224, 'A_sequence_closer': 0.07324259121002523, 'U_confirmation': 0.07295413738423284, 'A_confirmation': 0.07257621659642996, 'U_partial_request': 0.07246823922848628, 'A_grant': 0.06559316595812946, 'U_greeting': 0.058711922552461576, '<s>': 0.026668867348260947, '<e>': 0.026668867348260947, 'A_completion_check': 0.009955513324407203, 'A_hold_request': 0.009088609313202238, 'A_receipt': 0.006598959715186554, 'A_request_summary': 0.002805869032707887, 'A_repair_initiator': 0.0024094378389718086, 'A_disconfirmation': 0.0019713582318860254, 'U_request_summary': 0.0011461026340307643, 'U_disconfirmation': 0.001110624355992127, 'U_repair_initiator': 0.0008499361390995304, 'U_receipt': 0.0002406352771316275, 'U_completion_check': 0.0002190398035428917, 'U_hold_request': 0.00013882804449901587}
#         print(emit_p[st] [obs[0]]) #0.026668867348260947, que é o de <s> no caso do primeiro state
        

    # Run Viterbi when t > 0
    for t in range(1, len(obs)):
        V.append({})
        for st in states:
#             print(st) #new and current
            max_tr_prob = V[t - 1] [states[0]] ["prob"] * trans_p[states[0]] [st] #0.026402178674778336*0.1 and same*0.9
            
#             print(V)
#             print(V[t]) #AQUI SOLO SALE NEW EM TODOS OS Ts{'New': {'prob': 0.00021725420053650106, 'prev': 'New'}} OR ON NEXT T: {'New': {'prob': 6.887922369891843e-05, 'prev': 'Current'}}
#             print(V[t - 1]) #AQUI SAI O ANTERIOR TANTO PRA NEW QUANTO CURRENT:{'New': {'prob': 0.026402178674778336, 'prev': None}, 'Current': {'prob': 0.0002666886734826095, 'prev': None}}
#             print(V[t - 1] [states[0]]) #choose higher prob{'prob': 0.026402178674778336, 'prev': None}
#             print(V[t - 1] [states[0]] ["prob"]) # 0.026402178674778336
            
#             print(trans_p) #always the same: {'New': {'New': 0.1, 'Current': 0.9}, 'Current': {'New': 0.6, 'Current': 0.4}}
#             print(trans_p[states[0]]) #from state 0: {'New': 0.1, 'Current': 0.9}
#             print(trans_p[states[0]] [st]) #first iteration: 0.1, next iteration: 0.9 and repeats
            
            
            prev_st_selected = states[0]
#             print(states[1:])
            for prev_st in states[1:]:
                tr_prob = V[t - 1] [prev_st] ["prob"] * trans_p[prev_st] [st] # a diferenca aqui em relacao ao max_tr_prob é que aqui se calcula o previous state
                if tr_prob > max_tr_prob: #if other state is higher than state 0 (i.e if Current > New)
                    max_tr_prob = tr_prob #then new max is Current instead on New. Here we store the actual probability
                    prev_st_selected = prev_st #and previous state is updated. Here we store name of state

            max_prob = max_tr_prob * emit_p[st] [obs[t]] #prob of state* emission prob of the DA being observed
            V[t] [st] = {"prob": max_prob, "prev": prev_st_selected}
#             print(V[t])
#             print(V[t-1])
#             print(emit_p) #whole dictionary is used as input including key 'New': whole dict, 'Current': whole dict
#             print(emit_p[st]) #whole dictionary is used as input, but the key 'New' and 'Current' don't appear anymore
#             print(obs[t]) #Current DA
#             print(obs[t-1]) #previous DA. this works because is from second onwards
#             print(emit_p[st] [obs[t]])


    for line in dptable(V):
        print(line)

    optimal = []
    max_prob = 0.0
    best_st = None
    # Get most probable state and its backtrack
    for st, data in V[-1].items(): #V[-1] is the last dic. ex: {'New': {'prob': 6.983898154776834e-19, 'prev': 'Current'}, 'Current': {'prob': 1.5713770848247866e-18, 'prev': 'New'}
#         print(data) #{'prob': 6.983898154776834e-19, 'prev': 'Current'}
        if data["prob"] > max_prob: #6.983898154776834e-19 > 0.0 (in first step)
            max_prob = data["prob"] #update max prob to compare with second state in next iteration
#             print(max_prob)
            best_st = st
    optimal.append(best_st) #appending all best states
    previous = best_st #updating previous as best state to be used later


    # Follow the backtrack till the first observation
    for t in range(len(V) - 2, -1, -1): # start=len(V)-2, stop=-1, step=-1. Calculates from last step to first
        print(t, len(V))
        optimal.insert(0, V[t + 1] [previous] ["prev"])
        previous = V[t + 1] [previous] ["prev"]

    print ("The steps of states are " + " ".join(optimal) + " with highest probability of %s" % max_prob)
#     print(V) #probability of each state given all possible previous states

def dptable(V):
    # Print a table of steps from dictionary
    yield " " * 5 + "     ".join(("%3d" % i) for i in range(len(V)))
    for state in V[0]:
        yield "%.7s: " % state + " ".join("%.7s" % ("%lf" % v[state] ["prob"]) for v in V)

In [674]:
######## FUNCTIONING CODE FOR BIGRAMS #########
######PS: this would need as input sentences with padding=1


def viterbi(obs, states, start_p, trans_p, emit_p, emit_p_bi, emit_p_tri): 
    V = [{}]
    for st in states:
        V[0] [st] = {"prob": start_p[st] * emit_p[st] [obs[0]], "prev": None}


    # Run Viterbi when t > 0
    for t in range(1, len(obs)): 

        V.append({})
        for st in states: #New and Current
            max_tr_prob = V[t - 1] [states[0]] ["prob"] * trans_p[states[0]] [st] #0.026402178674778336*0.1 and same*0.9 ###### OLD
            prev_st_selected = states[0] 

            for prev_st in states[1:]:
                tr_prob = V[t - 1] [prev_st] ["prob"] * trans_p[prev_st] [st] # a diferenca aqui em relacao ao max_tr_prob é que aqui se calcula o previous state
                if tr_prob > max_tr_prob: #if other state is higher than state 0 (i.e if Current > New)
                    max_tr_prob = tr_prob #then new max is Current instead on New. Here we store the actual probability
                    prev_st_selected = prev_st #and previous state is updated. Here we store name of state

            ############# NEW to include emit_p for bigram
            max_prob = max_tr_prob * statistics.fmean([emit_p[st] [obs[t]],emit_p_bi[state].get((obs[t-1],obs[t]))]) #use average
                      
            V[t] [st] = {"prob": max_prob, "prev": prev_st_selected}

    for line in dptable(V):
        print(line)

    optimal = []
    max_prob = 0.0
    best_st = None
    # Get most probable state and its backtrack
    for st, data in V[-1].items(): # V[-1] is the last dic. ex: {'New': {'prob': 6.983898154776834e-19, 'prev': 'Current'}, 'Current': {'prob': 1.5713770848247866e-18, 'prev': 'New'}
        if data["prob"] > max_prob: #6.983898154776834e-19 > 0.0 (in first step)
            max_prob = data["prob"] #update max prob to compare with second state in next iteration
            best_st = st
    optimal.append(best_st) #appending all best states
    previous = best_st #updating previous as best state to be used later
    
    print(V)
    # Follow the backtrack till the first observation
    for t in range(len(V) - 2, -1, -1): # start=len(V)-2, stop=-1, step=-1. Calculates from last step to first
        optimal.insert(0, V[t + 1] [previous] ["prev"])
        previous = V[t + 1] [previous] ["prev"]
        print(t, previous)

    print ("The steps of states are " + " ".join(optimal) + " with highest probability of %s" % max_prob)

def dptable(V):
    # Print a table of steps from dictionary
    yield " " * 5 + "     ".join(("%3d" % i) for i in range(len(V)))
    for state in V[0]:
        yield "%.7s: " % state + " ".join("%.7s" % ("%lf" % v[state] ["prob"]) for v in V)

In [702]:
###### TRIGRAMS
######PS: this would need as input sentences with padding=2 (each side)

def viterbi(obs, states, start_p, trans_p, emit_p, emit_p_bi, emit_p_tri): 
    V = [{}]

    for st in states:
        V[0] [st] = {"prob": start_p[st] * emit_p[st] [obs[0]], "prev": None} #### OLD
        V.append({})
        V[1] [st] = {"prob": start_p[st] * emit_p[st] [obs[1]], "prev": None} #### NEW


    # Run Viterbi when t > 0
    for t in range(2, len(obs)): #in my example len(obs)=32. i.e: range 2, 31 inclusive
        V.append({})
        for st in states: #New and Current            
            max_tr_prob = V[t - 1] [states[0]] ["prob"] * trans_p[states[0]] [st] #0.026402178674778336*0.1 and same*0.9 ######### NEW
            prev_st_selected = states[0] 
            
            for prev_st in states[1:]: #(states[1:]) >>> Current
                tr_prob = V[t - 1] [prev_st] ["prob"] * trans_p[prev_st] [st] # a diferenca aqui em relacao ao max_tr_prob é que aqui se calcula o previous state
                if tr_prob > max_tr_prob: #if other state is higher than state 0 (i.e if Current > New)
                    max_tr_prob = tr_prob #then new max is Current instead on New. Here we store the actual probability
                    prev_st_selected = prev_st #and previous state is updated. Here we store name of state

            ######################################## NEW to include emit_p for trigram,bigram and unigram
            max_prob = max_tr_prob * statistics.fmean([emit_p[st] [obs[t]],emit_p_bi[state].get((obs[t-1],obs[t])), emit_p_tri[state].get((obs[t-2],obs[t-1],obs[t]))]) 
            # max_prob = max_tr_prob * statistics.fmean([emit_p[st] [obs[t]],emit_p_bi[state].get((obs[t-1],obs[t]))]) ### for bigram
                      
            V[t] [st] = {"prob": max_prob, "prev": prev_st_selected}

    V = V[:-1]
    for line in dptable(V):
        print(line)

    optimal = []
    max_prob = 0.0
    best_st = None
    # Get most probable state and its backtrack
    for st, data in V[-1].items(): # V[-1] is the last dic. ex: {'New': {'prob': 6.983898154776834e-19, 'prev': 'Current'}, 'Current': {'prob': 1.5713770848247866e-18, 'prev': 'New'}
        if data["prob"] > max_prob: #6.983898154776834e-19 > 0.0 (in first step)
            max_prob = data["prob"] #update max prob to compare with second state in next iteration
            best_st = st
    optimal.append(best_st) #appending all best states
    previous = best_st #updating previous as best state to be used later


    # Follow the backtrack till the first observation
    for t in range(len(V) - 2, -1, -1): # start=len(V)-2, stop=-1, step=-1. Calculates from last step to first ###### NEW
        optimal.insert(0, V[t + 1] [previous] ["prev"])
        previous = V[t + 1] [previous] ["prev"]
    optimal = optimal[1:]
    
    print ("The steps of states are " + " ".join(optimal) + " with highest probability of %s" % max_prob)

def dptable(V):
#     V_ = V[:-1]
    # Print a table of steps from dictionary
    yield " " * 5 + "     ".join(("%3d" % i) for i in range(len(V)))
    for state in V[0]:
        yield "%.7s: " % state + " ".join("%.7s" % ("%lf" % v[state] ["prob"]) for v in V)

In [703]:
#somente pra testar. esse codigo está mais abaixo
obs = unigram[5] # just an example
obs_example = ['<s>','<ss>','A_greeting','U_greeting','A_detail_request','U_confirmation','U_partial_request','U_answer','A_detail_request','A_grant', 'U_answer','A_detail_request','U_answer','U_sequence_closer','<ee>','<e>']
states = ['New', 'Current']
start_p = {'New':0.99,'Current':0.01}
trans_p = {'New': {'New':0.1,'Current':0.9},
           'Current': {'New':0.4,'Current':0.6}}
emit_uni = {}
emit_bi = {}
emit_tri = {}

emit_uni['New'] = uni #emition_p_df
emit_uni['Current'] = uni #emition_p_df #uni #####teste = uni +0.8, só pra ver melhor a diferença no algotritmo
emit_bi['New'] = bi #change this later to correct numbers
emit_bi['Current'] = bi

emit_tri['New'] = tri #change this later to correct numbers
emit_tri['Current'] = tri

viterbi(obs, states, start_p, trans_p, emit_uni, emit_bi, emit_tri) #steps are being rounded to zero cause prob is much smaller. Final is around e-18

       0       1       2       3       4       5       6       7       8       9      10      11      12      13      14      15      16      17      18      19      20      21      22      23      24      25      26      27      28      29      30      31
New: 0.02506 0.02506 0.00009 0.00001 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000
Current: 0.00025 0.00025 0.00081 0.00002 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000
The steps of states are New Current Current New Current New Current New Current New Current Current New Current New Current Current New Current New Current New Current New Current New Current New Current New Current w

In [414]:
#thinking how to add bigram emission probability:

# [[print(st, da[0], da[1], va) for da,va in v.items()] for st, v in emit_bi.items()]
# [[print(v.get((da[0], da[1]))) for da,va in v.items()] for st, v in emit_bi.items()]

#this works:
# emit_bi['New'].get((obs[1], 'A_sequence_closer'))
emit_tri['New'].get((obs[0], obs[1], obs[2])) #como corrigir exemplos como esse para trigram. double padding?

In [475]:
#test dict = unigram + 0.8,  only to see differences when printing lines on algorithm 

teste = {}
for key, value in uni.items():
    value += 0.8
    teste[key] = value

In [237]:
emition_p_df['A_confirmation']['<UNK>'] #column, row
emition_p_df

Unnamed: 0,<UNK>,<s>,A_completion_check,A_confirmation,A_detail_request,A_disconfirmation,A_grant,A_greeting,A_hold_request,A_receipt,...,U_completion_check,U_confirmation,U_disconfirmation,U_greeting,U_hold_request,U_partial_request,U_receipt,U_repair_initiator,U_request_summary,U_sequence_closer
<UNK>,0.04355,0.0,0.000181,0.015093,0.010058,0.000253,0.0,0.013121,0.001362,0.000618,...,3e-06,0.017258,0.0001,0.007757,2.4e-05,0.008932,1.1e-05,0.000185,0.000154,0.011303
<s>,0.003297,0.0,0.0,4.4e-05,0.000119,0.0,0.0,0.016081,0.0,3e-06,...,0.0,0.000219,0.0,0.006899,0.0,0.000702,0.0,0.0,0.0,6e-06
A_completion_check,4.1e-05,0.0,0.0,0.0,0.010095,0.0,2e-06,2e-06,5e-06,0.0,...,0.0,2.9e-05,0.0,2.2e-05,0.0,3e-06,0.0,2e-06,0.0,1.4e-05
A_confirmation,0.004452,0.0,0.001166,0.000554,0.006704,0.000426,0.009826,0.001516,0.000478,9e-05,...,0.0,0.001516,1.4e-05,0.000561,2e-06,0.000521,2e-06,1e-05,1.6e-05,0.00111
A_detail_request,0.001593,0.0,4e-05,0.002544,0.002509,3e-05,0.005808,0.001264,0.000146,6.5e-05,...,0.0,0.017554,0.000366,0.016439,2.2e-05,0.004735,3e-06,0.000163,0.000532,0.00262
A_disconfirmation,0.000285,0.0,2e-06,2.4e-05,0.000162,1.6e-05,0.000486,6e-05,1.4e-05,2e-06,...,0.0,0.000399,6e-06,6.7e-05,0.0,0.000181,0.0,5e-06,3e-06,2.1e-05
A_grant,0.011661,0.0,2.7e-05,0.000993,0.001758,4.4e-05,0.0,0.003636,0.001044,7.8e-05,...,0.0,0.017847,4e-05,0.009948,1.4e-05,0.011732,0.0,0.000139,5.5e-05,0.005614
A_greeting,0.004884,0.0,0.006443,0.010602,0.027439,0.000245,0.013796,0.009425,0.000493,5.2e-05,...,5e-06,0.003559,1.6e-05,0.003437,2e-06,0.001463,3e-06,4e-05,4.8e-05,0.001048
A_hold_request,0.003125,0.0,0.0,0.000234,0.000684,4.9e-05,0.001669,0.001033,0.00071,2.9e-05,...,0.0,0.001055,0.0,9.2e-05,8e-06,0.000127,2e-06,2e-06,2e-06,0.000341
A_receipt,0.000698,0.0,5.2e-05,1.7e-05,7.8e-05,0.0,1.7e-05,0.000136,8e-06,2.4e-05,...,0.0,9.3e-05,0.0,1e-05,0.0,1.4e-05,3e-06,0.0,0.0,0.000581


#### dictionary options atm:
Need to choose best format to input to viterbi
- ngrams_prob
- unigram
- uni
- bi
- tri
- four

The function viterbi takes the following arguments: obs is the sequence of observations, e.g. ['normal', 'cold', 'dizzy']; states is the set of hidden states; start_p is the start probability; trans_p are the transition probabilities; and emit_p are the emission probabilities. For simplicity of code, we assume that the observation sequence obs is non-empty and that trans_p[i] [j] and emit_p[i] [j] is defined for all states i,j.

In the running example, the forward/Viterbi algorithm is used as follows:


##### florian
The language model would be a nice baseline, where you can first calculate probabilities for different values of N (e.g.: 1 to 4 or 5, and potentially also skipgrams), and subsequently identify the pattern combination in each conversation that is most likely in terms of the probabilities. I’d propose to look at this as a hidden markov problem with state probabilities and transition probabilities, where at each segment between two dialog moves there are different options:
 
- The next move is part of an ongoing pattern (high probability for either second move in a bigram, third move in a tri-gram, etc.)
    - Note that there also might be sequence expansion, so the move can continue a pattern that was expanded on
- The next move is the start of a new pattern (high unigram probability compared to bigram or higher)
 
It may be that probabilities for unigrams are higher than those for bi-grams or higher, and by means of transition probabilities (stay in ongoing pattern or transition to new one) you can make sure that the cost of starting a new pattern is higher than continuing a current one.
 
In any case, the first step is to calculate language model probabilities based on the instructions in the work of jurafsky (and there is ample code available for this). Next you can apply these probabilities to the data, so that at each boundary between two moves you know the probabilities for different scenario’s (unigram, bigram, etc.).


##### me
viterbi(obs,
        states,
        start_p,
        trans_p,
        emit_p)
        
- observations: all unique DAs?
- states: DAs
- start probability: most probable first move. i.e: bigrams taking only probability of all combinations that start with \<s> symbol.
    - Should I include here trigrams and fourgrams that also start with \<s> symbol?
- transition probability: boundaries bewteen moves (probabilities generated in the LM: probabilities of tuples bigram for simplification - could also be trigram, 4gram...)
- emition probability: unigram probabilities (prob of each move hapenning at all)

The hidden state would contain info on bigrams, trigrams, fourgrams

##### florian

- observations: all unique DAs?
 
This is the sequence of Dialog acts in the conversation.
 
- states:
 
The hidden states are one of the following:

    - current pattern (e.g.: stay in pattern)
    - new pattern  
Note that the start / transition / emition probabilities are linked to these two states
 
- start probability: most probable first move. i.e.: bigrams taking only probability of all combinations that start with \<s> symbol.
Should I include here trigrams and fourgrams that also start with \<s> symbol?
 
Here you can simply assign a 1.0 probability for new pattern and 0.0 for current pattern – a conversation always starts with a new pattern.
 
- transition probability: boundaries between moves (probabilities generated in the LM: probabilities of tuples bigram for simplification - could also be trigram, 4gram...)
 
This you can also set yourself, and you can play around with different values. There will be four values:
 
    - from current to current (e.g.: stay in current pattern, will by definition be the third DA or higher in a pattern, since the first DA in a pattern is always of the hidden state ‘new’)
    - from current to new (e.g.: transition to new pattern, setting this one to a high value will likely lead to quite some smaller patterns as output)
    - from new to new (just started a new pattern and directly go to new one, not so realistic and you can set this one very low or simply to 0.0)
    - from new to current (just started a new pattern and staying in it)

- emition probability: unigram probabilities (prob of each move happening at all)
 
Here you will make use of all LM probabilities that you have at any point, but it should be linked to the two stages:
 
    - the probability that a DA is a start of a new pattern at any point can be based on the following information:
        - unigram probability of the DA
        - the bigram probability of the DA as start with the next DA (if any) as second move (e.g.: the DA as first move conditioned on the next DA as second move in the bigram, basically the other way around than is typically done in an LM)
        - the trigram probability of the DA as start conditioned on the next two moves
            - These values are probably not part of your currently trained LM, and may also be omitted
    - the probability that a DA is a continuation of the current pattern can be based on the following information:
        - the bigram probability  of the DA conditioned on the previous DA
        - the trigram probability of the DA conditioned on the previous two Das
        - etc.
 
This latter part may be a bit tricky – in essence you want to follow different paths: you want to consider the trigram probability if in a certain path there is a pattern going on for two moves. If it is not possible to include these different paths, you may also take the average of the different values (bigram, trigram, etc.).
 

In [259]:
# generic example to test code:

# obs = ["normal", "cold", "dizzy","dizzy","normal", "dizzy"]
# states = ("Healthy", "Fever")
# start_p = {"Healthy": 0.6, "Fever": 0.4}
# trans_p = {
#     "Healthy": {"Healthy": 0.7, "Fever": 0.3},
#     "Fever": {"Healthy": 0.4, "Fever": 0.6},
# }
# emit_p = {
#     "Healthy": {"normal": 0.5, "cold": 0.4, "dizzy": 0.1},
#     "Fever": {"normal": 0.1, "cold": 0.3, "dizzy": 0.6},
# }

# viterbi(obs, states, start_p, trans_p, emit_p)

In [378]:
# print('unigram prob of DA: '+ str(uni['A_detail_request'])) 
# print('bigram prob of DA: ' +str(bi['A_greeting A_detail_request']))
# #same as taking from matrix below:
# print('bigram prob of DA: ' +str(emition_p_df['A_detail_request'].loc['A_greeting'])) #first is column, second bracket is row
# print('emission prob uni*bi(conditioned on previous DA): ' + str(uni['A_greeting']*bi['A_greeting A_detail_request']))