In [None]:
from matplotlib import pyplot as plt
from sklearn.cluster import AgglomerativeClustering
import scipy.cluster.hierarchy as sch

#Basic imports
import numpy as np
import pandas as pd

#sklearn imports
from sklearn.decomposition import PCA #Principal Component Analysis
from sklearn.manifold import TSNE #T-Distributed Stochastic Neighbor Embedding
from sklearn.cluster import KMeans #K-Means Clustering
from sklearn.preprocessing import StandardScaler #used for 'Feature Scaling'

#plotly imports
import plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

In [None]:
df = pd.read_csv('./data_TM2/synt_annotated_data.csv', index_col=0)
# df['concat_da_names'] = df.iloc[:, 8:21].apply(lambda row: row.dropna().tolist(), axis=1)
df.head()

In [None]:
df_to_cluster = df.iloc[:, 8:21]
df_to_cluster.columns

In [None]:
DA = ['repair_initiator','greeting','request_summary','confirmation','receipt','disconfirmation',
      'sequence_closer','completion_check','hold_request','partial_request', 'detail_request', 'grant', 'answer'] 

for e,i in enumerate(DA):
    print(e,i, e+8) #(class number, original tag, column number)
    df_to_cluster.iloc[:,e] = df_to_cluster.iloc[:,e].replace(i, 1)
    df_to_cluster.iloc[:,e] = df_to_cluster.iloc[:,e].fillna(0)
    
df_to_cluster

In [None]:
#select only labeled dataset

df.dropna(subset=['DA_rep_init', 'DA_greet', 'DA_req_sum',
       'DA_conf', 'DA_receipt', 'DA_disconf', 'DA_closer', 'DA_comp_check',
       'DA_hold', 'DA_partial_req', 'DA_detail_req', 'DA_grant', 'DA_answer'], how='all', inplace = True)

print(len(df))

In [None]:
DA = ['repair_initiator','greeting','request_summary','confirmation','receipt','disconfirmation',
      'sequence_closer','completion_check','hold_request','partial_request', 'detail_request', 'grant', 'answer'] 

#replace tags per number of class
for e,i in enumerate(DA):
    print(e,i, e+8) #(class number, original tag, column number)
    df.iloc[:,e+8] = df.iloc[:,e+8].replace(i, (e+1))
    df.iloc[:,e+8] = df.iloc[:,e+8].fillna(0)

#     df.iloc[:,e+8] =df.iloc[:,e+8].astype(int)
    
df['concat_col'] = df.iloc[:, 8:21].apply(lambda row: row.dropna().tolist(), axis=1)

In [None]:
concat_col = []
for row in df['concat_col']:
    row = [int(x) for x in row]
    concat_col.append(row)
    
df['concat_col'] = concat_col
# print(df['concat_col'].head())
df.head()

In [None]:
#filled with zeros
X = df['concat_col'].to_list()

In [None]:
#example

X = [[0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      [0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 11, 0, 0],
      [0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      [0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11, 0, 0],
      [0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 11, 0, 0],
      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13],
      [0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 11, 0, 0],
      [0, 0, 0, 4, 0, 0, 7, 0, 0, 0, 0, 0, 0],
      [0, 0, 3, 4, 0, 0, 7, 0, 0, 0, 0, 0, 0],
      [0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      [0, 0, 0, 4, 0, 0, 7, 0, 0, 0, 0, 0, 0],
      [0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0],
      [0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 0, 0, 0],
      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11, 12, 0],
      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13],
      [0, 0, 0, 4, 0, 0, 7, 0, 0, 0, 0, 0, 0],
      [0, 0, 0, 4, 0, 0, 7, 0, 0, 0, 0, 0, 0],
      [0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0],
      [0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0]]


# X = [[10,20],[5,15],[13,10],[20,10],[1,13],[2,12]]

dendrogram = sch.dendrogram(sch.linkage(X, method='ward'))

In [None]:
%matplotlib inline
import matplotlib.pylab as plt
import seaborn as sns
import pandas as pd
import numpy as np

sns.clustermap(X)

In [None]:
model = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='ward')
model.fit(X)
labels = model.labels_


In [None]:
# plt.scatter(X[labels==0,0], X[labels==0,1], s=50, marker='o', color='red')
# plt.scatter(X[labels==1, 0], X[labels==1, 1], s=50, marker='o', color='blue')
# plt.scatter(X[labels==2, 0], X[labels==2, 1], s=50, marker='o', color='green')
plt.show()

## Notebook t-sne

In [None]:
# Time to build our clusters.
# In this kernel, we will be visualizing only three different clusters on our data. 
#I chose three because I found it to be a good number of clusters to help us visualize our data


########### en lugar de kmean use HAC ##########


#Initialize our model
kmeans = KMeans(n_clusters=3)
#Fit our model
kmeans.fit(df_to_cluster)
KMeans(n_clusters=3)
#Find which cluster each data-point belongs to
clusters = kmeans.predict(df_to_cluster)
#Add the cluster vector to our DataFrame, X
df_to_cluster["Cluster"] = clusters

In [None]:
#sampling 
#plotX is a DataFrame containing 5000 values sampled randomly from X
plot_df = pd.DataFrame(np.array(df_to_cluster.sample(5000)))

#Rename plotX's columns since it was briefly converted to an np.array above
plot_df.columns = df_to_cluster.columns

# plot_df.head()

In [None]:
#Set our perplexity
perplexity = 50

#T-SNE with two dimensions
tsne_2d = TSNE(n_components=2, perplexity=perplexity)

# #T-SNE with three dimensions
# tsne_3d = TSNE(n_components=3, perplexity=perplexity)

#This DataFrame contains two dimensions, built by T-SNE
TCs_2d = pd.DataFrame(tsne_2d.fit_transform(plot_df.drop(["Cluster"], axis=1)))

# #And this DataFrame contains three dimensions, built by T-SNE
# TCs_3d = pd.DataFrame(tsne_3d.fit_transform(df_to_cluster.drop(["Cluster"], axis=1)))

#"TC1_2d" means: 'The first component of the components created for 2-D visualization, by T-SNE.'
#And "TC2_2d" means: 'The second component of the components created for 2-D visualization, by T-SNE.'
TCs_2d.columns = ["TC1_2d","TC2_2d"]

# TCs_3d.columns = ["TC1_3d","TC2_3d","TC3_3d"]

df_to_cluster = pd.concat([plot_df,TCs_1d,TCs_2d,TCs_3d], axis=1, join='inner')


#Each of these new DataFrames will hold all of the values contained in exacltly one of the clusters. For example, all of the values contained within the DataFrame, cluster0 will belong to 'cluster 0', and all the values contained in DataFrame, cluster1 will belong to 'cluster 1', etc.

cluster0 = plot_df[plot_df["Cluster"] == 0]
cluster1 = plot_df[plot_df["Cluster"] == 1]
cluster2 = plot_df[plot_df["Cluster"] == 2]


In [None]:
#Instructions for building the 2-D plot

#trace1 is for 'Cluster 0'
trace1 = go.Scatter(
                    x = cluster0["TC1_2d"],
                    y = cluster0["TC2_2d"],
                    mode = "markers",
                    name = "Cluster 0",
                    marker = dict(color = 'rgba(255, 128, 255, 0.8)'),
                    text = None)

#trace2 is for 'Cluster 1'
trace2 = go.Scatter(
                    x = cluster1["TC1_2d"],
                    y = cluster1["TC2_2d"],
                    mode = "markers",
                    name = "Cluster 1",
                    marker = dict(color = 'rgba(255, 128, 2, 0.8)'),
                    text = None)

#trace3 is for 'Cluster 2'
trace3 = go.Scatter(
                    x = cluster2["TC1_2d"],
                    y = cluster2["TC2_2d"],
                    mode = "markers",
                    name = "Cluster 2",
                    marker = dict(color = 'rgba(0, 255, 200, 0.8)'),
                    text = None)

data = [trace1, trace2, trace3]

title = "Visualizing Clusters in Two Dimensions Using T-SNE (perplexity=" + str(perplexity) + ")"

layout = dict(title = title,
              xaxis= dict(title= 'TC1',ticklen= 5,zeroline= False),
              yaxis= dict(title= 'TC2',ticklen= 5,zeroline= False)
             )

fig = dict(data = data, layout = layout)

iplot(fig)

## N-grams from Jurafsky

In [1]:
import pandas as pd
import numpy as np
import re
from collections import Counter

In [2]:
df = pd.read_csv('./data_TM2/processed_utterances_sentence_DA_labeling.csv', index_col=0)

In [3]:
####### correct format of column that was string and should be list

import ast

print(type(df['all_DA'][0]))
all_DA = []
for row in range(len(df['all_DA'])):
    inst = ast.literal_eval(df['all_DA'][row])
    inst = [i.strip("[]") for i in inst]
    all_DA.append(inst)
    
df['all_DA'] = all_DA

print(type(df['all_DA'][0]))

<class 'str'>
<class 'list'>


In [4]:
df.drop(df.iloc[:, 8:-1], inplace = True, axis = 1)
df = df.explode('all_DA')
df['all_DA'] = df['all_DA'].fillna('<UNK>')
df['all_DA'].head()

0        U_greeting
1             <UNK>
2        A_greeting
3    A_confirmation
4        U_greeting
Name: all_DA, dtype: object

In [5]:
# #adding unknown token <UNK> for unlabeled utterances and *(removing more than 1 label for each utterance)

# ALL_DA = []
# for row in range(len(df['all_DA'])):
#     if len(df['all_DA'].iloc[row]) == 0:
#         ALL_DA.append(['<UNK>'])
# #     *uncomment the below to leave only one label per utterance. Otherwise multiple labels may be used 
# #     elif len(df['all_DA'].iloc[row]) >1:
# #         ALL_DA.append([df['all_DA'].iloc[row][0]])
#     else:
#         ALL_DA.append(df['all_DA'].iloc[row])
        
# # ALL_DA

In [6]:
#solution for all dialogues. After This data will be prepared to serve as input

unique_ids = df['conversation_id'].unique()

start = '<s>'
end ='<e>'
full_DA = []  

for dialog in unique_ids:
    dialog_DA = []   
    temp = df.loc[df['conversation_id'] == dialog]
    for x in range(len(temp)):
        dialog_DA.append(temp['all_DA'].iloc[x])

    #insert begin and end tokens for each dialog. This is for bigram only. for tri would need two symbols in the begin and 2 in the end
    dialog_DA.insert(0, start)
    dialog_DA.append(end)
    full_DA.append(dialog_DA)

full_DA[0]

['<s>',
 'U_greeting',
 '<UNK>',
 'A_greeting',
 'A_confirmation',
 'U_greeting',
 '<UNK>',
 '<UNK>',
 'A_greeting',
 '<UNK>',
 '<UNK>',
 '<UNK>',
 '<UNK>',
 'A_confirmation',
 'A_confirmation',
 'A_sequence_closer',
 '<UNK>',
 '<UNK>',
 '<UNK>',
 '<UNK>',
 'A_request_summary',
 'A_confirmation',
 'A_sequence_closer',
 'U_confirmation',
 'A_confirmation',
 'A_sequence_closer',
 '<UNK>',
 'A_sequence_closer',
 '<UNK>',
 '<UNK>',
 '<UNK>',
 '<UNK>',
 'A_confirmation',
 'A_sequence_closer',
 '<UNK>',
 '<UNK>',
 '<UNK>',
 'A_confirmation',
 'A_sequence_closer',
 'U_sequence_closer',
 'U_sequence_closer',
 'A_confirmation',
 'A_sequence_closer',
 'A_sequence_closer',
 '<UNK>',
 'U_greeting',
 'U_sequence_closer',
 '<e>']

In [7]:
# #for one example of dialog
# start = ['<s>']
# end =['<e>']

# dialog_DA = []   
# temp = df.loc[df['conversation_id'] == 'dlg-00100680-00e0-40fe-8321-6d81b21bfc4f']
# [dialog_DA.append(temp['all_DA'][x]) for x in range(len(temp))]

# #insert begin and end tokens for each dialog. This is for bigram only. for tri would need two symbols in the begin and 2 in the end
# dialog_DA.insert(0, start)
# dialog_DA.append(end)

# dialog_DA

In [8]:
#flatten nested lists

def flatten(t):
    # https://stackoverflow.com/questions/952914/how-to-make-a-flat-list-out-of-a-list-of-lists
    return [item for sublist in t for item in sublist]

# df['labels_UNK_and_SINGLE_DA'] = flatten(ALL_DA)

In [9]:
#modeling phase
#from this tutorial: https://medium.com/swlh/language-modelling-with-nltk-20eac7e70853#b9bf

import nltk
from nltk.util import ngrams
# from nltk import word_tokenize

unigram=[]
bigram=[]
trigram=[]
fourgram=[]
tokenized_text = flatten(unigram)

#unigram, bigram, trigram, and fourgram models are created
for sequence in full_DA:
    unigram.append(sequence)

bigram.extend(list(ngrams(sequence, 2)))  
trigram.extend(list(ngrams(sequence, 3)))
fourgram.extend(list(ngrams(sequence, 4)))

freq_uni = Counter(flatten(unigram))
freq_bi = nltk.FreqDist(bigram)
freq_tri = nltk.FreqDist(trigram)
freq_four = nltk.FreqDist(fourgram)


print("Most common n-grams without add-1 smoothing: \n")
print ("Most common unigrams: \n", freq_uni.most_common(5))
print ("Most common bigrams: \n", freq_bi.most_common(5))
print ("\nMost common trigrams: \n", freq_tri.most_common(5))
print ("\nMost common fourgrams: \n", freq_four.most_common(5))

Most common n-grams without add-1 smoothing: 

Most common unigrams: 
 [('<UNK>', 197778), ('U_sequence_closer', 57516), ('A_greeting', 53345), ('A_sequence_closer', 47482), ('U_confirmation', 47295)]
Most common bigrams: 
 [(('U_confirmation', 'U_sequence_closer'), 6), (('<UNK>', 'A_greeting'), 3), (('U_sequence_closer', '<UNK>'), 3), (('U_greeting', 'U_confirmation'), 3), (('A_greeting', 'U_confirmation'), 2)]

Most common trigrams: 
 [(('U_confirmation', 'U_sequence_closer', '<UNK>'), 3), (('U_greeting', 'U_confirmation', 'U_sequence_closer'), 3), (('<UNK>', 'A_greeting', 'U_confirmation'), 2), (('A_greeting', 'U_confirmation', 'U_sequence_closer'), 2), (('<s>', 'A_greeting', '<UNK>'), 1)]

Most common fourgrams: 
 [(('<UNK>', 'A_greeting', 'U_confirmation', 'U_sequence_closer'), 2), (('U_greeting', 'U_confirmation', 'U_sequence_closer', '<UNK>'), 2), (('<s>', 'A_greeting', '<UNK>', 'A_greeting'), 1), (('A_greeting', '<UNK>', 'A_greeting', 'U_confirmation'), 1), (('A_greeting', 'U_c

In [10]:
bigram

[('<s>', 'A_greeting'),
 ('A_greeting', '<UNK>'),
 ('<UNK>', 'A_greeting'),
 ('A_greeting', 'U_confirmation'),
 ('U_confirmation', 'U_sequence_closer'),
 ('U_sequence_closer', '<UNK>'),
 ('<UNK>', 'A_greeting'),
 ('A_greeting', 'U_confirmation'),
 ('U_confirmation', 'U_sequence_closer'),
 ('U_sequence_closer', 'U_greeting'),
 ('U_greeting', '<UNK>'),
 ('<UNK>', 'A_greeting'),
 ('A_greeting', 'A_greeting'),
 ('A_greeting', 'U_greeting'),
 ('U_greeting', 'U_confirmation'),
 ('U_confirmation', 'U_sequence_closer'),
 ('U_sequence_closer', 'A_hold_request'),
 ('A_hold_request', 'A_greeting'),
 ('A_greeting', 'A_confirmation'),
 ('A_confirmation', 'U_greeting'),
 ('U_greeting', 'U_confirmation'),
 ('U_confirmation', 'U_sequence_closer'),
 ('U_sequence_closer', '<UNK>'),
 ('<UNK>', 'U_greeting'),
 ('U_greeting', 'U_confirmation'),
 ('U_confirmation', 'U_sequence_closer'),
 ('U_sequence_closer', '<UNK>'),
 ('<UNK>', 'U_confirmation'),
 ('U_confirmation', 'U_sequence_closer'),
 ('U_sequence_clo

In [27]:
#Add-1 smoothing is performed here. Different value might be better
            
ngrams_all = {1:[], 2:[], 3:[], 4:[]} #from unigram to fourgram in this case
for i in range(4):
    for each in unigram:
        for j in ngrams(each, i+1):
            ngrams_all[i+1].append(j);
ngrams_voc = {1:set([]), 2:set([]), 3:set([]), 4:set([])} #set() method is used to convert any of the iterable to sequence of iterable elements with distinct elements

for i in range(4):
    for gram in ngrams_all[i+1]:
        if gram not in ngrams_voc[i+1]:
            ngrams_voc[i+1].add(gram)
total_ngrams = {1:-1, 2:-1, 3:-1, 4:-1}
total_voc = {1:-1, 2:-1, 3:-1, 4:-1}

for i in range(4):
    total_ngrams[i+1] = len(ngrams_all[i+1])
    total_voc[i+1] = len(ngrams_voc[i+1])                       
    
ngrams_prob = {1:[], 2:[], 3:[], 4:[]}

for i in range(4):
    for ngram in ngrams_voc[i+1]:
        tlist = [ngram]
        tlist.append(ngrams_all[i+1].count(ngram))
        ngrams_prob[i+1].append(tlist)
    
for i in range(4):
    for ngram in ngrams_prob[i+1]:
        ngram[-1] = (ngram[-1])/(total_ngrams[i+1]+total_voc[i+1])             


In [28]:
#Prints top 10 unigram, bigram, trigram, fourgram
print("Most common n-grams without stopword removal and with add-1 smoothing: \n")
for i in range(4):
    ngrams_prob[i+1] = sorted(ngrams_prob[i+1], key = lambda x:x[1], reverse = True)
    
print ("Most common unigrams: ", str(ngrams_prob[1][:10]))
print ("\nMost common bigrams: ", str(ngrams_prob[2][:10]))
print ("\nMost common trigrams: ", str(ngrams_prob[3][:10]))
print ("\nMost common fourgrams: ", str(ngrams_prob[4][:10]))

Most common n-grams without stopword removal and with add-1 smoothing: 

Most common unigrams:  [[('<UNK>',), 0.3616928670182805], [('U_sequence_closer',), 0.10518423150918414], [('A_greeting',), 0.09755638135227464], [('A_sequence_closer',), 0.08683423187494056], [('U_confirmation',), 0.0864922496214421], [('A_confirmation',), 0.086044198005896], [('U_greeting',), 0.06960710445271867], [('<e>',), 0.03161781380072127], [('<s>',), 0.03161781380072127], [('A_completion_check',), 0.01180295970095755]]

Most common bigrams:  [[('<UNK>', '<UNK>'), 0.18398994369754085], [('A_confirmation', 'A_sequence_closer'), 0.05414888536985412], [('U_confirmation', 'U_sequence_closer'), 0.05134980568762422], [('<UNK>', 'U_confirmation'), 0.046080061606177225], [('<UNK>', 'A_confirmation'), 0.04550627858298242], [('U_sequence_closer', '<UNK>'), 0.038437800221585945], [('A_sequence_closer', '<UNK>'), 0.035202720939494204], [('<UNK>', 'A_greeting'), 0.031812871236672285], [('U_greeting', '<UNK>'), 0.0299235

### To calculate probability of unseen sequences

In [34]:
def ngram_prediction(tokenized_sent):
    ngram = {1:[], 2:[], 3:[]}#to store n-grams formed from the strings

    for i in range(1, 4):
        ngram[i] = list(ngrams(tokenized_sent, i))[-1]
    
    print("String: ", ngram)
    
    for j in range(4):
        ngrams_prob[j+1] = sorted(ngrams_prob[j+1], key = lambda x:x[1], reverse = True)
    
    pred = {1:[], 2:[], 3:[]}
    for k in range(3):
        count = 0
        for each in ngrams_prob[k+2]:
            if each[0][:-1] == ngram[k+1]:#to find predictions based on highest probability of n-grams                   
                count +=1
                pred[k+1].append(each[0][-1])
                if count ==5:
                    break
        if count<5:
            while(count!=5):
                pred[k+1].append("NA")#if no word prediction is found, replace with NOT FOUND
                count +=1
                
    return pred, ngrams_prob

token_1 = ['<s>','U_greeting','<UNK>','A_greeting','A_confirmation']
token_2 = ['<s>','U_greeting','A_greeting']
pred_1, ngrams_prob_1 = ngram_prediction(token_1)
pred_2, ngrams_prob_2 = ngram_prediction(token_2)
    

String:  {1: ('A_confirmation',), 2: ('A_greeting', 'A_confirmation'), 3: ('<UNK>', 'A_greeting', 'A_confirmation')}
String:  {1: ('A_greeting',), 2: ('U_greeting', 'A_greeting'), 3: ('<s>', 'U_greeting', 'A_greeting')}


In [35]:
print("Next word predictions for the strings using the probability models of bigrams, trigrams, and fourgrams\n")
print("String 1 \n")
print("Bigram model predictions: {}\nTrigram model predictions: {}\nFourgram model predictions: {}\n" .format(pred_1[1], pred_1[2], pred_1[3]))
print("String 2 \n")
print("Bigram model predictions: {}\nTrigram model predictions: {}\nFourgram model predictions: {}" .format(pred_2[1], pred_2[2], pred_2[3]))

Next word predictions for the strings using the probability models of bigrams, trigrams, and fourgrams

String 1 

Bigram model predictions: ['A_sequence_closer', '<UNK>', 'U_confirmation', 'U_sequence_closer', 'A_greeting']
Trigram model predictions: ['A_sequence_closer', '<UNK>', 'A_completion_check', 'U_confirmation', 'U_greeting']
Fourgram model predictions: ['A_sequence_closer', '<UNK>', 'A_completion_check', 'U_confirmation', 'A_greeting']

String 2 

Bigram model predictions: ['<UNK>', 'U_greeting', 'A_confirmation', 'A_greeting', 'U_confirmation']
Trigram model predictions: ['<UNK>', 'A_confirmation', 'U_greeting', 'U_confirmation', 'A_greeting']
Fourgram model predictions: ['<UNK>', 'A_greeting', 'A_confirmation', 'U_greeting', 'U_confirmation']


In [36]:
#Dict of probabilities from unigram to fourgram

ngrams_prob

{1: [[('<UNK>',), 0.3616928670182805],
  [('U_sequence_closer',), 0.10518423150918414],
  [('A_greeting',), 0.09755638135227464],
  [('A_sequence_closer',), 0.08683423187494056],
  [('U_confirmation',), 0.0864922496214421],
  [('A_confirmation',), 0.086044198005896],
  [('U_greeting',), 0.06960710445271867],
  [('<e>',), 0.03161781380072127],
  [('<s>',), 0.03161781380072127],
  [('A_completion_check',), 0.01180295970095755],
  [('A_hold_request',), 0.0107751841583579],
  [('A_receipt',), 0.007823529842066378],
  [('A_request_summary',), 0.0033265546476668397],
  [('A_repair_initiator',), 0.0028565576468694907],
  [('A_disconfirmation',), 0.0023371835292568563],
  [('U_request_summary',), 0.0013587851034724914],
  [('U_disconfirmation',), 0.001316723115074285],
  [('U_repair_initiator',), 0.0010076589394526821],
  [('U_receipt',), 0.0002852900082660951],
  [('U_completion_check',), 0.00025968705880631733],
  [('U_hold_request',), 0.00016459038938428563]],
 2: [[('<UNK>', '<UNK>'), 0.18

### Perplexity
An intrinsic evaluation metric is one that measures the quality of a model independent of any application.

The perplexity of a language model on a test set is the inverse probability of the test set, normalized by the number of words. Thus the higher the conditional probability of the word sequence, the lower the perplexity, and maximizing the perplexity is equivalent to maximizing the test set probability according to the language model.

https://stackoverflow.com/questions/54941966/how-can-i-calculate-perplexity-using-nltk/55043954#55043954?newreg=b97aa34187184c90988f9a75e51898c2

##### Here we calculate perprexity in the whole dataset. without train, dev, test. Why though?? 

In [117]:
import nltk
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.lm import MLE
from nltk.lm import Vocabulary


n = 2
train_data = [nltk.bigrams(t,  pad_right=True, pad_left=True, left_pad_symbol="<s>", right_pad_symbol="<e>") for t in unigram]
words = [word for sent in unigram for word in sent]
words.extend(["<s>", "<e>"])
padded_vocab = Vocabulary(words)
model = MLE(n)
model.fit(train_data, padded_vocab)

# unigram = unigram[0:2] # if you want to test just small sample

test_data = [nltk.bigrams(t,  pad_right=True, pad_left=True, left_pad_symbol="<s>", right_pad_symbol="<e>") for t in unigram]

#for each bigram MLE estimate (tuples estimate)
#if you don't want to see MLE per tuple comment print line
for test in test_data:
#     print ("MLE Estimates:", [((ngram[-1], ngram[:-1]),model.score(ngram[-1], ngram[:-1])) for ngram in test]) # will generate one estimator per input. Here each input contains all DA labels for one dialog sequence.
    pass

#for whole dialog perplexity calculation.
#if print unigram[i] you see sentence and perplexity. 
#to make it easier to read perplexities (not caring about the specific sentence) print second line (only i)

test_data = [nltk.bigrams(t,  pad_right=True, pad_left=True, left_pad_symbol="<s>", right_pad_symbol="<e>") for t in unigram]
for i, test in enumerate(test_data):
#     print("PP({0}):{1}".format(unigram[i], model.perplexity(test)))
    print("PP({0}):{1}".format(i, model.perplexity(test)))



PP(0):4.688240911747697
PP(1):4.995226693886854
PP(2):5.479673856410271
PP(3):4.36005081184385
PP(4):6.015384907919331
PP(5):3.31409328786099
PP(6):5.335815297457647
PP(7):3.7055614342471195
PP(8):4.544332069871523
PP(9):4.793171336663411
PP(10):6.361018326525868
PP(11):4.765981831881959
PP(12):6.749021361436664
PP(13):5.290047936933452
PP(14):3.4363763001693517
PP(15):3.7647745132970676
PP(16):3.946722278799815
PP(17):4.817811682323476
PP(18):5.630302357820929
PP(19):5.10892081514422
PP(20):4.77958533951873
PP(21):5.415090367902473
PP(22):4.6561088571744005
PP(23):4.371059984140332
PP(24):3.659902774959437
PP(25):4.901811266260283
PP(26):5.251275784675293
PP(27):5.23548341539132
PP(28):4.0061932474099
PP(29):3.244409479256656
PP(30):6.225819049992292
PP(31):5.618844291577963
PP(32):6.307207107530651
PP(33):4.634933798585472
PP(34):6.230253117155856
PP(35):5.613660667256988
PP(36):4.464979298766235
PP(37):5.62185732974366
PP(38):6.819430526175091
PP(39):3.863429969441992
PP(40):7.27208

PP(783):4.028178926008791
PP(784):3.748636400052477
PP(785):3.6756272762002684
PP(786):3.7674907850917156
PP(787):6.141161815014958
PP(788):4.813801362825546
PP(789):4.0079294322473435
PP(790):4.163714388816804
PP(791):8.755725203736818
PP(792):4.986541911006334
PP(793):3.9076759417693063
PP(794):3.40362266291688
PP(795):5.305904230370828
PP(796):5.512259321569736
PP(797):3.2741307158497412
PP(798):4.4067865610189605
PP(799):4.306422180280863
PP(800):5.428803947389988
PP(801):5.12695989157374
PP(802):6.318445516409249
PP(803):3.997956772666648
PP(804):5.056322764406685
PP(805):4.227126758382728
PP(806):5.4689412597994815
PP(807):7.805632226091289
PP(808):4.3423264747079715
PP(809):4.137893445797057
PP(810):4.867844446930126
PP(811):3.6748151352540703
PP(812):5.224523029721432
PP(813):4.328137071583569
PP(814):5.846429558660311
PP(815):3.8225414257192507
PP(816):6.177113094083453
PP(817):7.1277296284795115
PP(818):5.31447925504002
PP(819):5.580953570995552
PP(820):3.1874208448099925
PP(

PP(1337):5.812575417726533
PP(1338):4.480867886037742
PP(1339):3.826913110046293
PP(1340):4.342547490195838
PP(1341):4.200584053947334
PP(1342):5.373141900122318
PP(1343):4.0820599386961645
PP(1344):6.202247141105298
PP(1345):6.5066744265081
PP(1346):5.6518623425511905
PP(1347):5.378221334821732
PP(1348):4.200870230920343
PP(1349):4.3474711823135035
PP(1350):4.984353279161712
PP(1351):4.485794305100469
PP(1352):5.735591476508776
PP(1353):5.298153986913975
PP(1354):4.280927959194613
PP(1355):4.458316849769445
PP(1356):5.867813718354766
PP(1357):3.1792064578493346
PP(1358):4.330155509953197
PP(1359):6.648052228993836
PP(1360):4.837218595633109
PP(1361):5.6366061538991445
PP(1362):4.131079848954719
PP(1363):4.413452729017138
PP(1364):4.282476064709022
PP(1365):6.51494248273425
PP(1366):6.246151316519521
PP(1367):6.6461119169975955
PP(1368):3.55258705860646
PP(1369):4.759196763534976
PP(1370):5.331691881882862
PP(1371):4.07587242934805
PP(1372):4.923237296567349
PP(1373):4.310192455253735


PP(1911):4.661291091418644
PP(1912):3.37490282566887
PP(1913):4.024953570854425
PP(1914):7.080724352246109
PP(1915):3.5505111321481366
PP(1916):5.073753523989257
PP(1917):4.446873351293024
PP(1918):3.9740485492008286
PP(1919):4.545796672878002
PP(1920):3.7320005606507105
PP(1921):5.699923321476049
PP(1922):4.720573885029629
PP(1923):5.833463582637263
PP(1924):6.593932381652173
PP(1925):4.001212766405952
PP(1926):3.585611928767227
PP(1927):3.925551469401088
PP(1928):3.665822503710828
PP(1929):4.335604969373379
PP(1930):4.568268733121427
PP(1931):4.929251965388915
PP(1932):6.915476194313621
PP(1933):6.185989724237313
PP(1934):3.572114993023217
PP(1935):4.892727427491991
PP(1936):5.3676581496759725
PP(1937):3.5065743904148765
PP(1938):5.359811903978038
PP(1939):5.957690412174697
PP(1940):4.270704855526864
PP(1941):3.637023366168496
PP(1942):3.2798001507598102
PP(1943):4.853508450746376
PP(1944):3.173460266170404
PP(1945):4.388898411208821
PP(1946):3.556360856409781
PP(1947):6.708921230726

PP(2549):3.2488797830070273
PP(2550):5.576182794913849
PP(2551):5.405893538994027
PP(2552):4.743176535220307
PP(2553):4.369110020135032
PP(2554):4.60069116364473
PP(2555):6.496399853437176
PP(2556):4.23514578819208
PP(2557):3.700996299794738
PP(2558):6.862113973985361
PP(2559):8.15384944939249
PP(2560):6.914051989753736
PP(2561):3.7487707436813356
PP(2562):4.71773931213362
PP(2563):6.141847790411042
PP(2564):4.953273822475617
PP(2565):4.906156123276213
PP(2566):5.312026544721554
PP(2567):3.582401593864358
PP(2568):4.371004041548585
PP(2569):4.118431189167047
PP(2570):4.6796285795501555
PP(2571):4.198683756348586
PP(2572):4.385625866967975
PP(2573):3.906591894997986
PP(2574):4.69152937660747
PP(2575):5.939809627274168
PP(2576):4.60979276529626
PP(2577):5.132547930374114
PP(2578):6.5583799063690025
PP(2579):4.6217214535529205
PP(2580):5.509331633894204
PP(2581):4.010449074224041
PP(2582):6.599424581750468
PP(2583):4.500074733458488
PP(2584):4.500348210637394
PP(2585):4.016799137120081
PP

PP(3336):4.69146204673371
PP(3337):8.233083293211806
PP(3338):5.298531238129392
PP(3339):5.218464824638633
PP(3340):3.913868060080917
PP(3341):3.9134762875758726
PP(3342):4.424131107632032
PP(3343):5.990631182881564
PP(3344):4.289921060969193
PP(3345):6.004868454329408
PP(3346):5.446945148732076
PP(3347):6.951473980165701
PP(3348):4.10996732124974
PP(3349):5.09279383435796
PP(3350):5.161990542449922
PP(3351):5.508978255992066
PP(3352):6.767551384925031
PP(3353):3.878466681294695
PP(3354):6.9393843698140465
PP(3355):4.328727626452871
PP(3356):5.3445012607293165
PP(3357):5.885330562012622
PP(3358):8.595324727573828
PP(3359):4.381804837706197
PP(3360):4.4561918393395805
PP(3361):4.5932326963672985
PP(3362):4.218528690348061
PP(3363):5.495922471795888
PP(3364):4.121430484480393
PP(3365):8.73549448260235
PP(3366):3.7217942317792754
PP(3367):5.5797944033667415
PP(3368):9.575494092206268
PP(3369):3.8377065114837747
PP(3370):5.851359866158665
PP(3371):5.577478792113243
PP(3372):4.6299735743397

PP(3936):4.614639925704424
PP(3937):3.958180943390093
PP(3938):4.5680149881879775
PP(3939):4.73474538157546
PP(3940):5.11950237137169
PP(3941):4.943251563121504
PP(3942):4.434706219169214
PP(3943):4.88657826264945
PP(3944):4.614550269283197
PP(3945):5.140386891411694
PP(3946):5.886692648094003
PP(3947):4.809787804407178
PP(3948):4.62205400175223
PP(3949):5.640117990494845
PP(3950):5.143490544401575
PP(3951):4.777479794613466
PP(3952):5.789649583035821
PP(3953):5.678963653817349
PP(3954):6.406996364617372
PP(3955):3.904669254893711
PP(3956):2.7291046564476016
PP(3957):3.975808142800123
PP(3958):5.145553671744797
PP(3959):4.811604281464307
PP(3960):6.4692410163354355
PP(3961):4.395073278172725
PP(3962):6.714518387519544
PP(3963):5.626330395180194
PP(3964):5.52288538557371
PP(3965):3.9013426378515117
PP(3966):4.705396649785279
PP(3967):5.510871822826448
PP(3968):3.6847492649479054
PP(3969):5.033214282922723
PP(3970):5.265534677635502
PP(3971):5.243740142422294
PP(3972):3.9833998443578884


PP(4502):5.02017216206817
PP(4503):4.1993262340221
PP(4504):6.315374153825262
PP(4505):5.8969096458156915
PP(4506):4.236470825636562
PP(4507):3.616595310539925
PP(4508):5.146298576104134
PP(4509):4.776553153002177
PP(4510):5.539620459072743
PP(4511):4.515042246609912
PP(4512):6.2285985867217954
PP(4513):5.680399710578238
PP(4514):4.103382623788866
PP(4515):5.827946910894637
PP(4516):6.472997162654052
PP(4517):4.313351090181266
PP(4518):3.6544144879727076
PP(4519):3.618024656260675
PP(4520):4.52057690741989
PP(4521):4.332179861769816
PP(4522):4.964448710294359
PP(4523):5.8256527178824635
PP(4524):4.3768908090658964
PP(4525):4.848700051769166
PP(4526):6.677147557823158
PP(4527):4.334979172756922
PP(4528):8.058330795261861
PP(4529):4.742862977646453
PP(4530):6.5840672255083525
PP(4531):3.820668000856503
PP(4532):4.730936026336546
PP(4533):5.3915462515021
PP(4534):4.4624183827662876
PP(4535):5.805293510935988
PP(4536):4.648662250230907
PP(4537):5.676473958196564
PP(4538):3.8077891470175977

PP(5056):4.638206156574721
PP(5057):4.8489305494502775
PP(5058):4.324751269020375
PP(5059):5.55024139339834
PP(5060):3.9825773642797238
PP(5061):6.626720704136313
PP(5062):5.616730403663803
PP(5063):3.8905183281670968
PP(5064):5.8088448031662
PP(5065):6.051245134914311
PP(5066):6.401465592769125
PP(5067):3.32527275562197
PP(5068):6.952167624276156
PP(5069):4.412346283137837
PP(5070):4.753160217117884
PP(5071):4.712826939223138
PP(5072):5.604096572273358
PP(5073):5.126305074565159
PP(5074):4.717573685090278
PP(5075):4.933878482248454
PP(5076):4.772276995212094
PP(5077):4.734809891980448
PP(5078):6.02022996407878
PP(5079):3.639020661843972
PP(5080):4.757829492407366
PP(5081):7.820654574233516
PP(5082):4.3727483008807315
PP(5083):5.662754100872641
PP(5084):7.811644742897389
PP(5085):3.2383377302381904
PP(5086):4.881237556653937
PP(5087):5.053410902288977
PP(5088):5.0489029882241185
PP(5089):4.792491680400206
PP(5090):4.893871305930292
PP(5091):4.972138227557205
PP(5092):6.377122407077981


PP(5585):4.0850932707595975
PP(5586):5.589344845252229
PP(5587):4.327275830458004
PP(5588):4.914210654892246
PP(5589):4.16171219559108
PP(5590):5.784330537678909
PP(5591):3.3387312594508147
PP(5592):6.931924806741543
PP(5593):4.648762679022003
PP(5594):5.173614925750746
PP(5595):4.571478533606333
PP(5596):4.431366403414428
PP(5597):4.28805495610246
PP(5598):4.933867525826778
PP(5599):5.016511993310351
PP(5600):5.774080072295809
PP(5601):5.6008306107484405
PP(5602):3.911503945565136
PP(5603):5.74792195046854
PP(5604):4.933195678329859
PP(5605):4.701425972198221
PP(5606):6.019486431351741
PP(5607):6.879241731174921
PP(5608):5.605963544878157
PP(5609):5.983380976500344
PP(5610):4.513656692171324
PP(5611):4.442165915417779
PP(5612):4.512042314833457
PP(5613):6.6700695875628595
PP(5614):3.857366284273641
PP(5615):4.392658731994036
PP(5616):5.455297387147678
PP(5617):6.108215826990143
PP(5618):4.895905010491429
PP(5619):4.202850211915576
PP(5620):5.008267065054363
PP(5621):5.70234405182607
P

PP(6180):5.563525626025501
PP(6181):3.5634635101535377
PP(6182):5.641737304766915
PP(6183):5.1323423071291465
PP(6184):4.891052957771142
PP(6185):5.873847333733288
PP(6186):6.123746916726088
PP(6187):3.815510444894818
PP(6188):4.622735459558231
PP(6189):4.393945233818335
PP(6190):4.331129962238585
PP(6191):4.945641675902778
PP(6192):5.5454863595108534
PP(6193):4.762924980987531
PP(6194):7.764936002605659
PP(6195):7.481506242123058
PP(6196):5.688003023600485
PP(6197):4.738296401224106
PP(6198):5.524787915019761
PP(6199):5.058141540028334
PP(6200):3.9952325713053356
PP(6201):5.120992346752386
PP(6202):3.484793038675205
PP(6203):5.12085876956285
PP(6204):4.247540959202467
PP(6205):4.204872406098617
PP(6206):5.033239637919714
PP(6207):4.13797130803446
PP(6208):5.550973255001549
PP(6209):5.2414291210533435
PP(6210):6.716151621709825
PP(6211):4.504323918536494
PP(6212):5.209778376561851
PP(6213):4.88056388809599
PP(6214):4.632935411039789
PP(6215):4.986288121324645
PP(6216):5.874502223712411

PP(6758):5.656883908782674
PP(6759):7.056110600547787
PP(6760):4.7241351954344415
PP(6761):3.4855859350984884
PP(6762):4.7642553380851345
PP(6763):4.909266716609383
PP(6764):4.55963422045097
PP(6765):3.0242076629472545
PP(6766):3.95334543226647
PP(6767):6.234022537433295
PP(6768):4.836455236228634
PP(6769):4.926983653041991
PP(6770):6.717314399215956
PP(6771):4.087972187411147
PP(6772):5.118875173529104
PP(6773):6.4252634608054215
PP(6774):3.870980404303035
PP(6775):3.68523064605339
PP(6776):3.9078694477376477
PP(6777):5.818981161024648
PP(6778):5.4796343425169125
PP(6779):4.021743218582919
PP(6780):4.734801044590584
PP(6781):5.572058101252006
PP(6782):6.331276983478368
PP(6783):5.108164093518211
PP(6784):3.720988029151883
PP(6785):3.854654820763025
PP(6786):4.5329297183615385
PP(6787):4.532454191740395
PP(6788):4.397810781854227
PP(6789):5.7574792677032125
PP(6790):5.617234619275208
PP(6791):5.758265915779458
PP(6792):7.757731439845114
PP(6793):4.726111453541999
PP(6794):5.98539575546

PP(7373):4.833493053573574
PP(7374):3.9862437444463246
PP(7375):4.395377411704842
PP(7376):4.646341657249621
PP(7377):4.593254727067042
PP(7378):4.361297127602856
PP(7379):5.674221909406737
PP(7380):4.157555249979927
PP(7381):5.341609069252345
PP(7382):5.813716764900978
PP(7383):4.456971143468228
PP(7384):4.304022615388133
PP(7385):5.555187425767554
PP(7386):4.234197940452966
PP(7387):4.52611588581875
PP(7388):6.834147994175305
PP(7389):5.442373694699739
PP(7390):4.605503620222293
PP(7391):3.5182563267460045
PP(7392):5.770499910375124
PP(7393):6.0335030921872805
PP(7394):4.721743156345176
PP(7395):4.088003267512189
PP(7396):8.444760707100654
PP(7397):5.247811509213126
PP(7398):6.5602223102442165
PP(7399):4.724940799221943
PP(7400):5.304623276331731
PP(7401):3.7586591967311396
PP(7402):4.408032075477634
PP(7403):4.379801210230458
PP(7404):3.4068978219784816
PP(7405):3.7795286873702727
PP(7406):3.3710324779122707
PP(7407):3.433471607653739
PP(7408):5.251035181311436
PP(7409):4.7226312305

PP(8026):3.2040804149522035
PP(8027):5.061339418773271
PP(8028):5.816064551702022
PP(8029):5.629628340145975
PP(8030):4.8253709443708015
PP(8031):6.469861181727559
PP(8032):4.054097780940186
PP(8033):4.832887816590564
PP(8034):8.552466529624388
PP(8035):6.075254483627437
PP(8036):4.548517033289648
PP(8037):5.386896680496954
PP(8038):5.042624642760279
PP(8039):4.115763246244527
PP(8040):5.539441226187525
PP(8041):4.72993588128001
PP(8042):5.234113559310728
PP(8043):4.444347071905471
PP(8044):4.321943838783593
PP(8045):5.704909200290205
PP(8046):4.367229869730913
PP(8047):5.41555018076072
PP(8048):4.2441407957776
PP(8049):4.70262135111926
PP(8050):5.4035476333688495
PP(8051):5.818662738895555
PP(8052):4.444020187006278
PP(8053):4.544517323799471
PP(8054):6.2466176239142746
PP(8055):4.215072995800671
PP(8056):4.650884054309682
PP(8057):5.029794095145445
PP(8058):5.3163930444658325
PP(8059):5.990164612330121
PP(8060):3.594042180983614
PP(8061):5.304904636196308
PP(8062):6.396101636573392
P

PP(8652):4.697283027765048
PP(8653):3.7530258103342216
PP(8654):4.493880256709242
PP(8655):3.5729219592671067
PP(8656):5.019790997543349
PP(8657):5.243729123076787
PP(8658):9.363624325977764
PP(8659):4.882553228818758
PP(8660):5.168096493703388
PP(8661):5.724221750084142
PP(8662):7.620082820598718
PP(8663):4.899431975576634
PP(8664):5.919549621460872
PP(8665):5.466678502582209
PP(8666):4.747428942141936
PP(8667):5.640781561646413
PP(8668):4.10182573311726
PP(8669):5.051262071247755
PP(8670):4.2715108197363385
PP(8671):4.197535430987937
PP(8672):3.5268964913043264
PP(8673):4.7413409056509925
PP(8674):8.60671631147956
PP(8675):6.517742195777363
PP(8676):4.2566192361091835
PP(8677):3.923541900439908
PP(8678):3.634341988946647
PP(8679):4.904957574634826
PP(8680):5.95074189021032
PP(8681):6.267893629302195
PP(8682):4.784586777349993
PP(8683):9.759352120428035
PP(8684):4.967506247207558
PP(8685):4.466571080651818
PP(8686):4.633463569867749
PP(8687):6.73081056960417
PP(8688):6.464535031922723

PP(9328):8.625695509711509
PP(9329):6.5115858551873105
PP(9330):6.724919862333479
PP(9331):7.48063936872918
PP(9332):8.924994812802627
PP(9333):4.4825980642881325
PP(9334):6.01812794633814
PP(9335):5.720620311086257
PP(9336):5.389983786403915
PP(9337):3.5054750479668537
PP(9338):3.958016938933896
PP(9339):8.708222534597242
PP(9340):5.463350165082026
PP(9341):5.774041074727867
PP(9342):4.3795437296536015
PP(9343):4.948882016484337
PP(9344):6.460197228146601
PP(9345):6.650589071436769
PP(9346):7.414790714518383
PP(9347):4.952561121951869
PP(9348):3.7914479343377185
PP(9349):4.535495049446124
PP(9350):3.7147824294468137
PP(9351):6.2859527686255845
PP(9352):9.954389362090266
PP(9353):4.820545314423107
PP(9354):12.426016760834592
PP(9355):4.480781405881321
PP(9356):5.533382827632907
PP(9357):4.628465584364053
PP(9358):4.46441149893062
PP(9359):6.332742460057891
PP(9360):4.778592984775513
PP(9361):4.759580389004902
PP(9362):5.51584572989228
PP(9363):4.632311413946094
PP(9364):3.9443583542652

PP(9889):4.550785252329681
PP(9890):6.382759271121702
PP(9891):3.525791964533108
PP(9892):4.081010485915574
PP(9893):6.671983941975313
PP(9894):6.488751247535232
PP(9895):4.652644820506472
PP(9896):3.3521732198536522
PP(9897):4.903168203023425
PP(9898):4.170573506052139
PP(9899):4.851494782029292
PP(9900):10.183689440113401
PP(9901):3.571856408612173
PP(9902):3.6529545266921177
PP(9903):5.4259415095939145
PP(9904):6.33432136000847
PP(9905):7.664855812504933
PP(9906):5.371181112563654
PP(9907):5.228002739122532
PP(9908):5.413098320276804
PP(9909):6.8185510642808795
PP(9910):5.115423422763656
PP(9911):5.313440172030365
PP(9912):3.5289155441297075
PP(9913):5.452081943829611
PP(9914):5.101718319551334
PP(9915):4.8672641144789335
PP(9916):5.775951265213831
PP(9917):6.664947441222035
PP(9918):5.212104157510478
PP(9919):5.246728542736416
PP(9920):5.503929609662738
PP(9921):9.786796142353309
PP(9922):4.188323178186525
PP(9923):4.939309073167983
PP(9924):5.013399562069576
PP(9925):6.81143534880

PP(10540):4.698898245819935
PP(10541):4.72274784982269
PP(10542):5.685529187226372
PP(10543):5.035317704467652
PP(10544):4.163706262765366
PP(10545):5.687171563133799
PP(10546):4.92712869975747
PP(10547):7.312513490827123
PP(10548):5.241468759121162
PP(10549):6.0188526485654394
PP(10550):6.048013486246447
PP(10551):5.337441883639508
PP(10552):5.941489605029088
PP(10553):7.135421482540475
PP(10554):5.399746688276991
PP(10555):6.500263835191025
PP(10556):4.931136331656054
PP(10557):3.8141060956046378
PP(10558):4.685823019422989
PP(10559):6.581300834382876
PP(10560):5.732770979600159
PP(10561):5.5986859718226505
PP(10562):4.3137382734059715
PP(10563):7.091966973617826
PP(10564):5.80171107911631
PP(10565):3.8931229574831905
PP(10566):6.820530529106425
PP(10567):4.118971341279
PP(10568):4.5940652610811075
PP(10569):2.7311223550467547
PP(10570):5.357821225388626
PP(10571):6.2038376805592375
PP(10572):6.2614693069474825
PP(10573):5.096748871626711
PP(10574):5.302258912527307
PP(10575):6.67769

PP(11138):3.5215381265448795
PP(11139):4.360487298046298
PP(11140):4.671152556998575
PP(11141):5.055100841535363
PP(11142):4.477935427001854
PP(11143):3.3203475535239666
PP(11144):5.700434750048862
PP(11145):5.18649398908706
PP(11146):3.99436748279641
PP(11147):5.65661094028523
PP(11148):4.235542407788565
PP(11149):4.223884844592108
PP(11150):7.8493184205465525
PP(11151):5.125426588184771
PP(11152):3.803070397363028
PP(11153):6.334389782586404
PP(11154):5.2087518104962625
PP(11155):4.792181405323235
PP(11156):4.438040211597002
PP(11157):5.861553614982699
PP(11158):4.789822798968394
PP(11159):5.907635842683154
PP(11160):4.273378077823466
PP(11161):4.443964304333772
PP(11162):6.04228922138162
PP(11163):5.758939829091745
PP(11164):6.201353394541511
PP(11165):3.68387058991709
PP(11166):5.409540252055845
PP(11167):4.724577370616113
PP(11168):5.336444857819561
PP(11169):5.760168214881449
PP(11170):3.966811566550162
PP(11171):5.384408805665606
PP(11172):4.934772513569431
PP(11173):3.693553969

PP(11699):3.061646883788994
PP(11700):4.998451042668045
PP(11701):4.029167069597024
PP(11702):6.250364638829545
PP(11703):6.016395240369269
PP(11704):6.014525109251902
PP(11705):4.905017302213126
PP(11706):5.706549652516913
PP(11707):6.073591386332474
PP(11708):3.8629810614972704
PP(11709):4.831848021392803
PP(11710):5.508312650774745
PP(11711):4.774955493127621
PP(11712):5.939221355144363
PP(11713):7.028398060092687
PP(11714):4.056919777432968
PP(11715):3.9643429696382673
PP(11716):3.2673818558740706
PP(11717):4.681690520311492
PP(11718):4.748342974261544
PP(11719):4.815150803961911
PP(11720):4.90845006923509
PP(11721):5.069324958009503
PP(11722):5.6437133769707
PP(11723):4.816538910775637
PP(11724):3.3168943471014387
PP(11725):5.38625105324752
PP(11726):4.343318416020479
PP(11727):5.521688103129454
PP(11728):5.538797351032283
PP(11729):6.853298338737509
PP(11730):5.476235068680657
PP(11731):7.240650390171172
PP(11732):6.895668749630851
PP(11733):5.067220869798585
PP(11734):3.94296315

PP(12278):3.653706583474982
PP(12279):4.536687534712553
PP(12280):4.830238779545905
PP(12281):4.980262543028868
PP(12282):5.080012484262672
PP(12283):5.078427696184772
PP(12284):6.845845682642013
PP(12285):5.350579608065412
PP(12286):3.763536593080319
PP(12287):4.298579137379762
PP(12288):5.5207488310198585
PP(12289):6.741976682670706
PP(12290):4.634481075374017
PP(12291):5.006382524384391
PP(12292):4.1491774633982175
PP(12293):6.3348188572034525
PP(12294):4.422514805088537
PP(12295):4.592533891347777
PP(12296):4.940936265046358
PP(12297):4.385522791126165
PP(12298):4.8141568456362815
PP(12299):4.756753955501513
PP(12300):5.78850590514588
PP(12301):3.8670705625865045
PP(12302):4.239406651291376
PP(12303):5.209141523300658
PP(12304):7.229026100451182
PP(12305):5.271543311789134
PP(12306):5.939457894472223
PP(12307):6.22304023246498
PP(12308):6.4535349464296745
PP(12309):4.932975598783881
PP(12310):5.818306778696264
PP(12311):2.562271576393142
PP(12312):6.721743200020953
PP(12313):6.3639

PP(13061):4.863713840092919
PP(13062):4.623058056275571
PP(13063):3.975334790697293
PP(13064):5.593973137974675
PP(13065):5.1205402535560545
PP(13066):5.032518406973201
PP(13067):4.137062563538778
PP(13068):4.330619468955659
PP(13069):6.33815065526359
PP(13070):5.871519054298675
PP(13071):4.8800826169543186
PP(13072):3.592530857745407
PP(13073):3.6077148507096988
PP(13074):4.7082915312245674
PP(13075):4.314815155378239
PP(13076):7.6443025022914295
PP(13077):5.33390278876157
PP(13078):3.6032960278185016
PP(13079):3.9270104550209686
PP(13080):4.942098009796107
PP(13081):5.606865431024832
PP(13082):4.375221035072172
PP(13083):5.285058698059502
PP(13084):5.386520726995475
PP(13085):5.064541883693494
PP(13086):5.636710079139599
PP(13087):5.710541117243427
PP(13088):4.39927410515827
PP(13089):5.729867970533201
PP(13090):10.548607683215332
PP(13091):5.718255223525958
PP(13092):7.06099882246349
PP(13093):4.1462038782168085
PP(13094):4.902503950611099
PP(13095):4.825096205774505
PP(13096):5.052

PP(13744):5.465063266863447
PP(13745):5.980402415726409
PP(13746):5.51888318231934
PP(13747):4.327540191003026
PP(13748):4.58945118426345
PP(13749):6.417904675911343
PP(13750):7.8878385521892636
PP(13751):4.5156073527956035
PP(13752):4.73272569786691
PP(13753):5.85809626648375
PP(13754):5.334415911172585
PP(13755):5.260674221095142
PP(13756):6.226860334430748
PP(13757):5.456437170918696
PP(13758):3.160391548743798
PP(13759):4.576932308802425
PP(13760):5.239841602071925
PP(13761):3.840203421524219
PP(13762):5.404400216825501
PP(13763):5.638243982542774
PP(13764):4.21792876247252
PP(13765):3.725691769573745
PP(13766):6.67539940463654
PP(13767):5.507523113535523
PP(13768):4.628163101932907
PP(13769):6.136093573802027
PP(13770):6.315243349581113
PP(13771):4.497869027864609
PP(13772):5.164721018512016
PP(13773):4.686243583238068
PP(13774):5.055190050310873
PP(13775):5.421518178826585
PP(13776):7.16092309038204
PP(13777):5.824421642935129
PP(13778):4.943134044714918
PP(13779):5.5220086396891

PP(14327):4.061650798274062
PP(14328):4.8040259543506005
PP(14329):5.795546601576947
PP(14330):4.906402628682507
PP(14331):2.8317380269612467
PP(14332):5.6369940224616535
PP(14333):3.644734298744882
PP(14334):5.1423377429507315
PP(14335):5.004395762779991
PP(14336):4.443278176680245
PP(14337):5.196724331707419
PP(14338):4.1029128314667975
PP(14339):3.382372474806982
PP(14340):7.810705122058409
PP(14341):3.767203268569935
PP(14342):3.884642556535513
PP(14343):3.9099917588795847
PP(14344):4.102248039788904
PP(14345):4.816094600018315
PP(14346):4.097913589795204
PP(14347):4.5246104165291054
PP(14348):3.1465645671667297
PP(14349):4.825628503812206
PP(14350):5.016695052269423
PP(14351):7.671519085040393
PP(14352):5.690324331609013
PP(14353):6.390993665484724
PP(14354):4.515312143521857
PP(14355):5.395478775424002
PP(14356):6.661533406421746
PP(14357):4.523406742692829
PP(14358):7.1463830924119724
PP(14359):4.2257942512730775
PP(14360):3.8422912026191858
PP(14361):3.1279037311905857
PP(14362

PP(15184):4.790651001872154
PP(15185):4.6830712231471745
PP(15186):4.407340752910983
PP(15187):8.28696017434603
PP(15188):4.041269864754638
PP(15189):5.210816441019039
PP(15190):5.365108848641255
PP(15191):7.312238881495349
PP(15192):4.461806517494628
PP(15193):6.906941837109009
PP(15194):5.859093951710235
PP(15195):4.577365410130772
PP(15196):6.106711417655841
PP(15197):5.118755792298133
PP(15198):4.841313538978754
PP(15199):2.6921286747662307
PP(15200):6.447785986625925
PP(15201):5.173104212365739
PP(15202):3.5251121430077084
PP(15203):4.869672242582496
PP(15204):4.935714141056791
PP(15205):4.572262705812253
PP(15206):4.4845136300821355
PP(15207):4.8030008741540815
PP(15208):4.678925416342724
PP(15209):5.0212825216256185
PP(15210):4.383745135253813
PP(15211):7.481350159854292
PP(15212):5.294932501806158
PP(15213):3.968416506054061
PP(15214):5.120684168720142
PP(15215):2.6746147540953977
PP(15216):7.429720413044147
PP(15217):4.614010595137626
PP(15218):6.302238922280343
PP(15219):5.29

PP(15826):4.579828595020835
PP(15827):6.1878821444978644
PP(15828):5.5998033613799905
PP(15829):3.8410657842016156
PP(15830):4.240447817191743
PP(15831):4.732337641975163
PP(15832):5.361290422983707
PP(15833):6.4482006452005205
PP(15834):4.949044269691777
PP(15835):6.439598519241323
PP(15836):4.425499656572197
PP(15837):6.643140849375846
PP(15838):4.061111498291472
PP(15839):4.811252098314807
PP(15840):6.074424505687933
PP(15841):5.841702881704857
PP(15842):3.533758342497561
PP(15843):6.261929648700907
PP(15844):4.307937738815048
PP(15845):3.8302254744629662
PP(15846):5.206110848134386
PP(15847):6.886139542143425
PP(15848):5.6501156806386765
PP(15849):5.332562459457524
PP(15850):6.343190864939846
PP(15851):5.068992697037634
PP(15852):7.289092784097425
PP(15853):4.972164832841492
PP(15854):4.493905035498119
PP(15855):3.947046636065205
PP(15856):4.007536139470067
PP(15857):9.076296772424387
PP(15858):6.662088032385194
PP(15859):4.545869538315014
PP(15860):5.047394099337314
PP(15861):6.76

PP(16694):4.088592340088438
PP(16695):6.555065845622313
PP(16696):6.2686212196382884
PP(16697):4.3048751629125475
PP(16698):4.214363087303975
PP(16699):4.548840558986494
PP(16700):7.017494389920028
PP(16701):3.64471201258989
PP(16702):5.4072157359626924
PP(16703):5.445075477244584
PP(16704):5.474536072919907
PP(16705):4.775030826939327
PP(16706):6.974723463036372
PP(16707):6.103371836215983
PP(16708):4.452602233525861
PP(16709):7.596186604125937
PP(16710):7.721298160034646
PP(16711):5.684440751138963
PP(16712):5.156525203740982
PP(16713):3.9035803877673265
PP(16714):4.963525410135098
PP(16715):5.806930617121403
PP(16716):7.51524795536892
PP(16717):4.587140802168897
PP(16718):4.526135576872953
PP(16719):5.6824190084141595
PP(16720):5.093340405746318
PP(16721):5.7769325135115075
PP(16722):5.674564550163343
PP(16723):4.9168900162667475
PP(16724):7.166690520919057
PP(16725):4.768312149613403
PP(16726):4.093036928753341
PP(16727):5.296305071319511
PP(16728):7.8413905915120905
PP(16729):4.98

## HMM

In [45]:
for ngrams in ngrams_prob.values():
    for lista in ngrams:
        print(lista[1])


0.3616928670182805
0.10518423150918414
0.09755638135227464
0.08683423187494056
0.0864922496214421
0.086044198005896
0.06960710445271867
0.03161781380072127
0.03161781380072127
0.01180295970095755
0.0107751841583579
0.007823529842066378
0.0033265546476668397
0.0028565576468694907
0.0023371835292568563
0.0013587851034724914
0.001316723115074285
0.0010076589394526821
0.0002852900082660951
0.00025968705880631733
0.00016459038938428563
0.18398994369754085
0.05414888536985412
0.05134980568762422
0.046080061606177225
0.04550627858298242
0.038437800221585945
0.035202720939494204
0.031812871236672285
0.02992353963727098
0.0285268309623889
0.023013606584915167
0.01970491698076883
0.01933309048218536
0.01916510795236846
0.018963151427757133
0.016669906779133173
0.016050825096212466
0.014723951855074488
0.014314376473386094
0.013948212307268358
0.013247970525672071
0.013240420749051088
0.011934309393620817
0.011590794557366033
0.010667834365450712
0.008948372740021554
0.008761515768652195
0.008221

1.165811414459559e-05
1.165811414459559e-05
1.165811414459559e-05
1.165811414459559e-05
1.165811414459559e-05
9.715095120496325e-06
9.715095120496325e-06
9.715095120496325e-06
9.715095120496325e-06
9.715095120496325e-06
9.715095120496325e-06
9.715095120496325e-06
9.715095120496325e-06
9.715095120496325e-06
9.715095120496325e-06
9.715095120496325e-06
9.715095120496325e-06
9.715095120496325e-06
9.715095120496325e-06
9.715095120496325e-06
9.715095120496325e-06
9.715095120496325e-06
9.715095120496325e-06
9.715095120496325e-06
9.715095120496325e-06
9.715095120496325e-06
9.715095120496325e-06
9.715095120496325e-06
9.715095120496325e-06
9.715095120496325e-06
9.715095120496325e-06
9.715095120496325e-06
9.715095120496325e-06
9.715095120496325e-06
9.715095120496325e-06
9.715095120496325e-06
9.715095120496325e-06
9.715095120496325e-06
9.715095120496325e-06
9.715095120496325e-06
9.715095120496325e-06
9.715095120496325e-06
9.715095120496325e-06
9.715095120496325e-06
9.715095120496325e-06
9.71509512

0.0005592531697600152
0.0005592531697600152
0.0005553008505390963
0.0005513485313181775
0.0005513485313181775
0.0005513485313181775
0.0005513485313181775
0.0005493723717077181
0.0005493723717077181
0.0005493723717077181
0.0005473962120972587
0.0005473962120972587
0.0005454200524867992
0.0005434438928763398
0.000539491573655421
0.0005375154140449615
0.0005375154140449615
0.0005375154140449615
0.0005375154140449615
0.0005335630948240427
0.0005335630948240427
0.0005315869352135833
0.000529610775603124
0.000529610775603124
0.0005276346159926645
0.0005256584563822051
0.0005217061371612863
0.0005177538179403674
0.0005177538179403674
0.0005177538179403674
0.000515777658329908
0.000515777658329908
0.0005098491794985297
0.0005078730198880703
0.0005078730198880703
0.0005078730198880703
0.0005058968602776109
0.0005058968602776109
0.0004979922218357732
0.0004940399026148544
0.0004940399026148544
0.0004940399026148544
0.0004940399026148544
0.0004940399026148544
0.0004940399026148544
0.0004920637430

4.742783065102602e-05
4.742783065102602e-05
4.742783065102602e-05
4.742783065102602e-05
4.742783065102602e-05
4.742783065102602e-05
4.742783065102602e-05
4.742783065102602e-05
4.742783065102602e-05
4.742783065102602e-05
4.742783065102602e-05
4.742783065102602e-05
4.742783065102602e-05
4.742783065102602e-05
4.742783065102602e-05
4.742783065102602e-05
4.742783065102602e-05
4.742783065102602e-05
4.742783065102602e-05
4.742783065102602e-05
4.742783065102602e-05
4.742783065102602e-05
4.742783065102602e-05
4.742783065102602e-05
4.742783065102602e-05
4.742783065102602e-05
4.742783065102602e-05
4.742783065102602e-05
4.742783065102602e-05
4.742783065102602e-05
4.742783065102602e-05
4.742783065102602e-05
4.742783065102602e-05
4.742783065102602e-05
4.54516710405666e-05
4.54516710405666e-05
4.54516710405666e-05
4.54516710405666e-05
4.54516710405666e-05
4.54516710405666e-05
4.54516710405666e-05
4.54516710405666e-05
4.54516710405666e-05
4.54516710405666e-05
4.54516710405666e-05
4.54516710405666e-05


1.580927688367534e-05
1.580927688367534e-05
1.580927688367534e-05
1.580927688367534e-05
1.580927688367534e-05
1.580927688367534e-05
1.580927688367534e-05
1.580927688367534e-05
1.580927688367534e-05
1.580927688367534e-05
1.580927688367534e-05
1.580927688367534e-05
1.580927688367534e-05
1.580927688367534e-05
1.580927688367534e-05
1.580927688367534e-05
1.580927688367534e-05
1.580927688367534e-05
1.580927688367534e-05
1.580927688367534e-05
1.580927688367534e-05
1.580927688367534e-05
1.580927688367534e-05
1.580927688367534e-05
1.580927688367534e-05
1.580927688367534e-05
1.580927688367534e-05
1.580927688367534e-05
1.580927688367534e-05
1.580927688367534e-05
1.580927688367534e-05
1.580927688367534e-05
1.580927688367534e-05
1.580927688367534e-05
1.580927688367534e-05
1.580927688367534e-05
1.580927688367534e-05
1.580927688367534e-05
1.580927688367534e-05
1.580927688367534e-05
1.580927688367534e-05
1.580927688367534e-05
1.580927688367534e-05
1.580927688367534e-05
1.580927688367534e-05
1.58092768

7.90463844183767e-06
7.90463844183767e-06
7.90463844183767e-06
7.90463844183767e-06
7.90463844183767e-06
7.90463844183767e-06
7.90463844183767e-06
7.90463844183767e-06
7.90463844183767e-06
7.90463844183767e-06
7.90463844183767e-06
7.90463844183767e-06
7.90463844183767e-06
7.90463844183767e-06
7.90463844183767e-06
7.90463844183767e-06
7.90463844183767e-06
7.90463844183767e-06
7.90463844183767e-06
7.90463844183767e-06
7.90463844183767e-06
7.90463844183767e-06
7.90463844183767e-06
7.90463844183767e-06
7.90463844183767e-06
7.90463844183767e-06
7.90463844183767e-06
7.90463844183767e-06
7.90463844183767e-06
7.90463844183767e-06
7.90463844183767e-06
7.90463844183767e-06
7.90463844183767e-06
7.90463844183767e-06
7.90463844183767e-06
7.90463844183767e-06
7.90463844183767e-06
7.90463844183767e-06
7.90463844183767e-06
7.90463844183767e-06
7.90463844183767e-06
7.90463844183767e-06
7.90463844183767e-06
7.90463844183767e-06
7.90463844183767e-06
7.90463844183767e-06
7.90463844183767e-06
7.90463844183

3.952319220918835e-06
3.952319220918835e-06
3.952319220918835e-06
3.952319220918835e-06
3.952319220918835e-06
3.952319220918835e-06
3.952319220918835e-06
3.952319220918835e-06
3.952319220918835e-06
3.952319220918835e-06
3.952319220918835e-06
3.952319220918835e-06
3.952319220918835e-06
3.952319220918835e-06
3.952319220918835e-06
3.952319220918835e-06
3.952319220918835e-06
3.952319220918835e-06
3.952319220918835e-06
3.952319220918835e-06
3.952319220918835e-06
3.952319220918835e-06
3.952319220918835e-06
3.952319220918835e-06
3.952319220918835e-06
3.952319220918835e-06
3.952319220918835e-06
3.952319220918835e-06
3.952319220918835e-06
3.952319220918835e-06
3.952319220918835e-06
3.952319220918835e-06
3.952319220918835e-06
3.952319220918835e-06
3.952319220918835e-06
3.952319220918835e-06
3.952319220918835e-06
3.952319220918835e-06
3.952319220918835e-06
3.952319220918835e-06
3.952319220918835e-06
3.952319220918835e-06
3.952319220918835e-06
3.952319220918835e-06
3.952319220918835e-06
3.95231922

1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.976159610

1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.976159610

1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.9761596104594176e-06
1.976159610