In [22]:
#Basic imports
import numpy as np
import pandas as pd
import pickle

#sklearn imports
from sklearn.decomposition import PCA #Principal Component Analysis
from sklearn.manifold import TSNE #T-Distributed Stochastic Neighbor Embedding
from sklearn.cluster import KMeans #K-Means Clustering
from sklearn.preprocessing import StandardScaler #used for 'Feature Scaling'

import ast


In [14]:
df = pd.read_csv('./data_TM2/processed/processed_utterances_sentence_DA_labeling.csv', index_col=0)

#### First identify all possible unigrams, bigrams and trigrams

In [15]:
all_DA = []
for row in range(len(df['all_DA'])):
    inst = ast.literal_eval(df['all_DA'][row])
    inst = [i.strip("[]") for i in inst]
    all_DA.append(inst)
    
df['all_DA'] = all_DA

df.drop(df.iloc[:, 8:-1], inplace = True, axis = 1)
df = df.explode('all_DA')
df['all_DA'] = df['all_DA'].fillna('<UNK>')

In [16]:
unique = list(df['all_DA'].unique())
one_hot_ubtf = []

#uni
[one_hot_ubtf.append([x]) for x in unique]
one_hot_ubtf

#bi
for i in range(len(unique)):
    for j in range(len(unique)):
        one_hot_ubtf.append([unique[i], unique[j]])
        
#tri >>>>>> until here 12,719
for i in range(len(unique)):
    for j in range(len(unique)):
        for k in range(len(unique)):
            one_hot_ubtf.append([unique[i], unique[j], unique[k]])
            
# #four >>>>>>> from here 292,560 variables
# for i in range(len(unique)):
#     for j in range(len(unique)):
#         for k in range(len(unique)):
#             for l in range(len(unique)):
#                 one_hot_ubtf.append([unique[i], unique[j], unique[k], unique[l]])

len(one_hot_ubtf)

12719

#### Now get patterns identified by HMM or baseline and create a matrix of zeros in which rows are patterns and columns are all possible uni, bi, trigrams

In [33]:
### now use results from HMM, i.e the detected patterns to make a one hot. 
# list os lists, each sublist is one pattern found in the HMM

a_file = open('./src/generated_files/sorted_hmm_dict.pkl', "rb") #'./src/generated_files/hmm_results.pkl'
hmm_results = pickle.load(a_file)


In [53]:
hmm_patterns = []
for key in hmm_results.keys():
    hmm_patterns.append(key.split())
    
hmm_patterns = hmm_patterns[1:]

#### Now make uni, bi, tri found in each pattern

In [58]:
print(len(hmm_patterns))

#remove unigram patterns found in hmm, since they can't be considered patterns
for pat in hmm_patterns:
    if len(pat) == 1:
        hmm_patterns.remove(pat)
        
print(len(hmm_patterns))

25536
25521


In [193]:
dict_grams = {}

for e, pat in enumerate(hmm_patterns):
    pat = (' ').join(pat)
    
    for uni in ngrams(pat.split(), 1):
        uni = list(uni)
        if pat not in dict_grams:
            dict_grams[pat] = [uni]
        else:
            dict_grams[pat].extend([uni])
        
    for bi in ngrams(pat.split(), 2):
        bi = list(bi)
        if pat not in dict_grams:
            dict_grams[pat] = [bi]
        dict_grams[pat].extend([bi])

    for tri in ngrams(pat.split(), 3):
        tri = list(tri)
        if pat not in dict_grams:
            dict_grams[pat] = [tri]
        dict_grams[pat].extend([tri])

# dict_grams

# #sanity check
# len(hmm_patterns) == len(dict_grams.keys())

{'U_answer A_detail_request': [['U_answer'],
  ['A_detail_request'],
  ['U_answer', 'A_detail_request']],
 'A_completion_check A_detail_request': [['A_completion_check'],
  ['A_detail_request'],
  ['A_completion_check', 'A_detail_request']],
 'A_greeting A_detail_request': [['A_greeting'],
  ['A_detail_request'],
  ['A_greeting', 'A_detail_request']],
 'A_greeting A_greeting A_detail_request': [['A_greeting'],
  ['A_greeting'],
  ['A_detail_request'],
  ['A_greeting', 'A_greeting'],
  ['A_greeting', 'A_detail_request'],
  ['A_greeting', 'A_greeting', 'A_detail_request']],
 'A_grant U_partial_request': [['A_grant'],
  ['U_partial_request'],
  ['A_grant', 'U_partial_request']],
 'A_sequence_closer A_detail_request': [['A_sequence_closer'],
  ['A_detail_request'],
  ['A_sequence_closer', 'A_detail_request']],
 'U_confirmation U_answer A_confirmation': [['U_confirmation'],
  ['U_answer'],
  ['A_confirmation'],
  ['U_confirmation', 'U_answer'],
  ['U_answer', 'A_confirmation'],
  ['U_confir

#### Create matrix of zeros that will have a 1 if one of the values of each key is found (i.e a uni,bi or tri per pattern)

In [216]:
# create matrix size len(one_hot_ubtf) X len(hmm_results)
one_hot_mat = np.zeros((len(dict_grams.keys()), len(one_hot_ubtf))) #format: mat_one_hot[row=dict_grams.keys][column_one_hot]

In [215]:
# if all(x in one_hot_ubtf for x in [['U_greeting', 'U_repair_initiator']]) == True: #here order matters
# all(x in one_hot_ubtf for x in [[unigram/bi/tri from pattern]]) # if sublist here order matters

one_hot_mat_exemplo = np.zeros((len(dict_grams_exemplo.keys()), len(one_hot_ubtf_exemplo)))
one_hot_ubtf_exemplo = [['U_answer'], ['NADA2'],  ['A_detail_request'], ['U_answer', 'A_detail_request'],
                       ['A_completion_check'], ['A_completion_check', 'A_detail_request'], ['NADA1']]
dict_grams_exemplo = {'ex0':[['U_answer'], ['A_detail_request'], ['U_answer', 'A_detail_request']],
                      'ex1':[['A_completion_check'], ['A_detail_request'], ['A_completion_check', 'A_detail_request']]}


for e, pattern in enumerate(dict_grams.values()):
    for g, pat in enumerate(pattern):
        print(g, pat)
        for f, pos_ngr in enumerate(one_hot_ubtf):
            if pos_ngr == pat:
                one_hot_mat[e][f] = 1

one_hot_mat_exemplo

0 ['U_answer']
1 ['A_detail_request']
2 ['U_answer', 'A_detail_request']
0 ['A_completion_check']
1 ['A_detail_request']
2 ['A_completion_check', 'A_detail_request']


array([[1., 0., 1., 1., 0., 0., 0.],
       [0., 0., 1., 0., 1., 1., 0.]])

In [171]:
# for e, k in enumerate(dict_grams.keys()):
#     for f, v in enumerate(dict_grams.values()): #all values have at least size 3 because move1, move2, move1+2
#         print(f, k, v[0], v[1], v[2])
# #         mat_one_hot[e]
    

In [173]:
# for row in pattern
# for columns in len one hot
# all df zeros
# if contains exactly something, then 1

In [218]:
# for pat in hmm_patterns:
#     if len(pat) == 3:
#         print(pat[0], pat[1], pat[2])
#         #marcar numero 1 em cada uni e o numero 1 em bi

# # talvez melhor forma seja df and use if contains "U_xxx" put 1



# for i in one_hot_ubtf:
#     if i == 'A_greeting'+'A_greeting':
#         print(i)
# #     for j in i:
# #         if j == ['U_answer']:
# #             print(j, i)

# all(x in ['b', 'a', 'foo', 'bar', ['a', 'b']] for x in [['a', 'b']]) # if sublist here order matters

In [None]:
# for e, pattern in enumerate(dict_grams_exemplo.values()):
#     print(e, pattern)
#     for f, pos_ngr in enumerate(one_hot_ubtf_exemplo):
#         if pos_ngr == pattern:
#             one_hot_mat_exemplo[e][f] = 1

# one_hot_mat_exemplo