# Preprocessing


## All preprocessing just needs to be done once. 
### If already done go direclty to DA patterns to load csv


In [2]:
import os
import pandas as pd
import numpy as np
from pandas.io.json import json_normalize
import re
from nltk.tokenize import word_tokenize
import sklearn
from sklearn.metrics import f1_score, precision_recall_fscore_support

#### Open all datasets from different tasks

In [None]:
df_t1 = pd.read_json('./data_TM2/flights.json')
df_t2 = pd.read_json('./data_TM2/food-ordering.json')
df_t3 = pd.read_json('./data_TM2/hotels.json')
df_t4 = pd.read_json('./data_TM2/movies.json')
df_t5 = pd.read_json('./data_TM2/music.json')
df_t6 = pd.read_json('./data_TM2/restaurant-search.json')
df_t7 = pd.read_json('./data_TM2/sports.json')

# create extra column in the beginning for type of task before merging
df_t1.insert(0, 'task', 'flights')
df_t2.insert(0, 'task', 'food-ordering')
df_t3.insert(0, 'task', 'hotels')
df_t4.insert(0, 'task', 'movies')
df_t5.insert(0, 'task', 'music')
df_t6.insert(0, 'task', 'restaurant-search')
df_t7.insert(0, 'task', 'sports')


#### Normalize json and change to pandas 

In [None]:
def process_raw(df): #, name):
    dialogs = []

    for e,i in enumerate(df['utterances']):
        for j in i:
            new_df=pd.json_normalize(j)
            new_df.insert(0, 'task', df['task'][e])
            new_df.insert(1, 'conversation_id', df['conversation_id'][e])
            new_df.insert(2, 'instruction_id', df['instruction_id'][e])
        
            dialogs.append(new_df)

    large_df = pd.concat(dialogs, ignore_index=True)
#   large_df.to_csv('./data_TM2/processed_utterances_'+name+'.csv')
    
    return large_df

#### Create unique dataset with normalized tasks

In [None]:
def unique(df_list):
    large_frames = []

    for df_ in df_list:
        large_df = process_raw(df_)
        large_frames.append(large_df)

    all_tasks = pd.concat(large_frames, ignore_index=True)
    all_tasks.to_csv('./data_TM2/concatenated_tasks.csv')
    
    return all_tasks

In [None]:
df_list = [df_t1, df_t2, df_t3, df_t4, df_t5, df_t6, df_t7]
all_tasks = unique(df_list)

In [None]:
# # #essa parte é de antes porem com split incluindo simbolo do split
# new_text = []
# for ut in df['text']:
#     new_ut = re.split('(\.)', ut)
#     new_text.append(new_ut)
# df['new_text'] = new_text

# #to keep delimiter in sentence
# #not very efficient way of doing it, apparently a better way using regex symbols

# new_lista = []
# for row in df['new_text']:
# #     print(row)
#     for e, word in enumerate(row):
#         if word == '.':
#             new_lista.append(row[e-1]+row[e])
#             new_lista.remove(row[e-1])
#         else:
#             new_lista.append(word)

# df = df.explode('new_text')
# df = df.reset_index(drop=True)

In [None]:
from nltk import sent_tokenize

df = pd.read_csv('./data_TM2/concatenated_tasks.csv', index_col=0)

tokenized = []
for ut in df['text']:
    tokenized.append(sent_tokenize(ut))
    
df['new_lista'] = tokenized
df = df.explode('new_lista')
df = df.reset_index(drop=True)
df.head(20)

## Sample a part of the dataset to annotate
### criteria:

- take from all taks
- annotate full dialogues
- how many dialogues are there? what's the percentage of full dialogues i want to annotate?
17304 dialogues (index 0 demarks the begining of a dialogue)

In [None]:
#number of dialogues:
df_count = pd.read_csv('./data_TM2/concatenated_tasks.csv', index_col=0)
print(df_count['conversation_id'].nunique())

#average size of dialog
print(df_count['index'].mean())

#dialogues per task (i.e True column)
pd.crosstab(df_count['task'], df_count['index'] ==0).sort_values(by='task', ascending=False)

In [None]:
#sample randomly 167 dialogues, that will be split into 2 batches:
#selection made using the unique conversation_ids

np.random.seed(0)
unique_conv_id = df['conversation_id'].unique()
sample_ids = np.random.choice(unique_conv_id, 172, replace=False)

sample_b1  = []
sample_b2  = []
for e, row in enumerate(df['conversation_id']):
    for b1 in sample_ids[:int(len(sample_ids)/2)]:
        if row == b1:
            sample_b1.append(df.loc[e,:])
    for b2 in sample_ids[int(len(sample_ids)/2):]:
        if row == b2:
            sample_b2.append(df.loc[e,:])

df_sample_b1 = pd.DataFrame(sample_b1)
df_sample_b2 = pd.DataFrame(sample_b2)

df_sample_b1.to_csv('./data_TM2/sample_b1.csv')
df_sample_b2.to_csv('./data_TM2/sample_b2.csv')

## Load gold label dataset to work with DA patterns

In [3]:
#format golden label to compare
#then process automatic labels using this file

df = pd.read_csv('./data_TM2/sample_b1_goldannotated_noAutomaticLabels.csv', index_col=0)
# df = df.iloc[:2034]
df = df.drop(columns='G_DA_rep')

gold = []
for label in df['gold_labels']:
    gold.append(str(label).split(','))

df['gold_formatted'] = gold
df

Unnamed: 0,task,conversation_id,instruction_id,index,speaker,text,segments,new_text,gold_labels,G_DA_rep_init,...,G_DA_receipt,G_DA_disconf,G_DA_closer,G_DA_comp_check,G_DA_hold,G_DA_partial_req,G_DA_detail_req,G_DA_grant,G_DA_answer,gold_formatted
2441,flights,dlg-07f1ccd9-e109-41c5-8db6-afb4cdd94728,flight-5,0,ASSISTANT,Hello.,,Hello.,greeting,,...,,,,,,,,,,[greeting]
2442,flights,dlg-07f1ccd9-e109-41c5-8db6-afb4cdd94728,flight-5,1,USER,Hi. I need help organizing a flight. I'm looki...,"[{'start_index': 59, 'end_index': 74, 'text': ...",Hi.,greeting,,...,,,,,,,,,,[greeting]
2443,flights,dlg-07f1ccd9-e109-41c5-8db6-afb4cdd94728,flight-5,1,USER,Hi. I need help organizing a flight. I'm looki...,"[{'start_index': 59, 'end_index': 74, 'text': ...",I need help organizing a flight.,partial_request,,...,,,,,,partial_request,,,,[partial_request]
2444,flights,dlg-07f1ccd9-e109-41c5-8db6-afb4cdd94728,flight-5,1,USER,Hi. I need help organizing a flight. I'm looki...,"[{'start_index': 59, 'end_index': 74, 'text': ...","I'm looking to fly to Dublin, Ireland.",partial_request,,...,,,,,,partial_request,,,,[partial_request]
2445,flights,dlg-07f1ccd9-e109-41c5-8db6-afb4cdd94728,flight-5,2,ASSISTANT,Okay. Can you give me some specifications?,,Okay.,confirmation,,...,,,,,,,,,,[confirmation]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
436152,sports,dlg-fccb9be2-f9ef-4472-80fb-af3fd0efc78b,mlb-7,10,ASSISTANT,I'm sorry can you repeat the question?,,I'm sorry can you repeat the question?,repair_initiator,repair_initiator,...,,,,,,,,,,[repair_initiator]
436153,sports,dlg-fccb9be2-f9ef-4472-80fb-af3fd0efc78b,mlb-7,11,USER,Who's in last place in the American League West?,"[{'start_index': 9, 'end_index': 19, 'text': '...",Who's in last place in the American League West?,repair,,...,,,,,,,,,,[repair]
436154,sports,dlg-fccb9be2-f9ef-4472-80fb-af3fd0efc78b,mlb-7,12,ASSISTANT,The Oakland A's are in last place in the Ameri...,"[{'start_index': 4, 'end_index': 13, 'text': '...",The Oakland A's are in last place in the Ameri...,grant,,...,,,,,,,,grant,,[grant]
436155,sports,dlg-fccb9be2-f9ef-4472-80fb-af3fd0efc78b,mlb-7,13,USER,"Awesome. That's all I needed to know, Thank you.",,Awesome.,sequence_closer,,...,,,sequence_closer,,,,,,,[sequence_closer]


In [4]:
#about the sample:

print('number of dialogues sampled: ' + str(df['conversation_id'].nunique()))
print('number of utterances sampled: ' + str(len(df)))
print('number of tasks sampled: '+ str(df['task'].nunique()))
df.columns

number of dialogues sampled: 70
number of utterances sampled: 2062
number of tasks sampled: 7


Index(['task', 'conversation_id', 'instruction_id', 'index', 'speaker', 'text',
       'segments', 'new_text', 'gold_labels', 'G_DA_rep_init', 'G_DA_greet',
       'G_DA_req_sum', 'G_DA_conf', 'G_DA_receipt', 'G_DA_disconf',
       'G_DA_closer', 'G_DA_comp_check', 'G_DA_hold', 'G_DA_partial_req',
       'G_DA_detail_req', 'G_DA_grant', 'G_DA_answer', 'gold_formatted'],
      dtype='object')

## OR Load whole dataset to work with DA patterns

In [None]:
# df = pd.read_csv('./data_TM2/processed_utterances_all_tasks.csv', index_col=0)

### Find substrings
make list of words and rules for labeling DA
https://towardsdatascience.com/check-for-a-substring-in-a-pandas-dataframe-column-4b949f64852#:~:text=Using%20%E2%80%9Ccontains%E2%80%9D%20to%20Find%20a,substring%20and%20False%20if%20not.

- add new column for subset. since each utterance can only be one thing one column for all DA should be enough. 
- Later I'll have to check the vocabulary used for generic rules (FS_base_greeting or whatever is)
- Also study in the UX book what is repair and other patterns:
    - page 243 to 246: NCF pattern language summary

Tips:
- #NOT_INCLUDE_WORD: 
    - not_fifa = football_soccer_games.loc[~football_soccer_games['Name'].str.contains('FIFA')]
- #to make a new dataframe with slicing subset:
    - new_df = df.loc[df['text'].str.contains('|'.join(end_of_request), case=False)]

In [None]:
# # DA indicators. Obs: the vars bellow are sensitive to spacing !!! 'hi' != ' hi'

# greeting = [' hi','hi.','hello','bye'] 
# repair_initiator = ['sorry','repeat', 'understand', 'mean'] 
# request_summary = ['correct?','confirm'] #when you ask an information to be confirmed
# confirmation = ['yes','correct.','correct ', 'that\'s it', 'indeed'] #confirmation é a resposta que confirma: ex yes | correct | that's it
# sentence_closer = ['awesome','great','perfect','exactly','that\'s all','that is all', 'thanks', 'thank you', 'pleasure', 'excellent'] 
# question = ['\?']
# receipt = ['You are welcome', 'you\'re welcome']
# grant = ['got it', 'sure']

# ## !!!!! order-sensitive > hierarchical!!!! this variable chooses only one label, rules goes on order. once it assigns one rule it won't check next

# temp2=df.new_text.fillna("0")
# df['DA_indicators_sent'] =  np.where(temp2.str.contains('|'.join(repair_initiator), case=False), "repair_initiator",
#                             np.where(temp2.str.contains('|'.join(greeting), case=False), "greeting",
#                             np.where(temp2.str.contains('|'.join(request_summary), case=False),  "request_summary",
#                             np.where(temp2.str.contains('|'.join(confirmation), case=False),  "confirmation",
#                             np.where(temp2.str.contains('|'.join(receipt), case=False),  "receipt",
#                             np.where(temp2.str.contains('|'.join(grant), case=False),  "grant",
#                             np.where(temp2.str.contains('|'.join(question), case=False),  "question",
#                             np.where(temp2.str.contains('|'.join(sentence_closer), case=False), "sentence_closer", "x"))))))))

# df['DA_indicators_sent'].value_counts()

# #it works! #binary variables and in the end gather in one new column

# df['DA_rep'] =  np.where(temp2.str.contains('|'.join(repair_initiator), case=False), "repair_initiator", '')
# df['DA_greet'] =  np.where(temp2.str.contains('|'.join(greeting), case=False), "greeting", '')
# df['DA_req_sum'] =  np.where(temp2.str.contains('|'.join(request_summary), case=False), "request_summary", '')
# df['DA_conf'] =  np.where(temp2.str.contains('|'.join(confirmation), case=False), "confirmation", '')
# df['DA_receipt'] =  np.where(temp2.str.contains('|'.join(receipt), case=False), "receipt", '')
# df['DA_grant'] =  np.where(temp2.str.contains('|'.join(grant), case=False), "grant", '')
# df['DA_quest'] =  np.where(temp2.str.contains('|'.join(question), case=False), "question", '')
# df['DA_closer'] =  np.where(temp2.str.contains('|'.join(sentence_closer), case=False), "sentence_closer", '')
# df['all_DA'] = df[['DA_rep', 'DA_greet','DA_req_sum', 'DA_conf', 'DA_receipt', 'DA_grant', 'DA_quest', 'DA_closer']].agg(' '.join, axis=1)

# df['all_DA'] = [word_tokenize(da) for da in df['all_DA']]

# # df.loc[:, ['speaker', 'new_text', 'all_DA']].to_csv('analysis_multiple_DA_cases.csv')
# df.loc[:, ['speaker', 'new_text', 'all_DA']]
# counts = df['all_DA'].value_counts()

# # counts.to_csv('count_overlaping_DAs.csv')
# data = {'counts':counts}
# df_counts = pd.DataFrame(data)
# df_counts.head(45)

## Second iteration

In [None]:
# # DA indicators. Obs: the vars bellow are sensitive to spacing !!! 'hi' != ' hi'

# greeting = [' hi','hi.','hello','yo ', 'hey'] 
# repair_initiator = ['sorry','repeat', 'understand', 'mean ', 'what?', 'example?']
# repair = ['I meant']
# request_summary = [' correct?','confirm']  
# confirmation = ['yes',' correct.',' correct ', 'that\'s it', 'indeed', 'yeah', 'that\'s right', 'you got it', 'yep', 'sure', 'okay', 'all right'] 
# sequence_closer = ['awesome','great','perfect','exactly','that\'s all','that is all', 'thanks', 'thank you', 'pleasure', 'excellent', 'okay', 'excellent', 'too bad', 'oh well']
# inquiry = ['\?']
# receipt = ['You are welcome', 'you\'re welcome']
# disconfirmation = [' no ', 'wrong', 'incorrect', 'not really']

# ## !!!!! order-sensitive !!!!

# #this variable chooses only one label, rules goes on order. once it assigns one rule it won't check next
# temp2=df.new_text.fillna("0")
# df['DA_indicators_sent'] =  np.where(temp2.str.contains('|'.join(repair_initiator), case=False), "repair_initiator",
#                             np.where(temp2.str.contains('|'.join(greeting), case=False), "greeting",
#                             np.where(temp2.str.contains('|'.join(request_summary), case=False),  "request_summary",
#                             np.where(temp2.str.contains('|'.join(confirmation), case=False),  "confirmation",
#                             np.where(temp2.str.contains('|'.join(receipt), case=False),  "receipt",
#                             np.where(temp2.str.contains('|'.join(disconfirmation), case=False),  "disconfirmation",
#                             np.where(temp2.str.contains('|'.join(inquiry), case=False),  "inquiry",
#                             np.where(temp2.str.contains('|'.join(repair), case=False),  "repair",
#                             np.where(temp2.str.contains('|'.join(sequence_closer), case=False), "sequence_closer", "x")))))))))

# print(df['DA_indicators_sent'].value_counts())

# #it works! #binary variables and in the end gather in one new column
# ##### ainda n sei se repair solo fica ######                                     
                                     
# df['DA_rep_init'] =  np.where(temp2.str.contains('|'.join(repair_initiator), case=False), "repair_initiator", '')
# df['DA_rep'] =  np.where(temp2.str.contains('|'.join(repair), case=False), "repair", '')
# df['DA_greet'] =  np.where(temp2.str.contains('|'.join(greeting), case=False), "greeting", '')
# df['DA_req_sum'] =  np.where(temp2.str.contains('|'.join(request_summary), case=False), "request_summary", '')
# df['DA_conf'] =  np.where(temp2.str.contains('|'.join(confirmation), case=False), "confirmation", '')
# df['DA_receipt'] =  np.where(temp2.str.contains('|'.join(receipt), case=False), "receipt", '')
# df['DA_disconf'] =  np.where(temp2.str.contains('|'.join(disconfirmation), case=False), "disconfirmation", '')
# df['DA_inquiry'] =  np.where(temp2.str.contains('|'.join(inquiry), case=False), "inquiry", '')
# df['DA_closer'] =  np.where(temp2.str.contains('|'.join(sequence_closer), case=False), "sequence_closer", '')
# df['all_DA'] = df[['DA_rep_init', 'DA_rep', 'DA_greet','DA_req_sum', 'DA_conf', 'DA_receipt', 'DA_disconf', 'DA_inquiry', 'DA_closer']].agg(' '.join, axis=1)

# df['all_DA'] = [word_tokenize(da) for da in df['all_DA']]

# df.loc[:, ['speaker', 'new_text', 'all_DA']].to_csv('analysis_multiple_DA_cases_2nditeration.csv')
# df.loc[:, ['speaker', 'new_text', 'all_DA']]
# counts = df['all_DA'].value_counts()

# counts.to_csv('count_overlaping_DAs_2nditeration.csv')
# data = {'counts':counts}
# df_counts = pd.DataFrame(data)
# df_counts.head()

## Third iteration and FINAL

In [5]:
# DA indicators. Obs: the vars bellow are sensitive to spacing !!! 'hi' != ' hi'

greeting = [' hi','hi.','hello','yo ', 'hey', 'How can I help you?'] 
repair_initiator = ['sorry','repeat', 'understand', 'mean?', 'what?', 'example?', 'could you say that again?', 'say again', 'what did you say?', 'I can\'t hear you', 'I didn\'t listen', 'say it again']
# repair = ['I meant']
request_summary = [' correct?','confirm']  
confirmation = ['yes',' correct.',' correct ', 'that\'s it', 'indeed', 'yeah', 'that\'s right', 'you got it', 'yep', 'sure', 'okay', 'all right',  'sure', 'sounds good', 'super.', 'super!', 'alright', 'I don\'t mind', 'I can help you', 'sounds really good', 'got it.'] 
sequence_closer = ['awesome','great','perfect','exactly','that\'s all','that is all', 'thanks', 'thank you', 'pleasure', 'excellent', 'okay', 'excellent', 'too bad', 'oh well', 'have a good day', 'enjoy', 'until next time', 'good day', 'good luck', 'bye', 'til next time']
receipt = ['You are welcome', 'you\'re welcome']
disconfirmation = [' no ', 'wrong', 'incorrect', 'not really']
#NEW
completion_check = ['is that all?','anything else I can help you with?', 'anything else?'] 
partial_request = ['\?']
detail_request = ['\?']
hold_request = ['one moment, please',' hold on', 'hold', 'just a moment', 'let me check for you', 'let me see', 'one sec' ]
grant_answer = ['.']
temp2=df.new_text.fillna("0")

## !!!!! order-sensitive !!!!

# #this variable chooses only one label, rules goes on order. once it assigns one rule it won't check next

# df['DA_indicators_sent'] =  np.where(temp2.str.contains('|'.join(repair_initiator), case=False), "repair_initiator",
#                             np.where(temp2.str.contains('|'.join(greeting), case=False), "greeting",
#                             np.where(temp2.str.contains('|'.join(request_summary), case=False),  "request_summary",
#                             np.where(temp2.str.contains('|'.join(confirmation), case=False),  "confirmation",
#                             np.where(temp2.str.contains('|'.join(receipt), case=False),  "receipt",
#                             np.where(temp2.str.contains('|'.join(disconfirmation), case=False),  "disconfirmation",
#                             np.where(temp2.str.contains('|'.join(repair), case=False),  "repair",
#                             np.where(temp2.str.contains('|'.join(sequence_closer), case=False), "sequence_closer", 
#                             np.where(temp2.str.contains('|'.join(completion_check), case=False),  "completion_check",
#                             np.where(temp2.str.contains('|'.join(hold_request), case=False),  "hold_request",
#                             np.where(temp2.str.contains('|'.join(partial_request), case=False),  "partial_request",
#                             np.where(temp2.str.contains('|'.join(detail_request), case=False),  "detail_request","x"))))))))))))

# print(df['DA_indicators_sent'].value_counts())


#it works! #binary variables and in the end gather in one new column
##### ainda n sei se repair solo fica ######                                     
                                     
df['DA_rep_init'] =  np.where(temp2.str.contains('|'.join(repair_initiator), case=False), "repair_initiator", '')
# df['DA_rep'] =  np.where(temp2.str.contains('|'.join(repair), case=False), "repair", '')
df['DA_greet'] =  np.where(temp2.str.contains('|'.join(greeting), case=False), "greeting", '')
df['DA_req_sum'] =  np.where(temp2.str.contains('|'.join(request_summary), case=False), "request_summary", '')
df['DA_conf'] =  np.where(temp2.str.contains('|'.join(confirmation), case=False), "confirmation", '')
df['DA_receipt'] =  np.where(temp2.str.contains('|'.join(receipt), case=False), "receipt", '')
df['DA_disconf'] =  np.where(temp2.str.contains('|'.join(disconfirmation), case=False), "disconfirmation", '')
df['DA_closer'] =  np.where(temp2.str.contains('|'.join(sequence_closer), case=False), "sequence_closer", '')           
df['DA_comp_check'] =  np.where(temp2.str.contains('|'.join(completion_check), case=False),  "completion_check", '')
df['DA_hold'] =  np.where(temp2.str.contains('|'.join(hold_request), case=False),  "hold_request", '')
df['DA_partial_req'] = np.where(((df['speaker'] == 'USER') & temp2.str.contains('|'.join(partial_request), case=False)), 'partial_request', '')
df['DA_detail_req'] = np.where(((df['speaker'] == 'ASSISTANT') & temp2.str.contains('|'.join(detail_request), case=False)), 'detail_request', '')
df['DA_grant'] = np.where(((df['speaker'] == 'ASSISTANT') & (temp2.str.contains('|'.join(grant_answer), case=False) & (df['DA_partial_req'].shift(1) == 'partial_request') )), 'grant', '')                                     
df['DA_answer'] = np.where(((df['speaker'] == 'USER') & (temp2.str.contains('|'.join(grant_answer), case=False) & (df['DA_detail_req'].shift(1) == 'detail_request') )), 'answer', '')                                     


df['all_DA'] = df[['DA_rep_init', 'DA_greet','DA_req_sum', 'DA_conf', 'DA_receipt', 'DA_disconf', 'DA_closer', 'DA_comp_check', 'DA_hold', 'DA_partial_req', 'DA_detail_req', 'DA_grant', 'DA_answer']].agg(' '.join, axis=1)

df['all_DA'] = [word_tokenize(da) for da in df['all_DA']]

df.loc[:, ['speaker', 'new_text', 'all_DA']].to_csv('analysis_multiple_DA_cases_2nditeration.csv')
df.loc[:, ['speaker', 'new_text', 'all_DA']]
counts = df['all_DA'].value_counts()

counts.to_csv('count_overlaping_DAs_3iteration.csv')
data = {'counts':counts}
df_counts = pd.DataFrame(data)
df_counts.head()

Unnamed: 0,counts
[grant],367
[answer],332
"[sequence_closer, grant]",144
"[partial_request, grant]",109
"[detail_request, answer]",104


In [29]:
df['DA_answer'] = np.where(((df['speaker'] == 'USER') & (temp2.str.contains('|'.join(grant_answer), case=False) & (df['DA_detail_req'].shift(1) == 'detail_request') )), 'answer', '')                                     

# df.loc[df['column name'] condition, 'new column name'] = 'value if condition is met'
# df.loc[(df['speaker'] != df['speaker'].shift(1)) & (df['DA_indicators_sent'].shift(1) == 'repair_initiator'), 'teste_col2'] = 'SECOND'

## Compare binary variable against binary. 
## Kappa score
#### I think it makes no sense to compare the multiple labels together because if only one is different it gives a zero for all

The kappa statistic, which is a number between -1 and 1. The maximum value means complete agreement; zero or lower means chance agreement.

Conclusion: remove request summary, examples are not matching. Also I don't see any easy to implement rules


In [27]:
print(df.columns[20])
print(df.columns[20+14])

G_DA_grant
DA_grant


In [30]:
# df.iloc[:,[9,23]]

DA = ['repair_initiator','greeting','request_summary','confirmation','receipt','disconfirmation',
      'sequence_closer','completion_check','hold_request','partial_request', 'detail_request', 'grant', 'answer'] 
kappa = []
prec_rec_f1 = []
count_g = []
count_s = []

for e,i in enumerate(range(9,22)):
    gold = np.asarray([0 if val != DA[e] else 1 for val in df.iloc[:, i]])
    synt = np.asarray([0 if val != DA[e] else 1 for val in df.iloc[:, (i+14)]])
    kappa.append(sklearn.metrics.cohen_kappa_score(gold, synt))
    prec_rec_f1.append(precision_recall_fscore_support(gold, synt, average='macro', zero_division=0))
    unique_g, counts_g = np.unique(gold, return_counts=True)
    unique_s, counts_s = np.unique(synt, return_counts=True)
    count_g.append(counts_g[1])
    count_s.append(counts_s[1])

# print('kappa: ' + str(np.mean(kappa)))  
# kappa_scores = list(zip(DA,kappa))

kappa_scores = pd.DataFrame(kappa, index = DA, columns = ['Kappa'])
# print(kappa_scores)

metrics = pd.DataFrame(prec_rec_f1, index = DA, columns = ['Precision','Recall','F1_macro','Support'])
metrics = metrics.drop(columns = 'Support')
# metrics = metrics.insert(loc=0,column=count_g, value='Count_Gold')
metrics['Cohen\'s Kappa'] = kappa_scores
metrics['Count Gold'] = count_g
metrics['Count Synt'] = count_s
metrics


Unnamed: 0,Precision,Recall,F1_macro,Cohen's Kappa,Count Gold,Count Synt
repair_initiator,0.544113,0.854398,0.562575,0.148531,18,155
greeting,0.659978,0.929457,0.708778,0.436271,116,355
request_summary,0.496111,0.498778,0.497441,-0.003709,16,5
confirmation,0.879387,0.845984,0.860759,0.721892,515,451
receipt,0.99902,0.92,0.956031,0.91207,25,21
disconfirmation,0.569358,0.527592,0.538329,0.078622,33,13
sequence_closer,0.67896,0.823679,0.712,0.438217,226,472
completion_check,0.801643,0.92224,0.851506,0.703147,27,38
hold_request,0.965319,0.764651,0.835702,0.672071,66,37
partial_request,0.924467,0.771037,0.82292,0.649179,329,195


## Getting insights from k-means

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd

vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['new_text'])

true_k = 5
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X)
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()

for i in range(true_k):
    print('Cluster %d:' % i),
    for ind in order_centroids[i, :5]:
        print(' %s' % terms[ind])

In [None]:
print('Prediction')
X = vectorizer.transform(['hi, can you help me?'])
predicted = model.predict(X)
print(predicted)