# Chat Intents

## Applying labels

In [2]:
import numpy as np
import pandas as pd

pd.set_option("display.max_rows", 600)
pd.set_option("display.max_columns", 500)
pd.set_option("max_colwidth", 400)

import collections

import spacy

In [3]:
nlp = spacy.load("en_core_web_sm")

In [4]:
data_clustered = pd.read_csv('../data/processed/sample_clustered.csv')
data_clustered = data_clustered[['text', 'label_st1']]
data_clustered.sample(10)

Unnamed: 0,text,label_st1
222,Do I need to verify the top-up?,17
525,How do I activate my new card,47
159,I only got $20 of the $100 that I attempted to withdraw.,51
972,Was there a problem with topping up?,19
136,What are ways to test why my card might not be working?,89
825,I was overcharged an additional pound.,68
515,I need an ATM right now!,70
917,"My phone was stolen, help!",42
856,Is there a certain age I need to be?,2
123,Where can I get your exchange rates from?,54


In [5]:
#example_category = data_sample[data_sample['category']=='cash_withdrawal_charge'].reset_index(drop=True)
example_category = data_clustered[data_clustered['label_st1']==6].reset_index(drop=True)
example_category 

Unnamed: 0,text,label_st1
0,I want to have multiple currencies in my account if possible.,6
1,I need to know which fiat currencies I can use with you.,6
2,Can i add a new currency to my account?,6
3,what currencies do you accept?,6
4,How many currencies can I have?,6
5,Can you tell me the names of the fiat currencies that I can use at your institution?,6
6,What fiat currencies are used with holdings and exchanges?,6
7,Are there limitations to what currency I can hold?,6
8,What currencies can I use?,6
9,Which flat currencies do you support for holding and exchange?,6


In [7]:
example_doc = nlp(list(example_category['text'])[11])

for token in example_doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_ , token.is_stop)

Can can AUX MD aux True
you you PRON PRP nsubj True
tell tell VERB VB ROOT False
me I PRON PRP dobj True
what what PRON WP det True
currencies currency NOUN NNS dobj False
I I PRON PRP nsubj True
can can AUX MD aux True
use use VERB VB ccomp False
to to PART TO aux True
add add VERB VB xcomp False
money money NOUN NN dobj False
? ? PUNCT . punct False


In [12]:
from spacy import displacy
from pathlib import Path

In [11]:
displacy.render(example_doc, style="dep")

In [17]:
fig = displacy.render(example_doc, style="dep", jupyter=False)
output_path = Path("../images/dependency_plot.svg") # you can keep there only "dependency_plot.svg" if you want to save it in the same folder where you run the script 
output_path.open("w", encoding="utf-8").write(fig)

9689

## Helper functions

In [7]:
def get_group(df, category_col, category):
    
    single_category = df[df[category_col]==category].reset_index(drop=True)

    return single_category 

In [8]:
def most_common(lst, n_words):
    counter=collections.Counter(lst)
    return counter.most_common(n_words)

In [9]:
def extract_labels(category_docs, print_word_counts=False):
    """
    Argument:
    category_docs: list of documents, all from the same category or clustering
    """
    verbs = []
    dobjs = []
    nouns = []
    adjs = []
    
    verb = ''
    dobj = ''
    noun1 = ''
    noun2 = ''

    for i in range(len(category_docs)):
        doc = nlp(category_docs[i])
        for token in doc:
            if token.is_stop==False:
                if token.dep_ == 'ROOT':
                    verbs.append(token.text.lower())

                elif token.dep_=='dobj':
                    dobjs.append(token.lemma_.lower())

                elif token.pos_=='NOUN':
                    nouns.append(token.lemma_.lower())
                    
                elif token.pos_=='ADJ':
                    adjs.append(token.lemma_.lower())
                    #print(i, token.text, token.pos_, token.tag_, token.dep_ )

    if print_word_counts:
        for word_lst in [verbs, dobjs, nouns, adjs]:
            counter=collections.Counter(word_lst)
            print(counter)
        
    if len(verbs) > 0:
        verb = most_common(verbs, 1)[0][0]
    
    if len(dobjs) > 0:
        dobj = most_common(dobjs, 1)[0][0]
    
    if len(nouns) > 0:
        noun1 = most_common(nouns, 1)[0][0]
    
    if len(set(nouns)) > 1:
        noun2 = most_common(nouns, 2)[1][0]
    
    
    words = [verb, dobj]
    
    for word in [noun1, noun2]:
        if word not in words:
            words.append(word)
    
    if '' in words:
        words.remove('')
    
    label = '_'.join(words)
    
    return label

In [10]:
def apply_and_summarize_labels(df, category_col):
    numerical_labels = df[category_col].unique()
    
    # create dictionary of the numerical category to the generated label
    label_dict = {}
    for label in numerical_labels:
        current_category = list(get_group(df, category_col, label)['text'])
        label_dict[label] = extract_labels(current_category)
        
    # create summary dataframe of numerical labels and counts
    summary_df = (df.groupby(category_col)['text'].count()
                    .reset_index()
                    .rename(columns={'text':'count'})
                    .sort_values('count', ascending=False))
    
    # apply generated labels
    summary_df['label'] = summary_df.apply(lambda x: label_dict[x[category_col]], axis = 1)
    
    return summary_df

In [22]:
def combine_ground_truth(df_clusters, df_ground, key):
    df_combined = pd.merge(df_clusters, df_ground, on=key, how = 'left')
    return df_combined

In [31]:
def get_top_category(df_label, df_summary):
    df_label_ground = (df_label.groupby('label')
                      .agg(top_ground_category=('category', lambda x:x.value_counts().index[0]), 
                           top_cat_count = ('category', lambda x:x.value_counts()[0]))
                      .reset_index())
    
    df_result = pd.merge(df_summary, df_label_ground, on='label', how='left')
    df_result['perc_top_cat'] = df_result.apply(lambda x: int(round(100*x['top_cat_count']/x['count'])), axis=1)
    
    return df_result

### Without ground truth labels

In [32]:
example_category = list(get_group(data_clustered, 'label_st1', 65)['text'])
extract_labels(example_category, True)

Counter({'tried': 4, 'pending': 3, 'taking': 2, 'ask': 1, 'finish': 1, 'need': 1, 'stays': 1, 'waiting': 1, 'receive': 1, 'fix': 1, 'think': 1, 'received': 1, 'saw': 1, 'arrived': 1, 'completed': 1, 'showing': 1, 'appear': 1, 'notice': 1, 'says': 1, 'saying': 1, 'happens': 1, 'noticing': 1, 'shopping': 1, 'going': 1, 'moved': 1})
Counter({'payment': 9, 'money': 2, 'help': 1, 'frame': 1, 'purchase': 1, 'error': 1, 'message': 1})
Counter({'payment': 16, 'card': 8, 'transaction': 5, 'time': 3, 'account': 3, 'friend': 3, 'couple': 2, 'day': 2, 'yesterday': 2, 'today': 2, 'progress': 1, 'merchant': 1, 'thing': 1, 'purchase': 1, 'morning': 1, 'money': 1, 'company': 1, 'store': 1, 'internet': 1, 'issue': 1})
Counter({'long': 3, 'wrong': 2, 'strange': 1, 'embarrassing': 1})


'tried_payment_card'

In [33]:
cluster_summary = apply_and_summarize_labels(data_clustered, 'label_st1')
cluster_summary.head()

Unnamed: 0,label_st1,count,label
0,-1,63,help_card_account
23,22,47,help_refund_account
66,65,32,tried_payment_card
67,66,29,deposited_money_account_cash
53,52,26,charged_fee_withdrawal_cash


In [34]:
labeled_clusters = pd.merge(data_clustered, cluster_summary[['label_st1', 'label']], on='label_st1', how = 'left')
labeled_clusters.head()

Unnamed: 0,text,label_st1,label
0,I'm worried my card might be lost in the mail? How long does it usually take to arrive?,48,expect_card_week
1,I got charged a fee that shouldn't be there from my cash,52,charged_fee_withdrawal_cash
2,Do you charge for making a withdrawal? I took some money out of my account earlier and I was charged for this.,52,charged_fee_withdrawal_cash
3,Is there an issue with my account? I don't see a cheque deposit that I made yesterday. Please assist.,66,deposited_money_account_cash
4,Are there ways for other people to send me money?,39,receive_money_charge_fee


### With ground truth labels

In [35]:
data_ground = pd.read_csv('../data/processed/data_sample.csv')[['text', 'category']]
data_ground.head()

Unnamed: 0,text,category
0,I'm worried my card might be lost in the mail? How long does it usually take to arrive?,card_delivery_estimate
1,I got charged a fee that shouldn't be there from my cash,cash_withdrawal_charge
2,Do you charge for making a withdrawal? I took some money out of my account earlier and I was charged for this.,cash_withdrawal_charge
3,Is there an issue with my account? I don't see a cheque deposit that I made yesterday. Please assist.,balance_not_updated_after_cheque_or_cash_deposit
4,Are there ways for other people to send me money?,receiving_money


In [36]:
labeled_clusters = combine_ground_truth(labeled_clusters, data_ground, 'text')
labeled_clusters.head()

Unnamed: 0,text,label_st1,label,category
0,I'm worried my card might be lost in the mail? How long does it usually take to arrive?,48,expect_card_week,card_delivery_estimate
1,I got charged a fee that shouldn't be there from my cash,52,charged_fee_withdrawal_cash,cash_withdrawal_charge
2,Do you charge for making a withdrawal? I took some money out of my account earlier and I was charged for this.,52,charged_fee_withdrawal_cash,cash_withdrawal_charge
3,Is there an issue with my account? I don't see a cheque deposit that I made yesterday. Please assist.,66,deposited_money_account_cash,balance_not_updated_after_cheque_or_cash_deposit
4,Are there ways for other people to send me money?,39,receive_money_charge_fee,receiving_money


In [37]:
labeled_clusters[labeled_clusters['label_st1']==65]

Unnamed: 0,text,label_st1,label,category
41,There is a payment that i made that i noticed has not gone thru yet. Can i ask why?,65,tried_payment_card,pending_card_payment
106,What is the pending payment on my card about?,65,tried_payment_card,pending_card_payment
112,When will my pending payment finish?,65,tried_payment_card,pending_card_payment
138,My card payment has been pending for a long time now. Why is it taking so long to take. It should have already made it through.,65,tried_payment_card,pending_card_payment
152,I have a payment that hasn't went through yet.,65,tried_payment_card,pending_card_payment
163,I need some help figuring out what this strange payment is on my account. It's stays pending and won't go away.,65,tried_payment_card,extra_charge_on_statement
166,"Is there something wrong with my card payment? It's been pending a really, really long time.",65,tried_payment_card,pending_card_payment
240,How long does it actually take for a card payment to go through? There's one stuck since a couple days already that doesn't seem to move,65,tried_payment_card,pending_card_payment
248,I am waiting for a payment to be processed,65,tried_payment_card,pending_card_payment
259,Why is this payment pending?,65,tried_payment_card,pending_card_payment


#### Count and name of most common category of generated labels and clusters

In [38]:
get_top_category(labeled_clusters, cluster_summary)

Unnamed: 0,label_st1,count,label,top_ground_category,top_cat_count,perc_top_cat
0,-1,63,help_card_account,card_about_to_expire,6,10
1,22,47,help_refund_account,Refund_not_showing_up,24,51
2,65,32,tried_payment_card,pending_card_payment,19,59
3,66,29,deposited_money_account_cash,balance_not_updated_after_cheque_or_cash_deposit,17,59
4,52,26,charged_fee_withdrawal_cash,cash_withdrawal_charge,23,88
5,50,22,cancelled_payment_card,reverted_card_payment?,13,59
6,58,20,waiting_time_transfer,pending_transfer,8,40
7,54,20,charged_rate_exchange_currency,exchange_rate,9,45
8,56,19,need_pin_bank,change_pin,11,58
9,20,19,like_account_service,terminate_account,13,68
