# Chat Intents

## Applying labels

In [1]:
import numpy as np
import pandas as pd

pd.set_option("display.max_rows", 600)
pd.set_option("display.max_columns", 500)
pd.set_option("max_colwidth", 400)

import collections
from sklearn.feature_extraction.text import CountVectorizer

import spacy

In [10]:
nlp = spacy.load("en_core_web_sm")

In [11]:
data_sample = pd.read_csv('../data/processed/sample_clustered.csv')
#data_sample = pd.read_csv('../data/processed/data_sample.csv')
data_sample.sample(10)

Unnamed: 0.1,Unnamed: 0,text,category,label_use,label_st1,label_st2,label_st3
540,540,my cash balance less than what I thought,wrong_amount_of_cash_received,-1,66,84,61
783,783,I want a refund for my item,request_refund,23,22,28,17
351,351,I just got my card. It's not linked yet--how do I do that?,card_linking,33,46,-1,54
216,216,Can I use this app for any currency?,supported_cards_and_currencies,45,38,44,15
501,501,I need to cancel a transfer I made.,cancel_transfer,15,15,55,22
815,815,I know I'm getting a new card but would like know when I can expect to receive it.,card_arrival,41,48,50,25
57,57,I got charged a fee for transferring money.,transfer_fee_charged,34,27,67,14
615,615,I wanted to know why there is a transfer of mine pending.,pending_transfer,28,58,59,74
195,195,How old do you need to be to have an account?,age_limit,12,2,5,2
80,80,I might need a new card because it's not working at any of the ATMs. I don't think ive used all my money. Why is it doing this?,declined_cash_withdrawal,69,78,73,52


In [52]:
#example_category = data_sample[data_sample['category']=='cash_withdrawal_charge'].reset_index(drop=True)
example_category = data_sample[data_sample['label_st1']==66].reset_index(drop=True)
example_category 

Unnamed: 0.1,Unnamed: 0,text,category,label_use,label_st1,label_st2,label_st3
0,3,Is there an issue with my account? I don't see a cheque deposit that I made yesterday. Please assist.,balance_not_updated_after_cheque_or_cash_deposit,62,66,84,53
1,107,My account says I have money but I withdrew it in cash.,pending_cash_withdrawal,-1,66,-1,-1
2,133,I do not see the money I transferred into this account,balance_not_updated_after_bank_transfer,41,66,84,70
3,154,My balance was not updated with my checque or cash deposit.,balance_not_updated_after_cheque_or_cash_deposit,-1,66,84,53
4,178,I did a cash deposit to my account but it doesn't show up,balance_not_updated_after_cheque_or_cash_deposit,61,66,84,70
5,265,"After I deposited my cash, I still don't see it in my account.",balance_not_updated_after_cheque_or_cash_deposit,61,66,84,70
6,297,There is a withdrawal on my account I didn't make.,cash_withdrawal_not_recognised,-1,66,86,45
7,333,I just deposited cash to my account and can't find it!,balance_not_updated_after_cheque_or_cash_deposit,61,66,84,70
8,437,There is unexpected money in my account.,cash_withdrawal_not_recognised,-1,66,-1,70
9,438,I tried updating my balance by cheque yesterday but it doesn't seem to be working. Shouldn't that be faster? Please check my account something has gone wrong there.,balance_not_updated_after_cheque_or_cash_deposit,62,66,84,53


In [167]:
len(example_category)

23

In [160]:
data_sample['category'].unique()

array(['card_delivery_estimate', 'cash_withdrawal_charge',
       'balance_not_updated_after_cheque_or_cash_deposit',
       'receiving_money', 'edit_personal_details', 'card_arrival',
       'card_acceptance', 'fiat_currency_support',
       'declined_cash_withdrawal', 'country_support',
       'topping_up_by_card', 'wrong_exchange_rate_for_cash_withdrawal',
       'Refund_not_showing_up', 'getting_virtual_card',
       'visa_or_mastercard', 'balance_not_updated_after_bank_transfer',
       'automatic_top_up', 'age_limit', 'transfer_fee_charged',
       'card_not_working', 'lost_or_stolen_phone',
       'verify_source_of_funds', 'card_about_to_expire', 'verify_top_up',
       'top_up_reverted', 'pending_top_up', 'top_up_failed',
       'card_payment_fee_charged', 'atm_support',
       'pending_cash_withdrawal', 'pending_card_payment',
       'cancel_transfer', 'top_up_by_cash_or_cheque',
       'getting_spare_card', 'request_refund',
       'direct_debit_payment_not_recognised', 'disp

## Helper functions

In [5]:
def get_group(df, category_col, category):
    
    single_category = df[df[category_col]==category].reset_index(drop=True)

    return single_category 

In [3]:
def most_common(lst, n_words):
    counter=collections.Counter(lst)
    return counter.most_common(n_words)

In [94]:
def extract_labels(category_docs, print_word_counts=False):
    """
    Argument:
    category_docs: list of documents, all from the same category or clustering
    """
    verbs = []
    dobjs = []
    nouns = []
    adjs = []
    
    verb = ''
    dobj = ''
    noun1 = ''
    noun2 = ''

    for i in range(len(category_docs)):
        doc = nlp(category_docs[i])
        for token in doc:
            if token.is_stop==False:
                if token.dep_ == 'ROOT':
                    verbs.append(token.text)

                elif token.dep_=='dobj':
                    dobjs.append(token.lemma_)

                elif token.pos_=='NOUN':
                    nouns.append(token.lemma_)
                    
                elif token.pos_=='ADJ':
                    adjs.append(token.lemma_)
                    #print(i, token.text, token.pos_, token.tag_, token.dep_ )

    if print_word_counts:
        for word_lst in [verbs, dobjs, nouns, adjs]:
            counter=collections.Counter(word_lst)
            print(counter)
        
    if len(verbs) > 0:
        verb = most_common(verbs, 1)[0][0]
    
    if len(dobjs) > 0:
        dobj = most_common(dobjs, 1)[0][0]
    
    if len(nouns) > 0:
        noun1 = most_common(nouns, 1)[0][0]
    
    if len(set(nouns)) > 1:
        noun2 = most_common(nouns, 2)[1][0]
    
    
    words = [verb, dobj]
    
    for word in [noun1, noun2]:
        if word not in words:
            words.append(word)
    
    label = '_'.join(words)
    
    return label

In [92]:
def apply_and_summarize_labels(df, category_col):
    numerical_labels = df[category_col].unique()
    
    # create dictionary of the numerical category to the generated label
    label_dict = {}
    for label in numerical_labels:
        current_category = list(get_group(df, category_col, label)['text'])
        label_dict[label] = extract_labels(current_category)
        
    # create summary dataframe of numerical labels and counts
    summary_df = (df.groupby(category_col)['text'].count()
                    .reset_index()
                    .rename(columns={'text':'count'})
                    .sort_values('count', ascending=False))
    
    # apply generated labels
    summary_df['label'] = summary_df.apply(lambda x: label_dict[x[category_col]], axis = 1)
    
    return summary_df

In [96]:
df1 = apply_and_summarize_labels(data_sample, 'label_st1')
df2 = extract_topic_sizes(data_sample).merge(df_keywords, how='left', on='label_st1')

df1.merge(df2, on='label_st1')

Unnamed: 0,label_st1,count,label,size,keywords
0,-1,63,need_card_account,63,"(card, payment, rate, account)"
1,22,47,help_refund_account,47,"(refund, statement, ago, bought)"
2,65,32,pending_payment_card,32,"(payment, pending, long, hasn)"
3,66,29,deposited_money_account_cash,29,"(cash, deposit, deposited, balance)"
4,52,26,charged_fee_withdrawal_cash,26,"(cash, charged, fee, withdrawal)"
5,50,22,cancelled_payment_card,22,"(payment, cancelled, reverted, did)"
6,58,20,waiting_time_transfer,20,"(pending, transfer, time, long)"
7,54,20,charged_rate_exchange_currency,20,"(exchange, rates, rate, currency)"
8,56,19,need_pin_bank,19,"(pin, change, reset, bank)"
9,20,19,like_account_service,19,"(account, delete, service, like)"


In [78]:
example_doc = nlp(list(get_group(data_sample, 'label_st1', 6)['text'])[1])

for token in example_doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_ , token.is_stop)

I I PRON PRP nsubj True
need need VERB VBP ROOT False
to to PART TO aux True
know know VERB VB xcomp False
which which DET WDT det True
fiat fiat NOUN NN compound False
currencies currency NOUN NNS dobj False
I I PRON PRP nsubj True
can can AUX MD aux True
use use VERB VB ccomp False
with with ADP IN prep True
you you PRON PRP pobj True
. . PUNCT . punct False


In [79]:
example_category = list(get_group(data_sample, 'label_st1', 6)['text'])
extract_labels(example_category, True)

Counter({'tell': 2, 'use': 2, 'want': 1, 'need': 1, 'add': 1, 'accept': 1, 'support': 1})
Counter({'currency': 10, 'money': 2, 'name': 1})
Counter({'fiat': 3, 'currency': 3, 'account': 2, 'holding': 2, 'exchange': 2, 'institution': 1, 'limitation': 1})
Counter({'multiple': 2, 'flat': 2, 'possible': 1, 'new': 1, 'okay': 1})


'tell_currency_fiat'

In [168]:
extract_labels(example_category, True)

Counter({'need': 7, 'cancel': 7, 'help': 3, 'realise': 1, 'realize': 1, 'try': 1, 'want': 1, 'let': 1})
Counter({'transfer': 8, 'transaction': 6, 'payment': 4, 'rent': 1, 'help': 1, 'error': 1, 'transcation': 1, 'mistake': 1})
Counter({'account': 9, 'tomorrow': 3, 'number': 3, 'payment': 3, 'yesterday': 3, 'transaction': 2, 'morning': 2, 'rent': 2, 'typo': 1, 'app': 1, 'correct?i': 1})
Counter({'wrong': 6, 'possible': 3, 'right': 2, 'able': 2, 'large': 1, 'important': 1, 'incorrect': 1, 'recent': 1})


'need_transfer_account_tomorrow'

In [100]:
extract_labels(example_category)

Counter({'need': 2, 'cards': 1, 'tried': 1, 'work': 1})
Counter({'cards': 4, 'card': 2})
Counter({'card': 6})
Counter({'cards': 3, 'limit': 1, 'times': 1, 'person': 1, 'hotel': 1, 'rules': 1, 'limits': 1, 'day': 1, 'downsides': 1})


'need_cards_limit'

In [34]:
counter=collections.Counter(verbs)
print(counter)

Counter({'charged': 12, 'is': 4, 'charge': 2, 'took': 1, 'going': 1, 'seen': 1, 'charging': 1, 'went': 1, "'s": 1, 'was': 1, 'came': 1, 'have': 1, 'Is': 1, 'supposed': 1, 'had': 1, 'changed': 1, 'expect': 1, 'realize': 1, 'are': 1, 'cost': 1, 'Explain': 1})


In [38]:
counter=collections.Counter(dobjs)
print(counter)

Counter({'fee': 8, 'cash': 6, 'money': 3, 'ATM': 3, 'withdrawal': 1, 'charge': 1, 'fees': 1, 'groceries': 1, 'transaction': 1, 'look': 1, 'it': 1, 'me': 1, 'anything': 1})


In [51]:
counter.most_common(2)

[('fee', 8), ('cash', 6)]

### Apply labels to each category

In [24]:
#all_messages_per_topic = intents_df.groupby(['topic'], as_index=False).agg({'message': ' '.join})
all_messages_per_topic = data_sample.groupby(['label_st1'], as_index=False).agg({'text': ' '.join})

In [28]:
def c_tf_idf(documents, m, ngram_range=(1,1)):
    count = CountVectorizer(ngram_range = ngram_range, stop_words = 'english').fit(documents)
    t = count.transform(documents).toarray()
    w = t.sum(axis=1)
    tf = np.divide(t.T, w)
    sum_t = t.sum(axis=0)
    idf = np.log(np.divide(m, sum_t)).reshape(-1, 1)
    tf_idf = np.multiply(tf, idf)
    
    return tf_idf, count

def extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=10):
    words = count.get_feature_names()
    labels = list(docs_per_topic['label_st1'])
    tf_idf_transposed = tf_idf.T
    indices = tf_idf_transposed.argsort()[:, -n:]
    top_n_words = {label: [(words[j], tf_idf_transposed[i][j]) for j in indices[i]][::-1] for i, label in enumerate(labels)}
    top_n_words_only = {label: [words[j] for j in indices[i]][::-1] for i, label in enumerate(labels)}
    
    return top_n_words, top_n_words_only

def extract_topic_sizes(df):
    topic_sizes = (df.groupby(['label_st1']).text.count()
                   .reset_index()
                   .rename({'text':'size'}, axis='columns')
                   .sort_values('size', ascending=False))
    return topic_sizes

In [29]:
tf_idf, count = c_tf_idf(all_messages_per_topic.text.values, m = len(data_sample))

In [30]:
top_n_words, top_n_words_only = extract_top_n_words_per_topic(tf_idf, count, all_messages_per_topic, n=4)

In [32]:
top_n_words[64]

[('help', 0.2627787347371944),
 ('withdrawal', 0.257877478268671),
 ('500', 0.21236388145624024),
 ('account', 0.19543826497894093)]

In [21]:
#top_n_words[-1]

In [35]:
df_keywords = pd.DataFrame()
df_keywords['label_st1'] = top_n_words_only.keys()
df_keywords['keywords'] = top_n_words_only.values()

In [36]:
extract_topic_sizes(data_sample).merge(df_keywords, how='left', on='label_st1')

Unnamed: 0,label_st1,size,keywords
0,-1,63,"(card, payment, rate, account)"
1,22,47,"(refund, statement, ago, bought)"
2,65,32,"(payment, pending, long, hasn)"
3,66,29,"(cash, deposit, deposited, balance)"
4,52,26,"(cash, charged, fee, withdrawal)"
5,50,22,"(payment, cancelled, reverted, did)"
6,58,20,"(pending, transfer, time, long)"
7,54,20,"(exchange, rates, rate, currency)"
8,56,19,"(pin, change, reset, bank)"
9,20,19,"(account, delete, service, like)"


In [None]:
intents_df[intents_df['message'].str.contains('card')]