# Chat Intents

## Applying labels

In [41]:
import numpy as np
import pandas as pd

pd.set_option("display.max_rows", 600)
pd.set_option("display.max_columns", 500)
pd.set_option("max_colwidth", 400)

import collections

import spacy

In [3]:
nlp = spacy.load("en_core_web_sm")

In [56]:
data_sample = pd.read_csv('../data/processed/data_sample.csv')
data_sample.sample(10)

Unnamed: 0.1,Unnamed: 0,text,category,cleaned_text
304,1778,Can you unblock my account? I entered the PIN wrong.,pin_blocked,unblock account entered pin wrong
722,1217,What are the restrictions on auto top-up?,automatic_top_up,restriction auto topup
157,1777,I need a new PIN number.,pin_blocked,need new pin number
705,6454,Is there a reason to why I was charged twice for a transaction?,transaction_charged_twice,reason charged twice transaction
135,4402,How many disposable cards per person?,disposable_card_limits,many disposable card per person
70,9194,It won't let me activate my card.,activate_my_card,wont let activate card
38,4549,Which ATM's can I use to make a withdrawal?,atm_support,atm use make withdrawal
131,5891,My card payment didn't go through.,declined_card_payment,card payment didnt go
858,3223,I refilled my account balance but got an error message.,top_up_reverted,refilled account balance got error message
943,4663,What is this direct debit in my statement?,direct_debit_payment_not_recognised,direct debit statement


In [58]:
#example_category = data_sample[data_sample['category']=='cash_withdrawal_charge'].reset_index(drop=True)
example_category = data_sample[data_sample['category']=='disposable_card_limits'].reset_index(drop=True)
example_category 

Unnamed: 0.1,Unnamed: 0,text,category,cleaned_text
0,4438,I have a disposable virtual card. Is there a limit to how many times I can use it?,disposable_card_limits,disposable virtual card limit many time use
1,4418,How many disposable cards can you own?,disposable_card_limits,many disposable card
2,4402,How many disposable cards per person?,disposable_card_limits,many disposable card per person
3,4349,What is the most amount of disposable cards allowable?,disposable_card_limits,amount disposable card allowable
4,4342,"i tried using a virtual card for a hotel but it didn't work, why not",disposable_card_limits,tried using virtual card hotel didnt work
5,4424,What are the rules for using disposable virtual cards?,disposable_card_limits,rule using disposable virtual card
6,4448,What are my limits on disposable cards?,disposable_card_limits,limit disposable card
7,4374,I need several cards per day. And I need to be able to throw away all of the cards I make.,disposable_card_limits,need several card per day need able throw away card make
8,4409,Are there downsides to using a disposable virtual cards?,disposable_card_limits,downside using disposable virtual card


In [69]:
example_doc = nlp(example_category.loc[0,'text'])
for token in example_doc:
    print(token.text, token.pos_, token.tag_, token.dep_ , token.is_stop)

I PRON PRP nsubj True
have VERB VBP ROOT True
a DET DT det True
disposable ADJ JJ amod False
virtual ADJ JJ amod False
card NOUN NN dobj False
. PUNCT . punct False
Is AUX VBZ ROOT True
there PRON EX expl True
a DET DT det True
limit NOUN NN attr False
to ADP IN prep True
how ADV WRB advmod True
many ADJ JJ amod True
times NOUN NNS npadvmod False
I PRON PRP nsubj True
can AUX MD aux True
use VERB VB pcomp False
it PRON PRP dobj True
? PUNCT . punct False


In [19]:
len(example_category)

24

In [54]:
def most_common(lst):
    counter=collections.Counter(lst)
    return counter.most_common(3)

In [67]:
verbs = []
dobjs = []
nouns = []

for i in range(len(example_category)):
    example_doc = nlp(example_category.loc[i,'text'])
    for token in example_doc:
        if (token.dep_ == 'ROOT') and (token.is_stop==False):
            verbs.append(token.text)
        
        elif (token.dep_=='dobj') and (token.is_stop==False):
            dobjs.append(token.text)
        
        elif (token.pos_=='NOUN') and (token.is_stop==False):
            nouns.append(token.text)
            #print(i, token.text, token.pos_, token.tag_, token.dep_ )

print(most_common(verbs))
print(most_common(dobjs))
print(most_common(nouns))

[('need', 2), ('cards', 1), ('tried', 1)]
[('cards', 4), ('card', 2)]
[('cards', 3), ('limit', 1), ('times', 1)]


In [34]:
counter=collections.Counter(verbs)
print(counter)

Counter({'charged': 12, 'is': 4, 'charge': 2, 'took': 1, 'going': 1, 'seen': 1, 'charging': 1, 'went': 1, "'s": 1, 'was': 1, 'came': 1, 'have': 1, 'Is': 1, 'supposed': 1, 'had': 1, 'changed': 1, 'expect': 1, 'realize': 1, 'are': 1, 'cost': 1, 'Explain': 1})


In [38]:
counter=collections.Counter(dobjs)
print(counter)

Counter({'fee': 8, 'cash': 6, 'money': 3, 'ATM': 3, 'withdrawal': 1, 'charge': 1, 'fees': 1, 'groceries': 1, 'transaction': 1, 'look': 1, 'it': 1, 'me': 1, 'anything': 1})


In [51]:
counter.most_common(2)

[('fee', 8), ('cash', 6)]

In [None]:
all_messages_per_topic = intents_df.groupby(['topic'], as_index=False).agg({'message': ' '.join})

In [None]:
def c_tf_idf(documents, m, ngram_range=(1,1)):
    count = CountVectorizer(ngram_range = ngram_range, stop_words = 'english').fit(documents)
    t = count.transform(documents).toarray()
    w = t.sum(axis=1)
    tf = np.divide(t.T, w)
    sum_t = t.sum(axis=0)
    idf = np.log(np.divide(m, sum_t)).reshape(-1, 1)
    tf_idf = np.multiply(tf, idf)
    
    return tf_idf, count

def extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=10):
    words = count.get_feature_names()
    labels = list(docs_per_topic.topic)
    tf_idf_transposed = tf_idf.T
    indices = tf_idf_transposed.argsort()[:, -n:]
    top_n_words = {label: [(words[j], tf_idf_transposed[i][j]) for j in indices[i]][::-1] for i, label in enumerate(labels)}
    top_n_words_only = {label: [words[j] for j in indices[i]][::-1] for i, label in enumerate(labels)}
    
    return top_n_words, top_n_words_only

def extract_topic_sizes(df):
    topic_sizes = (df.groupby(['topic']).message.count()
                   .reset_index()
                   .rename({'message':'size'}, axis='columns')
                   .sort_values('size', ascending=False))
    return topic_sizes

In [None]:
tf_idf, count = c_tf_idf(all_messages_per_topic.message.values, m = len(data_sample))

In [None]:
top_n_words, top_n_words_only = extract_top_n_words_per_topic(tf_idf, count, all_messages_per_topic, n=10)

In [None]:
top_n_words[0]

In [None]:
top_n_words[-1]

In [None]:
df_keywords = pd.DataFrame()
df_keywords['topic'] = top_n_words_only.keys()
df_keywords['keywords'] = top_n_words_only.values()

In [None]:
extract_topic_sizes(intents_df).merge(df_keywords, how='left', on='topic')

In [None]:
intents_df[intents_df['topic']==1]

In [None]:
intents_df[intents_df['message'].str.contains('card')]

In [None]:
umap_data = umap.UMAP(n_neighbors=15, 
                      n_components=2, 
                      min_dist = 0.0, 
                      metric='cosine').fit_transform(use_embeddings)

result = pd.DataFrame(umap_data, columns=['x', 'y'])
result['labels'] = clusters.labels_

#visualizing clusters
fig, ax = plt.subplots(figsize=(20, 10))
outliers = result.loc[result.labels == -1, :]
clustered = result.loc[result.labels != -1, :]
plt.scatter(outliers.x, outliers.y, color = '#BDBDBD', s=0.8)
plt.scatter(clustered.x, clustered.y, c=clustered.labels, s=0.8, cmap='hsv_r')
plt.colorbar()