# **Cluster Regrouping and Downsampling - Binary Classifiers**

In [None]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
iteration_number = 4

# Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import boto3
from io import StringIO, BytesIO
from sklearn.model_selection import train_test_split
import plotly.graph_objects as go
from statistics import mean
import matplotlib.pyplot as plt
from sklearn.utils import resample

# Defining Functions to Save and Load Files

In [None]:
# saving file
def saving_file(df, filepath, filename):
    bucket = 'intelligent-social-media-tracking' # already created on S3
    csv_buffer = StringIO()
    df.to_csv(csv_buffer)
    s3_resource = boto3.resource('s3')
    s3_resource.Object(bucket, filepath + filename).put(Body=csv_buffer.getvalue())

# loading file
def loading_file(filepath, filename):
    bucket = 'intelligent-social-media-tracking'
    data_key = filename
    data_location = f's3://{bucket}/{filepath}{data_key}'
    df = pd.read_csv(data_location).iloc[:, 1:]
    return df

# Loading Datasets

In [None]:
# declaring hyperparameter
n_neighbors = 100
min_dist = 0.0
min_cluster_size = 300
col_name = f"n_neighbors_{n_neighbors}_min_dist_{min_dist}_min_cluster_{min_cluster_size}"
hyperparam_combi = col_name

# loading file from bucket
topic_cluster_df = loading_file("data/embeddings_stopword_cluster_topic_v3_revolut/", 
                                f"{hyperparam_combi}_embeddings_cluster_topic_v3_rev.csv")
sent_embeddings_df = loading_file("data/processed/word_embeddings/", "sentence_embeddings_large_rev_df.csv")

# Grouping Similar Clusters and Assigning Topic

In [None]:
# grouping similar clusters and assigning topic
grouping_dict = {0:-1, 1:0, 2:1, 3:10, 4:-1, 5:2, 7:-1, 8:-1, 9:-1, 
                 11:3, 12:4, 13:5, 16:-1, 17:-1, 18:6, 19:7, 
                 21:6, 22:-1, 24:-1, 25:8, 26:-1, 27:-1, 29:9, 
                 40:8, 6:6, 28:6, 30:6, 31:10, 32:11, 14:6, 
                 33:6, 23:6, 34:6, 35:6, 15:12, 36:12, 20:13, 
                 37:13, 38:13, 10:7, 39:7, 43:7, 41:14, 42:7, 
                 44:15, 45:16, 46:17, 47:16}

# regrouping to transfers and customer service
topic_dict = {-1:"noise", 0:"crypto", 1:"stocks", 2:"app interface", 3:"bank subscription plans", 
              4:"usage abroad", 5:"refunds", 6:"customer_service", 7:"account issues",
              8:"tedious paperwork (request of documents)", 
              9:"additional fees (e.g. card delivery and transaction fees)", 
              10:"business accounts", 11:"account verification", 12:"fraudulent transactions", 
              13:"online payments", 14:"frozen account and money holdout", 15:"exchange rates", 16:"transfers", 
              17:"atm withdrawal"}

grouping_dict_keys = sorted([key for key in grouping_dict.keys()])

grouping_dict_sorted = {}
for key in grouping_dict_keys:
    grouping_dict_sorted[key] = grouping_dict[key]

# assigning new cluster and topic
topic_cluster_df["grouped_cluster"] = -1
topic_cluster_df["grouped_cluster_topics"] = ""

for key in grouping_dict_sorted.keys():
    topic_cluster_df.loc[topic_cluster_df.loc[:, "cluster"] == key, "grouped_cluster"] = grouping_dict_sorted[key]

for key in topic_dict.keys():
    topic_cluster_df.loc[topic_cluster_df.loc[:, "grouped_cluster"] == key, "grouped_cluster_topics"] = topic_dict[key]

# assigning onto word embeddings as well
sent_embeddings_df["cluster"] = topic_cluster_df.loc[:, "cluster"]
sent_embeddings_df["grouped_cluster"] = topic_cluster_df.loc[:, "grouped_cluster"]
sent_embeddings_df["grouped_cluster_topics"] = topic_cluster_df.loc[:, "grouped_cluster_topics"]
    
topic_cluster_df

Unnamed: 0,0,1,2,cluster,probability,unigram_topics_nmf,unigram_topics_lda,bigram_topics_nmf,bigram_topics_lda,grouped_cluster,grouped_cluster_topics
0,19.039290,-5.334114,33.775944,44,0.009648,"rates, exchange, currency, not, rate, exchange","exchange, not, currency, currencies, change, chf","exchange rates, good exchange, exchange rate, ...","currency exchange, foreign currency, multiple ...",15,exchange rates
1,9.437366,4.854146,3.957566,29,0.005139,"not, order, delivery, express, ordered, physical","not, financial, not, delivery, ordered, than","express delivery, free express, still not, not...","express delivery, customer service, new one, s...",9,additional fees (e.g. card delivery and transa...
2,-8.335799,6.184116,3.678786,35,0.095125,"service, customer, support, chat, not, customers","support, not, customer, service, support, not","customer service, worst customer, customer sup...","customer support, support chat, customer servi...",6,customer_service
3,22.169800,-7.420339,-13.171553,-1,0.000000,,,,,-1,noise
4,2.446999,-5.532766,34.115635,45,0.018998,"not, transfer, transfers, not, account, transf...","ibkr, transfer, transfer, transfers, transfer,...","not transfer, could not, cannot transfer, tran...","made transfer, transfer not, into account, can...",16,transfers
...,...,...,...,...,...,...,...,...,...,...,...
56941,15.044211,3.181443,33.518993,-1,0.000000,,,,,-1,noise
56942,-5.182962,11.406325,-8.464464,35,0.110180,"service, customer, support, chat, not, customers","support, not, customer, service, support, not","customer service, worst customer, customer sup...","customer support, support chat, customer servi...",6,customer_service
56943,15.300141,5.051132,-10.872703,31,0.012350,"business, elsewhere, company, like, not, about","business, not, business, up, company, like","business elsewhere, take business, bad busines...","run business, business model, bad business, no...",10,business accounts
56944,1.971823,-0.409367,-7.278151,34,0.009169,"days, waiting, reply, below, answer, not","days, not, reply, not, not, reply","copy paste, paste reply, reply below, update r...","not answer, not even, still not, live chat, co...",6,customer_service


In [None]:
saving_file(topic_cluster_df, "data/processed/model_development/regrouped_clusters/", f"{hyperparam_combi}_regrouped_cluster_iteration_{iteration_number}.csv")

In [None]:
# loading sentence-cluster and topic-cluster file from bucket
sent_cluster_df = loading_file('data/processed_stopword_revolut/', 'sentence_clustering_google_large_v3_rev_9.csv')
topic_cluster_df = loading_file("data/processed/model_development/regrouped_clusters/", f"{hyperparam_combi}_regrouped_cluster_iteration_{iteration_number}.csv")
sent_cluster_df = sent_cluster_df.loc[: ,["text", "sentences", "cleaned_sentences", hyperparam_combi]]

# appending on topics onto sentence
sent_cluster_df = pd.concat([sent_cluster_df, topic_cluster_df.iloc[:, 5:]], axis=1)
saving_file(sent_cluster_df, "data/processed/model_development/regrouped_clusters/", f"{hyperparam_combi}_sentence_regrouped_cluster_iteration_{iteration_number}.csv")

In [None]:
# distribution of clusters
grouped_clusters = list(set(topic_cluster_df["grouped_cluster"]))

for cluster in grouped_clusters:
    cluster_size = len(topic_cluster_df.loc[topic_cluster_df.loc[:,"grouped_cluster"] == cluster])
    print(cluster, cluster_size)

0 647
1 302
2 900
3 798
4 648
5 440
6 10367
7 3386
8 816
9 883
10 648
11 711
12 1639
13 1846
14 1071
15 545
16 1613
17 372
-1 29314


# Visualising Newly Grouped Clusters

## Getting New Cluster Labels and Sizes

In [None]:
# getting clusters labels, sizes and corresponding topics
new_clusters = topic_cluster_df.loc[:, "grouped_cluster"].astype(int)
unique_new_clusters = np.unique(new_clusters)
new_cluster_size = [(cluster, len(topic_cluster_df[topic_cluster_df.loc[:, "grouped_cluster"] == (cluster)])) 
                    for cluster in unique_new_clusters]
sorted_new_cluster_size = sorted(new_cluster_size, key=lambda x:x[1])

# without noise
topic_cluster_nn_df = topic_cluster_df[topic_cluster_df.loc[:, "grouped_cluster"] != -1]
topic_cluster_nn_df.reset_index(drop=True, inplace=True)

new_clusters_wo_noise = topic_cluster_nn_df.loc[:, "grouped_cluster"].astype(int)
unique_new_clusters_wo_noise = np.unique(new_clusters_wo_noise)
new_cluster_size_wo_noise = [(cluster, len(topic_cluster_nn_df[topic_cluster_nn_df.loc[:, "grouped_cluster"] == (cluster)])) 
                             for cluster in unique_new_clusters_wo_noise]
sorted_new_cluster_size_wo_noise = sorted(new_cluster_size_wo_noise, key=lambda x:x[1])

## 3D Scatter Plot of Newly Grouped Clusters

In [None]:
# declaring embeddings for 3d scatter plot
embedding_umap_3d_large_nn_df = topic_cluster_nn_df.iloc[:, [0, 1, 2]]

fig  = go.Figure(data=[go.Scatter3d(
    x=topic_cluster_nn_df.iloc[:, 0],
    y=topic_cluster_nn_df.iloc[:, 1],
    z=topic_cluster_nn_df.iloc[:, 2],
    hovertemplate='<b>%{text}</b><extra></extra>',
    text=["cluster {0}".format(cluster) for cluster in new_clusters_wo_noise],
    mode='markers',
    marker=dict(
        size=5,
        color=new_clusters_wo_noise, # set color to an array/list of desired values
        colorscale='hsv',   # choose a colorscale
        opacity=0.1))])

fig.update_layout(title_text="Regrouped Clusters Visualisation",
                  showlegend=False)

fig.show()

## Distribution of Cluster Sizes

In [None]:
sorted_y_labels = [topic_dict[cluster_size[0]] for cluster_size in sorted_new_cluster_size]
sorted_x_labels = [cluster_size[1] for cluster_size in sorted_new_cluster_size]
sorted_text_labels = [cluster_size[0] for cluster_size in sorted_new_cluster_size]

# plotting distribution of clusters

fig = go.Figure(data=[go.Bar(
    x=sorted_x_labels, 
    y=sorted_y_labels, 
    hovertemplate='<b>%{text}</b><extra></extra>',
    text=["cluster {0}".format(cluster) for cluster in sorted_text_labels], 
    orientation='h')])

fig.update_layout(title_text="Distribution of Regrouped Clusters with Noise",
                  yaxis = dict(tickfont = dict(size=10)),
                  xaxis = dict(tickfont = dict(size=10)),
                  showlegend=False)

fig.show()

print("Averge cluster size with noise = {}".format(mean([cluster_size[1] for cluster_size in new_cluster_size])))

Averge cluster size with noise = 2997.157894736842


In [None]:
sorted_y_labels = [topic_dict[cluster_size[0]] for cluster_size in sorted_new_cluster_size_wo_noise]
sorted_x_labels = [cluster_size[1] for cluster_size in sorted_new_cluster_size_wo_noise]
sorted_text_labels = [cluster_size[0] for cluster_size in sorted_new_cluster_size_wo_noise]
avg_cluster_size_wo_noise = mean(sorted_x_labels)

# plotting distribution of clusters

fig = go.Figure(data=[go.Bar(
    x=sorted_x_labels, 
    y=sorted_y_labels, 
    hovertemplate='<b>%{text}</b><extra></extra>',
    text=["cluster {0}".format(cluster) for cluster in sorted_text_labels], 
    orientation='h')])

fig.update_layout(title_text="Distribution of Regrouped Clusters without Noise",
                  yaxis = dict(tickfont = dict(size=10)),
                  xaxis = dict(tickfont = dict(size=10)),
                  showlegend=False)

fig.show()

print("Averge cluster size without noise = {}".format(avg_cluster_size_wo_noise))

Averge cluster size without noise = 1535.111111111111


# Testing and Training Split

In [None]:
# splitting into training (with validation) and testing
train_validation_df, test_df = train_test_split(sent_embeddings_df, random_state=42, test_size=0.1)

# saving training & validation and testing datasets
saving_file(test_df, 'data/processed/model_development/testing/', f'testing_df_iteration_{iteration_number}.csv')
saving_file(train_validation_df, 'data/processed/model_development/training_validation/', f'training_validation_df_iteration_{iteration_number}.csv')

# Down-sampling Noise

In [None]:
# down-sampling noise to average cluster size
# getting entries tagged as noise and not noise
train_validation_noise_df = train_validation_df.loc[train_validation_df.loc[:, "grouped_cluster"] == -1, :] # noise
train_validation_wo_noise_df = train_validation_df.loc[train_validation_df.loc[:, "grouped_cluster"] != -1, :] # not noise

# down-sampling noise
train_validation_noise_resample_df = (resample(train_validation_noise_df, n_samples=int(mean(sorted_x_labels))))

# merging down-sampled noise with remaining dataset
train_validation_resampled_df = pd.concat([train_validation_wo_noise_df, train_validation_noise_resample_df])
train_validation_resampled_df.reset_index(inplace=True, drop=False)

# saving file
saving_file(train_validation_resampled_df, 'data/processed/model_development/resampled/', f'training_validation_resampled_df_iteration_{iteration_number}.csv')
train_validation_resampled_df

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,...,505,506,507,508,509,510,511,cluster,grouped_cluster,grouped_cluster_topics
0,1298,-0.055236,0.003184,0.016944,-0.018619,0.032963,0.066310,-0.038245,0.002651,-0.037947,...,0.019553,-0.047200,-0.007941,0.044846,-0.011087,0.014982,-0.013049,37,13,online payments
1,53839,-0.010980,0.039448,0.013964,-0.022383,0.026393,-0.002197,0.083509,0.084897,-0.009534,...,0.006511,-0.027311,0.021034,0.090066,-0.000342,0.038786,-0.018333,35,6,customer_service
2,14014,0.016320,0.066595,0.012773,-0.015111,0.038297,0.103423,0.025933,-0.026357,0.059830,...,0.034463,0.029737,0.001701,-0.043678,0.008659,-0.005676,-0.021454,1,0,crypto
3,27465,-0.082296,0.005805,-0.046215,0.028224,0.039155,0.021807,0.111080,0.011511,0.007329,...,0.072875,-0.061023,0.033751,-0.004414,-0.033622,-0.007759,0.026315,35,6,customer_service
4,11710,0.054779,0.078618,0.005299,-0.005383,-0.054041,-0.022985,-0.044879,-0.040718,-0.001417,...,0.010525,0.011141,0.073240,0.026256,-0.008068,-0.017691,0.055849,29,9,additional fees (e.g. card delivery and transa...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26418,207,0.016489,0.060812,-0.008844,0.012060,0.039927,0.002299,0.030227,0.022362,0.010995,...,-0.048200,-0.035204,0.053990,-0.061621,-0.026316,-0.024627,-0.040891,22,-1,noise
26419,24034,0.047236,-0.020685,-0.010668,-0.020830,0.072601,-0.047231,0.025005,0.037194,-0.022323,...,0.006491,-0.052513,0.072273,-0.004314,-0.057944,-0.018869,-0.032514,-1,-1,noise
26420,4315,-0.023807,0.017712,-0.022030,-0.062909,-0.003362,0.041430,-0.001908,0.006628,-0.054618,...,-0.036946,0.012290,0.053044,0.015908,-0.012547,0.042582,0.019582,-1,-1,noise
26421,13721,0.022906,0.033055,0.056278,0.025466,0.054614,0.072051,-0.002255,0.028361,-0.053913,...,0.030514,-0.007280,0.042603,-0.045904,0.001590,0.048489,-0.023558,-1,-1,noise
