# Importing Libraries

In [13]:
import pandas as pd
import spacy
import scipy
import csv
import os
import numpy as np
import tensorflow_hub as hub
import uuid
import boto3
from io import StringIO, BytesIO
from tqdm import tqdm
import multiprocessing
import time
import umap

# Defining Functions to Save and Load Files

In [2]:
from saving_loading_functions.saving_file import saving_file
from saving_loading_functions.saving_file_json import saving_file_json
from saving_loading_functions.loading_file import loading_file
from saving_loading_functions.loading_file_json import loading_file_json

# Declaring Filepaths

In [None]:
# declaring filepaths
tokenised_sentences_filepath = 'data/processed/tokenised_sentences/'
word_embedding_filepath = 'data/processed/word_embeddings/'
umap_filepath = 'data/processed/dimensionality_reduced_word_embeddings/'
hdbscan_filepath = 'data/processed/clustered_embeddings/'
relative_validity_filepath = 'data/processed/relative_validity/'

# 1) Word Embeddings over Tokenised Sentences

In [None]:
# loading embedding model
module_url_large = "https://tfhub.dev/google/universal-sentence-encoder-large/5"
model_large = hub.load(module_url_large)
print ("module %s loaded" % module_url_large)

def embed_large(input):
    return model_large(input)

# declaring filepath to tokenised sentences
tokenised_sentences_filepath = 'data/processed/tokenised_sentences/'

# declaring bank names
bank_names = [*bank names*]

for bank in bank_names:
    
    # loading tokenised sentences
    df_poor = loading_file(tokenised_sentences_filepath, f'{bank}_tokenized_sentences_df.csv', 1)
    
    # getting cleaned sentences
    clean_text_list = [sent for sent in df_poor.loc[:, 'cleaned_sentences']]
    
    # embedding
    sentence_embeddings_large = embed_large(clean_text_list)
    sentence_embeddings_large_df = pd.DataFrame(sentence_embeddings_large.numpy())
    
    saving_file(sentence_embeddings_large_df, word_embedding_filepath, f'sentence_embeddings_large_{bank}_df.csv')

module https://tfhub.dev/google/universal-sentence-encoder-large/5 loaded


# 2) Dimensionality Reduction and Clustering

In [14]:
bank = "revolut"

# loading sentences and embeddings
sentence_embeddings_large_df = loading_file(word_embedding_filepath, f'sentence_embeddings_large_{bank}_df.csv', 1)
df_poor = loading_file(tokenised_sentences_filepath, f'{bank}_tokenized_sentences_df.csv', 1)

# getting text, sentences and cleaned sentences
text_list = [sent for sent in df_poor.loc[:, 'text']]
sent_list = [sent for sent in df_poor.loc[:, 'sentences']]
clean_text_list = [sent for sent in df_poor.loc[:, 'cleaned_sentences']]

# create new df 
sent_cluster_df = pd.DataFrame({"text": text_list, "sentences": sent_list, "cleaned_sentences": clean_text_list})

# dimensionality reduction and clustering
def task(n_neighbours_list, min_dist_list, process):

    min_cluster_size_list = [100, 200, 300]

    min_samples = 1
    leaf_size = 20

    len_n_neighbours = len(n_neighbours_list)
    len_min_dist = len(min_dist_list)
    len_min_cluster = len(min_cluster_size_list)

    file_names = []
    for i in range(len_n_neighbours):
        for j in range(len_min_dist):
            
            # declaring hyperparameter
            n_neighbors = n_neighbours_list[i]
            min_dist = min_dist_list[j]
            col_name = f"n_neighbors_{n_neighbors}_min_dist_{min_dist}"
            file_name = col_name + f"_embedding_umap_{bank}.csv"

            print(f"Iterating over:\nN-neighbors = {n_neighbors}\nMin Dist = {min_dist}")

            # dimensionality reduction via UMAP
            reducer = umap.UMAP(n_neighbors=n_neighbors, n_components=3, min_dist=min_dist, metric='cosine', 
                                random_state=42, negative_sample_rate=300)
            embedding_umap_3d_large = reducer.fit_transform(sentence_embeddings_large_df)
            embedding_umap_3d_large_df = pd.DataFrame(embedding_umap_3d_large)
            
            # saving reduced embeddings
            saving_file(embedding_umap_3d_large_df, umap_filepath, file_name)

            file_names.append(file_name)

    relative_validity_df = pd.DataFrame({"Evaluation Metric": ["Relative Validity (DBCV)"]})
    for file in tqdm(file_names):

        # to account for any memory leaks
        from hdbscan import HDBSCAN, prediction

        # loading file from bucket
        embedding_umap_3d_large_df = loading_file(umap_filepath, file, 1)

        # iterate over each min_cluster_size
        for k in range(len_min_cluster):
            min_cluster_size = min_cluster_size_list[k]

            print(f"Iterating over:\nMin Cluster Size = {min_cluster_size}")

            # clustering via HDBSCAN
            clusterer = HDBSCAN(min_cluster_size=min_cluster_size, min_samples=min_samples, leaf_size=leaf_size,
                                prediction_data=True, gen_min_span_tree=True).fit(embedding_umap_3d_large_df)

            
            col_name = file[:-len(f"_embedding_umap_{bank}.csv")] + "_min_cluster_{}".format(min_cluster_size)
            col_name_prob = col_name + "_prob"

            # writing cluster onto dataframe
            sent_cluster_df[col_name] = clusterer.labels_

            # writing cluster probability onto df
            cluster_prob = prediction.all_points_membership_vectors(clusterer)
            sent_cluster_df[col_name_prob] = 0
            for i in range(len(sent_cluster_df)):
                entry_cluster = sent_cluster_df.loc[i, col_name]
                if entry_cluster != -1:
                    sent_cluster_df.loc[i, col_name_prob] = cluster_prob[i][entry_cluster]

            # saving sentence-cluster file onto bucket
            saving_file(sent_cluster_df, hdbscan_filepath, f'sentence_clustering_{bank}_{process}.csv')
            
            # writing relative validity onto dataframe        
            relative_validity_df[col_name] = clusterer.relative_validity_

            # saving relative validity file onto bucket
            saving_file(relative_validity_df, relative_validity_filepath, f'relative_validity_{bank}_{process}.csv')

# declaring hyperparameters
n_neighbours_list = [15, 50, 100]
min_dist_list = [0.0, 0.5, 0.99]

# track process number
i = 0

if __name__ == "__main__": 
    start_time = time.perf_counter()
    processes = []

    for n_neighbours in n_neighbours_list:
        for min_dist in min_dist_list:
            i += 1
            p = multiprocessing.Process(target = task([n_neighbours], [min_dist], i))
            p.start()
            processes.append(p)
    
    for p in processes:
        p.join()
    
    finish_time = time.perf_counter()
 
    print(f"Program finished in {finish_time-start_time} seconds")   

Iterating over:
N-neighbors = 15
Min Dist = 0.0


  0%|          | 0/1 [00:00<?, ?it/s]

Iterating over:
Min Cluster Size = 100
Iterating over:
Min Cluster Size = 200
Iterating over:
Min Cluster Size = 300


100%|██████████| 1/1 [02:28<00:00, 148.10s/it]


Iterating over:
N-neighbors = 15
Min Dist = 0.5


  0%|          | 0/1 [00:00<?, ?it/s]

Iterating over:
Min Cluster Size = 100
Iterating over:
Min Cluster Size = 200
Iterating over:
Min Cluster Size = 300


100%|██████████| 1/1 [01:46<00:00, 106.47s/it]


Iterating over:
N-neighbors = 15
Min Dist = 0.99


  0%|          | 0/1 [00:00<?, ?it/s]

Iterating over:
Min Cluster Size = 100
Iterating over:
Min Cluster Size = 200
Iterating over:
Min Cluster Size = 300


100%|██████████| 1/1 [01:36<00:00, 96.54s/it]


Iterating over:
N-neighbors = 50
Min Dist = 0.0


  0%|          | 0/1 [00:00<?, ?it/s]

Iterating over:
Min Cluster Size = 100
Iterating over:
Min Cluster Size = 200
Iterating over:
Min Cluster Size = 300


100%|██████████| 1/1 [02:36<00:00, 157.00s/it]


Iterating over:
N-neighbors = 50
Min Dist = 0.5


  0%|          | 0/1 [00:00<?, ?it/s]

Iterating over:
Min Cluster Size = 100
Iterating over:
Min Cluster Size = 200
Iterating over:
Min Cluster Size = 300


100%|██████████| 1/1 [01:40<00:00, 100.30s/it]


Iterating over:
N-neighbors = 50
Min Dist = 0.99


  0%|          | 0/1 [00:00<?, ?it/s]

Iterating over:
Min Cluster Size = 100
Iterating over:
Min Cluster Size = 200
Iterating over:
Min Cluster Size = 300


100%|██████████| 1/1 [01:42<00:00, 102.18s/it]


Iterating over:
N-neighbors = 100
Min Dist = 0.0


  0%|          | 0/1 [00:00<?, ?it/s]

Iterating over:
Min Cluster Size = 100
Iterating over:
Min Cluster Size = 200
Iterating over:
Min Cluster Size = 300


100%|██████████| 1/1 [03:01<00:00, 181.42s/it]


Iterating over:
N-neighbors = 100
Min Dist = 0.5


  0%|          | 0/1 [00:00<?, ?it/s]

Iterating over:
Min Cluster Size = 100
Iterating over:
Min Cluster Size = 200
Iterating over:
Min Cluster Size = 300


100%|██████████| 1/1 [01:48<00:00, 108.11s/it]


Iterating over:
N-neighbors = 100
Min Dist = 0.99


  0%|          | 0/1 [00:00<?, ?it/s]

Iterating over:
Min Cluster Size = 100
Iterating over:
Min Cluster Size = 200
Iterating over:
Min Cluster Size = 300


100%|██████████| 1/1 [01:46<00:00, 106.11s/it]


Program finished in 37656.239334967 seconds
