# **Topic Modelling**

# Importing Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
import tensorflow_hub as hub
import boto3
from io import StringIO, BytesIO

# Loading Clusters

In [None]:
# loading sentence-cluster file from bucket
# declaring bucket and dataset
bucket = 'intelligent-social-media-tracking'
data_key = 'sentence_clustering_google_large_v3_rev_9.csv'
data_location = 's3://{}/data/processed_stopword_revolut/{}'.format(bucket, data_key)

# loading dataset
sent_cluster_df = pd.read_csv(data_location).iloc[:, 1:]
sent_cluster_df

Unnamed: 0,text,sentences,cleaned_sentences,n_neighbors_15_min_dist_0.0_min_cluster_100,n_neighbors_15_min_dist_0.0_min_cluster_100_prob,n_neighbors_15_min_dist_0.0_min_cluster_200,n_neighbors_15_min_dist_0.0_min_cluster_200_prob,n_neighbors_15_min_dist_0.0_min_cluster_300,n_neighbors_15_min_dist_0.0_min_cluster_300_prob,n_neighbors_15_min_dist_0.5_min_cluster_100,...,n_neighbors_100_min_dist_0.5_min_cluster_200,n_neighbors_100_min_dist_0.5_min_cluster_200_prob,n_neighbors_100_min_dist_0.5_min_cluster_300,n_neighbors_100_min_dist_0.5_min_cluster_300_prob,n_neighbors_100_min_dist_0.99_min_cluster_100,n_neighbors_100_min_dist_0.99_min_cluster_100_prob,n_neighbors_100_min_dist_0.99_min_cluster_200,n_neighbors_100_min_dist_0.99_min_cluster_200_prob,n_neighbors_100_min_dist_0.99_min_cluster_300,n_neighbors_100_min_dist_0.99_min_cluster_300_prob
0,Very good in basic stuff like currency exchang...,Very good in basic stuff like currency exchang...,very good basic stuff like currency exchange t...,82,0.004273,34,0.009041,19,0.013177,3,...,2,0.827574,2,0.872031,2,0.780664,2,0.830362,1,0.916072
1,Very good in basic stuff like currency exchang...,Unfortunately very bad experience with profess...,unfortunately very bad experience professional...,137,0.006922,51,0.017554,34,0.022492,3,...,2,0.605116,2,0.604105,2,0.530467,2,0.540015,1,0.762556
2,Very good in basic stuff like currency exchang...,Customer support is good just for giving infor...,customer support good just giving information ...,109,0.006453,46,0.014311,-1,0.000000,3,...,2,0.881210,2,0.876603,2,0.784240,2,0.737037,1,0.812933
3,Stay away from these scammers. I tried to make...,Stay away from these scammers.,stay away these scammers .,-1,0.000000,-1,0.000000,-1,0.000000,3,...,2,0.921861,2,0.952102,2,0.719522,2,0.724099,1,0.877297
4,Stay away from these scammers. I tried to make...,I tried to make a transfer from my US bank acc...,tried transfer account $500 .,101,0.007110,41,0.013634,25,0.025260,3,...,2,0.877575,2,0.938872,2,0.825841,2,0.807937,1,0.905500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56941,STILL AFTER 1 YEAR CUSTOMER SUPPORT AND LIVE C...,"\r,\r,REVOLUT MADE TWO HUGE MISTAKES IN THINK...",made two huge mistakes thinking customers not ...,-1,0.000000,11,0.015866,6,0.028961,3,...,2,0.612780,2,0.611814,2,0.592559,2,0.574579,1,0.790184
56942,STILL AFTER 1 YEAR CUSTOMER SUPPORT AND LIVE C...,"\r,\r,DISGUSTING WAY TO TREAT DECENT AND LOYAL...",disgusting way treat decent loyal customers .,36,0.006563,13,0.013230,8,0.017870,3,...,2,0.733750,2,0.729737,2,0.840894,2,0.802455,1,0.832382
56943,The business idea is super cool but the timing...,The business idea is super cool but the timing...,business idea super cool timing completely wro...,-1,0.000000,-1,0.000000,-1,0.000000,3,...,2,0.993478,2,0.993378,2,0.678243,2,0.590176,1,0.778348
56944,The business idea is super cool but the timing...,I have been waiting a month for the card and i...,waiting month not arrived yet .,91,0.016108,-1,0.000000,-1,0.000000,3,...,2,0.976607,2,0.850677,2,0.953993,2,0.940388,1,0.972362


# LDA and NMF - Defining Function

In [None]:
def topic_name_out(df, hyperparam_combi, cluster, n_g, text_col, no_topics, no_top_words = 2, no_features = 1000):
    """
    Takes tokenised sentences and respective clusters as input and outputs the top 'no_top_words' number of words
    """
    
    df = df[df[hyperparam_combi] == cluster]
    documents = df[text_col].values
    
    # Build our Topic Models - Using 2 different ones (LDA and NMF)
    
    # LDA - (Latent Dirichlet Allocation)
    tfVectorizer = CountVectorizer(max_df=0.95, 
                                   min_df=2,
                                   ngram_range = (n_g,n_g), 
                                   max_features=no_features
                                  )
    tf = tfVectorizer.fit_transform(documents)
    tfFeatureNames = tfVectorizer.get_feature_names()
    
    # Run LDA - (Latent Dirichlet Allocation)
    lda = LatentDirichletAllocation(n_components=no_topics, 
                                    max_iter=5, 
                                    learning_method='online', 
                                    learning_offset=50.,
                                    random_state=0).fit(tf)

    # NMF - (Non-negative Matrix factorization)
    tfidfVectorizer = TfidfVectorizer(max_df=0.95,
                                      min_df=2,
                                      ngram_range = (n_g,n_g), 
                                      max_features=no_features)
    tfidf = tfidfVectorizer.fit_transform(documents)
    tfidfFeatureNames = tfidfVectorizer.get_feature_names()

    # Run NMF - (Non-negative Matrix factorization)
    nmf = NMF(n_components=no_topics, 
              random_state=1, 
              alpha=.1, 
              l1_ratio=.5, 
              max_iter = 400, 
              init='nndsvd').fit(tfidf)

    def display_topics(model, feature_names, no_top_words = no_top_words):
        top_words = []
        for topic_idx, topic in enumerate(model.components_):
            topTopicWords = [feature_names[i]for i in topic.argsort()[:-no_top_words - 1:-1]]
            top_words.extend(topTopicWords)
        top_words = ", ".join(top_words)
        return top_words

    nmf_r = display_topics(nmf, tfidfFeatureNames, no_top_words)
    lda_r = display_topics(lda, tfFeatureNames, no_top_words)
    return nmf_r, lda_r

# LDA and NMF - Unigram & Bi-gram

In [None]:
# declaring parameters for nmf and lda
no_topics = 3
text_col = "cleaned_sentences"

# declaring hyperparameter combinations to iterate over
n_neighbours_list = [15, 50, 100]
min_dist_list = [0.0, 0.5, 0.99]
min_cluster_size_list = [100, 200, 300]

len_n_neighbours = len(n_neighbours_list)
len_min_dist = len(min_dist_list)
len_min_cluster = len(min_cluster_size_list)

# iterating over hyperparameter combinations
for i in range(len_n_neighbours):
    for j in range(len_min_dist):
        for k in range(len_min_cluster):
            
            # declaring hyperparameter
            n_neighbors = n_neighbours_list[i]
            min_dist = min_dist_list[j]
            min_cluster_size = min_cluster_size_list[k]
            hyperparam_combi = "n_neighbors_{0}_min_dist_{1}_min_cluster_{2}".format(n_neighbors, min_dist, min_cluster_size)
            col_name_prob = hyperparam_combi + "_prob"
            
            # loading word embeddings from bucket
            # declaring bucket and dataset
            bucket = 'intelligent-social-media-tracking'
            data_key = "n_neighbors_{0}_min_dist_{1}_embedding_umap_v3_rev.csv".format(n_neighbors, min_dist)
            data_location = 's3://{}/data/embeddings_stopword_revolut/{}'.format(bucket, data_key)
            embedding_umap_3d_large_df = pd.read_csv(data_location).iloc[:, 1:]
            
            # assigning cluster to word embedding
            embedding_umap_3d_large_df.loc[:, "cluster"] = sent_cluster_df[hyperparam_combi]
            embedding_umap_3d_large_df.loc[:, "probability"] = sent_cluster_df[col_name_prob]
            embedding_umap_3d_large_df.loc[:, "unigram_topics_nmf"] = ""
            embedding_umap_3d_large_df.loc[:, "unigram_topics_lda"] = ""
            embedding_umap_3d_large_df.loc[:, "bigram_topics_nmf"] = ""
            embedding_umap_3d_large_df.loc[:, "bigram_topics_lda"] = ""
            
            # getting top unigrams and top bi-grams
            for cluster in np.unique(sent_cluster_df[hyperparam_combi]):
                # unigrams
                if cluster != -1:
                    try:
                        (nmf_words, lda_words) = topic_name_out(df=sent_cluster_df, 
                                                                hyperparam_combi=hyperparam_combi,
                                                                cluster=cluster, 
                                                                n_g=1,
                                                                text_col=text_col, 
                                                                no_topics=no_topics)
                    except ValueError:
                        nmf_words = ""
                        lda_words = ""
                    embedding_umap_3d_large_df.loc[embedding_umap_3d_large_df.loc[:, "cluster"] == cluster, "unigram_topics_nmf"] = nmf_words
                    embedding_umap_3d_large_df.loc[embedding_umap_3d_large_df.loc[:, "cluster"] == cluster, "unigram_topics_lda"] = lda_words

                # bigrams
                    try:
                        (nmf_words, lda_words) = topic_name_out(df=sent_cluster_df, 
                                                                hyperparam_combi=hyperparam_combi,
                                                                cluster=cluster, 
                                                                n_g=2,
                                                                text_col=text_col, 
                                                                no_topics=no_topics)
                    except ValueError:
                        nmf_words = ""
                        lda_words = ""
                    embedding_umap_3d_large_df.loc[embedding_umap_3d_large_df.loc[:, "cluster"] == cluster, "bigram_topics_nmf"] = nmf_words
                    embedding_umap_3d_large_df.loc[embedding_umap_3d_large_df.loc[:, "cluster"] == cluster, "bigram_topics_lda"] = lda_words

            # saving reduced embedding onto bucket
            save_file_name = "n_neighbors_{0}_min_dist_{1}_min_cluster_{2}_embeddings_cluster_topic_v3_rev.csv".format(n_neighbors, min_dist, min_cluster_size)
            print("Done and saving file: {0}".format(save_file_name))
            csv_buffer = StringIO()
            embedding_umap_3d_large_df.to_csv(csv_buffer)
            s3_resource = boto3.resource('s3')
            s3_resource.Object(bucket, 'data/embeddings_stopword_cluster_topic_v3_revolut/{}'.format(save_file_name)).put(Body=csv_buffer.getvalue())



Done and saving file: n_neighbors_15_min_dist_0.0_min_cluster_100_embeddings_cluster_topic_v3_rev.csv
Done and saving file: n_neighbors_15_min_dist_0.0_min_cluster_200_embeddings_cluster_topic_v3_rev.csv
Done and saving file: n_neighbors_15_min_dist_0.0_min_cluster_300_embeddings_cluster_topic_v3_rev.csv
Done and saving file: n_neighbors_15_min_dist_0.5_min_cluster_100_embeddings_cluster_topic_v3_rev.csv
Done and saving file: n_neighbors_15_min_dist_0.5_min_cluster_200_embeddings_cluster_topic_v3_rev.csv
Done and saving file: n_neighbors_15_min_dist_0.5_min_cluster_300_embeddings_cluster_topic_v3_rev.csv
Done and saving file: n_neighbors_15_min_dist_0.99_min_cluster_100_embeddings_cluster_topic_v3_rev.csv




Done and saving file: n_neighbors_15_min_dist_0.99_min_cluster_200_embeddings_cluster_topic_v3_rev.csv
Done and saving file: n_neighbors_15_min_dist_0.99_min_cluster_300_embeddings_cluster_topic_v3_rev.csv
Done and saving file: n_neighbors_50_min_dist_0.0_min_cluster_100_embeddings_cluster_topic_v3_rev.csv
Done and saving file: n_neighbors_50_min_dist_0.0_min_cluster_200_embeddings_cluster_topic_v3_rev.csv




Done and saving file: n_neighbors_50_min_dist_0.0_min_cluster_300_embeddings_cluster_topic_v3_rev.csv
Done and saving file: n_neighbors_50_min_dist_0.5_min_cluster_100_embeddings_cluster_topic_v3_rev.csv
Done and saving file: n_neighbors_50_min_dist_0.5_min_cluster_200_embeddings_cluster_topic_v3_rev.csv
Done and saving file: n_neighbors_50_min_dist_0.5_min_cluster_300_embeddings_cluster_topic_v3_rev.csv
Done and saving file: n_neighbors_50_min_dist_0.99_min_cluster_100_embeddings_cluster_topic_v3_rev.csv
Done and saving file: n_neighbors_50_min_dist_0.99_min_cluster_200_embeddings_cluster_topic_v3_rev.csv
Done and saving file: n_neighbors_50_min_dist_0.99_min_cluster_300_embeddings_cluster_topic_v3_rev.csv
Done and saving file: n_neighbors_100_min_dist_0.0_min_cluster_100_embeddings_cluster_topic_v3_rev.csv




Done and saving file: n_neighbors_100_min_dist_0.0_min_cluster_200_embeddings_cluster_topic_v3_rev.csv




Done and saving file: n_neighbors_100_min_dist_0.0_min_cluster_300_embeddings_cluster_topic_v3_rev.csv
Done and saving file: n_neighbors_100_min_dist_0.5_min_cluster_100_embeddings_cluster_topic_v3_rev.csv
Done and saving file: n_neighbors_100_min_dist_0.5_min_cluster_200_embeddings_cluster_topic_v3_rev.csv
Done and saving file: n_neighbors_100_min_dist_0.5_min_cluster_300_embeddings_cluster_topic_v3_rev.csv
Done and saving file: n_neighbors_100_min_dist_0.99_min_cluster_100_embeddings_cluster_topic_v3_rev.csv
Done and saving file: n_neighbors_100_min_dist_0.99_min_cluster_200_embeddings_cluster_topic_v3_rev.csv
Done and saving file: n_neighbors_100_min_dist_0.99_min_cluster_300_embeddings_cluster_topic_v3_rev.csv
