# Model Results

This notebook performs clustering runs on various k-values and bias amounts, then produces model results for further analysis.

## Load Data

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
cd drive/MyDrive/ML_Trending_Topics/code

/content/drive/MyDrive/ML_Trending_Topics/code


In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
import numpy as np
import pandas as pd
from Biased_Clusters import get_silhouette

In [5]:
# cleaned data
df = pd.read_csv('../extracted_files/data_cleaned.csv')

In [6]:
import json
import scipy.sparse

# load training data
x_vector = np.load('../extracted_files/x_vector.npy')

# load terms sparse matrix
terms_sparse_matrix = scipy.sparse.load_npz('../extracted_files/terms_sparse_matrix.npz')

# load terms label
with open("../extracted_files/terms_label.txt", "r") as fp:
    terms_label = json.load(fp)

## Run KMeans Models on Various k-values and Bias Amounts

In [7]:
def build_result_data(df, x_vector, n_clusters, max_range=1000):
    # run k-mean model on various bias amounts and build the result dataframe
    data = []
    for m in tqdm(range(1, max_range,10)):
        m = m*.01
        #print(m)
        try:
            data.append(get_silhouette(df, x_vector, m, n_clusters))
        except ValueError:
            continue
    
    # create a data frame of result
    df_result = pd.DataFrame(data)
    
    return df_result

In [8]:
from tqdm import tqdm

# create an empty list to store the result data frame
result_list = []
k_values = []             # a list of k-values
avg_sil_scores = []       # a list of average Silhouette score per k-value

# run KMeans model on 25 different k-values
for n_clusters in range(5, 31):
    # get model result and save to a list
    df_result = build_result_data(df, x_vector, n_clusters, 2000)
    result_list.append(df_result)
    
    # compute average Silhouette score for each k value
    k_values.append(n_clusters)
    avg_sil_scores.append(df_result['Silhouette Score'].mean())

100%|██████████| 200/200 [03:53<00:00,  1.17s/it]
100%|██████████| 200/200 [03:55<00:00,  1.18s/it]
100%|██████████| 200/200 [03:54<00:00,  1.17s/it]
100%|██████████| 200/200 [03:53<00:00,  1.17s/it]
100%|██████████| 200/200 [04:09<00:00,  1.25s/it]
100%|██████████| 200/200 [04:13<00:00,  1.27s/it]
100%|██████████| 200/200 [04:36<00:00,  1.38s/it]
100%|██████████| 200/200 [04:27<00:00,  1.34s/it]
100%|██████████| 200/200 [04:36<00:00,  1.38s/it]
100%|██████████| 200/200 [04:42<00:00,  1.41s/it]
100%|██████████| 200/200 [04:46<00:00,  1.43s/it]
100%|██████████| 200/200 [04:51<00:00,  1.46s/it]
100%|██████████| 200/200 [05:01<00:00,  1.51s/it]
100%|██████████| 200/200 [05:09<00:00,  1.55s/it]
100%|██████████| 200/200 [05:17<00:00,  1.59s/it]
100%|██████████| 200/200 [05:09<00:00,  1.55s/it]
100%|██████████| 200/200 [05:18<00:00,  1.59s/it]
100%|██████████| 200/200 [05:26<00:00,  1.63s/it]
100%|██████████| 200/200 [05:37<00:00,  1.69s/it]
100%|██████████| 200/200 [05:31<00:00,  1.66s/it]


## Save Results

In [9]:
# save the result data frames to csv files
for n_clusters in range(5, 31):
    result_list[n_clusters-5].to_csv('../extracted_files/results/results_' + str(n_clusters) + '.csv', index=False)

In [10]:
# save average Silhouette score per k
df_avg_sil = pd.DataFrame(dict({'Number of Topics': k_values, 'Average Silhouette Score': avg_sil_scores}))
df_avg_sil.to_csv('../extracted_files/avg_sil_per_k.csv', index=False)