In [None]:
import pandas as pd
import pickle 
import numpy as np
import os
import sys
sys.path.append('/home/stankeaa/wikiwho_inheritance/')
from sequencers.clusterers import kmeans, dbscan, token_similarity, lda
import ipywidgets as widgets
from IPython.display import display, clear_output
import re
from sequencers import tsne

In [None]:
dirs = []
for file in os.listdir( sys.path[-1] ):
    if re.match("^[0-9_-]*$", file):
        dirs.append(file)

## Clusterer
This notebook clusterizes wikiwho chobs. 
It requires an **existing pickle** with chobs already vectorized (e.g. with **features** added). <br>
The following clustering methods are possible: 
- Kmeans (parameter: *random state*)
- DBscan (parameters: *min number of samples in one cluster* and *eps*)
- Token similarity (parameter: *intersection*, or min number of tokens to be the same in context)

In [None]:
a = widgets.Dropdown(
    options = dirs,
    value = None,
    description = 'Article id:',
)
v = widgets.Dropdown(
    options = [],
    value = None,
    description = 'Vectorizer:',
)
c = widgets.Dropdown(
    options = ['kmeans.Kmeans', 'dbscan.DBscan', 'token_similarity.Token_Similarity', 'lda.LDA'],
    description = 'Clusterer:',
)
min_s = widgets.IntText(
    value=5,
    description = 'Min samples:'
)
p1 = widgets.IntText(
    value=None
)
p2 = widgets.FloatText(
    value=None
)
p3 = widgets.Dropdown(
    options = ['id', 'string'],
    description = 'Token type:',
)
ts = widgets.IntText(
    value=42,
    description = 'TSNE random state:'
)

def a_on_change(change):
    if change['type'] == 'change' and change['name'] == 'value':
        v.options = os.listdir(sys.path[-1] + change['new'] + '/vectorizers/')
        display(v, c, min_s, ts)
        
def v_on_change(change):
    if change['type'] == 'change' and change['name'] == 'value':
        global vectorizer
        vectorizer = change['new']

def c_on_change(change):
    if change['type'] == 'change' and change['name'] == 'value':
        global clusterer
        clusterer = eval(c.value)
        clear_output()
        display(a, v, c, min_s, ts)
        if change['new'] == 'kmeans.Kmeans':
            p1.value = 42
            p1.description = 'Random state: '
            display(p1)
        elif change['new'] == 'token_similarity.Token_Similarity':
            p1.value = 2
            p1.description = 'Intersection: '
            display(p1, p3)
        elif change['new'] == 'dbscan.DBscan':
            p2.value = 1.5
            p2.description = 'Eps: '
            display(p2)
        print('Data is saved')

a.observe(a_on_change)
v.observe(v_on_change)
c.observe(c_on_change)
display(a)

In [None]:
## FOR CHOOSING NUM_TOPICS FOR LDA
save_path = '../'+a.value +'/clusterers/'+vectorizer[:-4]
cl = lda.LDA(pd.read_pickle('../'+a.value +'/vectorizers/'+vectorizer), {'num_topics':0}, save_path)
cl.get_corpus()
model_list, coherence_values = cl.choose_lda_model(start=20, limit=100, step=10)

In [None]:
save_path = '../'+a.value +'/clusterers/'+vectorizer[:-4]
if not os.path.exists(save_path):
    os.mkdir(save_path+'/') 
if clusterer == kmeans.Kmeans:
    cl = clusterer(pd.read_pickle('../'+a.value +'/vectorizers/'+vectorizer), {'random_state':p1.value, 'min_samples':min_s.value}, save_path)
    cl.df['avg_sil'] = np.nan
    cl.df.loc[0, 'avg_sil'] = cl.silhouette()
elif clusterer == token_similarity.Token_Similarity:
    cl = clusterer(pd.read_pickle('../'+a.value +'/vectorizers/'+vectorizer), {'intersection':p1.value, 'token_type':p3.value, 'min_samples':min_s.value}, save_path)
elif clusterer == lda.LDA:
    cl = clusterer(pd.read_pickle('../'+a.value +'/vectorizers/'+vectorizer), {'num_topics':37, 'min_samples':min_s.value}, save_path)
    cl.get_corpus()

else:
    cl = clusterer(pd.read_pickle('../'+a.value +'/vectorizers/'+vectorizer), {'min_samples':min_s.value, 'eps':p2.value}, save_path)
    cl.df['avg_sil'] = np.nan
    cl.df.loc[0, 'avg_sil'] = cl.silhouette()
cl.get_clusters()
cl.save()

In [None]:
if not os.path.exists(save_path+'/tsne/'):
    os.mkdir(save_path+'/tsne/') 
ft = tsne.Tsne(pd.read_pickle(cl.dirpath + '.pkl'), {'random_state':ts.value}, save_path+'/tsne/'+cl.dirpath.split('/')[-1])
ft.dirpath = ft.path
ft.get_plot_data(cl.df)
ft.save()

In [None]:
df = pd.read_pickle('../39570/clusterers/Distilbert_LR___Chobs_context_5_gap_length_20/tsne/DBscan_min_samples_5_eps_1.0.pkl')

In [None]:
df_other = pd.read_pickle('../39570/clusterers/Distilbert_LR___Chobs_context_5_gap_length_20/tsne/DBscan_min_samples_6_eps_1.0.pkl')

In [None]:
save_path = '../'+a.value +'/clusterers/'+vectorizer[:-4]
cl = clusterer(pd.read_pickle('../'+a.value +'/vectorizers/'+vectorizer), {'random_state':p1.value, 'min_samples':min_s.value}, save_path)
Sum_of_squared_distances = cl.evaluate_k()

In [None]:
K = range(1,15)
import matplotlib.pyplot as plt
plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()