In [None]:
import ipywidgets as widgets
from IPython.display import display
from IPython.display import clear_output
import os
import sys
sys.path.append('/home/stankeaa/wikiwho_inheritance/')
import pandas as pd
import re
from sequencers.clusterers import kmeans, dbscan
%matplotlib inline  
import matplotlib.pyplot as plt
import numpy as np

In [None]:
def getting_vectorizers(dirs):
    dir_to_vecs = {}
    for folder in dirs:
        dir_to_vecs[folder] = os.listdir(sys.path[-1] + folder + '/clusterers/')[1:]
    return dir_to_vecs

def getting_pickles(vecs):
    vecs_to_clusters = {}
    for vectorizer in vecs:
        path = sys.path[-1] + folder + '/clusterers/' + vectorizer + '/tsne/'
        pickles = []
        for dirpath, dirnames, files in os.walk(path):
            for file_name in files:
                pickles.append(file_name)
        vecs_to_clusters[vectorizer] = pickles        
    return vecs_to_clusters

dirs = []
for file in os.listdir( sys.path[-1] ):
    if re.match("^[0-9_-]*$", file):
        dirs.append(file) # getting the chobs
dir_to_vecs = getting_vectorizers(dirs) # getting the vectorizers
for folder in dir_to_vecs:
    vecs = dir_to_vecs[folder]
    vecs_to_clusters = getting_pickles(vecs) # getting the clusterers (with tsne)
        

In [None]:
def print_data(vectorizer,clusterizer):
    if clusterizer is not None:
        print('Downloaded data for {} vectorizer and {} clusterizer'.format(vectorizer, clusterizer[:-4])) 
    global vec, clu, df
    vec, clu = vectorizer, clusterizer
    df = pd.read_pickle(sys.path[-1] + d.value + '/clusterers/' + vec + '/tsne/' + clu)
    
def select_vec(folder):
    v.options = dir_to_vecs[folder]

#add in 'select clusterizer' function that looks in the new dictionary
def select_clusterizer(vectorizer):
    c.options = vecs_to_clusters[vectorizer]

d = widgets.Dropdown(options=dir_to_vecs.keys())
init = d.value
v = widgets.Dropdown(options=dir_to_vecs[init])


init2= v.value #new start value for vectorizer dropdown
c = widgets.Dropdown(options=vecs_to_clusters[init2]) #define district dropdown widget

j = widgets.interactive(print_data, vectorizer=v, clusterizer=c) #define clusterizer value
i = widgets.interactive(select_vec, folder=d)
k = widgets.interactive(select_clusterizer, vectorizer=v) #call everything together with new interactive

display(i)
display(j)

In [None]:
import plotly.io
import plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import random
init_notebook_mode(connected=True)

r = lambda: random.randint(0,255)
plot_data = df.copy()
if 'DBscan' in clu:
    plot_data = plot_data[plot_data['clusters'] != -1].reset_index()
if 't-SNE-X' not in plot_data.columns:
    print('TSNE coordinates have not been added. Please run the desired clusterizer again in the corresponding notebook')
else:
    traces = []
    for c in plot_data.clusters.unique():
        trace = go.Scatter(
            x=plot_data[plot_data["clusters"]==c]["t-SNE-X"],
            y=plot_data[plot_data["clusters"]==c]["t-SNE-Y"],
            mode = 'markers',
            name = str(c),
            marker = go.scatter.Marker(size=4, color='#%02X%02X%02X' % (r(),r(),r())),
            showlegend = True,
        )
        traces.append(trace)

    data = traces

    # Plot and embed in ipython notebook
    fname = '../' + d.value + '/figures/' + vec + clu[:-4] 

    plotly.offline.iplot(data,image_width=1280, image_height=800, image='png', filename='plot_image')

    plotly.offline.plot(data, filename=fname+'.html', auto_open=False,
                        image_width=1280, image_height=800)
                        #,image='png', image_filename='plot_image')
        

## Silhouette plot

In [None]:
#getting the average score and silhouette cluster scores from the df
sil_avg = df.loc[0, 'avg_sil']
sil_df = df.copy()
if clu.split('_')[0] == 'Kmeans':
    cl = sil_df.clusters.unique()
elif clu.split('_')[0] == 'DBscan':    
    cl = sil_df.clusters.value_counts()[1:11].index
    
    
plt.figure(figsize=(18, 7))
plt.xlim(-1, 1)
print("For n_clusters =", len(cl),
      "The average silhouette_score is :", sil_avg)
y_lower = 10
for i in cl:
    # Aggregate the silhouette scores for samples belonging to
    # cluster i, and sort them
    ith_cluster_silhouette_values = \
        sil_df[sil_df["clusters"]==i]["silhouette_value"]

    ith_cluster_silhouette_values.sort_values()

    size_cluster_i = ith_cluster_silhouette_values.shape[0]
    y_upper = y_lower + size_cluster_i

    plt.fill_betweenx(np.arange(y_lower, y_upper),
                      0, ith_cluster_silhouette_values,
                       alpha=0.7)

    # Label the silhouette plots with their cluster numbers at the middle
    plt.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

    # Compute the new y_lower for next plot
    y_lower = y_upper + 10  # 10 for the 0 samples

plt.title("The silhouette plot for the various clusters.")
plt.xlabel("The silhouette coefficient values")
plt.ylabel("Cluster label")

# The vertical line for average silhouette score of all the values
plt.axvline(x=sil_avg, color="red", linestyle="--")

plt.yticks([])  # Clear the yaxis labels / ticks
plt.xticks([-1, -0.6,-0.4,-0.2,-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
plt.show()
plt.savefig(fname+'_silhouette.png')

## Lasso selection with a table view

In [None]:
import time
import qgrid
from ipywidgets import widgets, Output
import pdb
from IPython.display import display, clear_output
qgrid.set_grid_option('maxVisibleRows', 5)

if 't-SNE-X' not in plot_data.columns:
    print('TSNE coordinates have not been added. Please run the desired clusterizer again in the corresponding notebook')
else:
    f = go.FigureWidget()
    f.layout.dragmode = 'select'
    DISPLAYED_TABLE_COLUMNS = ["left_token_str", "right_token_str", "del_tokens_str", "ins_tokens_str", "clusters"]

    scatter = f.add_scatter(x = plot_data["t-SNE-X"], y = plot_data["t-SNE-Y"], mode = 'markers', 
                           marker=go.scatter.Marker(size=0, opacity=0),showlegend = False)


    r = lambda: random.randint(0,255)
    def selection_fn(trace,points,selector):
        global sel_points, counter
        sel_points = points
        display("Finished: " + str(len(points.point_inds)) + " points selected")

    traces = []
    for c in plot_data["clusters"].unique():
        # Create a trace
        trace = go.Scatter(
            x=plot_data[plot_data["clusters"] == c]["t-SNE-X"],
            y=plot_data[plot_data["clusters"] == c]["t-SNE-Y"],
            mode = 'markers',
            name = str(c),
            uid = str(c),
            marker = go.scatter.Marker(size=4, color='#%02X%02X%02X' % (r(),r(),r())),

            showlegend = True, 
        )

        f.add_trace(trace)

    scatter.on_selection(selection_fn)
    display(f)

In [None]:
# Create a table FigureWidget that updates on selection from points in the scatter plot of f
t = go.FigureWidget([go.Table(  
    header=dict(values=DISPLAYED_TABLE_COLUMNS,             
                
                fill = dict(color='#C2D4FF'),
                align = ['left'] * 5),
    
    cells=dict(values=[plot_data[col] for col in DISPLAYED_TABLE_COLUMNS],              
               fill = dict(color='#F5F8FF'),
               align = ['left'] * 5
               ))])

display(qgrid.show_grid(plot_data.loc[sel_points.point_inds, DISPLAYED_TABLE_COLUMNS]))

## Nationality

In [None]:
r = lambda: random.randint(0,255)
if 't-SNE-X' not in plot_data.columns:
    print('TSNE coordinates have not been added. Please run the desired clusterizer again in the corresponding notebook')
else:
    traces = []
    plot_data['opacity'] = 1
    plot_data.loc[plot_data['nationality'].isna(), 'opacity'] = 0.1
    for c in plot_data.clusters.unique():
        if c!=-1:
        # Create a trace
            trace = go.Scatter(
                x=plot_data[plot_data["clusters"] == c]["t-SNE-X"],
                y=plot_data[plot_data["clusters"] == c]["t-SNE-Y"],
                mode = 'markers',
                name = str(c),
                marker = go.scatter.Marker(size=4, color='#%02X%02X%02X' % (r(),r(),r()),opacity=plot_data.loc[plot_data["clusters"]==c, "opacity"]),
                showlegend = True
            )
        traces.append(trace)
    data = traces
    layout = go.Layout(dragmode = 'select')

    fig = go.Figure(data=data, layout=layout)
    fig.layout.dragmode = 'select'
    fname = '../' + d.value + '/figures/nationality/' + vec + clu[:-4] + '.html'

    # Plot and embed in ipython notebook
    iplot(fig)
    plotly.offline.plot(data, filename=fname, auto_open=False,
                    image_width=1280, image_height=800)

## Silhouette score

In [None]:
df_we = pd.read_pickle('../39570/clusterers/Word_Embed_word_embed_size_300_use_gap_True_context_5___Chobs_context_5_gap_length_20/tsne/Kmeans_random_state_42.pkl')
df_bert = pd.read_pickle('../39570/clusterers/Bert_LR___Chobs_context_5_gap_length_20/tsne/Kmeans_random_state_42.pkl')
df_distilbert = pd.read_pickle('../39570/clusterers/Distilbert_LR___Chobs_context_5_gap_length_20/tsne/Kmeans_random_state_42.pkl')

In [None]:
from sequencers.clusterers import kmeans
k_we = kmeans.Kmeans(df_we, {}, '')
features_we = k_we.transform_feat()
k_b = kmeans.Kmeans(df_bert, {}, '')
features_b = k_b.transform_feat()
k_db = kmeans.Kmeans(df_distilbert, {}, '')
features_db = k_db.transform_feat()

In [None]:
from sklearn.cluster import KMeans
%matplotlib inline  
from yellowbrick.cluster import SilhouetteVisualizer
from yellowbrick.datasets import load_nfl
import matplotlib.pyplot as plt


# Instantiate the clustering model and visualizer
model = KMeans(random_state=42)
visualizer = SilhouetteVisualizer(model, colors='viridis')
#plt.figure(figsize=(10,8))
#plt.scatter(plot_data['t-SNE-X'], plot_data['t-SNE-Y'], s =10, c=plot_data['clusters'], cmap='viridis')
print('Silhouette of Word Embeddence:')
visualizer.fit(features_we) 
visualizer.show()    

print('Silhouette of Bert:')
model = KMeans(random_state=42)
visualizer = SilhouetteVisualizer(model, colors='viridis')
visualizer.fit(features_b) 
visualizer.show()    

print('Silhouette of DistilBert:')
model = KMeans(random_state=42)
visualizer = SilhouetteVisualizer(model, colors='viridis')
visualizer.fit(features_db) 
visualizer.show()    