3 different plots for visualizing KMeans/DBSCAN clustering as well as our ground truth labels of the embedded change objects created by `notebooks/t-SNE_plotting.ipynb`. Find the descriptions of the plots above the corresponding plot.



# Load data for plotting


In [None]:
import os
os.chdir("/home/heuzerothp/wikiwho_tsne")
import pandas as pd
pd.set_option("display.max_colwidth", 1000)

plot_data = pd.read_csv("data/plotData_jlb_outer_kmeans.csv")


from utils.vars import *

%matplotlib inline

import matplotlib.pyplot as plt


# Visualization and plotting


## Plot of t-SNEd data


Visualizes the raw t-SNE-X data in a scatter plot.


In [None]:
fig = plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')
plt.scatter(plot_data["t-SNE-X"], plot_data["t-SNE-Y"], s=10)

## Plot colored depending on Nationality
This plot shows the two-dimensional data reduced by t-SNE and is colored according to the ground-truth label we obtained from the ground truth table of John Logie Baird. 

How to use:
Hovering over single entries gives the ID for plot_data. In the picture click the third button from the right (the single "arrow" saying "Show closest data on hover") for a nicer and more exact display of the ID).
Click and select (while mouse button is down) values for displaying them in a table below. You might have to rerun the cell or reselect your data points as there are bugs, most probably caused by plotly. You have the choice between selecting in a rectangular-shaped fashion (4th button "Box select") and polygon-style (5th button "Lasso Select").





In [None]:
displayed_table_columns = ['ins_tokens_str',"del_tokens_str", 'to_rev', 'ins_tokens']

if GAP:
    displayed_table_columns.append("ins_tokens_str")
    
    displayed_table_columns.append("del_tokens_str")
    
    
if LEFT_CONTEXT:
    displayed_table_columns.append("left_token_str")
if RIGHT_CONTEXT:
    displayed_table_columns.append("right_token_str")

In [None]:
%matplotlib inline
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import random

init_notebook_mode()#connected=True)

r = lambda: random.randint(0,255)

f = go.FigureWidget()
f.layout.dragmode = 'lasso'

scatter = f.add_scatter(x = plot_data["t-SNE-X"], y = plot_data["t-SNE-Y"], mode = 'markers', 
                       marker=go.scatter.Marker(size=0, opacity=0),showlegend = False)
dtc = displayed_table_columns.copy()

dtc.append("nationality")

# Create a table FigureWidget that updates on selection from points in the scatter plot of f
t = go.FigureWidget([go.Table(  
    header=dict(values=dtc,             
                fill = dict(color='#C2D4FF'),
                align = ['left'] * 5),
    cells=dict(values=[plot_data[col] for col in dtc],              
               fill = dict(color='#F5F8FF'),
               align = ['left'] * 5
               ))])

def selection_fn(trace,points,selector):   
    with out:
        clear_output()
        # WORKAROUND to hide none values of nationality
        ids = [_id for _id in points.point_inds if not pd.isna(plot_data.loc[_id, "nationality"])]
        display(plot_data.loc[ids, dtc])   
        
        # for qgrid support (slow):
        #display(qgrid.show_grid(plot_data.loc[points.point_inds, dtc]))    
    
        # TODO: change above uncommented code to a list comprehension which should speed up the table display
        #t.data[0].cells.values = [plot_data.loc[points.point_inds][col] for col in dtc]

traces = []
for c in ["Y", "N", None]:
    if c is None:
        trace = go.Scatter(
            x=plot_data.loc[plot_data["nationality"].isna(),"t-SNE-X"],
            y=plot_data.loc[plot_data["nationality"].isna(),"t-SNE-Y"],
            mode = 'markers',
            name = str(c),
            marker = go.scatter.Marker(size=4, color='grey'),
            text = plot_data.loc[plot_data["nationality"].isna()].index,
            

            showlegend = True,
        )
    else:
        # Create a trace
        trace = go.Scatter(
            x=plot_data.loc[plot_data["nationality"]==c,"t-SNE-X"],
            y=plot_data.loc[plot_data["nationality"]==c,"t-SNE-Y"],
            mode = 'markers',
            name = "Nationality" if c=="Y" else "No Nationality",
            marker = go.scatter.Marker(size=4, color='#%02X%02X%02X' % (r(),r(),r())),
            text = plot_data.loc[plot_data["nationality"]==c].index,
            
            showlegend = True,
        )
    trace.on_selection(selection_fn)
    f.add_trace(trace)
    
scatter.on_selection(selection_fn)

from IPython.display import display, clear_output
display(f)

from ipywidgets import widgets, Output
out = Output()
display(out)

## Plot on Mouse Selection returning selected elements in table

This plot shows the two-dimensional data reduced by t-SNE and is colored according to the clusters we obtained from K-Means or DBSCAN.

How to use: Hovering over single entries gives the ID for plot_data. In the picture click the third button from the right (the single "arrow" saying "Show closest data on hover") for a nicer and more exact display of the ID). Click and select (while mouse button is down) values for displaying them in a table below. You might have to rerun the cell or reselect your data points as there are bugs, most probably caused by plotly. You have the choice between selecting in a rectangular-shaped fashion (4th button "Box select") and polygon-style (5th button "Lasso Select").

**NOTE that for this plot to work probably the right value has to be set in `utils/vars.py:CLUSTERING_METHOD`, depending on which clustering method was executed in `notebooks/t-SNE_fasttextvectors_anyarticle.ipynb`.**



In [None]:
import pandas as pd
import plotly.graph_objs as go
import plotly.offline as po
import numpy as np
from ipywidgets import interactive, HBox, VBox
po.init_notebook_mode()
from copy import copy

import time
import qgrid
qgrid.set_grid_option('maxVisibleRows', 5)

import random
import pdb

f = go.FigureWidget()
f.layout.dragmode = 'lasso'

scatter = f.add_scatter(x = plot_data["t-SNE-X"], y = plot_data["t-SNE-Y"], mode = 'markers', 
                       marker=go.scatter.Marker(size=0, opacity=0),showlegend = False)
r = lambda: random.randint(0,255)

dtc = displayed_table_columns.copy()
dtc.append("cluster")

# Create a table FigureWidget that updates on selection from points in the scatter plot of f
t = go.FigureWidget([go.Table(  
    header=dict(values=dtc,             
                fill = dict(color='#C2D4FF'),
                align = ['left'] * 5),
    cells=dict(values=[plot_data[col] for col in dtc],              
               fill = dict(color='#F5F8FF'),
               align = ['left'] * 5
               ))])

def selection_fn(trace,points,selector):   
    with out:
        clear_output()
        # WORKAROUND to hide non-clustered points
        if CLUSTERING_METHOD == "dbscan":
            ids = [_id for _id in points.point_inds if not (plot_data.loc[_id, "cluster"] == -1)]
        else:
            ids = points.point_inds
        display(plot_data.loc[ids, dtc])
        
        # for qgrid support (slow):
        #display(qgrid.show_grid(plot_data.loc[points.point_inds, dtc]))    
    
        # TODO: change above uncommented code to a list comprehension which should speed up the table display
        #t.data[0].cells.values = [plot_data.loc[points.point_inds][col] for col in dtc]  
    
traces = []
for c in plot_data["cluster"].unique():
    # Create a trace
    trace = go.Scatter(
        x=plot_data[plot_data["cluster"] == c]["t-SNE-X"],
        y=plot_data[plot_data["cluster"] == c]["t-SNE-Y"],
        mode = 'markers',
        name = str(c),
        uid = str(c),
        marker = go.scatter.Marker(size=4, color='#%02X%02X%02X' % (r(),r(),r())),
        text = plot_data.index,
        showlegend = True, 
    )
    trace.on_selection(selection_fn)
    f.add_trace(trace)
        
scatter.on_selection(selection_fn)

from IPython.display import display, clear_output
display(f)

from ipywidgets import widgets, Output

out = Output()
display(out)