In [None]:
import os
os.chdir("/home/heuzerothp/wikiwho_tsne")

import pandas as pd
from wikiwho_chobj import Chobjer

import nltk
nltk.download("stopwords")


co = Chobjer(article="39570", pickles_path='pickles', lang='en', context=5)
df = pd.DataFrame(co.iter_chobjs(), columns = next(co.iter_chobjs()).keys())


jlogie = pd.read_csv("data/John_Logie_Baird_FULL.csv")


WORD_EMBED_SIZE = 300
LEFT_CONTEXT = 5
RIGHT_CONTEXT = 5
GAP = True

In [None]:
import numpy as np

def combine(chobj):
    # to be called by an apply function on a dataframe of change objects as provided by wikiwho
    # depends on jlogie as ground truth labels
    boolean = jlogie["rev_id"] == chobj["to_rev"]
    token = jlogie[boolean]    
    if not token.empty and len(token) == 1:
        which_jlogie = token["token_id"].isin(chobj["ins_tokens"])
        if np.sum(which_jlogie) == 1:
            to_merge = jlogie.iloc[which_jlogie.index[0]]
            chobj["nationality"] = to_merge["nationality"]
            chobj["birth_place"] = to_merge["birth_place"]
            chobj["Link"] = to_merge["Link"]
            chobj["Bulk"] = to_merge["Bulk"]
            chobj["token"] = to_merge["token"]
            chobj["action"] = to_merge["action"]
            return chobj
        elif np.sum(which_jlogie) > 1:
            print("more than one row in jlogie found!")
            return pd.Series(None)
        elif np.sum(which_jlogie) == 0:
            chobj["nationality"] = None
            chobj["birth_place"] = None
            chobj["Link"] = None
            chobj["Bulk"] = None
            chobj["token"] = None
            chobj["action"] = None
            return chobj
    elif not token.empty and len(token) > 1:
        which_jlogie = token["token_id"].isin(chobj["ins_tokens"])
        if np.sum(which_jlogie) == 1:
            to_merge = jlogie.iloc[which_jlogie.index[0]]
            chobj["nationality"] = to_merge["nationality"]
            chobj["birth_place"] = to_merge["birth_place"]
            chobj["Link"] = to_merge["Link"]
            chobj["Bulk"] = to_merge["Bulk"]
            chobj["token"] = to_merge["token"]
            chobj["action"] = to_merge["action"]
            return chobj
        elif np.sum(which_jlogie) == 0:
            chobj["nationality"] = None
            chobj["birth_place"] = None
            chobj["Link"] = None
            chobj["Bulk"] = None
            chobj["token"] = None
            chobj["action"] = None
            return chobj
        elif np.sum(which_jlogie) > 1:
            for col in ["nationality", "birth_place", "Link", "Bulk"]:
                if len(token[col].unique()) == 1:
                    chobj[col] = list(token[col])[0]
                else:
                    chobj[col] = None
                    print("non congruent values found for df['to_rev'] == ", str(chobj["to_rev"]), " and token ids: ", list(token["token_id"]), " in jlogie. Setting None to column ", str(col))
            return chobj
        return pd.Series(None)
    else:
        chobj["nationality"] = None
        chobj["birth_place"] = None
        chobj["Link"] = None
        chobj["Bulk"] = None
        chobj["token"] = None
        chobj["action"] = None
        return chobj

merged = df.apply(lambda x: combine(x), axis=1)
merged = merged.dropna(how="all")
merged = merged[(merged["birth_place"].isna() & merged["Bulk"].isna() & merged["nationality"].isna() & merged["Link"].isna())== False]

In [None]:
# remove gaps longer than 10 tokens
if GAP:
    
    merged = merged[merged["ins_tokens_str"].apply(lambda x: len(x) <= 10) & merged["del_tokens_str"].apply(lambda x: len(x) <= 10)]

##  Embed words by creating a vector of length 300 for each inserted and deleted tokens, so the resulting vector for one change object has length 600

In [None]:
import io

def load_vectors(fname):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = map(float, tokens[1:])
    return data

embed = load_vectors('data/wiki-news-300d-1M-subword.vec')

In [None]:
from nltk.corpus import stopwords
import numpy as np
from gensim.sklearn_api import W2VTransformer
from gensim.models import KeyedVectors
from copy import deepcopy
import pdb

def transform(phrase : list, embedding):
    li_vecs = []
    for i in range(len(phrase)):
        if phrase[i] in embedding:
            li_vecs.append(list(deepcopy(embedding[phrase[i]])))
    if len(li_vecs) != 0:
        vecs = np.stack(li_vecs)
        return vecs            
    else:
        return None

def filter_stopwords(phrase):
    important_words = []
    for word in phrase:
        if word not in stopwords.words('english'):
            important_words.append(word)
    return important_words

def create_features(chobj, use_gap, left_context, right_context):
    if left_context > 0:
        left_wordvecs = transform(filter_stopwords(list(chobj["left_token_str"][-left_context:])), embed)
        if left_wordvecs is None:
            left_wordvecs = np.full(WORD_EMBED_SIZE, 0)
        else:
            left_wordvecs = np.mean(left_wordvecs, axis=0)
    if right_context > 0:
        right_wordvecs = transform(filter_stopwords(list(chobj["right_token_str"][:right_context])), embed)  
        if right_wordvecs is None:
            right_wordvecs = np.full(WORD_EMBED_SIZE, 0)
        else:
            right_wordvecs = np.mean(right_wordvecs, axis=0)
    if use_gap:
        ins_wordvecs = transform(filter_stopwords(list(chobj["ins_tokens_str"])), embed)     
        del_wordvecs = transform(filter_stopwords(list(chobj["del_tokens_str"])), embed)
        if ins_wordvecs is None:
            ins_wordvecs = np.full(WORD_EMBED_SIZE, 0)
        else:
            ins_wordvecs = np.mean(ins_wordvecs, axis=0)
        if del_wordvecs is None:
            del_wordvecs = np.full(WORD_EMBED_SIZE, 0)
        else:
            del_wordvecs = np.mean(del_wordvecs, axis=0)
    
    li = []
    for a in ["left_wordvecs", "right_wordvecs", "ins_wordvecs", "del_wordvecs"]:
        if a in vars():
            li.append(vars()[a])
    
    try:
        feat = pd.Series(np.nan_to_num(np.concatenate(li)))
    except ValueError:
        pdb.set_trace()
    
    
    return feat

Embedded = merged.apply(lambda x: create_features(x, use_gap=GAP, left_context=LEFT_CONTEXT, right_context=RIGHT_CONTEXT), 
                        axis=1)

## K-means clustering

In [None]:
from sklearn.cluster import KMeans

clusterer = KMeans(random_state=42)
clusters = clusterer.fit_predict(Embedded)

In [None]:
pd.Series(clusters).unique()


### find closest words to centroids for labeling clusters

In [None]:
from sklearn.neighbors import NearestNeighbors
import numpy as np

def average_vectors_to_shape_300(vec):
    result = np.zeros(300)
    fold = int(vec.shape[0]/300)
    for i in range(300):
        to_avg = []
        for k in range(fold):
            to_avg.append(vec[i + k *300])
        result[i] = np.mean(to_avg)
    return result

centroids = clusterer.cluster_centers_
centroids_300 = [np.zeros(300) for i in range(len(centroids))]
for i in range(len(centroids)):
    centroids_300[i] = average_vectors_to_shape_300(centroids[i])

embed_keys = list(embed.keys())
embed_vals = []
for i in embed.values():
    embed_vals.append(list(deepcopy(i)))
    
X = np.array(embed_vals)
nbrs = NearestNeighbors(n_neighbors=5, algorithm='ball_tree').fit(X)
dists, inds = nbrs.kneighbors(centroids_300)

closest_words = {}
for i in range(len(pd.Series(clusters).unique())):
    print(i)
    closest_words[i] = [(embed_keys[inds[i, j]], dists[i, j]) for j in range(len(inds[i]))]

## Visualization and plotting

In [None]:
from sklearn.manifold import TSNE

X = TSNE(random_state=42).fit_transform(Embedded)

%matplotlib inline


import matplotlib.pyplot as plt

In [None]:
import random

def convert_editors_to_colors(editors):
    ed_col = {}
    col_list = []
    for ed in editors:
        if ed in ed_col:
            col_list.append(ed_col[ed])
        else:
            r = lambda: random.randint(0,255)
            new_col = '#%02X%02X%02X' % (r(),r(),r())
            ed_col[ed] = new_col
            col_list.append(new_col)
            
    return col_list

In [None]:
fig = plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')
plt.scatter(X[:,0], X[:,1], s=10)

### interactively

In [None]:
%matplotlib inline
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go

# enable javascript support
init_notebook_mode(connected=True)

r = lambda: random.randint(0,255)

traces = []
for c in pd.Series(clusters).unique():
    

    # Create a trace
    trace = go.Scatter(
        x=X[clusters==c,0],
        y=X[clusters==c,1],
        mode = 'markers',
        name = str(c),
        marker = go.scatter.Marker(size=4, color='#%02X%02X%02X' % (r(),r(),r())),
        text = Embedded.index,
        showlegend = True,

    )
    traces.append(trace)

data = traces

# Plot and embed in ipython notebook
iplot(data, filename='basic-scatter')

In [None]:
%matplotlib inline
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go

# enable javascript support
init_notebook_mode(connected=True)

r = lambda: random.randint(0,255)

traces = []
for c in ["Y", "N", None]:
    
    if c is None:
        trace = go.Scatter(
            x=X[merged["nationality"].isna(),0],
            y=X[merged["nationality"].isna(),1],
            mode = 'markers',
            name = str(c),
            marker = go.scatter.Marker(size=4, color='#%02X%02X%02X' % (r(),r(),r())),
            text = Embedded.index,
            showlegend = True,
        )
    else:
        # Create a trace
        trace = go.Scatter(
            x=X[merged["nationality"]==c,0],
            y=X[merged["nationality"]==c,1],
            mode = 'markers',
            name = str(c),
            marker = go.scatter.Marker(size=4, color='#%02X%02X%02X' % (r(),r(),r())),
            text = Embedded.index,
            showlegend = True,
        )
    traces.append(trace)

data = traces

# Plot and embed in ipython notebook
iplot(data, filename='basic-scatter')

##### Selection Plot

In [None]:
import pandas as pd
import plotly.graph_objs as go
import plotly.offline as po
import numpy as np
from ipywidgets import interactive, HBox, VBox
po.init_notebook_mode()
import time
import qgrid
qgrid.set_grid_option('maxVisibleRows', 5)

import pdb

DISPLAYED_TABLE_COLUMNS = ["t-SNE-X", "t-SNE-Y"]
if GAP:
    DISPLAYED_TABLE_COLUMNS.append("ins_tokens_str")
    DISPLAYED_TABLE_COLUMNS.append("del_tokens_str")
if LEFT_CONTEXT:
    DISPLAYED_TABLE_COLUMNS.append("left_token_str")
if RIGHT_CONTEXT:
    DISPLAYED_TABLE_COLUMNS.append("right_token_str")
    
if not (merged.index == range(len(merged))).all():
    merged = merged.reset_index()
    
# TODO: NEEDS TO BE TESTED IF RIGHT ELEMENTS GET JOINED!!!    
plot_data = pd.concat([pd.DataFrame(X), merged], axis=1)

plot_data.columns = ['t-SNE-X',                 't-SNE-Y', 'index', 'Bulk',
                  'Link',          'action',     'birth_place',
           'del_end_pos',   'del_start_pos',      'del_tokens',
        'del_tokens_str',          'editor',        'from_rev',
        'from_timestamp',     'ins_end_pos',   'ins_start_pos',
            'ins_tokens',  'ins_tokens_str',      'left_neigh',
            'left_token',  'left_token_str',     'nationality',
               'page_id',     'right_neigh',     'right_token',
       'right_token_str',            'text',          'to_rev',
          'to_timestamp',           'token']

f = go.FigureWidget()
scatter = f.add_scatter(x = plot_data["t-SNE-X"], y = plot_data["t-SNE-Y"], mode = 'markers')




# r = lambda: random.randint(0,255)

# traces = []
# for c in pd.Series(clusters).unique():
#     # Create a trace
#     trace = go.Scatter(
#         x=plot_data.loc[clusters==c,"t-SNE-X"],
#         y=plot_data.loc[clusters==c,"t-SNE-Y"],
#         mode = 'markers',
#         name = str(c),
#         marker = go.scatter.Marker(size=4, color='#%02X%02X%02X' % (r(),r(),r())),
#         text = Embedded.index,
#         showlegend = True,

#     )
#     trace.on_selection(selection_fn)
#     traces.append(trace)

# f.data = traces


f.layout.dragmode = 'lasso'

N = len(plot_data)
scatter.x = scatter.x + np.random.rand(N)/10 *(plot_data['t-SNE-X'].max() - plot_data['t-SNE-X'].min())
scatter.y = scatter.y + np.random.rand(N)/10 *(plot_data['t-SNE-Y'].max() - plot_data['t-SNE-Y'].min())
scatter.marker.opacity = 0.5

# Create a table FigureWidget that updates on selection from points in the scatter plot of f
t = go.FigureWidget([go.Table(  
    header=dict(values=DISPLAYED_TABLE_COLUMNS,             
                fill = dict(color='#C2D4FF'),
                align = ['left'] * 5),
    
    cells=dict(values=[plot_data[col] for col in DISPLAYED_TABLE_COLUMNS],              
               fill = dict(color='#F5F8FF'),
               align = ['left'] * 5
               ))])

def selection_fn(trace,points,selector):   
    with out:
        clear_output()
        #display(plot_data.loc[points.point_inds])
        
        display(qgrid.show_grid(plot_data.loc[points.point_inds, DISPLAYED_TABLE_COLUMNS]))
    
    t.data[0].cells.values = [plot_data.loc[points.point_inds][col] for col in DISPLAYED_TABLE_COLUMNS]
scatter.on_selection(selection_fn)

from IPython.display import display, clear_output
display(f)

from ipywidgets import widgets, Output
out = Output()
display(out)