In [None]:
import pandas as pd
from wikiwho_chobj import Chobjer

context = 10
co = Chobjer(article="39570", pickles_path='../../bert', lang='en', context=context)
df_all = pd.DataFrame(co.iter_chobjs(), columns = next(co.iter_chobjs()).keys())

In [None]:
from nltk.corpus import stopwords 
nltk.download("stopwords")
stop_words = set(stopwords.words('english'))

In [None]:
import sys
sys.path.insert(1, '../utils/')
from merge import combine

df=df_all[(df_all['ins_tokens_str'].str.len() + df_all['del_tokens_str'].str.len()) <= 20]
jlogie = pd.read_csv("../../John_Logie_Baird_FULL.csv")
merged = df.apply(lambda x: combine(x, jlogie), axis=1)

# captures if we also want to use changeobjects that do not have tokens that are ground-truth labelled
OUTER_JOIN = True

merged = merged.dropna(how="all")
if not OUTER_JOIN:
    merged = merged[(merged["birth_place"].isna() & merged["Bulk"].isna() & merged["nationality"].isna() & merged["Link"].isna())== False]

In [None]:
def making_list(tokens):
    new_tokens = []
    for item in tokens:
        new_tokens.append(' '.join(word for word in item if not word in stop_words))
                          #.replace('[[', '').replace(']]', '').replace('}}', '').replace('{{', '').replace('–', ''))

    return(new_tokens)


all_tokens = []
for i, row in merged.iterrows():
    all_tokens.append(making_list([row['left_token_str'],row['right_token_str']]))#,row['ins_tokens_str'],row['del_tokens_str']]))
# for i, row in df.iterrows():
#     all_tokens.append(making_list([row['left_token_str'],row['right_token_str']]))#,row['ins_tokens_str'],row['del_tokens_str']]))

In [None]:
from bert_serving.client import BertClient
import numpy as np
bc = BertClient()
c = 0
features = pd.DataFrame()
for row in all_tokens:
    vector=[]
    for token in row:
        if token.isspace() or token == '':
            vector.append(np.full((1, 768), 0))
        else:
            vector.append(bc.encode([token]))

    new_vector = np.concatenate((vector[0], vector[1]#, vector[2], vector[3]
                                ),axis = 1)
    features = features.append(pd.DataFrame(new_vector[0]).T)  
    c+=1

In [None]:
from sklearn.cluster import KMeans

clusterer = KMeans(random_state=42)
clusters = clusterer.fit_predict(features)

In [None]:
pd.Series(clusters).unique()

In [None]:
from sklearn.manifold import TSNE


X = TSNE(random_state=42).fit_transform(features)

In [None]:
plot_data = pd.concat([pd.DataFrame(X), pd.Series(clusters), merged], axis=1)

plot_data.columns = ['t-SNE-X', 't-SNE-Y', 'cluster', 'Bulk',
                  'Link',          'action',     'birth_place',
           'del_end_pos',   'del_start_pos',      'del_tokens',
        'del_tokens_str',          'editor',        'from_rev',
        'from_timestamp',     'ins_end_pos',   'ins_start_pos',
            'ins_tokens',  'ins_tokens_str',      'left_neigh',
            'left_token',  'left_token_str',     'nationality',
               'page_id',     'right_neigh',     'right_token',
       'right_token_str',            'text',          'to_rev',
          'to_timestamp',           'token']     

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
fig = plt.figure(figsize=(16, 10), dpi= 80, facecolor='w', edgecolor='k')
plt.scatter(plot_data["t-SNE-X"], plot_data["t-SNE-Y"], s=10)

## tSNE k-means

In [None]:
import plotly.io
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import random

init_notebook_mode(connected=True)

r = lambda: random.randint(0,255)

traces = []
for c in plot_data.cluster.unique():
    
   # Create a trace
    trace = go.Scatter(
        x=plot_data[plot_data["cluster"]==c]["t-SNE-X"],
        
        
        y=plot_data[plot_data["cluster"]==c]["t-SNE-Y"],
        
        
        mode = 'markers',
        name = str(c),
        marker = go.scatter.Marker(size=4, color='#%02X%02X%02X' % (r(),r(),r())),
        text = plot_data.index,
        
        showlegend = True,

    )
    traces.append(trace)

data = traces

# Plot and embed in ipython notebook
sys.path.insert(1, '../bert-tfidf/graphs')
iplot(data, filename='kmeans_context_10',image='png', image_height = 500, image_width = 700) 
#plotly.io.write_image("../bert-tfidf/graphs/kmeans-context_", context, ".png")

## Nationality

In [None]:
# enable javascript support
init_notebook_mode(connected=True)

r = lambda: random.randint(0,255)

traces = []
for c in ["Y", "N", None]:
    
    if c is None:
        trace = go.Scatter(
            x=plot_data.loc[plot_data["nationality"].isna(),"t-SNE-X"],
            y=plot_data.loc[plot_data["nationality"].isna(),"t-SNE-Y"],
            mode = 'markers',
            name = str(c),
            marker = go.scatter.Marker(size=4, color='#%02X%02X%02X' % (r(),r(),r())),
            text = plot_data.index,
            
            showlegend = True,
        )
    else:
        # Create a trace
        trace = go.Scatter(
            x=plot_data.loc[plot_data["nationality"]==c,"t-SNE-X"],
            y=plot_data.loc[plot_data["nationality"]==c,"t-SNE-Y"],
            mode = 'markers',
            name = str(c),
            marker = go.scatter.Marker(size=4, color='#%02X%02X%02X' % (r(),r(),r())),
            text = plot_data.index,
            showlegend = True,
        )
    traces.append(trace)

data = traces

# Plot and embed in ipython notebook
iplot(data, filename='basic-scatter')

## tSNE DBSCAN

In [None]:
from sklearn.cluster import DBSCAN
from sklearn import metrics
db = DBSCAN(eps=10, min_samples=8).fit(features)
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)

In [None]:
plot_data_dbscan = pd.concat([pd.DataFrame(X), pd.Series(labels), merged], axis=1)

plot_data_dbscan.columns = ['t-SNE-X', 't-SNE-Y', 'db_cluster', 'Bulk',
                  'Link',          'action',     'birth_place',
           'del_end_pos',   'del_start_pos',      'del_tokens',
        'del_tokens_str',          'editor',        'from_rev',
        'from_timestamp',     'ins_end_pos',   'ins_start_pos',
            'ins_tokens',  'ins_tokens_str',      'left_neigh',
            'left_token',  'left_token_str',     'nationality',
               'page_id',     'right_neigh',     'right_token',
       'right_token_str',            'text',          'to_rev',
          'to_timestamp',           'token']   

In [None]:
import plotly.io
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import random
init_notebook_mode(connected=True)

r = lambda: random.randint(0,255)

traces = []
for c in plot_data_dbscan.db_cluster.unique():
    if c== -1:
        trace = go.Scatter(
            x=plot_data_dbscan[plot_data_dbscan["db_cluster"]==c]["t-SNE-X"],
            y=plot_data_dbscan[plot_data_dbscan["db_cluster"]==c]["t-SNE-Y"],
            mode = 'markers',
            name = str(c),
            marker = go.scatter.Marker(size=4, color='grey'),
            text = plot_data_dbscan.index,
            showlegend = True,
        )
    else:
   # Create a trace
        trace = go.Scatter(
            x=plot_data_dbscan[plot_data_dbscan["db_cluster"]==c]["t-SNE-X"],
            y=plot_data_dbscan[plot_data_dbscan["db_cluster"]==c]["t-SNE-Y"],
            mode = 'markers',
            name = str(c),
            marker = go.scatter.Marker(size=4, color='#%02X%02X%02X' % (r(),r(),r())),
            text = plot_data_dbscan.index,
            showlegend = True,
        )
    traces.append(trace)

data = traces

# Plot and embed in ipython notebook
sys.path.insert(1, '../bert-tfidf/graphs')
iplot(data, filename='kmeans_context_10',image='png', image_height = 500, image_width = 700) 
#plotly.io.write_image("../bert-tfidf/graphs/kmeans-context_", context, ".png")