In [None]:
import os
os.chdir("/home/heuzerothp/wikiwho_tsne")

import pandas as pd
from wikiwho_chobj import Chobjer

import nltk
nltk.download("stopwords")

co = Chobjer(article="1636145", pickles_path='pickles', lang='en', context=5)
df = pd.DataFrame(co.iter_chobjs(), columns = next(co.iter_chobjs()).keys())

##  Embed words by creating a vector of length 300 for each inserted and deleted tokens, so the resulting vector for one change object has length 600

In [None]:
import io

def load_vectors(fname):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = map(float, tokens[1:])
    return data

embed = load_vectors('data/wiki-news-300d-1M.vec')

In [None]:
from nltk.corpus import stopwords
import numpy as np
from gensim.sklearn_api import W2VTransformer
from gensim.models import KeyedVectors
from copy import deepcopy
import pdb

WORD_EMBED_SIZE = 300
LEFT_CONTEXT = 5
RIGHT_CONTEXT = 5
GAP = False

def transform(phrase : list, embedding):
    li_vecs = []
    for i in range(len(phrase)):
        if phrase[i] in embedding:
            li_vecs.append(list(deepcopy(embedding[phrase[i]])))
    if len(li_vecs) != 0:
        vecs = np.stack(li_vecs)
        return vecs            
    else:
        return None

def filter_stopwords(phrase):
    important_words = []
    for word in phrase:
        if word not in stopwords.words('english'):
            important_words.append(word)
    return important_words

def create_features(chobj, use_gap, left_context, right_context):
    if left_context > 0:
        left_wordvecs = transform(filter_stopwords(list(chobj["left_token_str"][-left_context:])), embed)
        if left_wordvecs is None:
            left_wordvecs = np.full(WORD_EMBED_SIZE, np.nan)
        else:
            left_wordvecs = np.mean(left_wordvecs, axis=0)
    if right_context > 0:
        right_wordvecs = transform(filter_stopwords(list(chobj["right_token_str"][:right_context])), embed)  
        if right_wordvecs is None:
            right_wordvecs = np.full(WORD_EMBED_SIZE, np.nan)
        else:
            right_wordvecs = np.mean(right_wordvecs, axis=0)
    if use_gap:
        ins_wordvecs = transform(filter_stopwords(list(chobj["ins_tokens_str"])), embed)     
        del_wordvecs = transform(filter_stopwords(list(chobj["del_tokens_str"])), embed)
        if ins_wordvecs is None:
            ins_wordvecs = np.full(WORD_EMBED_SIZE, np.nan)
        else:
            ins_wordvecs = np.mean(ins_wordvecs, axis=0)
        if del_wordvecs is None:
            del_wordvecs = np.full(WORD_EMBED_SIZE, np.nan)
        else:
            del_wordvecs = np.mean(del_wordvecs, axis=0)
    
    li = []
    for a in ["left_wordvecs", "right_wordvecs", "ins_wordvecs", "del_wordvecs"]:
        if a in vars():
            li.append(vars()[a])
    
    try:
        feat = pd.Series(np.nan_to_num(np.concatenate(li)))
    except ValueError:
        pdb.set_trace()
    
    return feat

Embedded = df.apply(lambda x: create_features(x, use_gap=GAP, left_context=LEFT_CONTEXT, right_context=RIGHT_CONTEXT), 
                        axis=1)

## Visualization and plotting

In [None]:
from sklearn.manifold import TSNE

X = TSNE().fit_transform(Embedded)

%matplotlib inline
import matplotlib.pyplot as plt

### of birth place

In [None]:
fig = plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')
plt.scatter(X[:,0], X[:,1], s=10)

### of nationality

In [None]:
fig = plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')
plt.scatter(X[:,0], X[:,1], s=10)

### interactively

In [None]:
%matplotlib inline
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go

# enable javascript support
init_notebook_mode(connected=True)

# Create a trace
trace = go.Scatter(
    x=X[:,0],
    y=X[:,1],
    mode = 'markers',
    marker = go.scatter.Marker(size=4),
    text = Embedded.index
)

data = [trace]

# Plot and embed in ipython notebook
iplot(data, filename='basic-scatter')