In [None]:
import os
os.chdir("..")

import pandas as pd
from wikiwho_chobj import Chobjer

import nltk
nltk.download("stopwords")

co = Chobjer(article="39570", pickles_path='pickles', lang='en', context=5)
df = pd.DataFrame(co.iter_chobjs(), columns = next(co.iter_chobjs()).keys())

jlogie = pd.read_csv("data/John_Logie_Baird_FULL.csv")

## Merge ground-truth labels with change object dataframe

In [None]:
import numpy as np

def combine(chobj):
    # to be called by an apply function on a dataframe of change objects as provided by wikiwho
    # depends on jlogie as ground truth labels
    boolean = jlogie["rev_id"] == chobj["to_rev"]
    token = jlogie[boolean]    
    if not token.empty and len(token) == 1:
        which_jlogie = token["token_id"].isin(chobj["ins_tokens"])
        if np.sum(which_jlogie) == 1:
            to_merge = jlogie.iloc[which_jlogie.index[0]]
            chobj["nationality"] = to_merge["nationality"]
            chobj["birth_place"] = to_merge["birth_place"]
            chobj["Link"] = to_merge["Link"]
            chobj["Bulk"] = to_merge["Bulk"]
            chobj["token"] = to_merge["token"]
            chobj["action"] = to_merge["action"]
            return chobj
        elif np.sum(which_jlogie) > 1:
            print("more than one row in jlogie found!")
            return pd.Series(None)
        elif np.sum(which_jlogie) == 0:
            return pd.Series(None)
    elif not token.empty and len(token) > 1:
        which_jlogie = token["token_id"].isin(chobj["ins_tokens"])
        if np.sum(which_jlogie) == 1:
            to_merge = jlogie.iloc[which_jlogie.index[0]]
            chobj["nationality"] = to_merge["nationality"]
            chobj["birth_place"] = to_merge["birth_place"]
            chobj["Link"] = to_merge["Link"]
            chobj["Bulk"] = to_merge["Bulk"]
            chobj["token"] = to_merge["token"]
            chobj["action"] = to_merge["action"]
            return chobj
        elif np.sum(which_jlogie) == 0:
            return pd.Series(None)
        elif np.sum(which_jlogie) > 1:
            for col in ["nationality", "birth_place", "Link", "Bulk"]:
                if len(token[col].unique()) == 1:
                    chobj[col] = list(token[col])[0]
                else:
                    chobj[col] = None
                    print("non congruent values found for df['to_rev'] == ", str(chobj["to_rev"]), " and token ids: ", list(token["token_id"]), " in jlogie. Setting None to column ", str(col))
            return chobj
        return pd.Series(None)
    else:
        return pd.Series(None)

pre_merge_optimization = df[df["to_rev"].isin(jlogie["rev_id"].unique())]
merged = pre_merge_optimization.apply(lambda x: combine(x), axis=1)
merged = merged.dropna(how="all")

##  Embed words by creating a vector of length 300 for each inserted and deleted tokens, so the resulting vector for one change object has length 600

In [None]:
from nltk.corpus import stopwords
import numpy as np
from gensim.sklearn_api import W2VTransformer
from gensim.models import KeyedVectors
from copy import deepcopy

WORD_EMBED_SIZE = 300

def transform(phrase : list, embedding):
    li_vecs = []
    for i in range(len(phrase)):
        if phrase[i] in embedding:
            li_vecs.append(deepcopy(embedding[phrase[i]]))
    if len(li_vecs) != 0:
        vecs = np.stack(li_vecs)
        return vecs            
    else:
        return None

def filter_stopwords(phrase):
    important_words = []
    for word in phrase:
        if word not in stopwords.words('english'):
            important_words.append(word)
    return important_words

# Load vectors directly from the file
embed = KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin', binary=True)

def create_features(chobj):
    ins_wordvecs = transform(filter_stopwords(list(chobj["ins_tokens_str"])), embed)     
    del_wordvecs = transform(filter_stopwords(list(chobj["del_tokens_str"])), embed)
    if ins_wordvecs is None:
        ins_wordvecs = np.full(300, np.nan)
    else:
        ins_wordvecs = np.mean(ins_wordvecs, axis=0)
    if del_wordvecs is None:
        del_wordvecs = np.full(300, np.nan)
    else:
        del_wordvecs = np.mean(del_wordvecs, axis=0)

    feat = pd.Series(np.nan_to_num(np.concatenate((ins_wordvecs, del_wordvecs))))

    return feat

Embedded = merged.apply(lambda x: create_features(x), axis=1)

## Visualization and plotting

In [None]:
from sklearn.manifold import TSNE

X = TSNE().fit_transform(Embedded)

In [None]:
def convert_to_colors(entries):
    col_list = []
    
    for ed in entries:
        if ed == "Y":
            col_list.append("r")
        if ed == "N":
            col_list.append("b")
        if ed is None:
            col_list.append("g")
            
    return col_list
            
%matplotlib inline
import matplotlib.pyplot as plt


### of birth place

In [None]:
fig = plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')
plt.scatter(X[:,0], X[:,1], c=convert_to_colors(merged["birth_place"]),s=10)

### of nationality

In [None]:
fig = plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')
plt.scatter(X[:,0], X[:,1], c=convert_to_colors(merged["nationality"]),s=10)