Merges change objects from wikiwho with our ground truth labels from the John Logie Baird Wikipedia article (section **Merging**). We could not use the normal join operator since we have to merge elements of the John Logie Baird table (**one token**) with elements from the change object table (**list of tokens**).

Afterwards we calculate the features (section **Feature Creation**) using word embeddings, in this case [fasttext subword embeddings](https://fasttext.cc/docs/en/english-vectors.html) (for changing to other fasttext embeddings just replace the file (see dependencies), for google embeddings, there is code in the notebooks folder or just ask me;)). Here depending on the variables ``GAP``, ``LEFT_CONTEXT`` and ``RIGHT_CONTEXT`` (from `utils/const.py`) 300 to 1200 dimensions get created for each change object: `LEFT_CONTEXT` and `RIGHT_CONTEXT` respectively each account for 300 dimensions. Their values are integers so they state how many tokens of the left and right context are considered. `GAP` is a boolean variable that states if we include the gap, i.e. the deleted and inserted list of tokens. If `GAP` is set we have an additional 600 dimensions (300 for the inserted tokens and 300 for the deleted tokens).
Since we have a list of tokens we *average* the values of the embeddings of inserted and deleted tokens and left and right context each.

Then you have the choice of clustering the embeddings either with K-Means (section **KMEANS clustering**) or DBSCAN (section **DBSCAN clustering**). Note that the subsection **reverse look-up** performs a search of finding the closest words to the centroids of the clustering to label our clusters. This step is optional and actually nowhere needed in the code.

Finally we reduce dimensionality of our data to plot it in a 2 dimensional graph using t-SNE (section **t-SNE**) and save the data to be executed by `notebooks/t-SNE_plotting.ipynb`.


**Dependencies**:
- `utils/const.py`: for constants shared with `notebooks/t-SNE_plotting.ipynb'
- `utils/merge.py`: for merging of the two dataframes `jlogie` and `df`
- `data/wiki-news-300d-1M-subword.vec`: the pre-trained word embeddings
- locally installed [wikiwho wrapper](https://github.com/gesiscss/wikiwho_wrapper)




# Merging


In [None]:
import os
os.chdir("/home/heuzerothp/wikiwho_tsne")

import pandas as pd
from wikiwho_chobj import Chobjer




from utils.vars import *





import nltk
nltk.download("stopwords")


co = Chobjer(article="39570", pickles_path='pickles', lang='en', context=max(LEFT_CONTEXT, RIGHT_CONTEXT))
df = pd.DataFrame(co.iter_chobjs(), columns = next(co.iter_chobjs()).keys())

jlogie = pd.read_csv("data/John_Logie_Baird_FULL.csv")



In [None]:
import numpy as np

from utils.merge import combine

merged = df.apply(lambda x: combine(x, jlogie), axis=1)

# captures if we also want to use change objects that do not match tokens that are ground-truth labelled
OUTER_JOIN = False

merged = merged.dropna(how="all")
if not OUTER_JOIN:
    merged = merged[(merged["birth_place"].isna() & merged["Bulk"].isna() & merged["nationality"].isna() & merged["Link"].isna())== False]

### remove Bulks

In [None]:
# remove gaps longer than 20 tokens
if GAP:

    merged = merged[(merged['ins_tokens_str'].str.len() + merged['del_tokens_str'].str.len()) <= 20]

##  Feature Creation


In [None]:
import io

def load_vectors(fname):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = map(float, tokens[1:])
    return data

embed = load_vectors('data/wiki-news-300d-1M-subword.vec')

In [None]:
from nltk.corpus import stopwords
import numpy as np
from gensim.sklearn_api import W2VTransformer
from gensim.models import KeyedVectors
from copy import deepcopy
import pdb

def transform(phrase : list, embedding):
    li_vecs = []
    for i in range(len(phrase)):
        
        if phrase[i] in embedding:
            li_vecs.append(list(deepcopy(embedding[phrase[i]])))
    if len(li_vecs) != 0:
        vecs = np.stack(li_vecs)
        
        return vecs            
    else:
        return None

def filter_stopwords(phrase):
    important_words = []
    for word in phrase:
        if word not in stopwords.words('english'):
            important_words.append(word)
    return important_words

def create_features(chobj, use_gap, left_context, right_context):
    if left_context > 0:
        left_wordvecs = transform(filter_stopwords(list(chobj["left_token_str"][-left_context:])), embed)
        if left_wordvecs is None:
            left_wordvecs = np.full(WORD_EMBED_SIZE, 0)
        else:
            left_wordvecs = np.mean(left_wordvecs, axis=0)
    if right_context > 0:
        right_wordvecs = transform(filter_stopwords(list(chobj["right_token_str"][:right_context])), embed)  
        if right_wordvecs is None:
            right_wordvecs = np.full(WORD_EMBED_SIZE, 0)
        else:
            right_wordvecs = np.mean(right_wordvecs, axis=0)
    if use_gap:
        ins_wordvecs = transform(filter_stopwords(list(chobj["ins_tokens_str"])), embed)     
        del_wordvecs = transform(filter_stopwords(list(chobj["del_tokens_str"])), embed)
        if ins_wordvecs is None:
            ins_wordvecs = np.full(WORD_EMBED_SIZE, 0)
        else:
            ins_wordvecs = np.mean(ins_wordvecs, axis=0)
        if del_wordvecs is None:
            del_wordvecs = np.full(WORD_EMBED_SIZE, 0)
        else:
            del_wordvecs = np.mean(del_wordvecs, axis=0)
    
    li = []
    for a in ["left_wordvecs", "right_wordvecs", "ins_wordvecs", "del_wordvecs"]:
        if a in vars():
            li.append(vars()[a])
    
    try:
        feat = pd.Series(np.nan_to_num(np.concatenate(li)))
    except ValueError:
        pdb.set_trace()
    
    
    return feat

Embedded = merged.apply(lambda x: create_features(x, use_gap=GAP, left_context=LEFT_CONTEXT, right_context=RIGHT_CONTEXT), 
                        axis=1)

## (a) K-means clustering


In [None]:
from sklearn.cluster import KMeans

clusterer = KMeans(random_state=42)
clusters = clusterer.fit_predict(Embedded)

In [None]:
pd.Series(clusters).unique()


### reverse look-up


In [None]:
from sklearn.neighbors import NearestNeighbors
import numpy as np

def average_vectors_to_shape_300(vec):
    result = np.zeros(300)
    fold = int(vec.shape[0]/300)
    for i in range(300):
        to_avg = []
        for k in range(fold):
            to_avg.append(vec[i + k *300])
        result[i] = np.mean(to_avg)
    return result

centroids = clusterer.cluster_centers_
centroids_300 = [np.zeros(300) for i in range(len(centroids))]
for i in range(len(centroids)):
    centroids_300[i] = average_vectors_to_shape_300(centroids[i])

embed_keys = list(embed.keys())
embed_vals = []
for i in embed.values():
    embed_vals.append(list(deepcopy(i)))
    
X = np.array(embed_vals)
nbrs = NearestNeighbors(n_neighbors=5, algorithm='ball_tree').fit(X)
dists, inds = nbrs.kneighbors(centroids_300)

closest_words = {}
for i in range(len(pd.Series(clusters).unique())):
    print(i)
    closest_words[i] = [(embed_keys[inds[i, j]], dists[i, j]) for j in range(len(inds[i]))]

## (b) DBSCAN clustering


In [None]:
from sklearn.cluster import DBSCAN

clusterer = DBSCAN(eps=0.75, min_samples=5)
clusters = clusterer.fit_predict(Embedded)


In [None]:
pd.Series(clusters).unique()

## t-SNE


In [None]:
from sklearn.manifold import TSNE

X = TSNE(random_state=42).fit_transform(Embedded)


# Save data for plotting



In [None]:
# saves data for plotting

if not (merged.index == range(len(merged))).all():
    merged = merged.reset_index()
    
    
plot_data = pd.concat([pd.DataFrame(X), pd.Series(clusters), merged], axis=1)


plot_data.columns = ['t-SNE-X', 't-SNE-Y', 'cluster', 'Bulk',
                     
                  'Link',          'action',     'birth_place',
           'del_end_pos',   'del_start_pos',      'del_tokens',
        'del_tokens_str',          'editor',        'from_rev',
        'from_timestamp',     'ins_end_pos',   'ins_start_pos',
            'ins_tokens',  'ins_tokens_str',      'left_neigh',
            'left_token',  'left_token_str',     'nationality',
               'page_id',     'right_neigh',     'right_token',
       'right_token_str',            'text',          'to_rev',
          'to_timestamp',           'token']    


plot_data.to_csv("data/plotData_jlb_inner_kmeans.csv")



