## Clustering named entities - OECD corpus

### 1. Import and preprocess named entities 

In [None]:
import warnings
warnings.filterwarnings('ignore')
import gensim
import numpy as np
from sklearn.manifold import TSNE
from nltk.corpus import stopwords
import pandas as pd
import plotly.offline as pyo
import plotly.graph_objs as go
pyo.init_notebook_mode()

# import master ner dataset
df = pd.read_csv('full-flair-ner-list-oecd-with-single-tokens.csv')

# partition dataframe according to main entity types
orgs_df = df[df['entity_type'] == 'ORG'] 
locs_df = df[df['entity_type'] == 'LOC'] 
gpes_df = df[df['entity_type'] == 'GPE'] 
persons_df = df[df['entity_type'] == 'PERSON'] 

# filter only the most frequently occurring entities of each type
n = 200
orgs = orgs_df['entity_as_single_token'].value_counts().index.tolist()[:n]
locs = locs_df['entity_as_single_token'].value_counts().index.tolist()[:n]
gpes = gpes_df['entity_as_single_token'].value_counts().index.tolist()[:n]
persons = persons_df['entity_as_single_token'].value_counts().index.tolist()[:n]
all_entities = list(set((orgs + persons + gpes)))
# print(orgs)
# print()
# print(gpes)
# print()
# print(persons)
# print()
# print(all_entities)

### 2. Import and prepare trained word vectors

In [None]:
# load chosen model: vector size - 200, window size - 20 
# filepath_200_20 = '/Users/kodymoodley/Documents/nlesc-projects/disaster-capitalism/embeddings/models/gensim-oecd-word2vec-200-20.model'
# d_testmodel = gensim.models.Word2Vec.load(filepath_200_20)
# load chosen model: vector size - 100, window size - 20 
filepath_100_20 = '/Users/kodymoodley/Documents/nlesc-projects/disaster-capitalism/embeddings/models/gensim-oecd-word2vec-100-20.model'
c_testmodel = gensim.models.Word2Vec.load(filepath_100_20)

In [None]:
all_vectors = []
    
words_in_model = []
for index, word in enumerate(c_testmodel.wv.index_to_key):
    words_in_model.append(word)
    all_vectors.append(c_testmodel.wv[word])
    
# print(len(words_in_model))
# print(len(all_vectors))
print(words_in_model[:10])

### 3. Use t-SNE to reduce dimensions of vectors to 2D and 3D

In [None]:
# define function to compute 3D / 2D coordinates (dimensionality reduction) for word vectors
def get_coordinates(model, vector_size, words, dimensions):
    if dimensions not in [2, 3]:
        print("incorrect number of dimensions: possible values are the integers 2 or 3")
        return [], [], []
    
    colors = []
    vectors = []
    labels = []
    for wrd in words:
        try:
            if (wrd in orgs):
                curr_color = f'rgb({67}, {198}, {252})'
                colors.append(curr_color)
            elif (wrd in gpes):
                curr_color = f'rgb({253}, {151}, {32})'
                colors.append(curr_color)
            elif (wrd in persons):
                curr_color = f'rgb({166}, {226}, {45})'
                colors.append(curr_color)
            else:
                curr_color = f'rgb({255}, {255}, {255})'
                colors.append(curr_color)
                
            wrd_vector = model.wv[wrd]
            vectors.append(wrd_vector)
            # arr = np.append(arr, np.array([wrd_vector]), axis=0)
            if wrd in (orgs + gpes + persons):
                labels.append(wrd)
            else:
                labels.append('')
        except:
            pass
        
    tsne = TSNE(n_components=dimensions, random_state=0)
    np.set_printoptions(suppress=True)
    Y = tsne.fit_transform(vectors)
    if (dimensions == 2):
        x_coords = Y[:, 0]
        y_coords = Y[:, 1]
        return x_coords, y_coords, colors, labels
    else:
        x_coords = Y[:, 0]
        y_coords = Y[:, 1]
        z_coords = Y[:, 2]
        return x_coords, y_coords, z_coords, colors, labels

In [None]:
# t-SNE clustering
x, y, z, colors, labels = get_coordinates(c_testmodel, 100, words_in_model, 3)
x, y, colors, labels = get_coordinates(c_testmodel, 100, words_in_model, 2)

In [None]:
def get_colors_and_labels(old_colors, old_labels, words_in_model, enttypes='org'):
    global orgs
    global persons
    global gpes

    new_colors = []
    new_labels = []
    
    selected_type = orgs
    
    if enttypes == 'org':
        selected_type = orgs
    elif enttypes == 'per':
        selected_type = persons
    else:
        selected_type = gpes
    
    for i in range(0, len(words_in_model)):
        try:
            if words_in_model[i] not in selected_type:
                curr_color = f'rgb({255}, {255}, {255})'
                new_colors.append(curr_color)
                new_labels.append('')
            else:
                if (words_in_model[i] in orgs):
                    curr_color = f'rgb({67}, {198}, {252})'
                    new_colors.append(curr_color)
                    new_labels.append(words_in_model[i])
                elif (words_in_model[i] in gpes):
                    curr_color = f'rgb({253}, {151}, {32})'
                    new_colors.append(curr_color)
                    new_labels.append(words_in_model[i])
                elif (words_in_model[i] in persons):
                    curr_color = f'rgb({166}, {226}, {45})'
                    new_colors.append(curr_color)
                    new_labels.append(words_in_model[i])
                else:
                    curr_color = f'rgb({255}, {255}, {255})'
                    new_colors.append(curr_color)   
                    new_labels.append('')
        except:
            pass
                    
    return new_colors, new_labels

org_colors, org_labels = get_colors_and_labels(colors, labels, words_in_model, enttypes='org')
per_colors, per_labels = get_colors_and_labels(colors, labels, words_in_model, enttypes='per')
gpe_colors, gpe_labels = get_colors_and_labels(colors, labels, words_in_model, enttypes='gpe')

### 4. Plot the 2D and 3D vectors from t-SNE

#### a. All entity types

In [None]:
plot_all_3d = [go.Scatter3d(x = x,
                    y = y,
                    z = z,
                    mode = 'markers+text',
                    text = labels,
                    textposition='bottom center',
                    hoverinfo = 'text',
                    marker=dict(color=colors,size=6,opacity=0.8))]

# data = [trace]
layout_all_3d = go.Layout(title='OECD actor clusters - ALL TYPES - 3D', autosize=False, width=1000, height=1000)
fig_all_3d = go.Figure(data=plot_all_3d, layout=layout_all_3d)
pyo.iplot(fig_all_3d) 
fig_all_3d.write_html("cluster-plots/actors-3d-alltypes.html")

In [None]:
plot_all_2d = [go.Scatter(x = x,
                    y = y,
                    mode = 'markers+text',
                    text = labels,
                    textposition='bottom center',
                    hoverinfo = 'text',
                    marker=dict(color=colors,size=6,opacity=0.8))]

layout_all_2d = go.Layout(title='OECD actor clusters - ALL TYPES - 2D',autosize=False, width=1000, height=1000)
fig_all_2d = go.Figure(data=plot_all_2d, layout=layout_all_2d)
pyo.iplot(fig_all_2d) 
fig_all_2d.write_html("cluster-plots/actors-2d-alltypes.html")

#### b. ORGs only


In [None]:
plot_all_3d = [go.Scatter3d(x = x,
                    y = y,
                    z = z,
                    mode = 'markers+text',
                    text = org_labels,
                    textposition='bottom center',
                    hoverinfo = 'text',
                    marker=dict(color=org_colors,size=6,opacity=0.8))]

# data = [trace]
layout_all_3d = go.Layout(title='OECD actor clusters - ORGs - 3D', autosize=False, width=1000, height=1000)
fig_all_3d = go.Figure(data=plot_all_3d, layout=layout_all_3d)
pyo.iplot(fig_all_3d) 
fig_all_3d.write_html("cluster-plots/actors-3d-orgs.html")

In [None]:
plot_all_2d = [go.Scatter(x = x,
                    y = y,
                    mode = 'markers+text',
                    text = org_labels,
                    textposition='bottom center',
                    hoverinfo = 'text',
                    marker=dict(color=org_colors,size=6,opacity=0.8))]

layout_all_2d = go.Layout(title='OECD actor clusters - ORGs - 2D',autosize=False, width=1000, height=1000)
fig_all_2d = go.Figure(data=plot_all_2d, layout=layout_all_2d)
pyo.iplot(fig_all_2d) 
fig_all_2d.write_html("cluster-plots/actors-2d-orgs.html")

#### c. PERs only

In [None]:
plot_all_3d = [go.Scatter3d(x = x,
                    y = y,
                    z = z,
                    mode = 'markers+text',
                    text = per_labels,
                    textposition='bottom center',
                    hoverinfo = 'text',
                    marker=dict(color=per_colors,size=6,opacity=0.8))]

# data = [trace]
layout_all_3d = go.Layout(title='OECD actor clusters - PERSONs - 3D', autosize=False, width=1000, height=1000)
fig_all_3d = go.Figure(data=plot_all_3d, layout=layout_all_3d)
pyo.iplot(fig_all_3d) 
fig_all_3d.write_html("cluster-plots/actors-3d-persons.html")

In [None]:
plot_all_2d = [go.Scatter(x = x,
                    y = y,
                    mode = 'markers+text',
                    text = per_labels,
                    textposition='bottom center',
                    hoverinfo = 'text',
                    marker=dict(color=per_colors,size=6,opacity=0.8))]

layout_all_2d = go.Layout(title='OECD actor clusters - PERSONs - 2D',autosize=False, width=1000, height=1000)
fig_all_2d = go.Figure(data=plot_all_2d, layout=layout_all_2d)
pyo.iplot(fig_all_2d) 
fig_all_2d.write_html("cluster-plots/actors-2d-persons.html")

#### d. GPEs only

In [None]:
plot_all_3d = [go.Scatter3d(x = x,
                    y = y,
                    z = z,
                    mode = 'markers+text',
                    text = gpe_labels,
                    textposition='bottom center',
                    hoverinfo = 'text',
                    marker=dict(color=gpe_colors,size=6,opacity=0.8))]

# data = [trace]
layout_all_3d = go.Layout(title='OECD actor clusters - GPEs - 3D', autosize=False, width=1000, height=1000)
fig_all_3d = go.Figure(data=plot_all_3d, layout=layout_all_3d)
pyo.iplot(fig_all_3d) 
fig_all_3d.write_html("cluster-plots/actors-3d-gpes.html")

In [None]:
plot_all_2d = [go.Scatter(x = x,
                    y = y,
                    mode = 'markers+text',
                    text = gpe_labels,
                    textposition='bottom center',
                    hoverinfo = 'text',
                    marker=dict(color=gpe_colors,size=6,opacity=0.8))]

layout_all_2d = go.Layout(title='OECD actor clusters - GPEs - 2D',autosize=False, width=1000, height=1000)
fig_all_2d = go.Figure(data=plot_all_2d, layout=layout_all_2d)
pyo.iplot(fig_all_2d) 
fig_all_2d.write_html("cluster-plots/actors-2d-gpes.html")

In [None]:
def get_more_info_about_entity(entity):
    global df
    new_df = df.drop(['entity', 'model'], axis=1)
    relevant_df = new_df[new_df['entity_as_single_token'] == entity].reset_index(drop=True)
    types = list(set(relevant_df['entity_type'].tolist()))
    docs = list(set(relevant_df['docid'].tolist()))
    potential_contexts = relevant_df['sentence'].tolist()
    new_contexts = []
    spans = relevant_df['span'].tolist()
    for i in range(0, len(potential_contexts)):
        span_parts = spans[i].split(':')
        l_span = int(span_parts[0])
        r_span = int(span_parts[1])
        left_str = potential_contexts[i][:l_span-1]
        right_str = potential_contexts[i][r_span+1:]
        left_str_parts = left_str.split()
        right_str_parts = right_str.split()
        if ((len(left_str_parts) > 3) or (len(right_str_parts) > 3)):
            if (len(left_str_parts) <= 3):
                new_str = potential_contexts[i][:r_span] + ' ' + ' '.join(right_str_parts[0:3])
                new_contexts.append(new_str)
            elif (len(right_str_parts) <= 3):
                new_str = ' '.join(left_str_parts[-3:]) + ' ' + potential_contexts[i][l_span:]
                new_contexts.append(new_str)
            else:
                new_str = ' '.join(left_str_parts[-3:]) + ' ' + potential_contexts[i][l_span:r_span] +  ' ' + ' '.join(right_str_parts[0:3])
                new_contexts.append(new_str)
        else:
            new_contexts.append(potential_contexts[i])
    return {'name' : entity, 'types' : types, 'docs' : docs, 'contexts' : new_contexts}

def pretty_print_entity_info(entity_info):
    print()
    print("name\t\t:\t", entity_info['name']) 
    print()
    print("types\t\t:\t", entity_info['types'])
    print()
    print("documents\t:\t", entity_info['docs'])
    print()
    print("contexts\t:\t", end="")
    print(" 1. " + entity_info['contexts'][0])
    for i in range(1, len(entity_info['contexts'])):
        print("\t\t\t " + str(i+1) + ". " + entity_info['contexts'][i])
    print()
    

In [None]:
result = get_more_info_about_entity('ibnet')
print(pretty_print_entity_info(result))

### 5. KMeans clustering

In [None]:
from nltk.cluster import KMeansClusterer
import nltk
NUM_CLUSTERS=30
X = all_vectors

In [None]:
kclusterer = KMeansClusterer(NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance, repeats=25)
assigned_clusters = kclusterer.cluster(X, assign_clusters=True)

In [None]:
def lookup_word_from_word2vec(vector, model):
    for index, word in enumerate(model.wv.index_to_key):
        if model.wv[word].tolist() == vector.tolist():
            return word
    return None

In [None]:
clusters = {}
raw_clusters=[]
cluster_indexes = set()
for index, word in enumerate(c_testmodel.wv.index_to_key):
    cluster_indexes.add(assigned_clusters[index])
    clusters[assigned_clusters[index]] = []
    raw_clusters.append((assigned_clusters[index], word))
    
for item in raw_clusters:
    clusters[item[0]].append(item[1])
    
import json
with open('clustering-data/kmeans_clusters.json', 'w') as fp:
    json.dump(clusters, fp)

In [None]:
# filter clusters only for ORGS, PER, GPEs

orgs_clusters = {}
persons_clusters = {}
gpes_clusters = {}
for key in clusters:
    orgs_clusters[key] = []
    persons_clusters[key] = []
    gpes_clusters[key] = []
    for word in clusters[key]:
        if word in orgs:
            orgs_clusters[key].append(word)
        if word in persons:
            persons_clusters[key].append(word)
        if word in gpes:
            gpes_clusters[key].append(word)
          
    if len(orgs_clusters[key]) == 0:
        del orgs_clusters[key]
    if len(persons_clusters[key]) == 0:
        del persons_clusters[key]
    if len(gpes_clusters[key]) == 0:
        del gpes_clusters[key]
    
with open('clustering-data/kmeans_clusters_orgs.json', 'w') as fp:
    json.dump(orgs_clusters, fp)

with open('clustering-data/kmeans_clusters_persons.json', 'w') as fp:
    json.dump(persons_clusters, fp)
    
with open('clustering-data/kmeans_clusters_gpes.json', 'w') as fp:
    json.dump(gpes_clusters, fp)

### 6. Spectral clustering

In [None]:
from sklearn.cluster import SpectralClustering
import numpy as np
sp_clustering = SpectralClustering(n_clusters=30, assign_labels='discretize’',random_state=0).fit(X)
print(sp_clustering.labels_)
print(sp_clustering)

In [None]:
# len(sp_clustering.labels_)
# len(list(set(sp_clustering.labels_)))

In [None]:
sp_clusters = {}
raw_sp_clusters=[]
cluster_sp_indexes = set()
for index, word in enumerate(c_testmodel.wv.index_to_key):
    cluster_sp_indexes.add(int(sp_clustering.labels_[index]))
    sp_clusters[int(sp_clustering.labels_[index])] = []
    raw_sp_clusters.append((int(sp_clustering.labels_[index]), word))
    
for item in raw_sp_clusters:
    sp_clusters[item[0]].append(item[1])
    
with open('clustering-data/spectral_clusters.json', 'w') as fp:
    json.dump(sp_clusters, fp)

In [None]:
# filter clusters only for ORGS, PER, GPEs

orgs_sp_clusters = {}
persons_sp_clusters = {}
gpes_sp_clusters = {}
for key in sp_clusters:
    orgs_sp_clusters[key] = []
    persons_sp_clusters[key] = []
    gpes_sp_clusters[key] = []
    for word in sp_clusters[key]:
        if word in orgs:
            orgs_sp_clusters[key].append(word)
        if word in persons:
            persons_sp_clusters[key].append(word)
        if word in gpes:
            gpes_sp_clusters[key].append(word)
          
    if len(orgs_sp_clusters[key]) == 0:
        del orgs_sp_clusters[key]
    if len(persons_sp_clusters[key]) == 0:
        del persons_sp_clusters[key]
    if len(gpes_sp_clusters[key]) == 0:
        del gpes_sp_clusters[key]
    
with open('clustering-data/spectral_clusters_orgs.json', 'w') as fp:
    json.dump(orgs_sp_clusters, fp)

with open('clustering-data/spectral_clusters_persons.json', 'w') as fp:
    json.dump(persons_sp_clusters, fp)
    
with open('clustering-data/spectral_clusters_gpes.json', 'w') as fp:
    json.dump(gpes_sp_clusters, fp)

### 7. Find closest actors to each STM topics

In [None]:
# import topics from STM and link them (using fuzzy string matching) to the word form present in the word embeddings list
# This is necessary because stemming is used in the STM topic words while it is not used for generating the word embeddings
# because lemmatization generally improves the quality of embeddings over stemming because it takes into account the meaning
# of the word in reducing it to a canonical form. stemming does not.

from thefuzz import fuzz
from thefuzz import process

def find_partial_match(given_word):
    global words_in_model       
    
    match = process.extractOne(given_word, words_in_model, scorer=fuzz.token_set_ratio, score_cutoff=80)
    
    if (match is not None):
        if (match[1] >= 90):
            return match[0]
        else:
            for word in words_in_model:
                if given_word.strip() == word[:len(given_word)].strip():
                    return word
            for word in words_in_model:
                if given_word.strip()[:len(given_word.strip())-1] == word[:len(given_word.strip())-1].strip():
                    return word
    else:
        for word in words_in_model:
            if given_word.strip() == word[:len(given_word)].strip():
                return word
        for word in words_in_model:
                if given_word.strip()[:len(given_word.strip())-1] == word[:len(given_word.strip())-1].strip():
                    return word

    return None

def get_word2vec_topic_words(topics):
    global words_in_model
        
    word2vec_topics = []
    
    for word in topics:
        if (word in words_in_model): # already verbatim in there
            word2vec_topics.append(word)
        else:
            word = word.replace('-', '_')
            match = process.extractOne(word, words_in_model, scorer=fuzz.token_set_ratio, score_cutoff = 91)
            if match is not None:
                # print(word, "\t:\t", match)
                word2vec_topics.append(match[0])
            else:
                partial_match = find_partial_match(word)
                if partial_match is not None:
                    # print(word, "\t:\t", partial_match)
                    word2vec_topics.append(partial_match)
                # else:
                #     print(word, "\t:\t", partial_match)

    return list(set(word2vec_topics))
    
def map_topics_words_to_word2vec_vocab(stm_topics_df):
    results = {}
    unique_topic_ids = stm_topics_df.id.unique()
    
    for topic_id in unique_topic_ids:
        print(topic_id, ' : ', len(unique_topic_ids))
        topic_id_df = stm_topics_df[stm_topics_df['id'] == topic_id]
        topic_id_df = topic_id_df.drop(['id', 'metric'], axis=1)
        df_values = topic_id_df.values.tolist()

        topic_words_as_list = [item.strip() for sublist in df_values for item in sublist]

        word2vec_topics = get_word2vec_topic_words(topic_words_as_list)
        results[int(topic_id)] = word2vec_topics
        
    return results
    
stm_topics_df = pd.read_csv('stm_final_topic_labels.csv', sep=';')

results_dict = map_topics_words_to_word2vec_vocab(stm_topics_df)

with open('clustering-data/stm_topics_word2vec_vocab.json', 'w') as fp:
    json.dump(results_dict, fp)


In [None]:
# Kmeans clusters for actors (ORGs)

similarity_results = {}

for topic_key in results_dict: # for each topic
    similarity_results[topic_key] = {}
    for org_cluster_key in orgs_clusters: # for each cluster
        sim = c_testmodel.wv.n_similarity(results_dict[topic_key], orgs_clusters[org_cluster_key])
        similarity_results[topic_key][org_cluster_key] = str(sim)

for item in similarity_results:
    highest = 0.0
    highest_subitem = 0
    for subitem in similarity_results[item]:
        if float(similarity_results[item][subitem]) > float(highest):
            highest = similarity_results[item][subitem]
            highest_subitem = subitem
            
    similarity_results[item]['highest'] = highest_subitem
    
with open('clustering-data/stm_topics_vs_org_clusters_kmeans.json', 'w') as fp:
    json.dump(similarity_results, fp)

In [None]:
# Spectral clusters for actors (ORGs)

similarity_results_sp = {}

for topic_key in results_dict: # for each topic
    similarity_results_sp[topic_key] = {}
    for org_cluster_key in orgs_sp_clusters: # for each cluster
        sim = c_testmodel.wv.n_similarity(results_dict[topic_key], orgs_sp_clusters[org_cluster_key])
        similarity_results_sp[topic_key][org_cluster_key] = str(sim)

for item in similarity_results_sp:
    highest = 0
    highest_subitem = 0
    for subitem in similarity_results_sp[item]:
        if float(similarity_results_sp[item][subitem]) > float(highest):
            highest = subitem
            highest_subitem = subitem
            
    similarity_results_sp[item]['highest'] = highest_subitem
    
with open('clustering-data/stm_topics_vs_org_clusters_spectral.json', 'w') as fp:
    json.dump(similarity_results_sp, fp)

In [None]:
print("STM topics mapping to Kmeans clusters (ORGs):")
print("---------------------------------------------")
print()
for item in similarity_results:
    print(item, " : ", similarity_results[item]['highest'])
print()
print()
print("STM topics mapping to Spectral clusters (ORGs):")
print("-----------------------------------------------")
print()
for item in similarity_results_sp:
    print(item, " : ", similarity_results_sp[item]['highest'])
print()
print()