In [2]:
import os
import copy
import math
import pickle
import spacy
from tqdm import tqdm
import numpy as np
import pandas as pd
import plotly.express as px
from graphdatascience import GraphDataScience
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
import ray

nlp = spacy.load('en_core_web_sm')

tables_path = 'tables/tables_52_88/'

# these entities will be omitted when found
uninformative_entities = ['DATE','TIME','QUANTITY','ORDINAL','CARDINAL','MONEY','PERCENT']

# threshold based on count - hyperparameter
min_ne_count = 50

# bin size in years - hyperparameter
bin_size = 4
name_extension = '_'+str(bin_size)+'yearbinned'

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
ne2doc_df[ne2doc_df['named_entity']=='Turkey']['dynamic_named_entity'].value_counts()

Turkey 54-58    673
Turkey 50-54    542
Turkey 58-62    445
Turkey 70-74    357
Turkey 74-78    342
Turkey 62-66    337
Turkey 66-70    332
Turkey 78-82    175
Turkey 82-86     15
Turkey nan       12
Name: dynamic_named_entity, dtype: int64

##### now:
##### 1- execute python3 ne_conversion.py
##### 2- run cypher commands in "ne2vec/cypher_commands.txt" on database.

In [5]:
gds = GraphDataScience("bolt://localhost:7687", auth=('neo4j', 'bos'), database='frus5288')

embedding_df = gds.run_cypher(
    """
        match (e:DynamicEntity4YearBinned)
        return e.name as entity, e['fastrp-embedding'] as fastrp_embedding
    """
)

In [None]:
reduced_emb_mat = TSNE(n_components=2, perplexity=50).fit_transform(np.stack(embedding_df['fastrp_embedding']))

x,y = reduced_emb_mat[:,0],reduced_emb_mat[:,1]

fig = px.scatter(x=x, y=y, text=embedding_df['entity'].values, width=900, height=900)
fig.write_html("ne2vec/69_76_dynamic_mincnt20_fastrp128.html")

In [6]:
cossim_mat = cosine_similarity(np.stack(embedding_df['fastrp_embedding']))

def most_similar(word, top_n):

    word_idx = embedding_df[embedding_df['entity']==word].index[0]

    similar_entity_idx = np.argsort(cossim_mat[word_idx])[::-1][1:top_n+1]

    similar_entity_names = embedding_df['entity'].values[similar_entity_idx]
    similar_entity_sims = cossim_mat[word_idx][similar_entity_idx]

    return np.array([similar_entity_names,similar_entity_sims]).T

In [92]:
most_similar('Gibraltar 78-82',15)

array([['the Argentine Government 78-82', 0.9991129461274546],
       ['Islands 78-82', 0.9991089446600828],
       ['the Falkland Islands 78-82', 0.9989217215538105],
       ['Galtieri 78-82', 0.998819607077223],
       ['Buenos Aires 78-82', 0.9988029566236453],
       ['North Atlantic 78-82', 0.9986363347037847],
       ['Henderson 78-82', 0.9985909515754992],
       ['Argentines 78-82', 0.9980166802772713],
       ['RAF 78-82', 0.9979366901069627],
       ['the South Atlantic 78-82', 0.9978699649030066],
       ['Falklands 78-82', 0.9978487026803599],
       ['Islanders 78-82', 0.9978153407369099],
       ['Costa Mendez 78-82', 0.9975161984650041],
       ['Argentine 78-82', 0.9972919972464009],
       ['SAM 78-82', 0.9970316558844142]], dtype=object)

In [153]:
entity1 = 'Angola 82-86'
entity2 = 'Portugal 82-86'
embedding1 = np.array(embedding_df[embedding_df['entity']== entity1]['fastrp_embedding'].values[0]).reshape(1, -1)
embedding2 = np.array(embedding_df[embedding_df['entity']== entity2]['fastrp_embedding'].values[0]).reshape(1, -1)

cosine_similarity(embedding1,embedding2)

array([[0.98785991]])