In [1]:
import os
import copy
import math
import pickle
import spacy
import numpy as np
import pandas as pd
import plotly.express as px
from graphdatascience import GraphDataScience
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity

nlp = spacy.load('en_core_web_sm')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def apply_ner(sentence):

    doc = nlp(sentence)

    named_entities = []

    entity_chunk = None #(entity,type)

    for token in doc:
        if token.ent_iob_ == 'O':
            if entity_chunk:
                named_entities.append(entity_chunk)
                entity_chunk = None
        elif token.ent_iob_ == 'B':
            if entity_chunk:
                named_entities.append(entity_chunk)
                entity_chunk = None
            entity_chunk = (token.text,token.ent_type_)
        else:
            entity_chunk_text = entity_chunk[0]
            entity_chunk_type = entity_chunk[1]
            entity_chunk = (entity_chunk_text+' '+token.text,entity_chunk_type)


    uninformative_entities = ['DATE','TIME','QUANTITY','ORDINAL','CARDINAL','QUANTITY','MONEY','PERCENT','PERSON']

    named_entities = list(filter(lambda x: True if x[1] not in uninformative_entities else False, named_entities))
    named_entities = np.unique(named_entities,axis=0) 

    return named_entities

In [5]:
doc_df = pd.read_csv('tables/tables_69_76/doc_69_76.csv')
doc_df = doc_df[doc_df['subtype']!='editorial-note'] # removing editorial notes

id_to_text_list = doc_df['id_to_text'].values
free_text_list = doc_df['text'].values
year_list = list(map(lambda x: str(int(x)),doc_df['year'].values))
era_list = doc_df['era'].values

In [6]:
name_extension = '_69_76_2yearbinned'

if os.path.isfile('ne2vec/ner_dict'+name_extension):
    with open("ne2vec/ner_dict"+name_extension, "rb") as fp:
        ner_dict = pickle.load(fp)
    print('ner dict loaded.')

else:
    ner_dict = {}

    for idx,text in enumerate(free_text_list):

        if not(isinstance(text, float) and math.isnan(text)): # check if NaN
            id_to_text = id_to_text_list[idx]
            year = year_list[idx]
            era = era_list[idx]
            ne_list = apply_ner(text)

            for ne_tuple in ne_list:
                ne = ne_tuple[0]

                if not ner_dict.get(ne,None):
                    ner_dict[ne] = [(id_to_text,year,era)]
                else:
                    ner_dict[ne].append((id_to_text,year,era))
    
    with open('ne2vec/ner_dict'+name_extension, 'wb') as f:
        pickle.dump(ner_dict, f)
    print('ner dict computed and saved.')


min_ne_count = 20 # 50 in original
copy_ner_dict = copy.deepcopy(ner_dict)

for key in copy_ner_dict:
    
    if len(ner_dict[key]) < min_ne_count:
        del ner_dict[key]

ner dict computed and saved.


In [7]:
ne2doc_df = pd.DataFrame(columns=['id_to_text','named_entity','year','era'])

for key in ner_dict:

    tuple_list = ner_dict[key]

    for tuple in tuple_list:
        id, year, era = tuple[0], tuple[1], tuple[2]

        ne2doc_df = pd.concat((ne2doc_df, 
                                pd.DataFrame({'id_to_text':[id],'named_entity':[key],'year':[year],'era':[era]})),
                                ignore_index=True)

ne2doc_df['year'] = ne2doc_df['year'].apply(lambda x: int(x))

In [8]:
# x year bins
bins = list(range(1960,1981,2))

labels = []
for i in range(1,len(bins)):
    labels.append(str(bins[i-1])[-2:]+'-'+str(bins[i])[-2:])

ne2doc_df['bin'] = pd.cut(ne2doc_df['year'], bins=bins, labels=labels, right=True)

ne2doc_df['dynamic_named_entity'] = ne2doc_df['named_entity'].astype(str) + ' ' + ne2doc_df['bin'].astype(str)

In [11]:
ne2doc_df[ne2doc_df['named_entity']=='Turkey']['dynamic_named_entity'].value_counts()

Turkey 74-76    196
Turkey 68-70    185
Turkey 72-74    185
Turkey 70-72    175
Turkey 78-80      8
Turkey 76-78      3
Name: dynamic_named_entity, dtype: int64

In [10]:
ne2doc_df.to_parquet('ne2vec/ne2doc_df'+name_extension+'.parquet')

##### now:
##### 1- execute python3 ne_conversion.py
##### 2- run cypher commands in "ne2vec/cypher_commands.txt" on database.

In [25]:
gds = GraphDataScience("bolt://localhost:7687", auth=('neo4j', 'bos'), database='entity2vec18mar')

embedding_df = gds.run_cypher(
    """
        match (e:Entity)
        return e.name as entity, e['fastrp-embedding'] as fastrp_embedding
    """
)

In [28]:
reduced_emb_mat = TSNE(n_components=2, perplexity=50).fit_transform(np.stack(embedding_df['fastrp_embedding']))

x,y = reduced_emb_mat[:,0],reduced_emb_mat[:,1]

fig = px.scatter(x=x, y=y, text=embedding_df['entity'].values, width=900, height=900)
fig.write_html("ne2vec/69_76_dynamic_mincnt20_fastrp128.html")



In [29]:
cossim_mat = cosine_similarity(np.stack(embedding_df['fastrp_embedding']))

def most_similar(word, top_n):

    word_idx = embedding_df[embedding_df['entity']==word].index[0]

    similar_entity_idx = np.argsort(cossim_mat[word_idx])[::-1][1:top_n+1]

    similar_entity_names = embedding_df['entity'].values[similar_entity_idx]
    similar_entity_sims = cossim_mat[word_idx][similar_entity_idx]

    return np.array([similar_entity_names,similar_entity_sims]).T

In [69]:
most_similar('DEMIREL 74-76',10)

array([['Ecevit 74-76', 0.9993930953681697],
       ['Caramanlis 74-76', 0.999386941191969],
       ['Bitsios 74-76', 0.999161144287318],
       ['Aegean 74-76', 0.9985918268278424],
       ['the Greek Government 74-76', 0.9984054825012745],
       ['Nicosia 74-76', 0.9982051251928411],
       ['Famagusta 74-76', 0.9980222253797764],
       ['Esenbel 74-76', 0.9978805230426075],
       ['Greek - Turkish 74-76', 0.9978783952754865],
       ['Clerides 74-76', 0.9976393110088014]], dtype=object)