In [1]:
import os
import copy
import math
import pickle
import spacy
from tqdm import tqdm
import numpy as np
import pandas as pd
import plotly.express as px
from graphdatascience import GraphDataScience
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
import ray
ray.init(num_cpus=12)

nlp = spacy.load('en_core_web_sm')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def apply_ner(sentence):

    doc = nlp(sentence)

    named_entities = []

    entity_chunk = None #(entity,type)

    for token in doc:
        if token.ent_iob_ == 'O':
            if entity_chunk:
                named_entities.append(entity_chunk)
                entity_chunk = None
        elif token.ent_iob_ == 'B':
            if entity_chunk:
                named_entities.append(entity_chunk)
                entity_chunk = None
            entity_chunk = (token.text,token.ent_type_)
        else:
            entity_chunk_text = entity_chunk[0]
            entity_chunk_type = entity_chunk[1]
            entity_chunk = (entity_chunk_text+' '+token.text,entity_chunk_type)


    uninformative_entities = ['DATE','TIME','QUANTITY','ORDINAL','CARDINAL','MONEY','PERCENT','PERSON']

    named_entities = list(filter(lambda x: True if x[1] not in uninformative_entities else False, named_entities))
    named_entities = np.unique(named_entities,axis=0) 

    return named_entities

In [3]:
tables_path = 'tables/tables_52_88/'

doc_df = pd.read_csv(tables_path+'doc.csv')
doc_df = doc_df[doc_df['subtype']!='editorial-note'] # removing editorial notes

id_to_text_list = doc_df['id_to_text'].values
free_text_list = doc_df['text'].values
year_list = list(map(lambda x: str(int(x)),doc_df['year'].values))
era_list = doc_df['era'].values

In [4]:
if os.path.isfile(tables_path+'ne2doc_original.parquet'):
    ne2doc_df = pd.read_parquet(tables_path+'ne2doc_original.parquet')
    print('ne2doc_df loaded.')

else:
    ner_dict = {'id_to_text':[], 'named_entity':[], 'year':[], 'era':[]}

    for idx,text in enumerate(tqdm(free_text_list)):

        if not(isinstance(text, float) and math.isnan(text)): # check if NaN
            id_to_text = id_to_text_list[idx]
            year = year_list[idx]
            era = era_list[idx]
            ne_list = apply_ner(text)

            for ne_tuple in ne_list:
                ne = ne_tuple[0]

                ner_dict['id_to_text'].append(id_to_text)
                ner_dict['named_entity'].append(ne)
                ner_dict['year'].append(int(year))
                ner_dict['era'].append(era)
    
    ne2doc_df = pd.DataFrame(data=ner_dict)
    ne2doc_df.to_parquet(tables_path+'ne2doc_original.parquet')
    print('ne2doc_df computed and saved.')

100%|██████████| 81930/81930 [2:07:35<00:00, 10.70it/s]  


ne2doc_df computed and saved.


In [33]:
# threshold based on count 
min_ne_count = 50
ne2doc_df = ne2doc_df.groupby('named_entity').filter(lambda x: len(x) >= min_ne_count)

In [35]:
# x year bins
name_extension = '_4yearbinned'
bins = list(range(1950,1990,4))

labels = []
for i in range(1,len(bins)):
    labels.append(str(bins[i-1])[-2:]+'-'+str(bins[i])[-2:])

ne2doc_df['bin'] = pd.cut(ne2doc_df['year'], bins=bins, labels=labels, right=True)

ne2doc_df['dynamic_named_entity'] = ne2doc_df['named_entity'].astype(str) + ' ' + ne2doc_df['bin'].astype(str)

In [49]:
ne2doc_df[ne2doc_df['named_entity']=='Turkey']['dynamic_named_entity'].value_counts()

Turkey 54-58    679
Turkey 50-54    542
Turkey 58-62    448
Turkey 70-74    360
Turkey 74-78    346
Turkey 62-66    341
Turkey 66-70    336
Turkey 78-82    178
Turkey 82-86     15
Turkey nan       12
Name: dynamic_named_entity, dtype: int64

In [None]:
ne2doc_df.to_parquet(tables_path+'ne2doc'+name_extension+'.parquet')

##### now:
##### 1- execute python3 ne_conversion.py
##### 2- run cypher commands in "ne2vec/cypher_commands.txt" on database.

In [None]:
gds = GraphDataScience("bolt://localhost:7687", auth=('neo4j', 'bos'), database='entity2vec18mar')

embedding_df = gds.run_cypher(
    """
        match (e:Entity)
        return e.name as entity, e['fastrp-embedding'] as fastrp_embedding
    """
)

In [None]:
reduced_emb_mat = TSNE(n_components=2, perplexity=50).fit_transform(np.stack(embedding_df['fastrp_embedding']))

x,y = reduced_emb_mat[:,0],reduced_emb_mat[:,1]

fig = px.scatter(x=x, y=y, text=embedding_df['entity'].values, width=900, height=900)
fig.write_html("ne2vec/69_76_dynamic_mincnt20_fastrp128.html")

In [None]:
cossim_mat = cosine_similarity(np.stack(embedding_df['fastrp_embedding']))

def most_similar(word, top_n):

    word_idx = embedding_df[embedding_df['entity']==word].index[0]

    similar_entity_idx = np.argsort(cossim_mat[word_idx])[::-1][1:top_n+1]

    similar_entity_names = embedding_df['entity'].values[similar_entity_idx]
    similar_entity_sims = cossim_mat[word_idx][similar_entity_idx]

    return np.array([similar_entity_names,similar_entity_sims]).T

In [None]:
most_similar('DEMIREL 74-76',10)