In [1]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn_extra.cluster import KMedoids
import math
import word_dist_script as ws
import pandas as pd

In [2]:
twitEmbs = ws.load_twitter_embs()

loading finished


In [3]:
load_word2vec = True

if load_word2vec:
    word2vec = ws.load_word2vec_embs()


Loading word2vec model...
This can take several minutes...


In [4]:
every_emb = ws.get_every_embeddings(word2vec)

clf = KMeans(n_clusters=5, random_state=0)

clf.fit(every_emb)

pred_emb = clf.predict(every_emb)



In [5]:
every_dist = ws.get_every_distance(word2vec)
import random
random.seed(0)
clf = KMeans(n_clusters=5, random_state=0, n_init=1)

clf.fit(every_dist)

pred_dist = clf.predict(every_dist)



In [6]:
sorted_dist = ws.sort_zip_labels(pred_dist)
groups_dist = ws.group_by_cluster(sorted_dist)

sorted_emb = ws.sort_zip_labels(pred_emb)
groups_emb = ws.group_by_cluster(sorted_emb)

In [7]:
for i, group in enumerate(groups_dist):
    print(i, group)

0 ['journal', 'astronomer', 'book', 'magazine', 'researcher', 'scientist', 'theory', 'university', 'writer']
1 ['album', 'band', 'genre', 'artist', 'instrument', 'poem', 'politics', 'politician', 'song']
2 ['country', 'event', 'field', 'location', 'metrics', 'person', 'product', 'java', 'task']
3 ['algorithm', 'chemical', 'enzyme', 'protein']
4 ['award', 'conference', 'discipline', 'election', 'misc', 'Organisation']


In [8]:
for i, group in enumerate(groups_emb):
    print(i, group)

0 ['astronomer', 'researcher', 'scientist']
1 ['algorithm', 'chemical', 'conference', 'country', 'discipline', 'election', 'event', 'field', 'location', 'metrics', 'instrument', 'Organisation', 'person', 'politics', 'politician', 'product', 'task', 'university']
2 ['journal', 'album', 'award', 'band', 'book', 'genre', 'magazine', 'artist', 'poem', 'java', 'song', 'theory', 'writer']
3 ['enzyme', 'protein']
4 ['misc']


In [9]:
for group in groups_dist:
    ## find distance to centroid
    label = ws.find_label_for_cluster(word2vec, group)
    print(label, group)

journal ['journal', 'astronomer', 'book', 'magazine', 'researcher', 'scientist', 'theory', 'university', 'writer']
artist ['album', 'band', 'genre', 'artist', 'instrument', 'poem', 'politics', 'politician', 'song']
location ['country', 'event', 'field', 'location', 'metrics', 'person', 'product', 'java', 'task']
enzyme ['algorithm', 'chemical', 'enzyme', 'protein']
conference ['award', 'conference', 'discipline', 'election', 'misc', 'Organisation']


In [10]:
for group in groups_emb:
    ## find distance to centroid
    label = ws.find_label_for_cluster_emb(word2vec, group)
    print(label, group)

scientist ['astronomer', 'researcher', 'scientist']
country ['algorithm', 'chemical', 'conference', 'country', 'discipline', 'election', 'event', 'field', 'location', 'metrics', 'instrument', 'Organisation', 'person', 'politics', 'politician', 'product', 'task', 'university']
book ['journal', 'album', 'award', 'band', 'book', 'genre', 'magazine', 'artist', 'poem', 'java', 'song', 'theory', 'writer']
enzyme ['enzyme', 'protein']
misc ['misc']


In [11]:
for group in groups_emb:
    # find most similar word
    centroid = ws.find_vector_for_cluster(word2vec, group)
    most_sim = word2vec.most_similar(centroid)[0]
    print(most_sim)

('scientist', 0.9096941351890564)
('By_Jonas_Elmerraji', 0.5770610570907593)
('album', 0.7002332210540771)
('enzyme', 0.9519215226173401)
('misc', 1.0)


In [12]:
entity2cluster_dist = ws.create_entity2cluster_dict(pred_dist)
entity2cluster_emb = ws.create_entity2cluster_dict(pred_emb)

cluster2label_dist = ws.create_cluster2label_dict(word2vec, pred_dist)
cluster2label_emb = ws.create_cluster2label_dict(word2vec, pred_emb)


## Mapping entities

In [13]:
## load csv
df = pd.read_csv('../data/manual_groups.csv')
## get entity names
entities = df["entity_name"]

In [14]:
def find_problematic_words(word2vec_model, entities):
    PROBLEMATIC_WORDS = []
    for entity in entities:
        try:
            word2vec_model[entity]
        except:
            PROBLEMATIC_WORDS.append(entity)
    return PROBLEMATIC_WORDS

In [15]:
## find the problematic words
problem_words = find_problematic_words(word2vec_model=word2vec, entities=entities)
problem2work = {"musicalartist": "artist", "organisation": "Organisation", "politicalparty": "politics", "academicjournal": "journal", "chemicalcompound": "chemical", "chemicalelement": "chemical", "astronomicalobject": "astronomer", "musicgenre": "genre", "literarygenre": "genre", "programlang": "java", "musicalinstrument": "instrument"  }

In [16]:
## map the problematic entities to working ones
working = entities.map(lambda entity: problem2work[entity] if entity in problem2work else entity)

In [20]:
## map the words to their cluster(word) label
mapped_words_dist = working.map(lambda entity: cluster2label_dist[entity2cluster_dist[entity]])
mapped_words_emb = working.map(lambda entity: cluster2label_emb[entity2cluster_emb[entity]])

In [21]:
## add the new columns to the dataframe
df["label_word_dist"] = mapped_words_dist
df["label_word_emb"] = mapped_words_emb

In [19]:
## write to csv
df.to_csv('../data/manual_groups.csv', index=False)

In [32]:
test_list = ["jounal", "journal"]

In [41]:
file = pd.read_csv("../data/manual_groups.csv", index_col=0)

In [47]:
file["label_ours"]["politicalparty"]

'organisation'

In [55]:
list_to_map = file.index.tolist()
list_to_map

['politician',
 'person',
 'writer',
 'researcher',
 'scientist',
 'musicalartist',
 'organisation',
 'politicalparty',
 'university',
 'band',
 'country',
 'location',
 'event',
 'election',
 'award',
 'conference',
 'album',
 'song',
 'academicjournal',
 'poem',
 'magazine',
 'book',
 'metrics',
 'enzyme',
 'protein',
 'chemicalcompound',
 'chemicalelement',
 'astronomicalobject',
 'theory',
 'musicgenre',
 'field',
 'discipline',
 'algorithm',
 'literarygenre',
 'product',
 'programlang',
 'misc',
 'musicalinstrument',
 'task']

In [51]:
df_to_use = file["label_ours"]

In [54]:
list(map(lambda entity: df_to_use[entity], list_to_map))

<map at 0x178b7248a60>