# Entity type clustering

In [1]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, DBSCAN, AffinityPropagation, SpectralClustering, AgglomerativeClustering
from sklearn_extra.cluster import KMedoids
from sklearn.decomposition import PCA
from sklearn.manifold import MDS, SpectralEmbedding
from gensim.models import Word2Vec, KeyedVectors
from  gensim import downloader
import math
import pickle

from tqdm import tqdm
from collections import Counter

import dist_util as util

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/davidsule/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
random_state = 4012

# Avoid downloading / loading the whole Word2Vec Google News 300 model
quick = False

if quick:
    entities = ['genre', 'song', 'writer', 'university', 'javascript', 'enzyme', 'award', 'chemical', 'person', 'event', 'conference', 'protein', 'magazine', 'task', 'galaxy', 'journal', 'album', 'researcher', 'discipline', 'band', 'book', 'country', 'election', 'algorithm', 'organization', 'location', 'poem', 'product', 'metrics', 'miscellaneous', 'musician', 'field', 'politician', 'coalition', 'theory', 'violin', 'scientist']
    with open("embs.pkl", "rb") as f:
        embeddings = pickle.load(f)

In [3]:
if quick == False:
    # Labels (Manual copy from .env file)
    entities_orig = "academicjournal album algorithm astronomicalobject award band book chemicalcompound chemicalelement conference country discipline election enzyme event field literarygenre location magazine metrics misc musicalartist musicalinstrument musicgenre organisation person poem politicalparty politician product programlang protein researcher scientist song task theory university writer"
    entities_orig = entities_orig.split()

    # Load Word2Vec embeddings - First time download: ~1.6 GB
    print("Loading pretrained Word2Vec model, this may take a while.")
    w2v = downloader.load("word2vec-google-news-300")

    # Check which words are not in Word2Vec model
    missing = util.find_missing(w2v, entities_orig)
    print(f"These entities are not in the model:\n{missing}")

Loading pretrained Word2Vec model, this may take a while.
These entities are not in the model:
['academicjournal', 'astronomicalobject', 'chemicalcompound', 'chemicalelement', 'literarygenre', 'musicalartist', 'musicalinstrument', 'musicgenre', 'organisation', 'politicalparty', 'programlang']


In [4]:
if quick == False:
    # Manual correction
    substitute = {"musicalartist": "musician", "organisation": "organization", "politicalparty": "coalition", "academicjournal": "journal", "chemicalcompound": "chemical", "chemicalelement": "chemical", "astronomicalobject": "galaxy", "musicgenre": "genre", "literarygenre": "genre", "programlang": "javascript", "musicalinstrument": "violin", "misc": "miscellaneous"}
    entities = set()
    for entity in entities_orig:
        if entity in substitute:
            entities.add(substitute[entity])
        else:
            entities.add(entity)

    entities = list(entities)

    still_missing = util.find_missing(w2v, entities)
    if len(still_missing) > 0:
        print("These entities are not in the model:")
        print(still_missing)
    else:
        print("All entities are in the model. Final list:")
        print(entities)
        print("Loading embeddings for them.")
        embeddings = w2v[entities]

All entities are in the model. Final list:
['journal', 'metrics', 'coalition', 'conference', 'protein', 'theory', 'field', 'event', 'election', 'researcher', 'politician', 'musician', 'enzyme', 'poem', 'band', 'magazine', 'product', 'writer', 'country', 'award', 'scientist', 'location', 'algorithm', 'genre', 'miscellaneous', 'javascript', 'task', 'galaxy', 'university', 'book', 'organization', 'violin', 'song', 'person', 'album', 'chemical', 'discipline']
Loading embeddings for them.


In [5]:
# Dimensionality Reduction
pca = PCA(n_components=6, svd_solver="full") # random state not needed for "full" solver
pca = pca.fit_transform(embeddings)

mds = MDS(n_components=6, random_state=random_state)
mds = mds.fit_transform(embeddings)

se = SpectralEmbedding(n_components=6, random_state=random_state)
se = se.fit_transform(embeddings)



In [6]:
# K-Means example
km = KMeans(n_clusters=7, random_state=random_state, n_init=100)
km = km.fit_predict(se)

categories = util.get_categories(km, entities)
for category, entity in categories.items():
    print(f"{category}:\t{entity}")

5:	['journal', 'poem', 'magazine', 'book']
3:	['metrics', 'field', 'election', 'country', 'miscellaneous', 'galaxy', 'university']
0:	['coalition', 'conference', 'event', 'award', 'location', 'task', 'organization', 'person', 'discipline']
2:	['protein', 'enzyme', 'chemical']
4:	['theory', 'product', 'algorithm', 'javascript']
1:	['researcher', 'politician', 'writer', 'scientist']
6:	['musician', 'band', 'genre', 'violin', 'song', 'album']
