In [181]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn_extra.cluster import KMedoids

In [135]:
import gensim.models
from gensim.models import Word2Vec
######### download the twitter.bin file from google drive and place it in the same folder (word_dist) ###########
######### https://drive.google.com/drive/folders/12vBvFPjpHx3gwkaCeuBf5MjBHdAnyk4N ###########
twitEmbs = gensim.models.KeyedVectors.load_word2vec_format(
                                'twitter.bin', binary=True)
print('loading finished')

loading finished


In [136]:
from  gensim import downloader
## This will take a very long time to download
Word2Vec_google_news = downloader.load('word2vec-google-news-300')


In [139]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Nicklas\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [140]:
ENTITY_LABELS = "journal album algorithm astronomer award band book chemical conference country discipline election enzyme event field genre location magazine metrics misc artist instrument Organisation person poem politics politician product java protein researcher scientist song task theory university writer"
ENTITY_LABELS_SPLIT = ENTITY_LABELS.split()

In [141]:
def distance_to_every_word(word, ENTITY_LABELS_SPLIT):
    
    distance_list = []
    
    for i in range(len(ENTITY_LABELS_SPLIT)):
        distance = Word2Vec_google_news.distance(word, ENTITY_LABELS_SPLIT[i])
        distance_list.append(distance)
    
    return distance_list

In [142]:
def get_every_distance(ENTITY_LABELS_SPLIT):
    
    every_distance = []
    for current_word in ENTITY_LABELS_SPLIT:
        distance_list = distance_to_every_word(current_word, ENTITY_LABELS_SPLIT)
        every_distance.append(distance_list)
    return every_distance    

def get_every_embeddings(ENTITY_LABELS_SPLIT):
        
        embeddings = []
        for current_word in ENTITY_LABELS_SPLIT:
            embedding = Word2Vec_google_news[current_word]
            embeddings.append(embedding)
        return embeddings

In [185]:
every_dist = get_every_distance(ENTITY_LABELS_SPLIT)
every_emb = get_every_embeddings(ENTITY_LABELS_SPLIT)

clf_kmedoids = KMedoids(n_clusters=5)

clf_kmedoids.fit(every_dist)

pred_kmediods = clf_kmedoids.predict(every_dist)





array([3, 4, 1, 3, 1, 4, 4, 0, 3, 3, 1, 3, 0, 2, 3, 4, 2, 3, 1, 1, 4, 2,
       3, 2, 4, 3, 2, 1, 1, 0, 3, 3, 4, 2, 3, 3, 2], dtype=int64)

In [183]:
def predict_using_distance(ENTITY_LABELS_SPLIT, n_clusters=5):
        
    ## make classifier
    clf = KMeans(n_clusters=n_clusters, random_state=0)
    ## get distances
    every_distance = get_every_distance(ENTITY_LABELS_SPLIT)
    ## fit classifier
    clf.fit(every_distance)
    ## predict labels
    predicted_labels = []
    for i,current_word in enumerate(ENTITY_LABELS_SPLIT):
        current_dist = every_distance[i]
        predicted_label = clf.predict([current_dist])
        predicted_labels.append(predicted_label)
    return predicted_labels

def predict_using_embeddings(ENTITY_LABELS_SPLIT, n_clusters=5):
        
    ## make classifier
    clf = KMeans(n_clusters=n_clusters, random_state=0)
    ## get embeddings
    every_embeddings = get_every_embeddings(ENTITY_LABELS_SPLIT)
    ## fit classifier
    clf.fit(every_embeddings)
    ## predict labels
    predicted_labels = []
    for i,current_word in enumerate(ENTITY_LABELS_SPLIT):
        current_emb = every_embeddings[i]
        predicted_label = clf.predict([current_emb])
        predicted_labels.append(predicted_label)
    return predicted_labels

In [178]:
def sort_zip_labels(ENTITY_LABELS_SPLIT, predictions):
    zipped = zip(ENTITY_LABELS_SPLIT, predictions)
    sorted_zipped = sorted(zipped, key=lambda x: x[1])
    return sorted_zipped

In [180]:
predicted_labels_dist = predict_using_distance(ENTITY_LABELS_SPLIT, n_clusters=6)
predicted_labels_emb = predict_using_embeddings(ENTITY_LABELS_SPLIT, n_clusters=6)

sorted_zipped_dist = sort_zip_labels(ENTITY_LABELS_SPLIT, predicted_labels_dist)
sorted_zipped_emb = sort_zip_labels(ENTITY_LABELS_SPLIT, predicted_labels_emb)

sorted_zipped_emb



[('algorithm', array([0])),
 ('award', array([0])),
 ('book', array([0])),
 ('conference', array([0])),
 ('country', array([0])),
 ('event', array([0])),
 ('field', array([0])),
 ('location', array([0])),
 ('magazine', array([0])),
 ('metrics', array([0])),
 ('misc', array([0])),
 ('instrument', array([0])),
 ('Organisation', array([0])),
 ('person', array([0])),
 ('product', array([0])),
 ('java', array([0])),
 ('task', array([0])),
 ('election', array([1])),
 ('politics', array([1])),
 ('politician', array([1])),
 ('journal', array([2])),
 ('discipline', array([2])),
 ('theory', array([2])),
 ('university', array([2])),
 ('album', array([3])),
 ('band', array([3])),
 ('genre', array([3])),
 ('artist', array([3])),
 ('poem', array([3])),
 ('song', array([3])),
 ('writer', array([3])),
 ('chemical', array([4])),
 ('enzyme', array([4])),
 ('protein', array([4])),
 ('astronomer', array([5])),
 ('researcher', array([5])),
 ('scientist', array([5]))]