# Calculate Usage Types

Now that we have usages for words from both of our data sources, we want to cluster each data source into senses.
For this we will follow the method from Giulianelli et al. 

In [11]:
import numpy as np
import pandas as pd
# import networkx as nx
# import plotly.graph_objs as go
# import plotly.io as pio

from collections import defaultdict
#from deprecated import deprecated
from scipy.spatial.distance import cdist
from tqdm import tqdm
from string import ascii_uppercase
from sklearn import preprocessing
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score
from sklearn.mixture import GaussianMixture
from transformers import BertTokenizer

In [12]:
# adapted from Giulianelli et al

def best_kmeans(X, max_range=np.arange(2, 11), criterion='silhouette'):
    """
    Return the best K-Means clustering given the data, a range of K values, and a K-selection criterion.

    :param X: usage matrix (made of usage vectors)
    :param max_range: range within the number of clusters should lie
    :param criterion: K-selection criterion: 'silhouette' or 'calinski'
    :return: best_model: KMeans model (sklearn.cluster.Kmeans) with best clustering according to the criterion
             scores: list of tuples (k, s) indicating the clustering score s obtained using k clusters
    """
    assert criterion in ['silhouette', 'calinski', 'harabasz', 'calinski-harabasz']

    best_model, best_score = None, -1
    scores = []

    for k in max_range:
        if k < X.shape[0]:
            kmeans = KMeans(n_clusters=k, random_state=SEED)
            cluster_labels = kmeans.fit_predict(X)

            if criterion == 'silhouette':
                score = silhouette_score(X, cluster_labels)
            else:
                score = calinski_harabasz_score(X, cluster_labels)

            scores.append((k, score))

            # if two clusterings yield the same score, keep the one that results from a smaller K
            if score > best_score:
                best_model, best_score = kmeans, score

    return best_model, scores

In [13]:
# first we need to get the matrix of bert context vectors for a word. Let's start with 'model'

In [14]:
word = 'model'

tokens = pd.read_csv('./collected_tokens/acl/{}.csv'.format(word))

In [15]:
tokens.head(5)

Unnamed: 0.1,Unnamed: 0,corpus_id,sentence,start_idx,end_idx
0,0,18022704,Since the similarity measure based on the vect...,9,10
1,1,18022704,e) Words with similar contexts might not be sy...,19,20
2,2,18022704,"Therefore, the vector space model should incor...",5,6
3,3,18022704,"Conclusions In this paper, we have adopted the...",11,12
4,4,16703040,Based on a review of our misclassified instanc...,20,21
