# Clusters labelling
Hopefully succeeding to automatically label clusters, otherwise manually label them.

In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df = pd.read_json('data/arxiv_papers.json', orient='records')

In [3]:
embeddings = np.load('data/abstract_embeddings.npy')

In [4]:
with open('model/clusteriser.model', 'rb') as f:
    clusteriser = pickle.load(f)

In [5]:
df['cluster'] = list(clusteriser.predict(embeddings))

In [6]:
corpus = []
for cluster in range(clusteriser.n_clusters):
    corpus.append(' '.join(df.abstract[df.cluster == cluster]))

In [7]:
vectoriser = TfidfVectorizer(stop_words='english', ngram_range=(1, 3))
X = np.array(vectoriser.fit_transform(corpus).todense())

In [10]:
cluster_names = [' ; '.join(e) for e in np.array(vectoriser.get_feature_names())[X.argsort(axis=1)[:, ::-1][:, :5]]]

In [11]:
clusters_df = pd.DataFrame({'cluster_id': list(range(clusteriser.n_clusters)), 'cluster_name': cluster_names}).set_index('cluster_id')

In [12]:
clusters_df.to_csv('model/cluster_names.csv')