In [1]:
import numpy as np
import pickle
import pandas as pd

# Load the data
with open('ParlaMint_GB_commons_embeddings_truncated.pkl', 'rb') as f:
    embeddings_dict = pickle.load(f)

In [2]:

embeddings_matrix = np.array(list(embeddings_dict.values()))
embeddings_matrix.shape

(472782, 768)

### K-Means with FAISS

In [5]:
import faiss

ncentroids = 2 # Number of clusters
niter = 100
verbose = True
d = embeddings_matrix.shape[1]
kmeans = faiss.Kmeans(d, ncentroids, niter=niter, verbose=verbose)
kmeans.train(embeddings_matrix)


Sampling a subset of 512 / 472782 for training
Clustering 512 points in 768D to 2 clusters, redo 1 times, 100 iterations
  Preprocessing in 0.19 s
  Iteration 99 (23.28 s, search 13.86 s): objective=2593.88 imbalance=1.006 nsplit=0       


2593.877197265625

Create D, I, where I is the cluster and D is the L2 distance to the cluster.

In [16]:
D, I = kmeans.index.search(embeddings_matrix, 1)
I = I.flatten()
I.shape

(472782,)

Check the shape of the centroid data in `kmeans.centroids`

In [15]:
kmeans.centroids.shape

(2, 768)

Create a dataframe with the cluster id:s

In [8]:
# Get the number of cluster members from I

cluster_members = pd.DataFrame(I, columns=['cluster_id'])

Show the counts of different clusters

In [9]:
cluster_members['cluster_id'].value_counts()

0    247187
1    225595
Name: cluster_id, dtype: int64

Load the metadata for the embeddings

In [7]:
metadata = pd.read_feather('ParlaMint_GB_commons.feather')
metadata.head()

Unnamed: 0,ID,Title,Date,Body,Term,Session,Meeting,Sitting,Agenda,Subcorpus,Speaker_role,Speaker_MP,Speaker_Minister,Speaker_party,Speaker_party_name,Party_status,Speaker_name,Speaker_gender,Speaker_birth
0,ParlaMint-GB_2017-11-20-commons.u1,"Minutes of the House of Commons, Daily Session...",2017-11-20,Lower house,57,,,2017-11-20,,Reference,Chairperson,MP,-,CON,Conservative,,"Bercow, John Simon",M,-
1,ParlaMint-GB_2017-11-20-commons.u2,"Minutes of the House of Commons, Daily Session...",2017-11-20,Lower house,57,,,2017-11-20,,Reference,Regular,MP,-,CON,Conservative,,"Jayawardena, Ranil Malcolm",M,-
2,ParlaMint-GB_2017-11-20-commons.u3,"Minutes of the House of Commons, Daily Session...",2017-11-20,Lower house,57,,,2017-11-20,,Reference,Regular,MP,-,CON,Conservative,,"Rudd, Amber",F,-
3,ParlaMint-GB_2017-11-20-commons.u4,"Minutes of the House of Commons, Daily Session...",2017-11-20,Lower house,57,,,2017-11-20,,Reference,Regular,MP,-,CON,Conservative,,"Jayawardena, Ranil Malcolm",M,-
4,ParlaMint-GB_2017-11-20-commons.u5,"Minutes of the House of Commons, Daily Session...",2017-11-20,Lower house,57,,,2017-11-20,,Reference,Regular,MP,-,CON,Conservative,,"Rudd, Amber",F,-


In [40]:
metadata.dtypes

ID                     object
Title                  object
Date                   object
Body                   object
Term                    int64
Session               float64
Meeting               float64
Sitting                object
Agenda                float64
Subcorpus              object
Speaker_role           object
Speaker_MP             object
Speaker_Minister       object
Speaker_party          object
Speaker_party_name     object
Party_status           object
Speaker_name           object
Speaker_gender         object
Speaker_birth          object
dtype: object

Attach the speech id:s to the cluster id:s

In [10]:
cluster_members['speech_id'] = embeddings_dict.keys()

Merge the metadata with the cluster labels

In [11]:
merged_df = pd.merge(metadata, cluster_members, left_on='ID', right_on='speech_id')

Calculate the Normalized Mutual Information (NMI) between the cluster labels and a selected variable from the metadata. A perfect score is 1.0.

In [19]:
from sklearn.metrics import normalized_mutual_info_score
# Change this according to your preference. The number of unique values should be equal to ncentroids.
labels_true = merged_df['Speaker_party']

# Do not change this. This is the cluster id assigned by k-means.
labels_pred = merged_df['cluster_id']

normalized_mutual_info_score(labels_true, labels_pred)

0.0037991005579201646

Save the metadata with the cluster labels

In [39]:
merged_df.to_feather(f'ParlaMint_GB_commons_clustered_{ncentroids}.feather')