In [58]:
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn import metrics
import plotly.express as px

In [2]:
policy_filepath = "policy_database.xlsx"

In [3]:
policy_df = pd.read_excel(policy_filepath)
policy_df

Unnamed: 0,state,year,felony,invcommitment,invoutpatient,danger,drugmisdemeanor,alctreatment,alcoholism,relinquishment,...,expartedating,dvrosurrender,dvrosurrendernoconditions,dvrosurrenderdating,expartesurrender,expartesurrendernoconditions,expartesurrenderdating,dvroremoval,stalking,lawtotal
0,Alabama,1991,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,15
1,Alabama,1992,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,16
2,Alabama,1993,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,16
3,Alabama,1994,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,16
4,Alabama,1995,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,Wyoming,2016,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8
1496,Wyoming,2017,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8
1497,Wyoming,2018,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,7
1498,Wyoming,2019,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,7


# Clustering

## Metrics

In [21]:
def intrinsic_metrics(data, predictions):
    #sillhouette score ranges from -1 to 1, where 1 is best and 0 indicates cluster overlap
    ss = metrics.silhouette_score(data, predictions, metric='euclidean')
    print("Sillhouette score:", ss)
    # variance ratio criterion-- how tightly clustered (higher is better)
    chs = metrics.calinski_harabasz_score(data, predictions)
    print("Calinski-Harabasz Index:", chs)
    # similarity between clusters (lower is better)
    dbs = metrics.davies_bouldin_score(data, predictions)   
    print("Davies-Bouldin Index:", dbs)
    return [ss, chs, dbs]

## Model Training

In [52]:
def run_kMeans(X, k):
    best_model = 0
    best_score = 0
    for i in range(2, k+1):
        clusterer = KMeans(n_clusters=i, init='k-means++')
        predictions = clusterer.fit_predict(X)
        score = metrics.davies_bouldin_score(X, predictions) 
        if score > best_score:
            best_score = score
            best_model = predictions
    return predictions        

In [53]:
best_preds = run_kMeans(X, 10)

In [54]:
intrinsic_metrics(X, best_preds)

Sillhouette score: 0.12817562276732422
Calinski-Harabasz Index: 144.9226724759816
Davies-Bouldin Index: 1.9813402751145592


[0.12817562276732422, 144.9226724759816, 1.9813402751145592]

In [55]:
tsne = TSNE(n_components=2, verbose=1)
data_embedded = tsne.fit_transform(sentence_embeddings)

df_tsne = pd.DataFrame()
df_tsne["label"] = best_preds
df_tsne["dimension1"] = data_embedded[:, 0]
df_tsne["dimension2"] = data_embedded[:, 1]
fig = px.scatter(df_tsne, x="dimension1", y="dimension2",  color="label", title="2-D t-SNE of Policy Embeddings")
fig.show()

Clusters are more obviously separated with PCA visualization

In [56]:
pca = PCA(n_components=2)
pca_data = pca.fit_transform(X_scaled)

df_pca = pd.DataFrame()
df_pca["label"] = best_preds
df_pca["dimension1"] = pca_data[:, 0]
df_pca["dimension2"] = pca_data[:, 1]
fig = px.scatter(df_pca, x="dimension1", y="dimension2",  color="label", title="2-D PCA of Policy Embeddings")
fig.show()

In [59]:
svd = TruncatedSVD(n_components=2, algorithm='randomized',
                   random_state=0)
svd_data = svd.fit_transform(X)

df_svd = pd.DataFrame()
df_svd["label"] = best_preds
df_svd["dimension1"] = svd_data[:, 0]
df_svd["dimension2"] = svd_data[:, 1]
fig = px.scatter(df_pca, x="dimension1", y="dimension2",  color="label", title="2-D Truncated SVD of Policy Embeddings")
fig.show()