<a href="https://colab.research.google.com/github/christophersingh/Mental-Health-Classification-Research/blob/main/Improvised%20ML%20Model%20Optimization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import statistically significant data

In [None]:
import pandas as pd
import numpy as np 

df = pd.read_csv('./statistically_significant_data.csv')
df

Unnamed: 0,time_period,value,CASE_RATE,HOSPITALIZED_RATE,sentiment__negative,sentiment__positive,indicator
0,1.0,28.7,171.46,13.55,500.0,50.0,1.0
1,1.0,28.7,1554.45,153.69,500.0,50.0,1.0
2,1.0,28.7,2529.03,630.34,500.0,50.0,1.0
3,1.0,28.7,2552.64,1192.50,500.0,50.0,1.0
4,1.0,28.7,2976.74,1830.07,500.0,50.0,1.0
...,...,...,...,...,...,...,...
35779,23.0,35.8,20207.72,1160.60,501.0,49.0,2.0
35780,23.0,35.8,15941.19,2540.10,501.0,49.0,2.0
35781,23.0,35.8,7067.11,2418.35,501.0,49.0,2.0
35782,23.0,35.8,7477.43,3995.12,501.0,49.0,2.0


Prepare Pipelines and Mixture Components

In [None]:
import pandas as pd
import numpy as np
from numpy import random
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

Normalize Data

In [None]:
X = df.values
X = StandardScaler().fit_transform(X)

In [None]:
scaler = StandardScaler()
model = KMeans(n_clusters=3, max_iter=5000, n_init=10, random_state=0) 
pipeline = make_pipeline(model)
pipeline.fit(X)
res = pipeline.predict(X)

In [None]:
model = GaussianMixture(n_components=3, max_iter=10000, n_init=100, random_state=0) 
res2 = model.fit_predict(X)

In [None]:
df['K-Means Cluster'] = res
df['GMM Result'] = res2
df

Unnamed: 0,time_period,value,CASE_RATE,HOSPITALIZED_RATE,sentiment__negative,sentiment__positive,indicator,K-Means Cluster,GMM Result
0,1.0,28.7,171.46,13.55,500.0,50.0,1.0,1,1
1,1.0,28.7,1554.45,153.69,500.0,50.0,1.0,1,1
2,1.0,28.7,2529.03,630.34,500.0,50.0,1.0,1,1
3,1.0,28.7,2552.64,1192.50,500.0,50.0,1.0,1,1
4,1.0,28.7,2976.74,1830.07,500.0,50.0,1.0,1,1
...,...,...,...,...,...,...,...,...,...
35779,23.0,35.8,20207.72,1160.60,501.0,49.0,2.0,0,2
35780,23.0,35.8,15941.19,2540.10,501.0,49.0,2.0,0,2
35781,23.0,35.8,7067.11,2418.35,501.0,49.0,2.0,0,2
35782,23.0,35.8,7477.43,3995.12,501.0,49.0,2.0,0,2


**Weighted k-means**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; 
sns.set()  # for plot styling
from sklearn.cluster import KMeans
from pandas import DataFrame
import random
from sklearn.datasets.samples_generator import make_blobs
from scipy.stats import skewnorm

In [None]:
kmeans = KMeans(n_clusters=3, random_state=0, max_iter=5000)
X = np.array(df.drop(['indicator'], 1).astype(float))
Y = np.array(df['indicator'].astype(float))

In [None]:
import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = StandardScaler().fit_transform(X)
X = pd.DataFrame(x_scaled)

In [None]:
wt_kmeansclus = kmeans.fit(X)
predicted_kmeans = kmeans.predict(X)

Show weighted centers of Features

In [None]:
centersdf

Unnamed: 0,Weight time_period,Weight value,Weight case_rate,Weight hospitalized_rate,Weight sentiment__negative,Weight sentiment__positive,6,7,cluster_assignment
0,-0.170563,0.98346,-0.239989,-0.115356,-0.169931,0.169931,1.307777,1.216099,0
1,1.13111,0.316017,1.587126,0.782792,1.099392,-1.099392,-1.468534,0.247146,1
2,-0.190397,-1.178939,-0.265806,-0.137239,-0.176591,0.176591,-0.285642,-0.405569,2


Get weight by column and by observation

In [None]:
df['cluster_assignment'] = predicted_kmeans
centers = wt_kmeansclus.cluster_centers_
centersdf=pd.DataFrame(centers)
centersdf['cluster_assignment'] = centersdf.index
centersdf=centersdf.rename(columns = {0:'Weight time_period',1:'Weight value', 2:'Weight case_rate', 3:'Weight hospitalized_rate', 4:'Weight sentiment__negative', 5:'Weight sentiment__positive'})
df=df.merge(centersdf,on='cluster_assignment',how='left')
df.head()

Unnamed: 0,time_period,value,CASE_RATE,HOSPITALIZED_RATE,sentiment__negative,sentiment__positive,indicator,K-Means Cluster,GMM Result,cluster_assignment,Weight time_period,Weight value,Weight case_rate,Weight hospitalized_rate,Weight sentiment__negative,Weight sentiment__positive,6,7
0,1.0,28.7,171.46,13.55,500.0,50.0,1.0,1,1,2,-0.190397,-1.178939,-0.265806,-0.137239,-0.176591,0.176591,-0.285642,-0.405569
1,1.0,28.7,1554.45,153.69,500.0,50.0,1.0,1,1,2,-0.190397,-1.178939,-0.265806,-0.137239,-0.176591,0.176591,-0.285642,-0.405569
2,1.0,28.7,2529.03,630.34,500.0,50.0,1.0,1,1,2,-0.190397,-1.178939,-0.265806,-0.137239,-0.176591,0.176591,-0.285642,-0.405569
3,1.0,28.7,2552.64,1192.5,500.0,50.0,1.0,1,1,2,-0.190397,-1.178939,-0.265806,-0.137239,-0.176591,0.176591,-0.285642,-0.405569
4,1.0,28.7,2976.74,1830.07,500.0,50.0,1.0,1,1,2,-0.190397,-1.178939,-0.265806,-0.137239,-0.176591,0.176591,-0.285642,-0.405569


**Get Weights**

In [None]:
del df['cluster_assignment']
del df[6]
del df[7]
df

Unnamed: 0,time_period,value,CASE_RATE,HOSPITALIZED_RATE,sentiment__negative,sentiment__positive,indicator,K-Means Cluster,GMM Result,Weight time_period,Weight value,Weight case_rate,Weight hospitalized_rate,Weight sentiment__negative,Weight sentiment__positive
0,1.0,28.7,171.46,13.55,500.0,50.0,1.0,1,1,-0.190397,-1.178939,-0.265806,-0.137239,-0.176591,0.176591
1,1.0,28.7,1554.45,153.69,500.0,50.0,1.0,1,1,-0.190397,-1.178939,-0.265806,-0.137239,-0.176591,0.176591
2,1.0,28.7,2529.03,630.34,500.0,50.0,1.0,1,1,-0.190397,-1.178939,-0.265806,-0.137239,-0.176591,0.176591
3,1.0,28.7,2552.64,1192.50,500.0,50.0,1.0,1,1,-0.190397,-1.178939,-0.265806,-0.137239,-0.176591,0.176591
4,1.0,28.7,2976.74,1830.07,500.0,50.0,1.0,1,1,-0.190397,-1.178939,-0.265806,-0.137239,-0.176591,0.176591
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35779,23.0,35.8,20207.72,1160.60,501.0,49.0,2.0,0,2,1.131110,0.316017,1.587126,0.782792,1.099392,-1.099392
35780,23.0,35.8,15941.19,2540.10,501.0,49.0,2.0,0,2,1.131110,0.316017,1.587126,0.782792,1.099392,-1.099392
35781,23.0,35.8,7067.11,2418.35,501.0,49.0,2.0,0,2,1.131110,0.316017,1.587126,0.782792,1.099392,-1.099392
35782,23.0,35.8,7477.43,3995.12,501.0,49.0,2.0,0,2,1.131110,0.316017,1.587126,0.782792,1.099392,-1.099392


In [None]:
df.describe()

Unnamed: 0,time_period,value,CASE_RATE,HOSPITALIZED_RATE,sentiment__negative,sentiment__positive,indicator,K-Means Cluster,GMM Result,Weight time_period,Weight value,Weight case_rate,Weight hospitalized_rate,Weight sentiment__negative,Weight sentiment__positive
count,35784.0,35784.0,35784.0,35784.0,35784.0,35784.0,35784.0,35784.0,35784.0,35784.0,35784.0,35784.0,35784.0,35784.0,35784.0
mean,14.045775,31.663146,4320.33733,1208.542817,500.704225,49.295775,1.0,1.057903,1.178404,0.057451,-0.263939,0.081119,0.037707,0.059015,-0.059015
std,6.431857,5.701017,3303.88542,998.74775,8.953674,8.953674,0.816508,0.72039,0.675609,0.508385,0.979133,0.71309,0.35288,0.492563,0.492563
min,1.0,22.0,171.46,13.55,469.0,30.0,0.0,0.0,0.0,-0.190397,-1.178939,-0.265806,-0.137239,-0.176591,-1.099392
25%,10.0,25.1,2593.3625,280.3275,495.0,43.0,0.0,1.0,1.0,-0.190397,-1.178939,-0.265806,-0.137239,-0.176591,0.169931
50%,14.5,33.0,3797.395,905.785,501.0,49.0,1.0,1.0,1.0,-0.190397,-1.178939,-0.265806,-0.137239,-0.176591,0.176591
75%,20.0,36.1,4745.6475,1886.595,507.0,55.0,2.0,2.0,2.0,-0.170563,0.98346,-0.239989,-0.115356,-0.169931,0.176591
max,23.0,41.7,20207.72,3995.12,520.0,81.0,2.0,2.0,2.0,1.13111,0.98346,1.587126,0.782792,1.099392,0.176591


In [None]:
weights = df[['Weight time_period', 'Weight value', 'Weight case_rate', 'Weight hospitalized_rate', 'Weight sentiment__negative', 'Weight sentiment__positive']].describe()
weights

Unnamed: 0,Weight time_period,Weight value,Weight case_rate,Weight hospitalized_rate,Weight sentiment__negative,Weight sentiment__positive
count,35784.0,35784.0,35784.0,35784.0,35784.0,35784.0
mean,0.057451,-0.263939,0.081119,0.037707,0.059015,-0.059015
std,0.508385,0.979133,0.71309,0.35288,0.492563,0.492563
min,-0.190397,-1.178939,-0.265806,-0.137239,-0.176591,-1.099392
25%,-0.190397,-1.178939,-0.265806,-0.137239,-0.176591,0.169931
50%,-0.190397,-1.178939,-0.265806,-0.137239,-0.176591,0.176591
75%,-0.170563,0.98346,-0.239989,-0.115356,-0.169931,0.176591
max,1.13111,0.98346,1.587126,0.782792,1.099392,0.176591


Take standard deviation to obtain feature weights

In [None]:
helper = weights.loc['std']
helper

In [None]:
time_period_weight = helper['Weight time_period']
value_weight = helper['Weight value']
case_rate_weight = helper['Weight case_rate']
hospitalized_rate_weight = helper['Weight hospitalized_rate']
sentiment_negative_weight = helper['Weight sentiment__negative']
sentiment_positive_weight = helper['Weight sentiment__positive']

In [None]:
df.drop(['K-Means Cluster', 'GMM Result', 'Weight time_period', 'Weight value', 'Weight case_rate', 'Weight hospitalized_rate', 'Weight sentiment__negative', 'Weight sentiment__positive'], axis=1, inplace=True)
df

Unnamed: 0,time_period,value,CASE_RATE,HOSPITALIZED_RATE,sentiment__negative,sentiment__positive,indicator
0,1.0,28.7,171.46,13.55,500.0,50.0,1.0
1,1.0,28.7,1554.45,153.69,500.0,50.0,1.0
2,1.0,28.7,2529.03,630.34,500.0,50.0,1.0
3,1.0,28.7,2552.64,1192.50,500.0,50.0,1.0
4,1.0,28.7,2976.74,1830.07,500.0,50.0,1.0
...,...,...,...,...,...,...,...
35779,23.0,35.8,20207.72,1160.60,501.0,49.0,2.0
35780,23.0,35.8,15941.19,2540.10,501.0,49.0,2.0
35781,23.0,35.8,7067.11,2418.35,501.0,49.0,2.0
35782,23.0,35.8,7477.43,3995.12,501.0,49.0,2.0


In [None]:
indicator = list(df['indicator'])
df.drop(['indicator'], axis=1, inplace=True)
import pandas as pd
from sklearn import preprocessing

x = df.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df = pd.DataFrame(x_scaled)
df

Unnamed: 0,0,1,2,3,4,5
0,0.0,0.340102,0.000000,0.000000,0.607843,0.392157
1,0.0,0.340102,0.069024,0.035197,0.607843,0.392157
2,0.0,0.340102,0.117665,0.154911,0.607843,0.392157
3,0.0,0.340102,0.118844,0.296102,0.607843,0.392157
4,0.0,0.340102,0.140010,0.456232,0.607843,0.392157
...,...,...,...,...,...,...
35779,1.0,0.700508,1.000000,0.288090,0.627451,0.372549
35780,1.0,0.700508,0.787060,0.634561,0.627451,0.372549
35781,1.0,0.700508,0.344159,0.603983,0.627451,0.372549
35782,1.0,0.700508,0.364637,1.000000,0.627451,0.372549


Obatain adjusted feature weight

In [None]:
feature_weights = [time_period_weight, value_weight, case_rate_weight, hospitalized_rate_weight, sentiment_negative_weight, sentiment_positive_weight]

def cluster(f1, f2, f3, f4, f5, f6):
  adjusted_weight_f1 = f1 * feature_weights[0]
  adjusted_weight_f2 = f2 * feature_weights[1]
  adjusted_weight_f3 = f3 * feature_weights[2]
  adjusted_weight_f4 = f4 * feature_weights[3]
  adjusted_weight_f5 = f5 * feature_weights[4]
  adjusted_weight_f6 = f6 * feature_weights[5]
  total = adjusted_weight_f1 + adjusted_weight_f2 + adjusted_weight_f3 + adjusted_weight_f4 + adjusted_weight_f5 + adjusted_weight_f6
  return np.log(total)

clusterResult = []
for index, row in df.iterrows():
    getCluster = cluster(row[0], row[1], row[2], row[3], row[4], row[5])
    clusterResult.append(getCluster)

df['feature_weight'] = clusterResult

In [None]:
df['indicator'] = indicator
df

Unnamed: 0,0,1,2,3,4,5,feature_weight,indicator
0,0.0,0.340102,0.000000,0.000000,0.607843,0.392157,-0.191684,1.0
1,0.0,0.340102,0.069024,0.035197,0.607843,0.392157,-0.119675,1.0
2,0.0,0.340102,0.117665,0.154911,0.607843,0.392157,-0.036520,1.0
3,0.0,0.340102,0.118844,0.296102,0.607843,0.392157,0.014694,1.0
4,0.0,0.340102,0.140010,0.456232,0.607843,0.392157,0.082872,1.0
...,...,...,...,...,...,...,...,...
35779,1.0,0.700508,1.000000,0.288090,0.627451,0.372549,0.916926,2.0
35780,1.0,0.700508,0.787060,0.634561,0.627451,0.372549,0.905030,2.0
35781,1.0,0.700508,0.344159,0.603983,0.627451,0.372549,0.763320,2.0
35782,1.0,0.700508,0.364637,1.000000,0.627451,0.372549,0.832795,2.0


In [None]:
df[df['indicator']==0].describe()

Unnamed: 0,0,1,2,3,4,5,feature_weight,indicator
count,11928.0,11928.0,11928.0,11928.0,11928.0,11928.0,11928.0,11928.0
mean,0.59299,0.539179,0.207068,0.300131,0.621651,0.378349,0.431477,0.0
std,0.292365,0.129753,0.1649,0.25085,0.175567,0.175567,0.217639,0.0
min,0.0,0.243655,0.0,0.0,0.0,0.0,-0.246252,0.0
25%,0.409091,0.436548,0.120876,0.067003,0.509804,0.254902,0.281836,0.0
50%,0.613636,0.558376,0.180969,0.224091,0.627451,0.372549,0.4434,0.0
75%,0.863636,0.624365,0.228295,0.470429,0.745098,0.490196,0.587654,0.0
max,1.0,0.741117,1.0,1.0,1.0,1.0,0.844852,0.0


In [None]:
indicator0, indicator1, indicator2

(0.28183599677213284, -0.02747191424547771, 0.4296134921993718)

Find distance between all features

In [None]:
df

Unnamed: 0,0,1,2,3,4,5,feature_weight,indicator,clusters
0,0.0,0.340102,0.000000,0.000000,0.607843,0.392157,-0.191684,1.0,1
1,0.0,0.340102,0.069024,0.035197,0.607843,0.392157,-0.119675,1.0,1
2,0.0,0.340102,0.117665,0.154911,0.607843,0.392157,-0.036520,1.0,1
3,0.0,0.340102,0.118844,0.296102,0.607843,0.392157,0.014694,1.0,1
4,0.0,0.340102,0.140010,0.456232,0.607843,0.392157,0.082872,1.0,1
...,...,...,...,...,...,...,...,...,...
35779,1.0,0.700508,1.000000,0.288090,0.627451,0.372549,0.916926,2.0,2
35780,1.0,0.700508,0.787060,0.634561,0.627451,0.372549,0.905030,2.0,2
35781,1.0,0.700508,0.344159,0.603983,0.627451,0.372549,0.763320,2.0,2
35782,1.0,0.700508,0.364637,1.000000,0.627451,0.372549,0.832795,2.0,2


In [None]:
df.drop(['feature_weight', 'clusters'], axis=1, inplace=True)
df

Unnamed: 0,0,1,2,3,4,5,indicator
0,0.0,0.340102,0.000000,0.000000,0.607843,0.392157,1.0
1,0.0,0.340102,0.069024,0.035197,0.607843,0.392157,1.0
2,0.0,0.340102,0.117665,0.154911,0.607843,0.392157,1.0
3,0.0,0.340102,0.118844,0.296102,0.607843,0.392157,1.0
4,0.0,0.340102,0.140010,0.456232,0.607843,0.392157,1.0
...,...,...,...,...,...,...,...
35779,1.0,0.700508,1.000000,0.288090,0.627451,0.372549,2.0
35780,1.0,0.700508,0.787060,0.634561,0.627451,0.372549,2.0
35781,1.0,0.700508,0.344159,0.603983,0.627451,0.372549,2.0
35782,1.0,0.700508,0.364637,1.000000,0.627451,0.372549,2.0


In [None]:
feature_weights = [time_period_weight, value_weight, case_rate_weight, hospitalized_rate_weight, sentiment_negative_weight, sentiment_positive_weight]

def cluster(f1, f2, f3, f4, f5, f6):
  adjusted_weight_f1 = f1 * (feature_weights[0]) * math.phi
  adjusted_weight_f2 = f2 * (feature_weights[1]) * math.phi
  adjusted_weight_f3 = f3 * (feature_weights[2]) * math.phi
  adjusted_weight_f4 = f4 * (feature_weights[3]) * math.phi
  adjusted_weight_f5 = f5 * (feature_weights[4]) * math.phi
  adjusted_weight_f6 = f6 * (feature_weights[5]) * math.phi
  total = adjusted_weight_f1 + adjusted_weight_f2 + adjusted_weight_f3 + adjusted_weight_f4 + adjusted_weight_f5 + adjusted_weight_f6
  return (np.log(total))

clusterResult = []
for index, row in df.iterrows():
    getCluster = cluster(row[0], row[1], row[2], row[3], row[4], row[5])
    clusterResult.append(getCluster)

df['feature_weight'] = clusterResult

Summation Operator

Iterate through dataframe to convert all observations

In [None]:
feature_weights = [time_period_weight, value_weight, case_rate_weight, hospitalized_rate_weight, sentiment_negative_weight, sentiment_positive_weight]

def cluster(f1, f2, f3, f4, f5, f6):
  adjusted_weight_f1 = f1 * (feature_weights[0]) * math.phi
  adjusted_weight_f2 = f2 * (feature_weights[1]) * math.phi
  adjusted_weight_f3 = f3 * (feature_weights[2]) * math.phi
  adjusted_weight_f4 = f4 * (feature_weights[3]) * math.phi
  adjusted_weight_f5 = f5 * (feature_weights[4]) * math.phi
  adjusted_weight_f6 = f6 * (feature_weights[5]) * math.phi 
  #total = adjusted_weight_f1 + adjusted_weight_f2 + adjusted_weight_f3 + adjusted_weight_f4 + adjusted_weight_f5 + adjusted_weight_f6
  return adjusted_weight_f1, adjusted_weight_f2, adjusted_weight_f3, adjusted_weight_f4, adjusted_weight_f5, adjusted_weight_f6

clusterResult = []
weightF1, weightF2, weightF3, weightF4, weightF5,weightF6 = [],[],[],[],[],[]
for index, row in df.iterrows():
    f1, f2, f3, f4, f5, f6 = cluster(row[0], row[1], row[2], row[3], row[4], row[5])
    weightF1.append(f1)
    weightF2.append(f2)
    weightF3.append(f3)
    weightF4.append(f4)
    weightF5.append(f5)
    weightF6.append(f6)
    
new_df = pd.DataFrame(list(zip(weightF1, weightF2, weightF3, weightF4, weightF5, weightF6, df['indicator'].values)),columns =['time_period', 'value', 'case_rate','hospitalized_rate','negative_tweets','positive_tweets', 'indicator'])
#df['feature_weight'] = clusterResult
new_df

Unnamed: 0,time_period,value,case_rate,hospitalized_rate,negative_tweets,positive_tweets,indicator
0,0.000000,0.333005,0.000000,0.000000,0.299401,0.193162,1.0
1,0.000000,0.333005,0.049221,0.012420,0.299401,0.193162,1.0
2,0.000000,0.333005,0.083906,0.054665,0.299401,0.193162,1.0
3,0.000000,0.333005,0.084746,0.104488,0.299401,0.193162,1.0
4,0.000000,0.333005,0.099840,0.160995,0.299401,0.193162,1.0
...,...,...,...,...,...,...,...
35779,0.508385,0.685890,0.713090,0.101661,0.309059,0.183504,2.0
35780,0.508385,0.685890,0.561245,0.223924,0.309059,0.183504,2.0
35781,0.508385,0.685890,0.245416,0.213134,0.309059,0.183504,2.0
35782,0.508385,0.685890,0.260019,0.352880,0.309059,0.183504,2.0


Partition using derived metrics

In [None]:
value = new_df['value'].values
clusters = []
#Min/max +/- standard deviation to get the thresholds
std_err = value.std()
low = value.mean() - std_err
high = value.mean() + std_err
for x in value:
  if (0 <= x <= low):
    clusters.append(1)
  elif (low < x <= high):
    clusters.append(0)
  else:
    clusters.append(2)

new_df['clusters'] = clusters

In [None]:
print("Accuracy:",accuracy_score(new_df['indicator'] , new_df['clusters']))
print(classification_report(new_df['indicator'] , new_df['clusters']))
print(confusion_matrix(new_df['indicator'] , new_df['clusters']))

Accuracy: 0.8615023474178404
              precision    recall  f1-score   support

         0.0       0.85      0.70      0.77     11928
         1.0       0.93      0.95      0.94     11928
         2.0       0.81      0.93      0.86     11928

    accuracy                           0.86     35784
   macro avg       0.86      0.86      0.86     35784
weighted avg       0.86      0.86      0.86     35784

[[ 8400   882  2646]
 [  588 11340     0]
 [  840     0 11088]]


Get Cluster Entropy

In [None]:
import math
def get_entropys(df_clustered, label_col_name='indicator', cluster_col_name='cluster', cluster_values=range(1,3,1)):
    p = [[round(df_clustered[(df_clustered[label_col_name]==label) & (df_clustered[cluster_col_name]==cluster)].shape[0]/
                df_clustered[df_clustered[cluster_col_name]==cluster].shape[0],3) for label in cluster_values] for cluster in cluster_values]
    p_log_p = [[p_i*math.log(p_i) for p_i in p_i_s if p_i!=0] for p_i_s in p]
    e = [-round(sum(p_log_p_i),3) for p_log_p_i in p_log_p]
    return e
  
entropys_5 = get_entropys(new_df, label_col_name='indicator', cluster_col_name='clusters', cluster_values=range(1,3,1))
print("Clusters' Entropy")
print("Entropy: {0:.3f}".format(min(entropys_5)))

Clusters' Entropy
Entropy: 0.259


Get Cluster Purity

In [None]:
import numpy as np
from sklearn import metrics

def purity_score(y_true, y_pred):
    # compute contingency matrix (also called confusion matrix)
    contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
    # return purity
    return np.sum(np.amax(contingency_matrix, axis=0)) / np.sum(contingency_matrix) 

purity = purity_score(new_df['indicator'], new_df['clusters'])
print("The purity is:",purity)

The purity is: 0.8615023474178404


In [None]:
from sklearn.metrics.cluster import v_measure_score
v_measure = v_measure_score(new_df['indicator'], new_df['clusters'])
print("The v_measure is:",v_measure)

The v_measure is: 0.6226071463542034


In [None]:
from sklearn.metrics.cluster import adjusted_rand_score
randScore = adjusted_rand_score(new_df['indicator'], new_df['clusters'])
print("The rand_score is:",randScore)

The rand_score is: 0.6509110504747214


In [None]:
from sklearn.metrics import silhouette_score 
from sklearn.metrics import calinski_harabasz_score
from sklearn.metrics import davies_bouldin_score

score_kemans_s = silhouette_score(new_df, clusters, metric='euclidean')
score_kemans_c = calinski_harabasz_score(new_df, clusters)
score_kemans_d = davies_bouldin_score(new_df, clusters)
print('Silhouette Score: %.4f' % score_kemans_s)
print('Calinski Harabasz Score: %.4f' % score_kemans_c)
print('Davies Bouldin Score: %.4f' % score_kemans_d)

Silhouette Score: 0.7032
Calinski Harabasz Score: 184390.7946
Davies Bouldin Score: 0.4429
