In [33]:
import pandas as pd
import numpy as np 
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score

In [34]:
# cleaning up the data 
dataset = pd.read_csv("house-votes-84.csv", header=None, index_col=False)

dataset = dataset.replace({"republican": 0, "democrat" : 1, "y" : 0, "n" : 1, "?" : 3})

dataset

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,0,1,0,1,0,0,0,1,1,1,0,3,0,0,0,1,0
1,0,1,0,1,0,0,0,1,1,1,1,1,0,0,0,1,3
2,1,3,0,0,3,0,0,1,1,1,1,0,1,0,0,1,1
3,1,1,0,0,1,3,0,1,1,1,1,0,1,0,1,1,0
4,1,0,0,0,1,0,0,1,1,1,1,0,3,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
430,0,1,1,0,0,0,0,1,1,0,0,1,0,0,0,1,0
431,1,1,1,0,1,1,1,0,0,0,0,1,1,1,1,1,0
432,0,1,3,1,0,0,0,1,1,1,1,0,0,0,0,1,0
433,0,1,1,1,0,0,0,3,3,3,3,1,0,0,0,1,0


In [49]:
#splitting the dataset into parties & votes
data = dataset.to_numpy()

parties = data[:,0:1]
votes = data[:,1:18]

In [50]:
# run kmeans on the votes 

kmeans = KMeans(n_clusters = 2, random_state = 0, max_iter = 300).fit(votes)

print("Number of Iterations: " + str(kmeans.n_iter_))
print("Error/Distance between centroids: " + str(kmeans.inertia_))

Number of Iterations: 7
Error/Distance between centroids: 3196.3626738104354


In [51]:
kmeans.labels_

array([1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1,
       1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,

In [57]:
score = accuracy_score(parties, kmeans.labels_)

print("Accuracy of Model: " + str(score * 100) + "%")

Accuracy of Model: 13.563218390804598%


In [82]:
# run kmeans on the votes with more parties

kmeans2 = KMeans(n_clusters = 5, random_state = 0, max_iter = 300).fit(votes)

print("Number of Iterations: " + str(kmeans2.n_iter_))
print("Error/Distance between centroids: " + str(kmeans2.inertia_))

Number of Iterations: 5
Error/Distance between centroids: 2292.5327930331955


In [83]:
kmeans2.labels_

array([0, 2, 0, 1, 1, 0, 0, 0, 0, 4, 0, 2, 4, 4, 2, 2, 1, 1, 0, 1, 1, 1,
       1, 1, 4, 1, 1, 1, 0, 1, 0, 4, 1, 0, 1, 0, 0, 0, 0, 1, 4, 4, 1, 1,
       4, 4, 1, 4, 1, 0, 1, 0, 4, 0, 2, 0, 0, 0, 0, 2, 4, 0, 4, 1, 1, 0,
       0, 0, 1, 1, 4, 1, 4, 0, 1, 0, 2, 0, 0, 0, 4, 1, 0, 0, 0, 0, 0, 0,
       0, 2, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 2, 0, 2, 1, 0, 3, 4, 4,
       1, 0, 4, 0, 4, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 3, 4, 1,
       2, 0, 2, 0, 0, 1, 1, 1, 1, 0, 0, 1, 4, 1, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 4, 0, 4, 0, 0, 0, 0, 2, 4, 0, 1, 2, 1, 4, 0, 1, 0, 4, 1,
       1, 1, 4, 1, 4, 1, 1, 3, 1, 1, 1, 4, 2, 1, 2, 2, 1, 1, 1, 2, 4, 0,
       4, 4, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 2, 2, 0, 4, 1,
       4, 2, 4, 0, 0, 0, 1, 1, 0, 0, 0, 0, 4, 0, 4, 0, 1, 1, 4, 0, 0, 1,
       1, 1, 1, 1, 1, 0, 3, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 4, 1, 4,
       4, 1, 0, 1, 4, 1, 1, 1, 4, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 1, 1,
       2, 1, 1, 2, 1, 1, 1, 1, 2, 2, 0, 4, 1, 1, 0,

In [84]:
score = accuracy_score(parties, kmeans2.labels_)

print("Accuracy of Model: " + str(score * 100) + "%")

Accuracy of Model: 67.58620689655173%
