In [17]:
from lib.spectral_clustering import spectral_clustering
from lib.categorical_similarity_functions import categorical_preprocessing_csv
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np

First thing's first, we need to take our csv file and make it something that we can work with.

In [25]:
numOfAtts, dataA = categorical_preprocessing_csv("lib/data/mushrooms.csv")

Now, let's take that data and cluster it!

In [6]:
assns = spectral_clustering(data,2,"rw",numOfAtts=numOfAtts)

Please be a real thing:  [[1.         0.96085267 0.95796875 ... 0.9229095  0.94593881 0.92519783]
 [0.96085267 1.         0.99351086 ... 0.95104249 0.91610079 0.95333082]
 [0.95796875 0.99351086 1.         ... 0.95163808 0.91610079 0.95104249]
 ...
 [0.9229095  0.95104249 0.95163808 ... 1.         0.91495269 0.99606841]
 [0.94593881 0.91610079 0.91610079 ... 0.91495269 1.         0.91495269]
 [0.92519783 0.95333082 0.95104249 ... 0.99606841 0.91495269 1.        ]]


In [7]:
print(assns)

[0 1 1 ... 1 0 1]


Hooray! We have our assignments! Now, we need to be able to compare these assignments to the actual data (we want to see how well we clustered the data into poisonous and edible).

In [9]:
dataT = np.array(data.T)
verify = dataT[0] # @Max I called it verify because I want to verify the clusters
print(verify)
print(verify.shape)

['p' 'e' 'e' ... 'e' 'p' 'e']
(8124,)


Now that we have taken the first column and reshaped the data, we can convert our 'p''s to 1's and 'e''s to 0's so we can compare this vector to our assignments.

In [11]:
bindata = [] # @Max I called it bindata because i'm taking the data and converting it to 0's and 1's, like binary
for i in range(len(verify)):
    if verify[i]=='p':
        bindata.append(0)
    elif verify[i] == 'e':
        bindata.append(1)
bindata = np.array(bindata)
print(bindata)

[0 1 1 ... 1 0 1]


Now, we'll compute the error by taking the norm of the difference between the cluster assignments vector and the actual categorical assignments of the mushrooms.

In [20]:
errvec = assns-bindata # @Max I called it errvec because i'm going to use it to compute the error
n= errvec.shape[0]
err = np.linalg.norm(errvec)/n
print(err)

0.002892008893271967


This does pretty well! But it's also worth noting that we use the column that tells us whether a mushroom is poisonous or not to make our clusters. To get a more accurate representation of how well our clustering algorithm works, we should ignore this column when we cluster and then make the same comparison as we did here. 

In [21]:
data = data.T[1:]
data = data.T
numOfAtts = numOfAtts[1:]
print(data.shape)
print(numOfAtts.shape)

(8124, 22)
(22,)


In [23]:
assns, SandU = spectral_clustering(data,2,"rw", with_eigen = True, numOfAtts=numOfAtts)
print(assns)

Please be a real thing:  [[1.         0.97422476 0.97120975 ... 0.9345569  0.94348149 0.93694924]
 [0.97422476 1.         0.9932159  ... 0.94881715 0.92743871 0.95120949]
 [0.97120975 0.9932159  1.         ... 0.94943981 0.92743871 0.94881715]
 ...
 [0.9345569  0.94881715 0.94943981 ... 1.         0.92623842 0.9958897 ]
 [0.94348149 0.92743871 0.92743871 ... 0.92623842 1.         0.92623842]
 [0.93694924 0.95120949 0.94881715 ... 0.9958897  0.92623842 1.        ]]
[1 1 1 ... 0 0 0]


In [60]:
dataT = np.array(dataA.T)
verify = dataT[0] 
bindata = [] 
for i in range(len(verify)):
    if verify[i]=='p':
        bindata.append(0)
    elif verify[i] == 'e':
        bindata.append(1)
bindata = np.array(bindata)
errvec = assns-bindata 
n= errvec.shape[0]
err = np.linalg.norm(errvec)/n
print(err)

# to check in how many spots the assingments match reality
count=0
for i in range(len(errvec)):
    if errvec[i]==0:
        count +=1
print(count)

0.00567826814114761
5996


Seeing the error is nice and all, but we all know that visuals are where it's at. The only problem is that our data is HUGE. So, why don't we take advantage of the dimension reduction done by spectral clustering, and project our data into \mathbb{R}^2? 

In [59]:
print(verify[0:30])
print(bindata[0:30])
print(assns[0:30])
S, U = SandU
print(U.shape)

['p' 'e' 'e' 'p' 'e' 'e' 'e' 'e' 'p' 'e' 'e' 'e' 'e' 'p' 'e' 'e' 'e' 'p'
 'p' 'p' 'e' 'p' 'e' 'e' 'e' 'p' 'e' 'e' 'e' 'e']
[0 1 1 0 1 1 1 1 0 1 1 1 1 0 1 1 1 0 0 0 1 0 1 1 1 0 1 1 1 1]
[1 1 1 1 0 1 1 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 0 0]
(8124, 2)
[4.01819435e-15 9.86980042e-01]


In [None]:
# ignore this crap...it's bad and makes no sense i was tripping on shrooms :)
print(SandU)
S, U = SandU

proj_of_d_on_u = []
i = 0
u0 = U[:,0]
u1 = U[:,1]
d = data[i,:]
u0norm = np.sqrt(sum(u0**2))
u1norm = np.sqrt(sum(u1**2))
proj = ((np.dot(d,u0)/(u0norm**2))*u0)+((np.dot(d,u1)/(u1norm**2))*u1)

print(proj)

proj_of_d_on_u.append(proj)

for i in range of len(U):
    ui