# Data Mining Project

#### load packages

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (20,10)

#### load data

In [3]:
gene_df = pd.read_csv('genedata.csv', index_col = 0)
gene_df.head()

Unnamed: 0_level_0,class,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f6991,f6992,f6993,f6994,f6995,f6996,f6997,f6998,f6999,f7000
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5,2.3383,10.544,11.4653,10.4441,10.6311,7.7036,9.6444,7.8501,7.7853,...,11.9394,11.1349,5.7066,8.2176,9.9062,9.0815,10.5098,11.8648,8.4341,6.7618
2,4,3.9151,9.5815,10.3992,9.8333,8.9781,7.0265,9.2761,7.3903,8.2744,...,11.6547,7.7178,9.3558,7.5026,10.2106,8.9278,10.2466,11.3329,7.8209,7.435
3,5,2.9322,9.3102,11.0756,9.849,9.4291,8.0618,8.3874,7.9274,10.0544,...,11.9372,11.2369,6.586,9.1668,10.3315,9.7551,11.4392,10.7036,8.0041,8.538
4,5,3.947,8.5315,10.9836,9.3676,9.6856,7.5842,8.5774,8.3737,9.0443,...,13.2541,11.8221,9.5261,7.4231,11.2862,9.8992,10.4067,11.9044,7.9843,8.537
5,1,4.4618,9.1392,10.9183,10.049,9.829,6.5658,9.8122,9.0646,8.3383,...,11.6416,11.5142,6.5539,7.8319,11.01,10.1013,11.2682,10.8969,8.1959,8.2906


#### check for missing values

In [4]:
gene_cols_with_missing = [col for col in gene_df.columns if gene_df[col].isnull().any()]

print("gene dataframe has "+str(len(gene_cols_with_missing))+" columns with missing values.")

gene dataframe has 0 columns with missing values.


It seems that there are no missing values.

#### Save labels in seperate array

In [5]:
#print(gene_df['class'])
print(len(gene_df.columns))
if len(gene_df.columns) == 7001:
    true_labels = gene_df['class']
    gene_df = gene_df.drop('class',axis=1)

gene = gene_df.to_numpy()
gene = gene.astype(np.float)
#print(gene_labels)

7001


## Gene dataset

### Processing

In [6]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.neighbors import DistanceMetric

In [7]:
gene_std = StandardScaler().fit_transform(gene)

In [8]:
gene_mas = MaxAbsScaler().fit_transform(gene)

In [9]:
gene_cs = cosine_similarity(gene)
gene_cs_std = cosine_similarity(gene_std)
gene_cs_mas = cosine_similarity(gene_mas)

In [10]:
def euclidean_similarity(data):
    return 1/(1+euclidean_distances(data, data))

In [11]:
gene_ed = euclidean_similarity(gene)
gene_ed_std = euclidean_similarity(gene_std)
gene_ed_mas = euclidean_similarity(gene_mas)

## Clustering

In [14]:
from sklearn.cluster import KMeans
from sklearn.metrics.cluster import normalized_mutual_info_score
from sklearn.cluster import SpectralClustering
from sklearn.cluster import AgglomerativeClustering

def get_nmi(labels1,labels2):
    return normalized_mutual_info_score(labels1,labels2,average_method='geometric')

### K-means

In [15]:
def perform_KMeans(data):
    kmeans_list = []
    for i in range(1,9):
        kmeans = KMeans(n_clusters=i).fit(data)
        labels = kmeans.labels_
        kmeans_list.append((i,get_nmi(true_labels,labels),labels))
    sorted_kmeans_list = sorted(kmeans_list, key=lambda x:x[1], reverse=True)
    return sorted_kmeans_list

In [16]:
kmeans_0 = perform_KMeans(gene)
kmeans_ss = perform_KMeans(gene_std)
kmeans_mas = perform_KMeans(gene_mas)

It seems that 4-8 clusters is optimal, let's see if we can find a better clustering by chance:

In [17]:
for i in range(3):
    print(i)
    print(kmeans_0[i][0:2])
    print(kmeans_ss[i][0:2])
    print(kmeans_mas[i][0:2])

0
(1, 2.625)
(1, 2.625)
(1, 2.625)
1
(6, 0.8824258875040402)
(5, 0.788158868791317)
(5, 0.8446159380386106)
2
(7, 0.8586095137185736)
(7, 0.7847710683634179)
(7, 0.837214360827197)


### Hierarchical = garbage

In [18]:
def perform_hierarchical(data,linkage):
    hier_list = []
    for i in range(1,9):
        hier = AgglomerativeClustering(n_clusters=i,linkage=linkage).fit(data)
        labels = hier.labels_
        hier_list.append((i,get_nmi(true_labels,labels),labels))
    sorted_hier_list = sorted(hier_list, key=lambda x:x[1], reverse=True)
    return sorted_hier_list  

In [19]:
linkages = ['ward','complete','average','single']
hier_0 = []
hier_ss = []
hier_mas = []
for linkage in linkages:
    print(linkage)
    hier_0.append((linkage,perform_hierarchical(gene,linkage)))
    hier_ss.append((linkage,perform_hierarchical(gene_std,linkage)))
    hier_mas.append((linkage,perform_hierarchical(gene_mas,linkage)))

ward
complete
average
single


In [20]:
for i in range(4):
    print("============================================")
    print(linkages[i])
    print("0")
    for j in range(3):
        print(hier_0[i][1][j][0:2])
    print("ss")
    for j in range(3):
        print(hier_ss[i][1][j][0:2])
    print("mas")
    for j in range(3):
        print(hier_mas[i][1][j][0:2])

ward
0
(1, 2.625)
(6, 0.9179157442581702)
(7, 0.8878777908498132)
ss
(1, 2.625)
(7, 0.8378882021311446)
(8, 0.8282447174039829)
mas
(1, 2.625)
(7, 0.88845314017232)
(6, 0.8512060961094837)
complete
0
(1, 2.625)
(8, 0.6100507558366003)
(7, 0.6003261469427305)
ss
(1, 2.625)
(8, 0.5083856029139342)
(7, 0.49403066238187526)
mas
(1, 2.625)
(8, 0.6181491399834758)
(7, 0.5279598097586954)
average
0
(1, 2.625)
(8, 0.046350697557658097)
(7, 0.04604423405299482)
ss
(1, 2.625)
(8, 0.0765473397628901)
(7, 0.07449474820534495)
mas
(1, 2.625)
(8, 0.0493062446937199)
(7, 0.04404218670429313)
single
0
(1, 2.625)
(8, 0.03600217293124191)
(7, 0.03468494456958169)
ss
(1, 2.625)
(8, 0.03600217293124191)
(7, 0.03155397027375823)
mas
(1, 2.625)
(8, 0.03600217293124191)
(7, 0.031643766137050604)


### Spectral = garbage

nearest neighbour 

In [21]:
def perform_spectral_nn(data):
    spectral_list = []
    for nc in range(2,8):
        for nn in range(5,9,2):
            spectral = SpectralClustering(n_clusters=nc, affinity='nearest_neighbors',n_neighbors=nn,n_jobs=3).fit(data)
            labels = spectral.labels_
            spectral_list.append((nc,nn,get_nmi(true_labels,labels),labels))
    sorted_spectral_list = sorted(spectral_list, key=lambda x:x[2], reverse=True)
    return sorted_spectral_list 

In [22]:
spectral_0 = perform_spectral_nn(gene)
spectral_ss = perform_spectral_nn(gene_std)
spectral_mas = perform_spectral_nn(gene_mas)

In [23]:
print("0")
for i in range(3):
    print(spectral_0[i][0:3])

print("ss")
for i in range(3):
    print(spectral_ss[i][0:3])

print("mas")
for i in range(3):
    print(spectral_mas[i][0:3])

0
(5, 5, 0.9847702095491678)
(5, 7, 0.9762785050842379)
(6, 5, 0.9372444167772979)
ss
(5, 7, 0.9665000951941088)
(5, 5, 0.9485802550113134)
(6, 7, 0.9247786084922212)
mas
(5, 7, 0.9774353402417695)
(5, 5, 0.9759616118626921)
(6, 5, 0.9323883209786678)


RBF

In [24]:
def perform_spectral_rbf(data):
    spectral_list = []
    for nc in range(2,8):
        spectral = SpectralClustering(n_clusters=nc, affinity='rbf',n_jobs=3).fit(data)
        labels = spectral.labels_
        spectral_list.append((nc,get_nmi(true_labels,labels),labels))
    sorted_spectral_list = sorted(spectral_list, key=lambda x:x[1], reverse=True)
    return sorted_spectral_list 

In [25]:
spectral_rbf_0 = perform_spectral_rbf(gene)
spectral_rbf_ss = perform_spectral_rbf(gene_std)
spectral_rbf_mas = perform_spectral_rbf(gene_mas)

In [26]:
print("0")
for i in range(3):
    print(spectral_rbf_0[i][0:2])

print("ss")
for i in range(3):
    print(spectral_rbf_ss[i][0:2])

print("mas") 
for i in range(3):
    print(spectral_rbf_mas[i][0:2])

0
(6, 0.015463222435514722)
(7, 0.010027876006086229)
(5, 0.008843401879973015)
ss
(7, 0.007491396037058029)
(6, 0.006744086073076299)
(2, 0.006721349129332039)
mas
(7, 0.031643766137050604)
(6, 0.03005573358925145)
(5, 0.028462663673074918)


Precomputed

In [27]:
def perform_spectral_pc(data):
    spectral_list = []
    for nc in range(2,8):
        spectral = SpectralClustering(n_clusters=nc, affinity='precomputed',n_jobs=3).fit(data)
        labels = spectral.labels_
        spectral_list.append((nc,get_nmi(true_labels,labels),labels))
    sorted_spectral_list = sorted(spectral_list, key=lambda x:x[1], reverse=True)
    return sorted_spectral_list 

In [28]:
spectral_pc_cs = perform_spectral_pc(gene_cs)
#spectral_pc_cs_ss = perform_spectral_pc(ms_cs_ss)
spectral_pc_cs_mas = perform_spectral_pc(gene_cs_mas)
spectral_pc_ed = perform_spectral_pc(gene_ed)
spectral_pc_ed_ss = perform_spectral_pc(gene_ed_std)
spectral_pc_ed_mas = perform_spectral_pc(gene_ed_mas)

In [29]:
print("cs 0")
for i in range(3):
    print(spectral_pc_cs[0][0:2])
#print(spectral_pc_cs_ss[0][0:2])
print("cs mas")
for i in range(3):
    print(spectral_pc_cs_mas[0][0:2])
print("ed 0")
for i in range(3):
    print(spectral_pc_ed[0][0:2])
print("ed ss")
for i in range(3):
    print(spectral_pc_ed_ss[0][0:2])
print("ed mas")
for i in range(3):
    print(spectral_pc_ed_mas[0][0:2])

cs 0
(6, 0.7762300398086431)
(6, 0.7762300398086431)
(6, 0.7762300398086431)
cs mas
(6, 0.7574254553108777)
(6, 0.7574254553108777)
(6, 0.7574254553108777)
ed 0
(5, 0.8001183745485337)
(5, 0.8001183745485337)
(5, 0.8001183745485337)
ed ss
(7, 0.6660144100221334)
(7, 0.6660144100221334)
(7, 0.6660144100221334)
ed mas
(5, 0.7576765658439144)
(5, 0.7576765658439144)
(5, 0.7576765658439144)
