In [1]:
import pandas as pd
import numpy as np
import math
import pickle
import plotly.express as px
import plotly.graph_objs as go
import os
import sys
import matplotlib.pyplot as plt
from tqdm import tqdm
from plotly.offline import init_notebook_mode, iplot
from myclass.CleanMergeDataset import Clean_Merge_Dataset #prende due dataframe (e.g. illumina normal e tumor), 
                                                        #li accorpa e restituisce dataset intero e labels
from myclass.ResultTable import ResultTable
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from myclass.BonferroniTtest import Bonferroni_Ttest
from sklearn.cluster import KMeans, AgglomerativeClustering, SpectralClustering
from sklearn.metrics import silhouette_score, adjusted_rand_score

In [4]:
#read illumina file

data_normal = pd.read_pickle('data-ready/illumina-27-450-normal')
data_tumor = pd.read_pickle('data-ready/illumina450-27-tumor')
dataset, labels, cases_id = Clean_Merge_Dataset().transform(data_normal, data_tumor)

dataset.head()
labels.head()

#Bonferroni
Bonferroni_dataset = pd.DataFrame(Bonferroni_Ttest(alpha=0.05).fit(dataset, labels).transform(dataset))
Bonferroni_dataset.head()

Data_normal: (125, 25981)
Data_tumor: (1082, 25981)
All data: (1207, 25981)
{'TCGA-LUSC', 'TCGA-LUAD'}
Features completly 0 values 0 removed
Features completely Nan 2597 removed
Final dataset shape (1206, 19697)
Final dataset shape: (1206, 15698)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15688,15689,15690,15691,15692,15693,15694,15695,15696,15697
0,0.145436,0.238829,0.061273,0.574314,0.179827,0.019343,0.015581,0.012865,0.012288,0.040996,...,0.029687,0.027,0.13633,0.642762,0.609087,0.497102,0.012065,0.010047,0.010551,0.234456
1,0.023482,0.127048,0.018317,0.411468,0.599194,0.017732,0.107776,0.01374,0.017746,0.423045,...,0.043639,0.020692,0.021095,0.891056,0.186336,0.590133,0.008278,0.006162,0.012283,0.150736
2,0.022901,0.124989,0.039252,0.904835,0.407803,0.016394,0.018574,0.012854,0.019546,0.032972,...,0.043652,0.02877,0.03539,0.747453,0.708645,0.671824,0.0093,0.00698,0.01282,0.207062
3,0.152088,0.20444,0.055004,0.585254,0.334862,0.014521,0.233927,0.016145,0.014847,0.129278,...,0.030561,0.023523,0.108554,0.656074,0.452591,0.459527,0.008094,0.007547,0.012393,0.239422
4,0.177851,0.179807,0.044693,0.548487,0.257736,0.014981,0.012817,0.017158,0.016132,0.043027,...,0.043748,0.020173,0.072768,0.661479,0.336484,0.28985,0.009491,0.006965,0.010071,0.177258


In [7]:
def Illumina_analysis(df, true_labels, table):
    #KMEANS
    clustering = KMeans(n_clusters=3, max_iter=600).fit(df)
    table.setClusteringAlghorithm(name='KMeans')
    print("KMEANS CLUSTERING")
    sil = silhouette_score(df, clustering.labels_)
    print(sil)
    RI = adjusted_rand_score(true_labels, clustering.labels_)
    print(RI)
    table.setSilhouette(sil)
    table.setRandIndex(RI)
    print()
    table.update()

    #AGGLOMERATIVE
    table.setClusteringAlghorithm(name='Agglomerative')
    clustering = AgglomerativeClustering(n_clusters=3).fit(df)
    print("AGGLOMERATIVE CLUSTERING")
    sil = silhouette_score(df, clustering.labels_)
    print(sil)
    RI = adjusted_rand_score(true_labels, clustering.labels_)
    print(RI)
    table.setSilhouette(sil)
    table.setRandIndex(RI)
    print()
    table.update()

    #SPECTRAL
    table.setClusteringAlghorithm(name='Spectral')
    clustering = SpectralClustering(n_clusters=3).fit(df)
    print("SPECTRAL CLUSTERING")
    sil = silhouette_score(df, clustering.labels_)
    print(sil)
    RI = adjusted_rand_score(true_labels, clustering.labels_)
    print(RI)
    table.setSilhouette(sil)
    table.setRandIndex(RI)
    table.update()

In [8]:
table = ResultTable('Illumina','KMeans')
table.setBonf(True)

<myclass.ResultTable.ResultTable at 0x26314784e80>

In [9]:
#prima prova: per ogni paziente 4 features: min,max,media e deviazione standard dei beta values
mins=[]
maxs=[]
means=[]
std_deviations=[]
medians=[]
for index, person in Bonferroni_dataset.iterrows():
    person_array=person.to_numpy()
    mins.append(np.amin(person_array))
    maxs.append(np.amax(person_array))
    means.append(np.mean(person_array))
    medians.append(np.median(person_array))
    std_deviations.append(np.std(person_array))

data=[]
for i in range(0,len(mins)):
    data.append([mins[i],maxs[i],means[i],std_deviations[i],medians[i]])
    
df = pd.DataFrame(data, columns=['min_beta', 'max_beta', 'mean_beta', 'std_dev_beta', 'median_beta'])
df.head()

Unnamed: 0,min_beta,max_beta,mean_beta,std_dev_beta,median_beta
0,0.005524,0.99208,0.199056,0.282626,0.036754
1,0.004696,0.993646,0.189619,0.280057,0.032385
2,0.005203,0.993301,0.224595,0.319448,0.035493
3,0.005182,0.992891,0.216389,0.286012,0.045624
4,0.005381,0.992665,0.169648,0.255723,0.032651


In [10]:
################################STATISTICS DATASET (NO STD SCALER NO MIN/MAX SCALER)##########################
table.setPca(False)
table.setStandardScaler(False) 
table.setMaxMinScaler(False)
table.setStatisticalFeatures(True)
table.setLogarithmTransformation(False)

true_labels=[]
for label in labels:
    if label=='TCGA-LUAD':
        true_labels.append(0)
    elif label=='TCGA-LUSC':
        true_labels.append(1)
    elif str(label)=='False':
        true_labels.append(2)


print("###############STATISTICS DATASET (NO STD SCALER NO MIN/MAX SCALER)#######")
print()
Illumina_analysis(df, true_labels, table)

###############STATISTICS DATASET (NO STD SCALER NO MIN/MAX SCALER)#######

KMEANS CLUSTERING
0.42066024904413174
0.3724623952536867

AGGLOMERATIVE CLUSTERING
0.41932140598217316
0.38321521196619024

SPECTRAL CLUSTERING
0.416069269792155
0.320818903831565


In [11]:
################################STATISTICS DATASET (STANDARD SCALER)##########################
table.setPca(False)
table.setStandardScaler(True) 
table.setMaxMinScaler(False)
table.setStatisticalFeatures(True)
table.setLogarithmTransformation(False)

scaler = StandardScaler()
scaled_df = scaler.fit_transform(df)

true_labels=[]
for label in labels:
    if label=='TCGA-LUAD':
        true_labels.append(0)
    elif label=='TCGA-LUSC':
        true_labels.append(1)
    elif str(label)=='False':
        true_labels.append(2)

print("###############STATISTICS DATASET (STANDARD SCALER)#######")
print()
Illumina_analysis(scaled_df, true_labels, table)

###############STATISTICS DATASET (STANDARD SCALER)#######

KMEANS CLUSTERING
0.33057495458361513
0.4302439688268274

AGGLOMERATIVE CLUSTERING
0.3129737249849424
0.4641569183579978

SPECTRAL CLUSTERING
0.45597061584781223
-0.0014477098390929927


In [12]:
################################STATISTICS DATASET (MIN/MAX SCALER)##########################
table.setPca(False)
table.setStandardScaler(False) 
table.setMaxMinScaler(True)
table.setStatisticalFeatures(True)
table.setLogarithmTransformation(False)

scaler = MinMaxScaler()
scaled_df = scaler.fit_transform(df)

true_labels=[]
for label in labels:
    if label=='TCGA-LUAD':
        true_labels.append(0)
    elif label=='TCGA-LUSC':
        true_labels.append(1)
    elif str(label)=='False':
        true_labels.append(2)

print("###############STATISTICS DATASET (MIN/MAX SCALER)#######")
print()
Illumina_analysis(scaled_df, true_labels, table)

###############STATISTICS DATASET (MIN/MAX SCALER)#######

KMEANS CLUSTERING
0.3781110722080605
0.4851985992100818

AGGLOMERATIVE CLUSTERING
0.3676242914559082
0.4894935194476812

SPECTRAL CLUSTERING
0.32197689222788717
0.22627736151542016


In [13]:
#seconda prova: trasformazione dei beta values (log2(b/(1-b)))
def log_series(x):
    array=x.to_numpy()
    array=np.log2(array/(1-array))
    return pd.Series(array)

log_df = Bonferroni_dataset.transform(lambda x:log_series(x))
#log_df.head()
log_df = log_df.replace([np.inf, -np.inf], np.nan).dropna(axis=1)
log_df.describe()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,15687,15688,15689,15690,15691,15692,15693,15695,15696,15697
count,1206.0,1206.0,1206.0,1206.0,1206.0,1206.0,1206.0,1206.0,1206.0,1206.0,...,1206.0,1206.0,1206.0,1206.0,1206.0,1206.0,1206.0,1206.0,1206.0,1206.0
mean,-0.973118,-3.573737,0.099241,-1.282246,-4.319731,-4.146227,-5.266595,-4.653474,-3.530582,-6.037485,...,-3.036138,-3.841257,-3.981521,-2.381613,0.575094,0.622271,-0.476963,-4.630617,-3.853255,-2.207359
std,1.045698,1.061152,0.93634,1.107086,1.042945,1.568529,0.703134,0.940831,0.902582,0.588787,...,1.141547,0.75452,0.905235,1.221481,1.112868,1.088142,0.823121,1.390095,1.56292,0.500812
min,-5.142051,-7.091701,-2.976016,-5.87164,-6.459395,-7.580761,-6.602702,-6.629086,-5.383675,-11.376035,...,-5.842471,-6.180068,-6.044861,-5.853793,-3.686929,-5.088362,-4.492192,-7.552243,-7.185183,-4.270495
25%,-1.591274,-4.217,-0.472761,-1.937919,-5.26216,-4.967817,-5.83968,-5.315149,-4.18112,-6.310283,...,-3.742162,-4.357933,-4.569061,-3.067927,0.013422,-0.006448,-0.890246,-6.325293,-5.296795,-2.524066
50%,-0.899323,-3.292933,0.055585,-1.335687,-4.184155,-4.097285,-5.471275,-4.692706,-3.633571,-6.065675,...,-2.958041,-3.882358,-3.917458,-2.369902,0.582589,0.684758,-0.387007,-4.287456,-3.191535,-2.172991
75%,-0.312349,-2.82828,0.685248,-0.486834,-3.38128,-3.432991,-4.603453,-3.812604,-2.909394,-5.758821,...,-2.340278,-3.234778,-3.220448,-1.753321,1.260854,1.361011,0.031884,-3.51252,-2.721674,-1.870313
max,3.629616,-0.502198,3.519839,3.943763,0.392531,0.868366,-3.420686,-2.444258,1.84708,-3.761292,...,4.392058,-2.139872,-1.274644,3.435751,5.58607,5.640941,2.674802,-2.093369,-1.523587,-0.086903


In [14]:
################################LOG DATASET (NO STD SCALER NO MIN/MAX SCALER)##########################
table.setPca(False)
table.setStandardScaler(False) 
table.setMaxMinScaler(False)
table.setStatisticalFeatures(False)
table.setLogarithmTransformation(True)

true_labels=[]
for label in labels:
    if label=='TCGA-LUAD':
        true_labels.append(0)
    elif label=='TCGA-LUSC':
        true_labels.append(1)
    elif str(label)=='False':
        true_labels.append(2)


print("###############LOG DATASET (NO STD SCALER NO MIN/MAX SCALER)#######")
print()
Illumina_analysis(log_df, true_labels, table)

###############LOG DATASET (NO STD SCALER NO MIN/MAX SCALER)#######

KMEANS CLUSTERING
0.24094634543080154
0.48991092865817565

AGGLOMERATIVE CLUSTERING
0.24091615548098194
0.501748412130219

SPECTRAL CLUSTERING
-0.01391906456219195
-0.0022607246442117803


In [15]:
################################LOG DATASET (STD SCALER)##########################
table.setPca(False)
table.setStandardScaler(True) 
table.setMaxMinScaler(False)
table.setStatisticalFeatures(False)
table.setLogarithmTransformation(True)

scaler = StandardScaler()
scaled_df = scaler.fit_transform(log_df)

true_labels=[]
for label in labels:
    if label=='TCGA-LUAD':
        true_labels.append(0)
    elif label=='TCGA-LUSC':
        true_labels.append(1)
    elif str(label)=='False':
        true_labels.append(2)

print("###############LOG DATASET (STANDARD SCALER)#######")
print()
Illumina_analysis(scaled_df, true_labels, table)

###############LOG DATASET (STANDARD SCALER)#######

KMEANS CLUSTERING
0.28559225711352015
0.49697686274302455

AGGLOMERATIVE CLUSTERING
0.2856848681514263
0.4968833892935592

SPECTRAL CLUSTERING
-0.013157614352757445
0.00806220060173182


In [16]:
################################LOG DATASET (MIN/MAX SCALER)##########################
table.setPca(False)
table.setStandardScaler(False) 
table.setMaxMinScaler(True)
table.setStatisticalFeatures(False)
table.setLogarithmTransformation(True)

scaler = MinMaxScaler()
scaled_df = scaler.fit_transform(log_df)

true_labels=[]
for label in labels:
    if label=='TCGA-LUAD':
        true_labels.append(0)
    elif label=='TCGA-LUSC':
        true_labels.append(1)
    elif str(label)=='False':
        true_labels.append(2)

print("###############LOG DATASET (MIN/MAX SCALER)#######")
print()
Illumina_analysis(scaled_df, true_labels, table)

###############LOG DATASET (MIN/MAX SCALER)#######

KMEANS CLUSTERING
0.33558997636969085
0.4993128256090324

AGGLOMERATIVE CLUSTERING
0.33554778066834196
0.501748412130219

SPECTRAL CLUSTERING
0.2574996900259849
-0.000732794392418621


In [17]:
#terza prova: utilizzo di PCA (che raggiunge buoni risultati nei beta_values)
from sklearn.decomposition import PCA

#prima si prova con il dataset originale
transformer = PCA(n_components=0.9)
PCA_df = transformer.fit_transform(Bonferroni_dataset)
print(PCA_df.shape)

(1206, 399)


In [18]:
################################PCA DATASET (NO STD SCALER NO MIN/MAX SCALER)##########################
table.setPca(True)
table.setStandardScaler(False) 
table.setMaxMinScaler(False)
table.setStatisticalFeatures(False)
table.setLogarithmTransformation(False)

true_labels=[]
for label in labels:
    if label=='TCGA-LUAD':
        true_labels.append(0)
    elif label=='TCGA-LUSC':
        true_labels.append(1)
    elif str(label)=='False':
        true_labels.append(2)


print("###############PCA DATASET (NO STD SCALER NO MIN/MAX SCALER)#######")
print()
Illumina_analysis(PCA_df, true_labels, table)

###############PCA DATASET (NO STD SCALER NO MIN/MAX SCALER)#######

KMEANS CLUSTERING
0.10886011610284559
0.5141498918729018

AGGLOMERATIVE CLUSTERING
0.0670759136245856
0.8128855030773817

SPECTRAL CLUSTERING
0.3609867123568902
-0.000732794392418621


In [19]:
################################PCA DATASET (STD SCALER)##########################
table.setPca(True)
table.setStandardScaler(True) 
table.setMaxMinScaler(False)
table.setStatisticalFeatures(False)
table.setLogarithmTransformation(False)

scaler = StandardScaler()
scaled_df = scaler.fit_transform(PCA_df)

true_labels=[]
for label in labels:
    if label=='TCGA-LUAD':
        true_labels.append(0)
    elif label=='TCGA-LUSC':
        true_labels.append(1)
    elif str(label)=='False':
        true_labels.append(2)

print("###############PCA DATASET (STANDARD SCALER)#######")
print()
Illumina_analysis(scaled_df, true_labels, table)

###############PCA DATASET (STANDARD SCALER)#######

KMEANS CLUSTERING
0.3026267411663188
-0.000732794392418621

AGGLOMERATIVE CLUSTERING
0.3248071005429922
-0.0021419266126533215

SPECTRAL CLUSTERING
0.19777147119959243
-2.76491018819624e-05


In [20]:
################################PCA DATASET (MIN/MAX SCALER)##########################
table.setPca(True)
table.setStandardScaler(False) 
table.setMaxMinScaler(True)
table.setStatisticalFeatures(False)
table.setLogarithmTransformation(False)

scaler = MinMaxScaler()
scaled_df = scaler.fit_transform(PCA_df)

true_labels=[]
for label in labels:
    if label=='TCGA-LUAD':
        true_labels.append(0)
    elif label=='TCGA-LUSC':
        true_labels.append(1)
    elif str(label)=='False':
        true_labels.append(2)

print("###############PCA DATASET (MIN/MAX SCALER)#######")
print()
Illumina_analysis(scaled_df, true_labels, table)

###############PCA DATASET (MIN/MAX SCALER)#######

KMEANS CLUSTERING
0.02628118657402271
0.6045314652383712

AGGLOMERATIVE CLUSTERING
-0.1254552376084501
0.4537334740998066

SPECTRAL CLUSTERING
-0.20619427472455198
0.15886303538584412


In [21]:
#poi con il con il dataset trasformato tramite log2
from sklearn.decomposition import PCA

transformer = PCA(n_components=0.9)
PCA_df = transformer.fit_transform(log_df)
print(PCA_df.shape)

(1206, 287)


In [22]:
################################LOG-PCA DATASET (NO STD SCALER NO MIN/MAX SCALER)##########################
table.setPca(True)
table.setStandardScaler(False) 
table.setMaxMinScaler(False)
table.setStatisticalFeatures(False)
table.setLogarithmTransformation(True)

true_labels=[]
for label in labels:
    if label=='TCGA-LUAD':
        true_labels.append(0)
    elif label=='TCGA-LUSC':
        true_labels.append(1)
    elif str(label)=='False':
        true_labels.append(2)

pred_labels=[]

clustering = KMeans(n_clusters=3, max_iter=600).fit(PCA_df)
for label in clustering.labels_:
       pred_labels.append(int(label))

print("###############LOG-PCA DATASET (NO STD SCALER NO MIN/MAX SCALER)#######")
print()
#KMEANS
table.setClusteringAlghorithm(name='KMeans')
print("KMEANS CLUSTERING")
sil = silhouette_score(PCA_df, clustering.labels_)
print(sil)
RI = adjusted_rand_score(true_labels, clustering.labels_)
print(RI)
table.setSilhouette(sil)
table.setRandIndex(RI)
print()
table.update()

#AGGLOMERATIVE
table.setClusteringAlghorithm(name='Agglomerative')
clustering = AgglomerativeClustering(n_clusters=3).fit(PCA_df)
print("AGGLOMERATIVE CLUSTERING")
sil = silhouette_score(PCA_df, clustering.labels_)
print(sil)
RI = adjusted_rand_score(true_labels, clustering.labels_)
print(RI)
table.setSilhouette(sil)
table.setRandIndex(RI)
print()
table.update()


###############LOG-PCA DATASET (NO STD SCALER NO MIN/MAX SCALER)#######

KMEANS CLUSTERING
0.28544795888053814
0.48991092865817565

AGGLOMERATIVE CLUSTERING
0.28545747067998994
0.501748412130219



<myclass.ResultTable.ResultTable at 0x26314784e80>

In [23]:
################################LOG-PCA DATASET (STD SCALER)##########################
table.setPca(True)
table.setStandardScaler(True) 
table.setMaxMinScaler(False)
table.setStatisticalFeatures(False)
table.setLogarithmTransformation(True)

scaler = StandardScaler()
scaled_df = scaler.fit_transform(PCA_df)

true_labels=[]
for label in labels:
    if label=='TCGA-LUAD':
        true_labels.append(0)
    elif label=='TCGA-LUSC':
        true_labels.append(1)
    elif str(label)=='False':
        true_labels.append(2)


print("###############LOG-PCA DATASET (STANDARD SCALER)#######")
print()
Illumina_analysis(scaled_df, true_labels, table)

###############LOG-PCA DATASET (STANDARD SCALER)#######

KMEANS CLUSTERING
0.372615220893088
-0.00038492271575386937

AGGLOMERATIVE CLUSTERING
0.3875108424454306
-0.00251331573347103

SPECTRAL CLUSTERING
0.2960494921054117
-0.0007542007874733234


In [24]:
################################LOG-PCA DATASET (MIN/MAX SCALER)##########################
table.setPca(True)
table.setStandardScaler(False) 
table.setMaxMinScaler(True)
table.setStatisticalFeatures(False)
table.setLogarithmTransformation(True)

scaler = MinMaxScaler()
scaled_df = scaler.fit_transform(PCA_df)

true_labels=[]
for label in labels:
    if label=='TCGA-LUAD':
        true_labels.append(0)
    elif label=='TCGA-LUSC':
        true_labels.append(1)
    elif str(label)=='False':
        true_labels.append(2)


print("###############LOG-PCA DATASET (MIN/MAX SCALER)#######")
print()
Illumina_analysis(scaled_df, true_labels, table)

###############LOG-PCA DATASET (MIN/MAX SCALER)#######

KMEANS CLUSTERING
0.1321749772330635
0.03665608500993342

AGGLOMERATIVE CLUSTERING
-0.005997701630487908
0.45199376403029445

SPECTRAL CLUSTERING
-0.009092087147845433
0.4637317615945232


In [25]:
df_result = table.getDF()
df_result

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Silhouette,RandIndex
Omnic Name,Cluster Algorithm,BonferroniTtest,MaxMinScaler,StandardScaler,PCA,Logarithm Transformation,Statistical Features,Unnamed: 8_level_1,Unnamed: 9_level_1
Illumina,KMeans,Yes,No,No,No,No,Yes,0.42066,0.372462
Illumina,Agglomerative,Yes,No,No,No,No,Yes,0.419321,0.383215
Illumina,Spectral,Yes,No,No,No,No,Yes,0.416069,0.320819
Illumina,KMeans,Yes,No,Yes,No,No,Yes,0.330575,0.430244
Illumina,Agglomerative,Yes,No,Yes,No,No,Yes,0.312974,0.464157
Illumina,Spectral,Yes,No,Yes,No,No,Yes,0.455971,-0.001448
Illumina,KMeans,Yes,Yes,No,No,No,Yes,0.378111,0.485199
Illumina,Agglomerative,Yes,Yes,No,No,No,Yes,0.367624,0.489494
Illumina,Spectral,Yes,Yes,No,No,No,Yes,0.321977,0.226277
Illumina,KMeans,Yes,No,No,No,Yes,No,0.240946,0.489911
