In [1]:
import pandas as pd
import numpy as np
import pickle
import plotly.express as px
import plotly.graph_objs as go
import os
import sys
import matplotlib.pyplot as plt
from tqdm import tqdm
from plotly.offline import init_notebook_mode, iplot
from myclass.CleanMergeDataset import Clean_Merge_Dataset #prende due dataframe (e.g. illumina normal e tumor), 
                                                        #li accorpa e restituisce dataset intero e labels
from myclass.ResultTable import ResultTable
from myclass.BonferroniTtest import Bonferroni_Ttest
from sklearn.cluster import KMeans, AgglomerativeClustering, SpectralClustering
from sklearn.metrics import silhouette_score, adjusted_rand_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [2]:
#read miRNA file

data_normal = pd.read_pickle('data-ready//miRNA_dataframe_normal')
data_tumor = pd.read_pickle('data-ready//miRNA_dataframe')
dataset, labels, cases_id = Clean_Merge_Dataset().transform(data_normal, data_tumor)

print(dataset.head())
labels=pd.DataFrame(labels)

#Bonferroni
Bonferroni_dataset = pd.DataFrame(Bonferroni_Ttest(alpha=0.05).fit(dataset, labels).transform(dataset))
Bonferroni_dataset.head()

Data_normal: (136, 1884)
Data_tumor: (623, 1884)
All data: (759, 1884)
{'TCGA-LUSC', 'TCGA-LUAD'}
Features completly 0 values 287 removed
Features completely Nan 0 removed
Final dataset shape (558, 1596)
   hsa-let-7a-1  hsa-let-7a-2  hsa-let-7a-3    hsa-let-7b   hsa-let-7c  \
0   6182.784984   6271.883461   6284.689112  13527.096668  1690.346000   
1  18145.485059  18149.595704  18265.789948  22571.828048  1234.563841   
2   5918.739305   5848.764062   6131.256710  11719.557368   342.101188   
3   6498.141197   6469.266585   6615.444308  10226.575471   542.752472   
4   7494.708394   7488.796575   7498.508849  14368.464669  2484.653024   

    hsa-let-7d   hsa-let-7e  hsa-let-7f-1  hsa-let-7f-2  hsa-let-7g  ...  \
0   359.279688  1506.738208   3236.042246   3288.888103  604.029957  ...   
1  1016.973677   952.299522   8729.914701   8874.883463  655.236880  ...   
2   510.560106   731.500457   1276.400266   1302.317023  687.441971  ...   
3   334.313867   893.759474   2850.465603   289

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,225,226,227,228,229,230,231,232,233,234
0,6182.784984,6271.883461,6284.689112,359.279688,1506.738208,3236.042246,3288.888103,604.029957,0.901806,2185.618099,...,0.0,3.246503,3843.318705,3385.201032,353.868849,6762.285805,1.983974,2.525058,2.344697,509.881364
1,18145.485059,18149.595704,18265.789948,1016.973677,952.299522,8729.914701,8874.883463,655.23688,3.014473,3379.224579,...,0.0,4.110645,7210.620154,6607.999535,453.267168,7776.519007,13.702151,11.509807,1.644258,408.872198
2,5918.739305,5848.764062,6131.25671,510.560106,731.500457,1276.400266,1302.317023,687.441971,0.0,4471.288445,...,0.0,1.295838,4164.822797,4019.041041,104.962865,4306.71704,5.183351,12.310459,1.295838,88.116973
3,6498.141197,6469.266585,6615.444308,334.313867,893.759474,2850.465603,2897.386848,628.473977,2.255829,7843.968815,...,0.0,3.609326,2611.347723,2388.471811,131.289251,2762.037104,2.255829,6.767487,1.804663,278.369306
4,7494.708394,7488.796575,7498.508849,514.117107,807.385551,3075.623776,3113.206053,533.541655,22.591594,8697.974674,...,0.211136,3.800455,2872.299434,2694.522594,421.639368,2201.730263,3.800455,5.067273,4.645001,1064.1274


In [3]:
#drop 'CPTAC-3' labeled samples

#Bonferroni_dataset = Bonferroni_dataset[labels!='CPTAC-3']
#labels = labels[labels!='CPTAC-3']

In [4]:
print(Bonferroni_dataset.shape)
Bonferroni_dataset.head()

(558, 235)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,225,226,227,228,229,230,231,232,233,234
0,6182.784984,6271.883461,6284.689112,359.279688,1506.738208,3236.042246,3288.888103,604.029957,0.901806,2185.618099,...,0.0,3.246503,3843.318705,3385.201032,353.868849,6762.285805,1.983974,2.525058,2.344697,509.881364
1,18145.485059,18149.595704,18265.789948,1016.973677,952.299522,8729.914701,8874.883463,655.23688,3.014473,3379.224579,...,0.0,4.110645,7210.620154,6607.999535,453.267168,7776.519007,13.702151,11.509807,1.644258,408.872198
2,5918.739305,5848.764062,6131.25671,510.560106,731.500457,1276.400266,1302.317023,687.441971,0.0,4471.288445,...,0.0,1.295838,4164.822797,4019.041041,104.962865,4306.71704,5.183351,12.310459,1.295838,88.116973
3,6498.141197,6469.266585,6615.444308,334.313867,893.759474,2850.465603,2897.386848,628.473977,2.255829,7843.968815,...,0.0,3.609326,2611.347723,2388.471811,131.289251,2762.037104,2.255829,6.767487,1.804663,278.369306
4,7494.708394,7488.796575,7498.508849,514.117107,807.385551,3075.623776,3113.206053,533.541655,22.591594,8697.974674,...,0.211136,3.800455,2872.299434,2694.522594,421.639368,2201.730263,3.800455,5.067273,4.645001,1064.1274


In [5]:
table = ResultTable('miRNA','KMeans')
table.setBonf(True)

<myclass.ResultTable.ResultTable at 0x13cd4993460>

In [6]:
#filter counts per milion above a certain threshold (at least 2 samples must have that gene above the threshold, 
#otherwise column is dropped)
thresh=0.5
for col in Bonferroni_dataset.columns:
    count=0
    to_drop=True
    for index, person in Bonferroni_dataset.iterrows():
        #print(person[col])
        if person[col] >=thresh:
            count+=1
            if count==2:
                to_drop=False
                break
    if to_drop:
        Bonferroni_dataset.drop(col, axis=1, inplace=True)
        print('dropped 1 column')
print(Bonferroni_dataset.shape)
df = Bonferroni_dataset
print(type(df))

dropped 1 column
dropped 1 column
dropped 1 column
dropped 1 column
dropped 1 column
dropped 1 column
dropped 1 column
(558, 228)
<class 'pandas.core.frame.DataFrame'>


In [7]:
def miRNA_analysis(df, true_labels, table):
    #KMEANS
    clustering = KMeans(n_clusters=3, max_iter=600).fit(df)
    table.setClusteringAlghorithm(name='KMeans')
    print("KMEANS CLUSTERING")
    sil = silhouette_score(df, clustering.labels_)
    print(sil)
    RI = adjusted_rand_score(true_labels, clustering.labels_)
    print(RI)
    table.setSilhouette(sil)
    table.setRandIndex(RI)
    print()
    table.update()

    #AGGLOMERATIVE
    table.setClusteringAlghorithm(name='Agglomerative')
    clustering = AgglomerativeClustering(n_clusters=3).fit(df)
    print("AGGLOMERATIVE CLUSTERING")
    sil = silhouette_score(df, clustering.labels_)
    print(sil)
    RI = adjusted_rand_score(true_labels, clustering.labels_)
    print(RI)
    table.setSilhouette(sil)
    table.setRandIndex(RI)
    print()
    table.update()

    #SPECTRAL
    table.setClusteringAlghorithm(name='Spectral')
    clustering = SpectralClustering(n_clusters=3).fit(df)
    print("SPECTRAL CLUSTERING")
    sil = silhouette_score(df, clustering.labels_)
    print(sil)
    RI = adjusted_rand_score(true_labels, clustering.labels_)
    print(RI)
    table.setSilhouette(sil)
    table.setRandIndex(RI)
    table.update()

In [8]:
################################DATASET (NO STD SCALER NO MIN/MAX SCALER)##########################
table.setPca(False)
table.setStandardScaler(False) 
table.setMaxMinScaler(False)
table.setStatisticalFeatures(False)
table.setLogarithmTransformation(False)
        
true_labels=[]
labels=labels.to_numpy()
print(len(labels))
print(labels[0])
for label in labels:
    if label=='TCGA-LUAD':
        true_labels.append(0)
    elif label=='TCGA-LUSC':
        true_labels.append(1)
    elif label==False:
        true_labels.append(2)
print(len(true_labels))
print(df.shape)
print("###############DATASET (NO STD SCALER NO MIN/MAX SCALER)#######")
print()
miRNA_analysis(df, true_labels, table)

558
['TCGA-LUAD']
558
(558, 228)
###############DATASET (NO STD SCALER NO MIN/MAX SCALER)#######

KMEANS CLUSTERING
0.5360397871627334
0.09745578652439597

AGGLOMERATIVE CLUSTERING
0.6325626450552725
0.0847010512251581

SPECTRAL CLUSTERING
-0.021495451878949596
0.0002750450474265977


In [9]:
################################DATASET (STANDARD SCALER)##########################
table.setPca(False)
table.setStandardScaler(True) 
table.setMaxMinScaler(False)
table.setStatisticalFeatures(False)
table.setLogarithmTransformation(False)

scaler = StandardScaler()
scaled_df = scaler.fit_transform(df)

true_labels=[]
for label in labels:
    if label=='TCGA-LUAD':
        true_labels.append(0)
    elif label=='TCGA-LUSC':
        true_labels.append(1)
    elif label==False:
        true_labels.append(2)

print("###############DATASET (STANDARD SCALER)#######")
print()
miRNA_analysis(scaled_df, true_labels, table)

###############DATASET (STANDARD SCALER)#######

KMEANS CLUSTERING
0.19802149620589454
0.283009787681506

AGGLOMERATIVE CLUSTERING
0.15879420393299123
0.38288884171642773

SPECTRAL CLUSTERING
0.40610754976195934
0.008316936154180663


In [10]:
################################DATASET (MIN/MAX SCALER)##########################
table.setPca(False)
table.setStandardScaler(False) 
table.setMaxMinScaler(True)
table.setStatisticalFeatures(False)
table.setLogarithmTransformation(False)

scaler = MinMaxScaler()
scaled_df = scaler.fit_transform(df)

true_labels=[]
for label in labels:
    if label=='TCGA-LUAD':
        true_labels.append(0)
    elif label=='TCGA-LUSC':
        true_labels.append(1)
    elif label==False:
        true_labels.append(2)

print("###############DATASET (MIN/MAX SCALER)#######")
print()
miRNA_analysis(scaled_df, true_labels, table)

###############DATASET (MIN/MAX SCALER)#######

KMEANS CLUSTERING
0.049683488057548804
0.21156587181215494

AGGLOMERATIVE CLUSTERING
0.14600175183602387
0.3536882246504176

SPECTRAL CLUSTERING
0.4259060807198584
0.02294667609022335


In [11]:
#prima prova: per ogni paziente 4 features: min,max,media e deviazione standard dei count per milion
mins=[]
maxs=[]
means=[]
std_deviations=[]
medians=[]
for index, person in Bonferroni_dataset.iterrows():
    person_array=person.to_numpy()
    mins.append(np.amin(person_array))
    maxs.append(np.amax(person_array))
    means.append(np.mean(person_array))
    medians.append(np.median(person_array))
    std_deviations.append(np.std(person_array))

data=[]
for i in range(0,len(mins)):
    data.append([mins[i],maxs[i],means[i],std_deviations[i],medians[i]])
    
df = pd.DataFrame(data, columns=['min_count', 'max_count', 'mean_count', 'std_dev_count', 'median_count'])
df.head()

Unnamed: 0,min_count,max_count,mean_count,std_dev_count,median_count
0,0.0,44314.047241,1076.901658,4321.09894,5.951923
1,0.0,82406.382791,1718.45051,8108.934145,7.125119
2,0.0,154975.077799,1591.081412,10588.350173,5.83127
3,0.0,154677.235978,1688.286216,11576.740875,3.609326
4,0.0,189713.224,2276.431799,15955.485988,4.539433


In [12]:
################################STATISTICS DATASET (NO STD SCALER NO MIN/MAX SCALER)##########################
table.setPca(False)
table.setStandardScaler(False) 
table.setMaxMinScaler(False)
table.setStatisticalFeatures(True)
table.setLogarithmTransformation(False)

true_labels=[]
for label in labels:
    if label=='TCGA-LUAD':
        true_labels.append(0)
    elif label=='TCGA-LUSC':
        true_labels.append(1)
    elif label==False:
        true_labels.append(2)

print("###############STATISTICS DATASET (NO STD SCALER NO MIN/MAX SCALER)#######")
print()
miRNA_analysis(df, true_labels, table)

###############STATISTICS DATASET (NO STD SCALER NO MIN/MAX SCALER)#######

KMEANS CLUSTERING
0.5822493009647378
0.06799005481569453

AGGLOMERATIVE CLUSTERING
0.5133829056155359
0.044122236104546374

SPECTRAL CLUSTERING
-0.02454314821956624
-0.008318277641972233


In [13]:
################################STATISTICS DATASET (STANDARD SCALER)##########################
table.setPca(False)
table.setStandardScaler(True) 
table.setMaxMinScaler(False)
table.setStatisticalFeatures(True)
table.setLogarithmTransformation(False)

scaler = StandardScaler()
scaled_df = scaler.fit_transform(df)

true_labels=[]
for label in labels:
    if label=='TCGA-LUAD':
        true_labels.append(0)
    elif label=='TCGA-LUSC':
        true_labels.append(1)
    elif label==False:
        true_labels.append(2)

print("###############STATISTICS DATASET (STANDARD SCALER)#######")
print()
miRNA_analysis(scaled_df, true_labels, table)

###############STATISTICS DATASET (STANDARD SCALER)#######

KMEANS CLUSTERING
0.3312549522880574
0.08523048519256766

AGGLOMERATIVE CLUSTERING
0.3006431904542592
0.09806111822336416

SPECTRAL CLUSTERING
0.6037614183528752
0.08476428378865808


In [14]:
################################STATISTICS DATASET (MIN/MAX SCALER)##########################
table.setPca(False)
table.setStandardScaler(False) 
table.setMaxMinScaler(True)
table.setStatisticalFeatures(True)
table.setLogarithmTransformation(False)

scaler = MinMaxScaler()
scaled_df = scaler.fit_transform(df)

true_labels=[]
for label in labels:
    if label=='TCGA-LUAD':
        true_labels.append(0)
    elif label=='TCGA-LUSC':
        true_labels.append(1)
    elif label==False:
        true_labels.append(2)

print("###############STATISTICS DATASET (MIN/MAX SCALER)#######")
print()
miRNA_analysis(scaled_df, true_labels, table)

###############STATISTICS DATASET (MIN/MAX SCALER)#######

KMEANS CLUSTERING
0.34198632282909164
0.08511690283557188

AGGLOMERATIVE CLUSTERING
0.3406343356466155
0.15434176964917296

SPECTRAL CLUSTERING
0.33631938556890695
0.18459652161935647


In [15]:
#seconda prova: applico la PCA mantenendo una varianza del 85%
from sklearn.decomposition import PCA

#prima si prova con il dataset originale
transformer = PCA(n_components=0.85)
PCA_df = transformer.fit_transform(Bonferroni_dataset)
print(PCA_df.shape)

(558, 3)


In [16]:
################################PCA DATASET (NO STD SCALER NO MIN/MAX SCALER)##########################
table.setPca(True)
table.setStandardScaler(False) 
table.setMaxMinScaler(False)
table.setStatisticalFeatures(False)
table.setLogarithmTransformation(False)

true_labels=[]
for label in labels:
    if label=='TCGA-LUAD':
        true_labels.append(0)
    elif label=='TCGA-LUSC':
        true_labels.append(1)
    elif label==False:
        true_labels.append(2)

print("###############PCA DATASET (NO STD SCALER NO MIN/MAX SCALER)#######")
print()
miRNA_analysis(PCA_df, true_labels, table)

###############PCA DATASET (NO STD SCALER NO MIN/MAX SCALER)#######

KMEANS CLUSTERING
0.4235920040429153
0.13817929365750042

AGGLOMERATIVE CLUSTERING
0.37647263330963215
0.1190021093991923

SPECTRAL CLUSTERING
-0.03862862183794525
0.007215899461028647


In [17]:
################################PCA DATASET (STD SCALER)##########################
table.setPca(True)
table.setStandardScaler(True) 
table.setMaxMinScaler(False)
table.setStatisticalFeatures(False)
table.setLogarithmTransformation(False)

scaler = StandardScaler()
scaled_df = scaler.fit_transform(PCA_df)

true_labels=[]
for label in labels:
    if label=='TCGA-LUAD':
        true_labels.append(0)
    elif label=='TCGA-LUSC':
        true_labels.append(1)
    elif label==False:
        true_labels.append(2)

print("###############PCA DATASET (STANDARD SCALER)#######")
print()
miRNA_analysis(scaled_df, true_labels, table)

###############PCA DATASET (STANDARD SCALER)#######

KMEANS CLUSTERING
0.5275049295236922
0.08548585987762869

AGGLOMERATIVE CLUSTERING
0.6353605689455178
0.0847010512251581

SPECTRAL CLUSTERING
0.6700145184203883
0.053826676152521395


In [18]:
################################PCA DATASET (MIN/MAX SCALER)##########################
table.setPca(True)
table.setStandardScaler(False) 
table.setMaxMinScaler(True)
table.setStatisticalFeatures(False)
table.setLogarithmTransformation(False)

scaler = MinMaxScaler()
scaled_df = scaler.fit_transform(PCA_df)

true_labels=[]
for label in labels:
    if label=='TCGA-LUAD':
        true_labels.append(0)
    elif label=='TCGA-LUSC':
        true_labels.append(1)
    elif label==False:
        true_labels.append(2)

print("###############PCA DATASET (MIN/MAX SCALER)#######")
print()
miRNA_analysis(scaled_df, true_labels, table)

###############PCA DATASET (MIN/MAX SCALER)#######

KMEANS CLUSTERING
0.5270018346513178
0.08045680756628232

AGGLOMERATIVE CLUSTERING
0.5601455991878495
0.09502901806055436

SPECTRAL CLUSTERING
0.5520143485676934
0.0936283797010809


In [19]:
df_result = table.getDF()
df_result

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Silhouette,RandIndex
Omnic Name,Cluster Algorithm,BonferroniTtest,MaxMinScaler,StandardScaler,PCA,Logarithm Transformation,Statistical Features,Unnamed: 8_level_1,Unnamed: 9_level_1
miRNA,KMeans,Yes,No,No,No,No,No,0.53604,0.097456
miRNA,Agglomerative,Yes,No,No,No,No,No,0.632563,0.084701
miRNA,Spectral,Yes,No,No,No,No,No,-0.021495,0.000275
miRNA,KMeans,Yes,No,Yes,No,No,No,0.198021,0.28301
miRNA,Agglomerative,Yes,No,Yes,No,No,No,0.158794,0.382889
miRNA,Spectral,Yes,No,Yes,No,No,No,0.406108,0.008317
miRNA,KMeans,Yes,Yes,No,No,No,No,0.049683,0.211566
miRNA,Agglomerative,Yes,Yes,No,No,No,No,0.146002,0.353688
miRNA,Spectral,Yes,Yes,No,No,No,No,0.425906,0.022947
miRNA,KMeans,Yes,No,No,No,No,Yes,0.582249,0.06799
