In [79]:
import pandas as pd
import numpy as np
import pickle
import plotly.express as px
import plotly.graph_objs as go
import os
import sys
import imp
import json
from tqdm import tqdm

from plotly.offline import init_notebook_mode, iplot
import plotly.figure_factory as ff

from matplotlib import pyplot as plt

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.cluster import DBSCAN, KMeans, AgglomerativeClustering, SpectralClustering
from sklearn.pipeline import Pipeline
from sklearn.metrics import silhouette_score, accuracy_score, adjusted_rand_score
from sklearn.manifold import TSNE 

from scipy.cluster.hierarchy import dendrogram, linkage

from myclass.BonferroniTtest import Bonferroni_Ttest
from myclass.MappingEngGen import MappingENG
from myclass.CleanMergeDataset import Clean_Merge_Dataset
from myclass.ResultTable import ResultTable
#from myclass.FeatureSelection import FeatureSelection # FAILED

### Reading the data from the pickle files and cleaning of the data

In [74]:
if os.path.exists('./data-ready/data_mapped_RNA.pkl') is False:
    data_normal = pd.read_pickle('./data-ready/RNA_dataframe_normal').replace('/', '\\')
    data_tumor = pd.read_pickle('./data-ready/RNA_dataframe').replace('/', '\\')
    dataset, y, cases_id = Clean_Merge_Dataset(name='RNA').transform(data_normal, data_tumor)

    mapENG = MappingENG(name='RNA')
    dataset = mapENG.fit_transform(dataset, y)
    
else:
    dataset = pd.read_pickle('./data-ready/data_mapped_RNA.pkl')

In [80]:
data_normal = pd.read_pickle('./data-ready/RNA_dataframe_normal').replace('/', '\\')
data_tumor = pd.read_pickle('./data-ready/RNA_dataframe').replace('/', '\\')
dataset, y, cases_id = Clean_Merge_Dataset(name='RNA').transform(data_normal, data_tumor)

mapENG = MappingENG(name='RNA')
dataset = mapENG.fit_transform(dataset, y)
print(dataset.shape)

Data_normal: (305, 60486)
Data_tumor: (1071, 60486)
All data: (1376, 60486)
{'TCGA-LUAD', 'TCGA-LUSC'}
Features completly 0 values 2227 removed
Features completely Nan 0 removed
Final dataset shape (959, 58258)


100%|██████████| 58256/58256 [14:31:11<00:00,  1.11it/s]        


(959, 58256)


In [81]:
dataset.describe()

Unnamed: 0,RP11-368I23.2,RP11-742D12.2,RAB4B,ENSG00000273842.1,C12orf5,RNF44,NUP210P2,DNAH3,RPL23A,CTD-2382E5.4,...,TENM1,CYP4F2,RP11-486E2.1,AC010525.4,RP11-713D19.1,PPP6R1,RP4-569M23.2,ENSG00000280861.1,BATF3,OR8D4
count,959.0,959.0,959.0,959.0,959.0,959.0,959.0,959.0,959.0,959.0,...,959.0,959.0,959.0,959.0,959.0,959.0,959.0,959.0,959.0,959.0
mean,0.094264,0.007452,4.1475,0.001463,4.857505,12.94731,0.002687,0.489015,138.91174,0.141317,...,0.604147,0.073425,0.001043,0.180663,0.059561,17.275161,0.502305,0.002205,1.55048,0.00264
std,0.404352,0.038579,2.08503,0.026007,2.086634,5.38633,0.025259,0.927192,57.893797,0.486565,...,2.121799,0.350132,0.014065,0.181504,0.101847,6.810052,0.857857,0.037163,1.596573,0.029357
min,0.0,0.0,0.520163,0.0,0.755503,1.488724,0.0,0.002849,23.706158,0.0,...,0.0,0.0,0.0,0.0,0.0,2.499193,0.0,0.0,0.121123,0.0
25%,0.0,0.0,2.856123,0.0,3.496863,9.480507,0.0,0.102176,97.850687,0.037785,...,0.040003,0.0,0.0,0.056087,0.0,13.007338,0.153991,0.0,0.646829,0.0
50%,0.01823,0.0,3.780238,0.0,4.449093,11.961984,0.0,0.215449,127.518943,0.082277,...,0.103719,0.005068,0.0,0.13156,0.033872,16.047071,0.309361,0.0,1.058032,0.0
75%,0.072504,0.005423,4.884539,0.0,5.730334,15.196104,0.0,0.499029,167.307584,0.139131,...,0.283205,0.02738,0.0,0.254937,0.085938,20.672096,0.57924,0.0,1.72995,0.0
max,9.004566,0.956318,21.352727,0.690224,17.03692,61.809897,0.579273,12.72151,500.893573,8.030229,...,31.37123,6.111623,0.423827,1.672513,1.482554,53.267206,14.536727,1.029548,14.182301,0.662189


In [82]:
dataset.shape

(959, 58256)

In [83]:
y.shape

(959,)

### Application of the dataset to the clustering methods with PCA and StandardScaler

In [84]:
from itertools import product
my_record = ResultTable(name='RNA')

y_ = [str(i) for i in y]
labelEnc = LabelEncoder().fit(y_)
labels = labelEnc.transform(y_)

all_false = False

clusters = {
    'Agglomerative': AgglomerativeClustering(n_clusters=3),
    'K-Means': KMeans(n_clusters=3),
    'Spectral': SpectralClustering(n_clusters=3), 
}

methods = [('bonferroni', Bonferroni_Ttest(alpha=0.05)),
           ('minmax', MinMaxScaler()),
           ('scaler', StandardScaler()),
           ('pca', PCA(n_components=0.8))
          ]

for combo in product([True, False], repeat=4):
    
    jump = False
    if combo[3] is False and combo[2] is False:
        jump = True
    
    if jump is False:
        my_record.setBonf(combo[0])
        my_record.setMaxMinScaler(combo[1])
        my_record.setStandardScaler(combo[2])
        if combo[2] is True and combo[3] is True:
            my_record.setPca(combo[3])
        else:
            my_record.setPca(False)

        final_pipe = list()

        for i, status, mth in zip(range(0, len(methods)), combo, methods):
            if status is True:
                if i!= 3:
                    final_pipe.append(mth)
                    if combo[2] is True and i == 3:
                        final_pipe.append(mth)


        if len(final_pipe) > 0:
            pipe = Pipeline(final_pipe)
            df = pipe.fit_transform(dataset, y)
        else:
            df = dataset
            
        for k in clusters.keys():
            clustering = clusters[k]
            clustering.fit(df)
            
            print('Silhouette score ', k,'cluster:', silhouette_score(df, clustering.labels_))
            print('Rand Index', k,'cluster:', adjusted_rand_score(labels, clustering.labels_))
            print('\n')
            my_record.setClusteringAlghorithm(k)
            my_record.setSilhouette(silhouette_score(df, clustering.labels_))
            my_record.setRandIndex(adjusted_rand_score(labels, clustering.labels_))
            my_record.update()
print("\n")

Final dataset shape: (959, 12896)
Silhouette score  Agglomerative cluster: 0.11622216352711279
Rand Index Agglomerative cluster: 0.5594627862671556


Silhouette score  K-Means cluster: 0.1231135729149337
Rand Index K-Means cluster: 0.5567586646928226


Silhouette score  Spectral cluster: -0.017715061104463838
Rand Index Spectral cluster: -0.011722620143502532


Final dataset shape: (959, 12896)
Silhouette score  Agglomerative cluster: 0.11622216352711279
Rand Index Agglomerative cluster: 0.5594627862671556


Silhouette score  K-Means cluster: 0.1227685210366503
Rand Index K-Means cluster: 0.5513763803759797


Silhouette score  Spectral cluster: -0.025803325465486988
Rand Index Spectral cluster: -0.0036641450566045363


Final dataset shape: (959, 12896)
Silhouette score  Agglomerative cluster: 0.07272600594209219
Rand Index Agglomerative cluster: 0.7653315676291866


Silhouette score  K-Means cluster: 0.07817271566183681
Rand Index K-Means cluster: 0.7608899002107056


Silhouette score 

In [85]:
import dataframe_image as dfi

df_styled = my_record.getDF().style.background_gradient()
dfi.export(df_styled, 'images/RNA_results.png')

In [86]:
def plot_best_score(best_sil):
    table_data = []

    column = []
    for el in best_sil:
        column.append(el)
    table_data.append(column)

    column = []
    for el in best_sil:
        column.append(best_sil[el])
    table_data.append(column)

    #create table
    fig = go.Figure(data=[go.Table(header=dict(values=['Index', 'Value']),
                     cells=dict(values=table_data))
                         ])
    fig.show()
    return

In [87]:
plot_best_score(my_record.maxSilhouette())
plot_best_score(my_record.maxRandIndex())

In [88]:
# Selecting and plotting the best Silhouette score
dict_result = my_record.maxSilhouette()
boolResult = my_record.getBoolForPipe(dict_result)

final_pipe = list()
methods = {'BonferroniTtest': Bonferroni_Ttest(alpha=0.05),
           'MaxMinScaler': MinMaxScaler(),
           'StandardScaler': StandardScaler(),
           'PCA': PCA(n_components=0.8)
          }

for k in boolResult.keys():
    if boolResult[k] is True:
        final_pipe.append((k, methods[k]))

if len(final_pipe) > 0:
    pipe = Pipeline(final_pipe)
    df = pipe.fit_transform(dataset, y)
else:
    df = dataset
    
clustering = clusters[dict_result['Cluster Algorithm']]
clustering.fit(df)
print('Silhouette: ', silhouette_score(df, clustering.labels_))
print('RandIndex: ', adjusted_rand_score(labels, clustering.labels_))

X_t = TSNE(n_components=3).fit_transform(df)
fig = px.scatter_3d(X_t, x=0, y=1, z=2 ,color=clustering.labels_)
fig.show()
my_record.getDF()

Silhouette:  0.8322108091926181
RandIndex:  -0.00330077136562291


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Silhouette,RandIndex
Omnic Name,Cluster Algorithm,BonferroniTtest,MaxMinScaler,StandardScaler,PCA,Logarithm Transformation,Statistical Features,Unnamed: 8_level_1,Unnamed: 9_level_1
RNA,Agglomerative,Yes,Yes,Yes,Yes,No,No,0.116222,0.559463
RNA,K-Means,Yes,Yes,Yes,Yes,No,No,0.123114,0.556759
RNA,Spectral,Yes,Yes,Yes,Yes,No,No,-0.017715,-0.011723
RNA,Agglomerative,Yes,Yes,Yes,No,No,No,0.116222,0.559463
RNA,K-Means,Yes,Yes,Yes,No,No,No,0.122769,0.551376
RNA,Spectral,Yes,Yes,Yes,No,No,No,-0.025803,-0.003664
RNA,Agglomerative,Yes,Yes,No,No,No,No,0.072726,0.765332
RNA,K-Means,Yes,Yes,No,No,No,No,0.078173,0.76089
RNA,Spectral,Yes,Yes,No,No,No,No,0.490914,-0.000982
RNA,Agglomerative,Yes,No,Yes,Yes,No,No,0.116222,0.559463


In [90]:
# Selecting and plotting the best RandIndex score
dict_result = my_record.maxRandIndex()
boolResult = my_record.getBoolForPipe(dict_result)

final_pipe = list()
methods = {'BonferroniTtest': Bonferroni_Ttest(alpha=0.05),
           'MaxMinScaler': MinMaxScaler(),
           'StandardScaler': StandardScaler(),
           'PCA': PCA(n_components=0.8)
          }

for k in boolResult.keys():
    if boolResult[k] is True:
        final_pipe.append((k, methods[k]))


if len(final_pipe) > 0:
    pipe = Pipeline(final_pipe)
    df = pipe.fit_transform(dataset, y)
else:
    df = dataset
    
clustering = clusters[dict_result['Cluster Algorithm']]
clustering.fit(df)
print('Silhouette: ', silhouette_score(df, clustering.labels_))
print('RandIndex: ', adjusted_rand_score(labels, clustering.labels_))

X_t = TSNE(n_components=3).fit_transform(df)
fig = px.scatter_3d(X_t, x=0, y=1, z=2 ,color=clustering.labels_)
fig.show()

Final dataset shape: (959, 12896)
Silhouette:  0.07272600594209219
RandIndex:  0.7653315676291866


(1375, 5553)
