In [1]:
import pandas as pd
from sklearn.metrics.cluster import silhouette_score, silhouette_samples
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
from master_thesis.hsbmpy import colors_cycle

In [2]:
#GTEx
df = pd.read_csv('https://storage.googleapis.com/gtex_analysis_v7/rna_seq_data/GTEx_Analysis_2016-01-15_v7_RNASeQCv1.1.8_gene_tpm.gct.gz', skiprows=2, compression='gzip', sep='\t')
df['ensg'] = [x[:15] for x in df['Name']]
df.set_index('Name', inplace=True)
df.set_index(['ensg'],inplace=True)
df=df.drop(['Description'],1)
genelist=pd.read_csv("https://stephenslab.github.io/count-clustering/project/utilities/gene_names_all_gtex.txt", header=None).values.ravel()
df = df[df.index.isin(genelist)]
df_file = pd.read_csv("https://storage.googleapis.com/gtex_analysis_v7/annotations/GTEx_v7_Annotations_SampleAttributesDS.txt", sep='\t').loc[:,['SAMPID','SMTS', 'SMTSD']]
df_file.set_index('SAMPID', inplace=True)

In [None]:
#TCGA
df=pd.read_csv("datasets/tcga/oversampling_10tissue/mainTable_all.csv", index_col=0)
df_file=pd.read_csv("datasets/tcga/oversampling_10tissue/files.dat", index_col=0)

In [3]:
import os
os.chdir("datasets/tcga/oversampling_10tissue/")

In [4]:
files = (file for file in df_file.index if file in df.columns)

In [5]:
df = df[list(files)]
df.head()

Unnamed: 0_level_0,GTEX-1117F-0226-SM-5GZZ7,GTEX-1117F-0426-SM-5EGHI,GTEX-1117F-0526-SM-5EGHJ,GTEX-1117F-0626-SM-5N9CS,GTEX-1117F-0726-SM-5GIEN,GTEX-1117F-1326-SM-5EGHH,GTEX-1117F-2226-SM-5N9CH,GTEX-1117F-2426-SM-5EGGH,GTEX-1117F-2526-SM-5GZY6,GTEX-1117F-2826-SM-5GZXL,...,GTEX-ZZPU-1126-SM-5N9CW,GTEX-ZZPU-1226-SM-5N9CK,GTEX-ZZPU-1326-SM-5GZWS,GTEX-ZZPU-1426-SM-5GZZ6,GTEX-ZZPU-1826-SM-5E43L,GTEX-ZZPU-2126-SM-5EGIU,GTEX-ZZPU-2226-SM-5EGIV,GTEX-ZZPU-2426-SM-5E44I,GTEX-ZZPU-2626-SM-5E45Y,GTEX-ZZPU-2726-SM-5NQ8O
ensg,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000227232,21.4,15.03,17.53,23.82,7.443,13.0,31.46,31.17,35.71,20.81,...,4.758,9.184,16.52,8.087,7.667,14.46,6.013,10.35,6.26,7.699
ENSG00000238009,0.2392,0.06782,0.1972,0.02808,0.07535,0.2125,0.1795,0.3079,0.0687,0.02258,...,0.1288,0.1209,0.2049,0.2263,0.1579,0.1495,0.1256,0.3233,0.06529,0.2103
ENSG00000237683,9.467,1.139,2.326,3.739,1.446,2.964,10.4,2.631,8.982,4.116,...,7.126,16.33,49.8,16.38,8.755,38.48,20.79,6.515,1.096,11.58
ENSG00000268903,0.0,0.0,0.0,0.6211,0.0,0.0,0.6616,0.0,1.52,0.0,...,2.216,1.003,4.532,1.251,0.3881,1.323,0.5556,0.0,0.0,0.4652
ENSG00000228463,0.9219,0.3351,0.4483,0.5162,0.2346,0.2048,0.2483,0.2282,0.1358,0.3749,...,0.9253,1.926,1.914,0.7381,0.6659,2.802,0.5362,0.703,0.4355,4.103


In [6]:
df = df.sample(n=3000, axis=1, random_state=42).sample(5000, axis=0, random_state=42)

In [7]:
data = df.transpose().values
data[np.isnan(data)]=0
files=df.columns
genes=df.index
del df

In [8]:
df_file.columns

Index(['SMTS', 'SMTSD'], dtype='object')

In [10]:
classes, cluster_labels = np.unique([df_file.at[sample, 'SMTS'] for sample in files], return_inverse=True)
n_clusters = 1+cluster_labels.max()

In [11]:
data = np.log2(data+1)
data[np.isinf(data)]=np.log2(1e6+1)
#np.random.shuffle(cluster_labels)

In [12]:
data.shape

(3000, 5000)

In [None]:
from telepyth import TelepythClient
tc=TelepythClient('8884910787382816523')

In [None]:
metrics = ['euclidean', 'cosine']
label = 'pca'
#k='euclidean'
def make_silhouette(datas, labels):
    for data,label in zip(datas, labels):
        for k in metrics:
            y_lower = 10
            sample_silhouette_values = silhouette_samples(data, cluster_labels, metric=k)
            silhouette_avg=silhouette_score(data, cluster_labels, metric=k)
            fig, ax = plt.subplots(1, 1)
            fig.set_size_inches(15, 35)

            # The 1st subplot is the silhouette plot
            # The silhouette coefficient can range from -1, 1 but in this example all
            # lie within [-0.1, 1]
            ax.set_xlim([-1, 1])
            # The (n_clusters+1)*10 is for inserting blank space between silhouette
            # plots of individual clusters, to demarcate them clearly.
            ax.set_ylim([0, data.shape[0] + (n_clusters + 1) * 10])

            for i in range(n_clusters):
                # cluster i, and sort them
                ith_cluster_silhouette_values = \
                    sample_silhouette_values[cluster_labels == i]

                ith_cluster_silhouette_values.sort()

                size_cluster_i = ith_cluster_silhouette_values.shape[0]
                y_upper = y_lower + size_cluster_i

                color = cm.gnuplot(float(i) / n_clusters)
                ax.fill_betweenx(np.arange(y_lower, y_upper),
                                  0, ith_cluster_silhouette_values,
                                  facecolor=color, edgecolor=color, alpha=0.7)

                ax.text(-0.8, (y_lower+y_upper)/2, "%s"%classes[i], fontsize=18)

                # Compute the new y_lower for next plot
                y_lower = y_upper + 10  # 10 for the 0 samples


            ax.set_title("gtex %s\n%s metric\n%d clusters\n%s space"%(label,k,n_clusters, label), fontsize=20)
            ax.set_xlabel("score", fontsize=20)
            # Label the silhouette plots with their cluster numbers at the middle
            ax.axvline(x=silhouette_avg, color="red", linestyle="--", lw=2)
            plt.tick_params(labelsize=20)
            plt.show()
            fig.savefig("silhouette_gtex_%s_%s.pdf"%(label,k))
            del sample_silhouette_values
            del silhouette_avg
            tc.send_figure(fig)

In [None]:
print(red_data.shape)
print(lda_data.shape)
print(emb_data.shape)
print(hsbm_data.shape)
print(tm_data.shape)
make_silhouette([data, red_data, lda_data, emb_data, hsbm_data, tm_data],
               ['data', 'pca', 'lda', 'tsne', 'hsbm', 'tm'])

## PCA

In [None]:
from sklearn.manifold import MDS
from sklearn.decomposition import PCA

In [None]:
model=PCA(n_components=2)

In [None]:
red_data=model.fit_transform(data)

In [None]:
#%matplotlib notebook
fig=plt.figure(figsize=(15,15))
for i in range(cluster_labels.max()+1):
    plt.scatter(red_data[cluster_labels==i].T[0],red_data[cluster_labels==i].T[1], label=classes[i], c=colors_cycle[i])
plt.legend(ncol=4, fontsize=10)
#plt.xlim(red_data.T[0].min(),red_data.T[0].max())
plt.tick_params(labelsize=20)
plt.show()
fig.savefig("mds_02.pdf")

In [None]:
import pickle

In [None]:
emb_datawith open("pca_model.pkl",'wb') as f:
    pickle.dump(model, f)

In [None]:
with open("pca_model.pkl",'rb') as f:
    model = pickle.load(f)

In [None]:
red_data = model.transform(data)

## kmeans

In [None]:
from sklearn.cluster import KMeans

In [None]:
model = KMeans(n_clusters=n_clusters, n_init=10, random_state=42)

In [None]:
cluster_labels = model.fit_predict(data)
classes = np.unique(cluster_labels)

## hierarchical

In [None]:
from sklearn.cluster import AgglomerativeClustering
import scipy.cluster.hierarchy as shc

In [None]:
fig=plt.figure()
Z = shc.linkage(data, method='average')
dend = shc.dendrogram(Z, leaf_rotation=90., leaf_font_size=8.,)
plt.xlabel("samples", fontsize=16)
plt.show()
fig.savefig("dendogram.pdf")

In [None]:
cut = shc.cut_tree(Z, height=20000).ravel()
np.unique(cut, return_counts=True)

In [None]:
model = AgglomerativeClustering(n_clusters=n_clusters, affinity='cosine', linkage='complete')

In [None]:
cluster_labels = model.fit_predict(data[cut==0])
classes = np.unique(cluster_labels)

In [None]:
out_to_file(cluster_labels, files[cut==0], name='hier')

## LDA

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
lda = LatentDirichletAllocation(n_components=n_clusters)

In [None]:
lda_data = lda.fit_transform(data)
cluster_labels = np.argmax(lda_data, axis=1)
classes = np.arange(0, np.max(cluster_labels)+1)

In [None]:
fig=plt.figure(figsize=(15,15))
for i in range(cluster_labels.max()+1):
    plt.scatter(lda_data[cluster_labels==i].T[3],lda_data[cluster_labels==i].T[4], label=classes[i], c=colors_cycle[i])
plt.legend(ncol=4, fontsize=10)
#plt.xlim(red_data.T[0].min(),red_data.T[0].max())
plt.tick_params(labelsize=20)
plt.show()
fig.savefig("lda_02.pdf")

## hSBM

In [13]:
from master_thesis.hsbm.sbmtm import sbmtm



In [14]:
hsbm = sbmtm()

In [15]:
hsbm.make_graph_from_BoW_df(pd.DataFrame(data=data))

<master_thesis.hsbm.sbmtm.sbmtm at 0x7fd2b6e84828>

In [16]:
hsbm.save_graph("graph.xml.gz")

In [None]:
hsbm.fit()

In [None]:
hsbm_data = pd.read_csv("topsbm/topsbm_level_3_topic-dist.csv").set_index('doc').reindex(index=[s[:24] for s in files]).drop('i_doc',1).fillna(0).values

In [None]:
fig=plt.figure(figsize=(15,15))
for i in range(cluster_labels.max()+1):
    plt.scatter(hsbm_data[cluster_labels==i].T[0],hsbm_data[cluster_labels==i].T[1], label=classes[i], c=colors_cycle[i])
plt.legend(ncol=4, fontsize=10)
#plt.xlim(red_data.T[0].min(),red_data.T[0].max())
plt.tick_params(labelsize=20)
plt.show()
fig.savefig("hsbm_02.pdf")

## t-SNE

In [None]:
from sklearn.manifold import TSNE

In [None]:
tsne = TSNE(n_components=2)

In [None]:
emb_data = tsne.fit_transform(data)

In [None]:
fig=plt.figure(figsize=(15,15))
for i in range(cluster_labels.max()+1):
    plt.scatter(emb_data[cluster_labels==i].T[0],emb_data[cluster_labels==i].T[1], label=classes[i], c=colors_cycle[i])
plt.legend(ncol=4, fontsize=10)
#plt.xlim(red_data.T[0].min(),red_data.T[0].max())
plt.tick_params(labelsize=20)
plt.show()
fig.savefig("tsne_02.pdf")

## TM

In [None]:
tm_data = pd.read_csv("tm/tm_level_0_topic-dist.csv").set_index('doc').reindex(index=files).drop('i_doc',1).fillna(0).values

In [None]:
fig=plt.figure(figsize=(15,15))
for i in range(cluster_labels.max()+1):
    plt.scatter(tm_data[cluster_labels==i].T[0],tm_data[cluster_labels==i].T[2], label=classes[i], c=colors_cycle[i])
plt.legend(ncol=4, fontsize=10)
#plt.xlim(red_data.T[0].min(),red_data.T[0].max())
plt.tick_params(labelsize=20)
plt.show()
fig.savefig("tm_02.pdf")

In [None]:
from master_thesis.hsbmpy import out_to_file

In [None]:
out_to_file(cluster_labels, files, name='lda')

In [None]:
df_file = pd.read_csv("https://storage.googleapis.com/gtex_analysis_v7/annotations/GTEx_v7_Annotations_SampleAttributesDS.txt", sep='\t').loc[:,['SAMPID','SMTS', 'SMTSD']]
df_file.set_index('SAMPID', inplace=True)

In [None]:
df_file.reindex(index=files).to_csv("files.dat")