In [None]:
import pandas as pd
from sklearn.metrics.cluster import silhouette_score, silhouette_samples
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm

In [None]:
df = pd.read_csv('https://storage.googleapis.com/gtex_analysis_v7/rna_seq_data/GTEx_Analysis_2016-01-15_v7_RNASeQCv1.1.8_gene_reads.gct.gz', skiprows=2, compression='gzip', sep='\t')
df['ensg'] = [x[:15] for x in df['Name']]
df.set_index('Name', inplace=True)
df.set_index(['ensg'],inplace=True)
df=df.drop(['Description'],1)
genelist=pd.read_csv("https://stephenslab.github.io/count-clustering/project/utilities/gene_names_all_gtex.txt", header=None).values.ravel()
df = df[df.index.isin(genelist)]
df_file = pd.read_csv("https://storage.googleapis.com/gtex_analysis_v7/annotations/GTEx_v7_Annotations_SampleAttributesDS.txt", sep='\t').loc[:,['SAMPID','SMTS', 'SMTSD']]
df_file.set_index('SAMPID', inplace=True)

In [None]:
files = [file for file in df_file.index if file in df.columns]

In [None]:
df = df[list(files)]

In [None]:
df.head()

In [None]:
data = df.transpose().values
files=df.columns
genes=df.index
del df

In [None]:
df_file.columns

In [None]:
classes, cluster_labels = np.unique([df_file.at[sample, 'SMTSD'] for sample in files], return_inverse=True)
n_clusters = 1+cluster_labels.max()

In [None]:
data.shape

In [None]:
metrics = ['cosine']
#k='euclidean'

In [None]:
y_lower = 10

for k in metrics:
    sample_silhouette_values = silhouette_samples(data, cluster_labels, metric=k)
    silhouette_avg=silhouette_score(data, cluster_labels, metric=k)
    fig, ax = plt.subplots(1, 1)
    fig.set_size_inches(15, 35)

    # The 1st subplot is the silhouette plot
    # The silhouette coefficient can range from -1, 1 but in this example all
    # lie within [-0.1, 1]
    ax.set_xlim([-1, 1])
    # The (n_clusters+1)*10 is for inserting blank space between silhouette
    # plots of individual clusters, to demarcate them clearly.
    ax.set_ylim([0, data.shape[0] + (n_clusters + 1) * 10])

    for i in range(n_clusters):
        # cluster i, and sort them
        ith_cluster_silhouette_values = \
            sample_silhouette_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.gnuplot(float(i) / n_clusters)
        ax.fill_betweenx(np.arange(y_lower, y_upper),
                          0, ith_cluster_silhouette_values,
                          facecolor=color, edgecolor=color, alpha=0.7)

        ax.text(-0.8, (y_lower+y_upper)/2, "%s"%classes[i], fontsize=18)
        
        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples
        
        
    ax.set_title("gtex subtissues\n%s metric\n%d clusters"%(k,n_clusters), fontsize=20)
    ax.set_xlabel("score", fontsize=20)
    # Label the silhouette plots with their cluster numbers at the middle
    ax.axvline(x=silhouette_avg, color="red", linestyle="--", lw=2)
    plt.tick_params(labelsize=20)
    plt.show()
    fig.savefig("silhouette_gtex_subtissues_%s.pdf"%(k))
    del sample_silhouette_values
    del silhouette_avg

In [None]:
from sklearn.manifold import MDS

In [None]:
model=MDS(n_components=5)

In [None]:
red_data=model.fit_transform(data)

In [None]:
fig=plt.figure(figsize=(15,10))
for i in range(cluster_labels.max()+1):
    plt.scatter(red_data[cluster_labels==i].T[0],red_data[cluster_labels==i].T[2], label=classes[i])
plt.legend(ncol=4, fontsize=10)
#plt.xscale('log')
#plt.yscale('log')
#plt.xlim(red_data.T[0].min(),red_data.T[0].max())
plt.show()

In [None]:
model.dissimilarity_matrix_