In [None]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
from silhouette import make_silhouette
import gc
from topicpy.hsbmpy import color_iterator
log.setLevel("DEBUG")

In [None]:
genelist=pd.read_csv("https://stephenslab.github.io/count-clustering/project/utilities/gene_names_all_gtex.txt", header=None).values.ravel()
df_file = pd.read_csv("https://storage.googleapis.com/gtex_analysis_v8/annotations/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt", sep='\t').loc[:,['SAMPID','SMTS', 'SMTSD']]
dfStreamHandleret_index('SAMPID', inplace=True)

In [None]:
#GTEx
df_generator = pd.read_csv('../GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct', skiprows=2, sep='\t', chunksize=1000)

log.info("read generator")

df = pd.DataFrame()
df = next(df_generator)
df['ensg'] = [x[:15] for x in df['Name']]
df.reindex(index=df.index[df.index.isin(genelist)])
df.set_index("ensg", inplace=True)
df.drop(["Name", "Description"], axis=1, inplace=True)

log.info("first chunk OK")

for subdf in df_generator:
    subdf['ensg'] = [x[:15] for x in subdf['Name']]
    subdf.set_index("ensg", inplace=True)
    subdf.drop(["Name", "Description"], axis=1, inplace=True)
    df = df.append(subdf.reindex(index=subdf.index[subdf.index.isin(genelist)]))
    log.debug("new chunk")
    log.info(df.shape)

In [None]:
df.head(2)

In [None]:
files = (file for file in df_file.index if file in df.columns)

In [None]:
df = df[list(files)]
df.head()

In [None]:
df = df.sample(n=3000, axis=1, random_state=42).sample(5000, axis=0, random_state=42)

In [None]:
data = df.transpose().values
data[np.isnan(data)]=0
files=df.columns
genes=df.index
del df

In [None]:
df_file.columns

In [None]:
classes, cluster_labels = np.unique([df_file.at[sample, 'SMTS'] for sample in files], return_inverse=True)
n_clusters = 1+cluster_labels.max()

In [None]:
data = np.log2(data+1)
data[np.isinf(data)]=np.log2(1e6+1)
#np.random.shuffle(cluster_labels)

In [None]:
data.shape

In [None]:
cm.gnuplot(float(2) / n_clusters)

In [None]:
for d,label in zip([data], ["SMTS"]):
    make_silhouette(d, label, classes, cluster_labels, n_clusters)

In [None]:
print(red_data.shape)
print(lda_data.shape)
print(emb_data.shape)
print(hsbm_data.shape)
print(tm_data.shape)
make_silhouette([data, red_data, lda_data, emb_data, hsbm_data, tm_data],
               ['data', 'pca', 'lda', 'tsne', 'hsbm', 'tm'])

## PCA

In [None]:
from sklearn.manifold import MDS
from sklearn.decomposition import PCA

In [None]:
model=PCA(n_components=2)

In [None]:
red_data=model.fit_transform(data)

In [None]:
#%matplotlib notebook
fig=plt.figure(figsize=(15,15))
for i in range(cluster_labels.max()+1):
    plt.scatter(red_data[cluster_labels==i].T[0],red_data[cluster_labels==i].T[1], label=classes[i], c=colors_cycle[i])
plt.legend(ncol=4, fontsize=10)
#plt.xlim(red_data.T[0].min(),red_data.T[0].max())
plt.tick_params(labelsize=20)
plt.show()
fig.savefig("mds_02.pdf")

In [None]:
import pickle

In [None]:
emb_datawith open("pca_model.pkl",'wb') as f:
    pickle.dump(model, f)

In [None]:
with open("pca_model.pkl",'rb') as f:
    model = pickle.load(f)

In [None]:
red_data = model.transform(data)

## kmeans

In [None]:
from sklearn.cluster import KMeans

In [None]:
model = KMeans(n_clusters=n_clusters, n_init=10, random_state=42)

In [None]:
cluster_labels = model.fit_predict(data)
classes = np.unique(cluster_labels)

## hierarchical

In [None]:
from sklearn.cluster import AgglomerativeClustering
import scipy.cluster.hierarchy as shc

In [None]:
fig=plt.figure()
Z = shc.linkage(data, method='average')
dend = shc.dendrogram(Z, leaf_rotation=90., leaf_font_size=8.,)
plt.xlabel("samples", fontsize=16)
plt.show()
fig.savefig("dendogram.pdf")

In [None]:
cut = shc.cut_tree(Z, height=20000).ravel()
np.unique(cut, return_counts=True)

In [None]:
model = AgglomerativeClustering(n_clusters=n_clusters, affinity='cosine', linkage='complete')

In [None]:
cluster_labels = model.fit_predict(data[cut==0])
classes = np.unique(cluster_labels)

In [None]:
out_to_file(cluster_labels, files[cut==0], name='hier')

## LDA

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
lda = LatentDirichletAllocation(n_components=n_clusters)

In [None]:
lda_data = lda.fit_transform(data)
cluster_labels = np.argmax(lda_data, axis=1)
classes = np.arange(0, np.max(cluster_labels)+1)

In [None]:
fig=plt.figure(figsize=(15,15))
for i in range(cluster_labels.max()+1):
    plt.scatter(lda_data[cluster_labels==i].T[3],lda_data[cluster_labels==i].T[4], label=classes[i], c=colors_cycle[i])
plt.legend(ncol=4, fontsize=10)
#plt.xlim(red_data.T[0].min(),red_data.T[0].max())
plt.tick_params(labelsize=20)
plt.show()
fig.savefig("lda_02.pdf")

## hSBM

In [None]:
from master_thesis.hsbm.sbmtm import sbmtm

In [None]:
hsbm = sbmtm()

In [None]:
hsbm.make_graph_from_BoW_df(pd.DataFrame(data=data))

In [None]:
hsbm.save_graph("graph.xml.gz")

In [None]:
hsbm.fit()

In [None]:
hsbm_data = pd.read_csv("topsbm/topsbm_level_3_topic-dist.csv").set_index('doc').reindex(index=[s[:24] for s in files]).drop('i_doc',1).fillna(0).values

In [None]:
fig=plt.figure(figsize=(15,15))
for i in range(cluster_labels.max()+1):
    plt.scatter(hsbm_data[cluster_labels==i].T[0],hsbm_data[cluster_labels==i].T[1], label=classes[i], c=colors_cycle[i])
plt.legend(ncol=4, fontsize=10)
#plt.xlim(red_data.T[0].min(),red_data.T[0].max())
plt.tick_params(labelsize=20)
plt.show()
fig.savefig("hsbm_02.pdf")

## t-SNE

In [None]:
from sklearn.manifold import TSNE

In [None]:
tsne = TSNE(n_components=2)

In [None]:
emb_data = tsne.fit_transform(data)

In [None]:
fig=plt.figure(figsize=(15,15))
for i in range(cluster_labels.max()+1):
    plt.scatter(emb_data[cluster_labels==i].T[0],emb_data[cluster_labels==i].T[1], label=classes[i], c=colors_cycle[i])
plt.legend(ncol=4, fontsize=10)
#plt.xlim(red_data.T[0].min(),red_data.T[0].max())
plt.tick_params(labelsize=20)
plt.show()
fig.savefig("tsne_02.pdf")

## TM

In [None]:
tm_data = pd.read_csv("tm/tm_level_0_topic-dist.csv").set_index('doc').reindex(index=files).drop('i_doc',1).fillna(0).values

In [None]:
fig=plt.figure(figsize=(15,15))
for i in range(cluster_labels.max()+1):
    plt.scatter(tm_data[cluster_labels==i].T[0],tm_data[cluster_labels==i].T[2], label=classes[i], c=colors_cycle[i])
plt.legend(ncol=4, fontsize=10)
#plt.xlim(red_data.T[0].min(),red_data.T[0].max())
plt.tick_params(labelsize=20)
plt.show()
fig.savefig("tm_02.pdf")

In [None]:
from master_thesis.hsbmpy import out_to_file

In [None]:
out_to_file(cluster_labels, files, name='lda')

In [None]:
df_file = pd.read_csv("https://storage.googleapis.com/gtex_analysis_v7/annotations/GTEx_v7_Annotations_SampleAttributesDS.txt", sep='\t').loc[:,['SAMPID','SMTS', 'SMTSD']]
df_file.set_index('SAMPID', inplace=True)

In [None]:
df_file.reindex(index=files).to_csv("files.dat")