In [1]:
import matplotlib.pyplot as plt  
import pandas as pd  
import numpy as np 
import seaborn as sns
import os, sys
from hsbmpy import get_file, define_labels, get_cluster_given_l, get_max_available_L
from geneontology import topic_analysis
import tensorflow as tf
from sklearn.metrics import homogeneity_completeness_v_measure
import scipy.cluster.hierarchy as shc
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import LatentDirichletAllocation
from lda import lda

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
os.chdir("/home/fvalle/phd/master_thesis")

In [3]:
level = 2
#setup = 'oversigma_10tissue'
#label = 'disease_type'
label='lda'
#label = 'uniq'
#labels = ['primary_site', 'disease_type']
#labels = ['primary_site', 'secondary_site']
directory=r"/home/fvalle/phd/datasets/tcga/TCGA/"
#L=get_max_available_L(directory)
os.chdir(directory)

In [4]:
df = pd.read_csv("mainTable_all.csv", index_col=[0], header=[0]).dropna().astype(int)
totalobjcets = len(df.columns)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15659 entries, ENSG00000000419 to ENSG00000273489
Columns: 11093 entries, a62cfb55-6820-404c-959b-e33a2f732f25.htseq.counts.gz to 89e20bd9-15e5-48e6-9dfe-0b919c817216.htseq.counts.gz
dtypes: int64(11093)
memory usage: 1.3+ GB


In [5]:
df_files = pd.read_csv("files.dat", index_col=[0])
df_files.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11093 entries, d1051cf3-e258-4250-b31e-26a1711c90ee.htseq.counts.gz to dd1d19b2-681b-447b-b16f-f9b0bf4f4c30.htseq.counts.gz
Data columns (total 4 columns):
primary_site         11093 non-null object
tcga_id              11093 non-null object
disease_type         11093 non-null object
primary_diagnosis    11053 non-null object
dtypes: object(4)
memory usage: 433.3+ KB


In [6]:
true_out = []
for sample in df.columns.values:
    try:
        true_out.append(get_file(sample, df_files)['primary_site'])
    except:
        print(sys.exc_info()[1])
        true_out.append('unknown')

In [7]:
scores = {}

In [8]:
try:
    with open("clustersizes.txt",'r') as f:
        xl = np.array(f.read().split(sep='\n'))[:-1].astype(int)
except:
        xl=np.linspace(2,50,5, dtype=int)
xl

array([ 2, 14, 26, 38, 50])

In [9]:
df.T.values.shape

(11093, 15659)

## hierarchical

In [None]:
fig=plt.figure()
dend = shc.dendrogram(shc.linkage(df.T.values, method='ward'), leaf_rotation=90., leaf_font_size=8.,)
plt.xlabel("samples", fontsize=16)
plt.show()
fig.savefig("hierarchical_dendogram.pdf")

In [None]:
#hierarchical
scores['hierarchical']={
    'h':[],
    'c':[],
    'V':[]
}
print("hierarchical")
os.system('mkdir -p hierarchical')
hierarchical_model = AgglomerativeClustering(n_clusters=1, affinity='euclidean', linkage='complete')  
for l,x in enumerate(xl):
    print("testing with %d clusters"%x)
    hierarchical_model.n_clusters=x
    out = hierarchical_model.fit_predict(df.T.values)
        
    #save clusters
    print("saving clusters")
    df_clusters = pd.DataFrame(index=np.arange(totalobjcets))
    for c in np.arange(out.max()+1)[::-1]:
        c_objects = df.columns[np.argwhere(out==c)].values.T[0]
        df_clusters.insert(0,"Cluster %d"%(c+1),np.concatenate((c_objects,[np.nan for _ in np.arange(totalobjcets-len(c_objects))])))
    df_clusters.dropna(axis=0,how='all', inplace=True)
    df_clusters.to_csv("hierarchical/hierarchical_level_%d_clusters.csv"%(l), index=False, header=True)
    
    score = (homogeneity_completeness_v_measure(true_out, out))
    scores['hierarchical']['h'].append(score[0])
    scores['hierarchical']['c'].append(score[1])
    scores['hierarchical']['V'].append(score[2])
    
pd.DataFrame(data=scores['hierarchical']).to_csv("hierarchical.scores", header=True, index=False)

hierarchical
testing with 2 clusters
saving clusters
testing with 14 clusters


## LDA

In [None]:
with open("topicsizes.txt",'r') as f:
    tl = np.array(f.read().split(sep='\n'))[:-1].astype(int)

In [None]:
#xl = [2,3,4,5]
tl = xl
Sigmas = []

In [None]:
model=lda(n_jobs=12, verbose=2)
model.full_analysis(directory, xl,tl)

In [None]:
fig, ax=plt.subplots(figsize=(20,10))
for topic in df_D.columns:
    df_D[topic].hist(ax=ax, histtype='step', density=True, lw=2, label=topic)
ax.set_yscale('log')
ax.set_xlabel('$D^g[k]$', fontsize=20)
plt.tick_params(labelsize=20)
plt.tick_params(axis='x', rotation=60)
ax.legend(ncol=5)
plt.show()
fig.savefig("%s/distinctivness.pdf"%directory)

In [None]:
topic_analysis(directory,3, 'lda')

In [None]:
Sigmas

In [None]:
homogeneity_completeness_v_measure(true_out, out)

## LDA Mallet

In [None]:
from gensim.models.wrappers import LdaMallet
from gensim.corpora.dictionary import Dictionary
from gensim.test.utils import get_tmpfile
from gensim.corpora import MmCorpus

In [None]:
corpus=list(map(list,map(lambda x: list(zip(range(len(df.index)), df[x])),df.columns)))
corpus

In [None]:
dictionary=Dictionary([df.index])
[a for a in dictionary.items()]

In [None]:
out_file=get_tmpfile("/home/fvalle/phd/datasets/tcga/oversampling_10tissue/corpus.mm")
MmCorpus.serialize(out_file, corpus, dictionary)

In [None]:
model = LdaMallet("/home/fvalle/phd/Mallet/bin/mallet", workers=5, corpus=corpus, num_topics=15, id2word=dictionary)

In [None]:
df_topics=pd.DataFrame(data=model.get_topics().T, index=[a[1] for a in dictionary.items()], columns=["Topic %d"%(t+1) for t in range(5)])
df_topics

In [None]:
df_topic_distr=pd.read_csv(model.fdoctopics(), sep='\t', header=None, index_col=0).drop(1,1)
df_topic_distr.columns=["Topic %d"%(t+1) for t in range(5)]
df_topic_distr.index=df.columns
df_topic_distr

In [None]:
df_topic_distr.apply(lambda x: x.idxmax().split(" ")[1], axis=1)

## hierachical on Altmann's output

In [None]:
hiermodel = AgglomerativeClustering(n_clusters=10, affinity='euclidean', linkage='ward')

In [None]:
with open('clustersizes.txt') as f:
    xl=np.array(f.read().split('\n')[:-1]).astype(int)

In [None]:
os.system("mkdir -p hierhsbm")

In [None]:
true_out = []
for sample in pd.read_csv("%s/%s_level_%d_topic-dist.csv"%('topsbm','topsbm',0), index_col=1).drop('i_doc', axis=1).index.values:
    try:
        true_out.append(get_file(sample, df_files)['primary_site'])
    except:
        print(sys.exc_info()[0])
        true_out.append('')

In [None]:
scores['hierhsbm']={
    'h':[],
    'c':[],
    'V':[]
}
for l,n_clusters in enumerate(xl):
    print("Fitting level %d with %d clusters"%(l, n_clusters))
    df_topics = pd.read_csv("%s/%s_level_%d_topic-dist.csv"%('topsbm','topsbm',l), index_col=1).drop('i_doc', axis=1)
    df_clusters = pd.DataFrame(columns=["Cluster %d"%c for c in np.arange(n_clusters)+1])
    hiermodel.n_clusters=n_clusters
    out = hiermodel.fit_predict(df_topics.values)  
    for c in np.arange(out.max()+1)[::-1]:
        c_objects = df_topics.index[np.argwhere(out==c)].values.T[0]
        df_clusters["Cluster %d"%(c+1)]=np.concatenate((c_objects,[np.nan for _ in np.arange(len(df_topics.index)-len(c_objects))]))
    df_clusters.dropna(axis=0,how='all', inplace=True)
    df_clusters.to_csv("hierhsbm/hierhsbm_level_%d_clusters.csv"%(l), index=False, header=True)
    #metrics
    print("saving metrics")
    score = (homogeneity_completeness_v_measure(true_out, out))
    scores['hierhsbm']['h'].append(score[0])
    scores['hierhsbm']['c'].append(score[1])
    scores['hierhsbm']['V'].append(score[2])
    
pd.DataFrame(data=scores['hierhsbm']).to_csv("%s/hierhsbm.scores"%directory, header=True, index=False)