In [None]:
import matplotlib.pyplot as plt  
import pandas as pd  
import numpy as np 
import seaborn as sns
from hsbmpy import get_file, define_labels, get_cluster_given_l
from sklearn.metrics import homogeneity_completeness_v_measure
import scipy.cluster.hierarchy as shc
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
level = 2
#setup = 'oversigma_10tissue'
#label = 'disease_type'
label='primary_site'
#label = 'uniq'
L = 4
#labels = ['primary_site', 'disease_type']
#labels = ['primary_site', 'secondary_site']
#directory = "results/hSBM/%s"%setup
#directory="gtex/hsbm/%s"%setup
directory="merged/hsbm"
df = pd.read_csv("%s/mainTable.csv"%(directory), index_col=[0], header=[0])
df.head()

In [None]:
with open("%s/clustersizes.txt"%directory,'r') as f:
    xl = np.array(f.read().split(sep='\n'))[:-1].astype(int)

In [None]:
df.T.values.shape

In [None]:
fig=plt.figure()
dend = shc.dendrogram(shc.linkage(df.T.values, method='ward'), leaf_rotation=90., leaf_font_size=8.,)
plt.xlabel("samples", fontsize=16)
plt.show()
fig.savefig("%s/hierarchical_dendogram.pdf"%directory)

In [None]:
df_files = pd.read_csv("%s/files.dat"%directory, index_col=[0])
df_files.head()

In [None]:
scores = {}
scores['hierarchical']={
    'h':[],
    'c':[],
    'V':[]
}
for x in xl:
    print("testing with %d clusters"%x)
    cluster = AgglomerativeClustering(n_clusters=x, affinity='euclidean', linkage='ward')  
    out = cluster.fit_predict(df.T.values)
    true_out = []
    for sample in df.columns.values:
        try:
            true_out.append(get_file(sample, df_files)['primary_site'])
        except:
            true_out.append('')
    score = (homogeneity_completeness_v_measure(true_out, out))
    scores['hierarchical']['h'].append(score[0])
    scores['hierarchical']['c'].append(score[1])
    scores['hierarchical']['V'].append(score[2])
    
pd.DataFrame(data=scores['hierarchical']).to_csv("%s/hierarchical.scores"%directory, header=True, index=False)

In [None]:
lda = LatentDirichletAllocation(n_components=100,random_state=42)
topics = lda.fit_transform(df.T.values)

In [None]:
scores['lda']={
    'h':[],
    'c':[],
    'V':[]
}
for x in xl:
    cluster = AgglomerativeClustering(n_clusters=x, affinity='euclidean', linkage='ward')  
    out = cluster.fit_predict(topics)
    true_out = []
    for sample in df.columns.values:
        try:
            true_out.append(get_file(sample, df_files)['primary_site'])
        except:
            true_out.append('')
    score = (homogeneity_completeness_v_measure(true_out, out))
    scores['lda']['h'].append(score[0])
    scores['lda']['c'].append(score[1])
    scores['lda']['V'].append(score[2])
pd.DataFrame(data=scores['lda']).to_csv("%s/lda.scores"%directory, header=True, index=False)