In [None]:
import requests as rq
import json
import pandas as pd
import numpy as np
from textwrap import wrap
from matplotlib import pyplot as plt
from TCGA_files import *
from ensembleAPI import geneinfo, genesinfo
from IPython.display import HTML
from sklearn import metrics

In [None]:
cases_endpt = 'https://api.gdc.cancer.gov/cases'
fields = [
    "submitter_id",
    "case_id",
    "primary_site",
    "disease_type",
    "project.project_id",
    "diagnoses.tumor_stage",
    "diagnoses.tumor_grade",
    "diagnoses.primary_diagnosis",
    "diagnoses.classification_of_tumor",
    "annotations.classification",
    "samples.tumor_code"
    ]
fields = ','.join(fields)

# query file

See https://docs.gdc.cancer.gov/API/Users_Guide/Appendix_A_Available_Fields/#file-fields for possible *fields*

In [None]:
queryFile('007f49fe-d251-4ae4-9313-86d765b13499.FPKM.txt.gz')

In [None]:
df_files = pd.read_csv("files.txt", index_col=[0], header=[0])

## query many

In [None]:
level = 0
setup = 'overlap_1000doc_0.5occ_thr'
#setup='5000doc_0.5occ_fpkmweight_thr'
#setup = 'overlap_1000doc_0.5occ_thr_tfidf'
label = 'disease_type'
#label='primary_site'
df_clusters = pd.read_csv("results/hSBM/%s/topsbm/topsbm_level_%d_clusters.csv"%(setup, level), header=[0])

In [None]:
cluster={}
for i,c in enumerate(df_clusters.columns):
    cluster[i]=df_clusters[c].dropna().values

In [None]:
for i,c in enumerate(cluster):
    print(c)
    #datatotest = queryFiles([f+".FPKM.txt.gz" for f in cluster[c]])
    datatotest = pd.DataFrame(data=df_files.loc[[f for f in cluster[c]]], columns=df_files.columns)
    makePie(datatotest, level, c, ['primary_site','disease_type'])
    if i > 5:
        break

# Benchmark
https://scikit-learn.org/stable/modules/clustering.html

In [None]:
true_labels = []
predicted_labels = []
for c in cluster:
    print(c)
    for sample in cluster[c]:
        #true_labels.append(getFile(sample)['primary_site'].values[0])
        try:
            true_labels.append(df_files.loc[sample][label])
            predicted_labels.append(c)
        except:
            print("error in %s"%sample)

In [None]:
_, labels_true = np.unique(true_labels,return_inverse=True)

In [None]:
series_metrics=pd.Series(name=setup+'__level_%d_%s'%(level,label))
series_metrics['MI']=metrics.cluster.normalized_mutual_info_score(labels_true, predicted_labels, average_method='arithmetic')
series_metrics['Homogeneity']=metrics.cluster.homogeneity_score(labels_true, predicted_labels) 
series_metrics['Completness']=metrics.cluster.completeness_score(labels_true, predicted_labels) 
series_metrics['Vmeasure']=metrics.cluster.v_measure_score(labels_true, predicted_labels)
series_metrics['Fowlkes-Mallows']=metrics.cluster.fowlkes_mallows_score(labels_true,predicted_labels)
series_metrics['AdjustedRandIndex']=metrics.cluster.adjusted_rand_score(labels_true,predicted_labels)

In [None]:
series_metrics

In [None]:
#df_metrics = pd.DataFrame(columns=['AdjustedRandIndex','MI','Homogeneity','Completness','Vmeasure','Fowlkes-Mallows'])

In [None]:
try:
    df_metrics.append(series_metrics, ignore_index=False, verify_integrity=True).to_csv("results/hSBM/metrics.csv", index=True,header=True)
except:
    raise

In [None]:
df_metrics = pd.read_csv("results/hSBM/metrics.csv", index_col=[0],header=[0])
df_metrics.sort_values(by=['MI','Homogeneity'], ascending=False, axis=0, inplace=True)
df_metrics.round(decimals=2)

In [None]:
from matplotlib import pyplot as plt

fig, ax = plt.subplots(figsize=(24, 20)) # set size frame
ax.xaxis.set_visible(False)  # hide the x axis
ax.yaxis.set_visible(False)  # hide the y axis
ax.set_frame_on(False)  # no visible frame, uncomment if size is ok
tabla = pd.plotting.table(ax, df_metrics.round(decimals=2), loc='upper right', colWidths=[0.12]*len(df_metrics.columns))  # where df is your data frame
tabla.auto_set_font_size(False) # Activate set fontsize manually
tabla.set_fontsize(12) # if ++fontsize is necessary ++colWidths
tabla.scale(1, 2) # change size table
plt.savefig('results/hSBM/metrics.pdf', transparent=True)

In [None]:
metrics.cluster.contingency_matrix(true_labels, predicted_labels)

In [None]:
df_files[df_files['primary_site']=='Brain']