In [None]:
import os
import glob
import pickle
import sys  

sys.path.insert(0, '../py')
from graviti import *

import numpy as np
import pandas as pd

from  matplotlib import pyplot
from matplotlib.colors import ListedColormap
from sklearn import neighbors, datasets
import seaborn

import umap

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load the 2d umap projection for BRCA 
umap_proj_filename = '../data/descriptor_withI.umap.csv' 
umap_xy = pd.read_csv(umap_proj_filename)
# Pase the sample id in order to compare with the subtype table
umap_xy['Sample.ID'] = umap_xy['sample'].str.rsplit('-',n=3,expand=True)[0]

In [None]:
# Load the molecular subtype table from literature
subtypes_filename = '../data/subtypes.csv'
subtypes = pd.read_csv(subtypes_filename,header=1)

In [None]:
#print(subtypes.head())
#print(set(subtypes['Tumor.Type']))
#print(subtypes.shape)

#print(umap_xy.head())

In [None]:
# merge the two tables on the Sample.ID
df_merged = pd.merge(umap_xy, subtypes, on="Sample.ID")
df_merged.drop('Unnamed: 0',axis=1,inplace=True)
df_merged.columns

In [None]:
# Define a function showing a feature distribution on the umap projection
import seaborn
seaborn.set(style='white')
def show_umap_proj(feature,df):
    dff = df[['x','y',feature]]
    #print(dff[feature].value_counts())
    fg = seaborn.FacetGrid(data=dff, 
                           hue=feature,
                           height=10, aspect=1
                          )
    fg.map(pyplot.scatter, 'x', 'y',s=50,alpha=0.5).add_legend()
    plt.text(-8.0, -3.0, str(dff[feature].value_counts()), 
            horizontalalignment='left', size='medium', color='black')#, weight='semibold')

    filename = 'umap_'+str(feature)
    plt.savefig(filename+'.png')

In [None]:
# Define the feature set
feature_set = ['pathologic_stage',
        'BRCA_Pathology', 'BRCA_Subtype_PAM50',
        'CNV Clusters',
       'Mutation Clusters', 'DNA.Methylation Clusters', 'mRNA Clusters',
       'miRNA Clusters', 'lncRNA Clusters', 'Protein Clusters',
       'PARADIGM Clusters', 'Pan-Gyn Clusters']

In [None]:
for feature in feature_set:
    show_umap_proj(feature,df_merged)

In [None]:
# Evaluate the distance structure of the individual clusters considering the neighborhood of each sample
from sklearn.neighbors import NearestNeighbors
import seaborn as sns
import collections
from collections import Counter

def show_pmi(df_merged,feature):
    #feature = 'BRCA_Subtype_PAM50'

    dff = df_merged[['x','y',feature]]
    X = dff[['x','y']].to_numpy()
    nbrs = NearestNeighbors(n_neighbors=10, algorithm='ball_tree').fit(X)
    distances, indices = nbrs.kneighbors(X)

# Create a dataframe with inter-subtypes proximity countings

    inter_types_df = pd.DataFrame()
    for idx in range(indices.shape[0]):
        list_of_types = [ dff[feature].to_list()[i] for i in list(indices[idx]) ]
        source = list_of_types[0]
        c = Counter(list_of_types)
        df = pd.DataFrame.from_dict(c, orient='index').reset_index()
        df = df.rename(columns={'index':'target', 0:'count'}) 
        df['source'] = df.shape[0]*[source] # add the source tissue column
        inter_types_df = inter_types_df.append(df)

    pv_data = pd.pivot_table(inter_types_df, index=["source"], columns=["target"], values=["count"], aggfunc=np.sum)

    c = Counter(dff[feature].to_list())
    df_pmi = pd.DataFrame()
    set1 = {s for s in set(inter_types_df.source) if s==s}
    set2 = {t for t in set(inter_types_df.target) if t==t}
    for s in set1:
        for t in set2:
            num = pv_data.loc[s,('count', t)]
            den = c[s]*c[t]
            pmi = np.log(dff.shape[0]*num/den) # the cooccurrence is evaluated using an analogue of the pointwise mutual information
            df0 = pd.DataFrame([[s,t,num,c[s],c[t],pmi]], 
                           columns=['source','target','cooccurence','source_count','target_count','pmi'])
            df_pmi = df_pmi.append( df0 )


# Plot the point-wise mutual information of the clusters
    filename = 'PMI_'+str(feature)+'.10nn'

    pv_pmi = df_pmi.pivot(index='source',columns='target',values='pmi')
    plt.figure(figsize=(10,10))
    ax = sns.heatmap(pv_pmi,annot=True, fmt=".3", cmap="Blues")
    ax.set_ylim([0,len(set1)])
    ax.set_title(filename)
    plt.savefig(filename+'.png')
#df_pmi.to_csv(filename+'.csv')

In [None]:
for feature in feature_set:
    show_pmi(df_merged,feature) 
    print(feature)