In [20]:
import os
import glob
import pickle
import sys  

sys.path.insert(0, '../py')
from graviti import *

import numpy as np
import pandas as pd
from sklearn import neighbors, datasets
from sklearn.neighbors import NearestNeighbors

from  matplotlib import pyplot
from matplotlib.colors import ListedColormap
import seaborn as sns
sns.set(style='white')
import umap

import collections
from collections import Counter

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.model_selection import train_test_split
from sklearn import preprocessing 
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.metrics import balanced_accuracy_score

In [2]:
# Define a function showing a feature distribution on the umap projection
def show_umap_proj(feature,df):
    dff = df[['x','y',feature]]
    #print(dff[feature].value_counts())
    fg = seaborn.FacetGrid(data=dff, 
                           hue=feature,
                           height=10, aspect=1
                          )
    fg.map(pyplot.scatter, 'x', 'y',s=50,alpha=0.5).add_legend()
    plt.text(-8.0, -3.0, str(dff[feature].value_counts()), 
            horizontalalignment='left', size='medium', color='black')#, weight='semibold')

    filename = 'umap_'+str(feature)
    plt.savefig(filename+'.png')

In [3]:
# Evaluate the distance structure of the individual clusters considering the neighborhood of each sample

def show_pmi(df_merged,feature):
    #feature = 'BRCA_Subtype_PAM50'

    dff = df_merged[['x','y',feature]]
    X = dff[['x','y']].to_numpy()
    nbrs = NearestNeighbors(n_neighbors=10, algorithm='ball_tree').fit(X)
    distances, indices = nbrs.kneighbors(X)

    # Create a dataframe with inter-subtypes proximity countings

    inter_types_df = pd.DataFrame()
    for idx in range(indices.shape[0]):
        list_of_types = [ dff[feature].to_list()[i] for i in list(indices[idx]) ]
        source = list_of_types[0]
        c = Counter(list_of_types)
        df = pd.DataFrame.from_dict(c, orient='index').reset_index()
        df = df.rename(columns={'index':'target', 0:'count'}) 
        df['source'] = df.shape[0]*[source] # add the source tissue column
        inter_types_df = inter_types_df.append(df)

    pv_data = pd.pivot_table(inter_types_df, index=["source"], columns=["target"], values=["count"], aggfunc=np.sum)

    c = Counter(dff[feature].to_list())
    df_pmi = pd.DataFrame()
    set1 = {s for s in set(inter_types_df.source) if s==s}
    set2 = {t for t in set(inter_types_df.target) if t==t}
    for s in set1:
        for t in set2:
            num = pv_data.loc[s,('count', t)]
            den = c[s]*c[t]
            pmi = np.log(dff.shape[0]*num/den) # the cooccurrence is evaluated using an analogue of the pointwise mutual information
            df0 = pd.DataFrame([[s,t,num,c[s],c[t],pmi]], 
                           columns=['source','target','cooccurence','source_count','target_count','pmi'])
            df_pmi = df_pmi.append( df0 )


    # Plot the point-wise mutual information of the clusters
    filename = 'PMI_'+str(feature)+'.10nn'

    pv_pmi = df_pmi.pivot(index='source',columns='target',values='pmi')
    plt.figure(figsize=(10,10))
    ax = sns.heatmap(pv_pmi,annot=True, fmt=".3", cmap="Blues")
    ax.set_ylim([0,len(set1)])
    ax.set_title(filename)
    plt.savefig(filename+'.png')
    #df_pmi.to_csv(filename+'.csv')

In [4]:
# Load the 2d umap projection for BRCA 
umap_proj_filename = '../data/descriptor_withI.umap.csv' 
umap_xy = pd.read_csv(umap_proj_filename)
# Pase the sample id in order to compare with the subtype table
umap_xy['Sample.ID'] = umap_xy['sample'].str.rsplit('-',n=3,expand=True)[0]

In [5]:
# Load the molecular subtype table from literature
subtypes_filename = '../data/subtypes.csv'
subtypes = pd.read_csv(subtypes_filename,header=1)

In [6]:
# merge the two tables on the Sample.ID
df_merged = pd.merge(umap_xy, subtypes, on="Sample.ID").copy()
df_merged.drop('Unnamed: 0',axis=1,inplace=True)
df_merged.columns

Index(['x', 'y', 'sample', 'Sample.ID', 'Tumor.Type',
       'Included_in_previous_marker_papers', 'vital_status', 'days_to_birth',
       'days_to_death', 'days_to_last_followup',
       'age_at_initial_pathologic_diagnosis', 'pathologic_stage',
       'Tumor_Grade', 'BRCA_Pathology', 'BRCA_Subtype_PAM50', 'CESC_Pathology',
       'OV_Subtype', 'UCS_Histology', 'UCEC_Histology', 'MSI_status',
       'HPV_Status', 'tobacco_smoking_history', 'CNV Clusters',
       'Mutation Clusters', 'DNA.Methylation Clusters', 'mRNA Clusters',
       'miRNA Clusters', 'lncRNA Clusters', 'Protein Clusters',
       'PARADIGM Clusters', 'Pan-Gyn Clusters'],
      dtype='object')

In [7]:
# Define the feature set
feature_set = ['pathologic_stage','BRCA_Pathology', 'BRCA_Subtype_PAM50','CNV Clusters',
               'Mutation Clusters', 'DNA.Methylation Clusters', 'mRNA Clusters','miRNA Clusters',
               'lncRNA Clusters', 'Protein Clusters','PARADIGM Clusters', 'Pan-Gyn Clusters']

In [None]:
for feature in feature_set:
    show_umap_proj(feature,df_merged)

In [None]:
for feature in feature_set:
    show_pmi(df_merged,feature) 
    print(feature)

# Supervised learning of molecular features
We use the 2d UMAP projection to infer the annotated molecular subtypes

In [18]:
le = preprocessing.LabelEncoder()
feature_set = ['pathologic_stage','BRCA_Pathology', 'BRCA_Subtype_PAM50','CNV Clusters',
               'Mutation Clusters', 'DNA.Methylation Clusters', 'mRNA Clusters','miRNA Clusters',
               'lncRNA Clusters', 'Protein Clusters','PARADIGM Clusters', 'Pan-Gyn Clusters']
feature = 'Pan-Gyn Clusters'
X_full = df_merged[['x','y',feature]].copy()

# Remove rows with missing target, separate target from predictors
X_full.dropna(axis=0, subset=[feature], inplace=True)
y = le.fit_transform(X_full[feature]) # label encode the target
X_full.drop([feature], axis=1, inplace=True)


# Break off validation set from training data
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X_full, y,
                                                      train_size=0.8, test_size=0.2,
                                                      random_state=0)

# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in X_train_full.columns if
                    #X_train_full[cname].nunique() < 10 and 
                    X_train_full[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if 
                X_train_full[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

clf = tree.DecisionTreeClassifier(random_state=0, max_depth=3)
probs = cross_val_predict(clf, X_full, y, cv=3,method='predict_proba')
scores = cross_val_score(clf, X_full, y, cv=3)
#print(probs)
#print(scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

clf = KNeighborsClassifier(n_neighbors=5, weights='distance')
probs = cross_val_predict(clf, X_full, y, cv=3,method='predict_proba')
scores = cross_val_score(clf, X_full, y, cv=3)
#print(probs)
#print(scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.33 (+/- 0.01)
Accuracy: 0.27 (+/- 0.06)


In [136]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [156]:
model = tree.DecisionTreeClassifier(random_state=0, max_depth=100)

In [201]:
# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])

# Preprocessing of training data, fit model 
my_pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_valid)

# Evaluate the model
from sklearn.metrics import multilabel_confusion_matrix, confusion_matrix

print(multilabel_confusion_matrix(y_valid, preds))
print(confusion_matrix(y_valid, preds))

[[[138  35]
  [ 35   7]]

 [[183  16]
  [ 14   2]]

 [[ 55  58]
  [ 45  57]]

 [[140  26]
  [ 40   9]]

 [[204   5]
  [  6   0]]]
[[ 7  4 21  9  1]
 [ 5  2  6  3  0]
 [21  7 57 14  3]
 [ 7  4 28  9  1]
 [ 2  1  3  0  0]]
