In [29]:
import os
import glob
import numpy as np
import pandas as pd

from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split, cross_val_predict
from dask_searchcv import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from scripts.tcga_util import get_threshold_metrics

In [2]:
np.random.seed(123)

In [82]:
# Load constants
genes = ['TP53', 'RAS']
filter_prop = 0.05
filter_count = 15
folds = 5
algorithms = ['pca', 'ica']
alphas = [0.1, 0.13, 0.15, 0.2, 0.25, 0.3]
l1_ratios = [0.15, 0.16, 0.2, 0.25, 0.3, 0.4]

In [128]:
# What results to track
full_metrics_list = []
full_auc_list = []
full_aupr_list = []
full_coef_list = []

# Obtain a list of locations for each feature matrix (X)
z_matrix_dict = {}
for signal in signals:
    z_matrix_dict[signal] = {}

    if signal == 'signal':
        results_dir = 'TCGA_results'
    else:
        results_dir = 'TCGA_shuffled_results'

    matrix_dir = os.path.join('..', '2.ensemble-z-analysis', 'results',
                              results_dir, 'ensemble_z_matrices')

    for comp_dir in os.listdir(matrix_dir):
        matrix_comp_dir = os.path.join(matrix_dir, comp_dir)
        z_dim = comp_dir.split('_')[2]
        z_matrix_dict[signal][z_dim] = {}

        for z_file in glob.glob('{}/*_z_*'.format(matrix_comp_dir)):
            seed = os.path.basename(z_file).split('_')[1]

            if seed not in z_matrix_dict[signal][z_dim].keys():
                z_matrix_dict[signal][z_dim][seed] = {}

            if '_test_' in z_file:
                z_matrix_dict[signal][z_dim][seed]['test'] = z_file
            else:
                z_matrix_dict[signal][z_dim][seed]['train'] = z_file

for gene in genes:
    # Process the y matrix for the given gene or pathway
    if gene == 'RAS':
        subset_gene = ['KRAS', 'HRAS', 'NRAS']
        y_copy_number_df = copy_gain_df.loc[:, subset_gene].max(axis='columns')
        y_mutation_df = mutation_df.loc[:, subset_gene].max(axis='columns')
    else:
        y_copy_number_df = copy_loss_df.loc[:, gene]
        y_mutation_df = mutation_df.loc[:, gene]

    y_df = y_copy_number_df + y_mutation_df
    y_df.loc[y_df > 1] = 1
    y_df = pd.DataFrame(y_df)
    y_df.columns = ['status']

    y_df = (
        y_df
        .merge(sample_freeze_df, how='left', left_index=True,
               right_on='SAMPLE_BARCODE')
        .set_index('SAMPLE_BARCODE')
        .merge(mut_burden_df, left_index=True, right_index=True)
    )

    # Get statistics per gene and disease
    disease_counts_df = pd.DataFrame(y_df.groupby('DISEASE').sum()['status'])

    disease_proportion_df = (
        disease_counts_df
        .divide(y_df['DISEASE']
                .value_counts(sort=False)
                .sort_index(), axis=0)
    )

    # Filter diseases with low counts or proportions for classification balance
    filter_disease_df = ((disease_counts_df > filter_count) &
                         (disease_proportion_df > filter_prop))
    filter_disease_df.columns = ['disease_included']

    disease_stats_df = (
        disease_counts_df
        .merge(disease_proportion_df,
               left_index=True,
               right_index=True,
               suffixes=('_count', '_proportion'))
        .merge(filter_disease_df,
               left_index=True,
               right_index=True)
    )

    filter_file = '{}_filtered_cancertypes.tsv'.format(gene)
    filter_file = os.path.join('results', filter_file)
    disease_stats_df.to_csv(filter_file, sep='\t')

    # Filter
    use_diseases = disease_stats_df.query("disease_included").index.tolist()
    burden_filter = y_df['log10_mut'] < 5 * y_df['log10_mut'].std()
    y_df = y_df.loc[burden_filter, :].query("DISEASE in @use_diseases")

    # Now, perform all the analyses for each X matrix
    for signal in z_matrix_dict.keys():
        z_dim_dict = z_matrix_dict[signal]
        for z_dim in ['2', '3']:
            seed_z_dim_dict = z_dim_dict[z_dim]
            for seed in seed_z_dim_dict.keys():
                z_train_file = z_matrix_dict[signal][z_dim][seed]['train']
                z_test_file = z_matrix_dict[signal][z_dim][seed]['test']

                for alg in algorithms:
                    # Load Data
                    x_train_df = pd.read_table(z_train_file, index_col=0)
                    use_col = x_train_df.columns.str.contains(alg)
                    x_train_df = x_train_df.loc[:, use_col]

                    x_test_df = pd.read_table(z_test_file, index_col=0)
                    use_col = x_test_df.columns.str.contains(alg)
                    x_test_df = x_test_df.loc[:, use_col]

                    # Subset samples
                    train_samples = (
                        set(y_df.index).intersection(set(x_train_df.index))
                    )
                    test_samples = (
                        set(y_df.index).intersection(set(x_test_df.index))
                    )

                    x_train_df = x_train_df.reindex(train_samples)
                    y_train_df = y_df.reindex(train_samples)

                    x_test_df = x_test_df.reindex(test_samples)
                    y_test_df = y_df.reindex(test_samples)

                    # Add in covariate info
                    covar_train_df = pd.get_dummies(y_train_df.DISEASE)
                    covar_test_df = pd.get_dummies(y_test_df.DISEASE)

                    mut_covar_train_df = pd.DataFrame(y_train_df.loc[:, 'log10_mut'])
                    mut_covar_test_df = pd.DataFrame(y_test_df.loc[:, 'log10_mut'])

                    x_train_df = (
                        x_train_df
                        .merge(covar_train_df, left_index=True,
                               right_index=True)
                        .merge(mut_covar_train_df, left_index=True,
                               right_index=True)
                    )
                    x_test_df = (
                        x_test_df
                        .merge(covar_test_df, left_index=True,
                               right_index=True)
                        .merge(mut_covar_test_df, left_index=True,
                               right_index=True)
                    )

                    # Setup the classifier parameters
                    clf_parameters = {'classify__loss': ['log'],
                                      'classify__penalty': ['elasticnet'],
                                      'classify__alpha': alphas,
                                      'classify__l1_ratio': l1_ratios}

                    estimator = (
                        Pipeline(
                            steps=[('classify',
                                    SGDClassifier(random_state=0,
                                                  class_weight='balanced',
                                                  loss='log',
                                                  max_iter=100,
                                                  tol=1e-3))]
                                )
                    )

                    cv_pipeline = GridSearchCV(estimator=estimator,
                                               param_grid=clf_parameters,
                                               n_jobs=-1,
                                               cv=folds,
                                               scoring='roc_auc',
                                               return_train_score=True)

                    print('Training model... gene: {}, '
                          'algorithm: {}, signal: {}, z_dim: {}, '
                          'seed: {}'.format(gene, alg, signal, z_dim, seed))

                    # Fit the model
                    output_cv = cv_pipeline.fit(X=x_train_df,
                                                y=y_train_df.status)

                    # Obtain cross validation results
                    y_cv_df = (
                        cross_val_predict(cv_pipeline.best_estimator_,
                                          X=x_train_df,
                                          y=y_train_df.status,
                                          cv=folds,
                                          method='decision_function')
                    )

                    # Get all performance results
                    y_predict_train_df = (
                        cv_pipeline.decision_function(x_train_df)
                    )
                    y_predict_test_df = (
                        cv_pipeline.decision_function(x_test_df)
                    )

                    # Get metric  predictions
                    y_train_results = (
                        get_threshold_metrics(y_train_df.status,
                                              y_predict_train_df,
                                              drop=False)
                    )
                    y_test_results = (
                        get_threshold_metrics(y_test_df.status,
                                              y_predict_test_df,
                                              drop=False)
                    )
                    y_cv_results = (
                        get_threshold_metrics(y_train_df.status,
                                              y_cv_df,
                                              drop=False)
                    )

                    # Get coefficients
                    final_pipeline = cv_pipeline.best_estimator_
                    final_classifier = final_pipeline.named_steps['classify']

                    coef_df = pd.DataFrame.from_dict(
                        {'feature': x_train_df.columns,
                         'weight': final_classifier.coef_[0]})

                    coef_df = (
                        coef_df
                        .assign(abs=coef_df['weight'].abs())
                        .sort_values('abs', ascending=False)
                        .reset_index(drop=True)
                        .assign(gene=gene,
                                signal=signal,
                                z_dim=z_dim,
                                seed=seed,
                                algorithm=alg)
                    )

                    # Store all results
                    train_metrics_, train_roc_df, train_pr_df = (
                        summarize_results(y_train_results, gene, signal, z_dim,
                                          seed, alg, 'train')
                    )
                    test_metrics_, test_roc_df, test_pr_df = (
                        summarize_results(y_test_results, gene, signal, z_dim,
                                          seed, alg, 'test')
                    )
                    cv_metrics_, cv_roc_df, cv_pr_df = (
                        summarize_results(y_cv_results, gene, signal, z_dim,
                                          seed, alg, 'cv')
                    )

                    # Compile summary metrics
                    cols = ['auroc', 'aupr', 'gene', 'signal', 'z_dim', 'seed',
                            'algorithm', 'data_type']
                    metrics_ = [train_metrics_, test_metrics_, cv_metrics_]
                    metric_df_ = pd.DataFrame(metrics_, columns=cols)
                    full_metrics_list.append(metric_df_)

                    full_auc_df = pd.concat(
                        [train_roc_df, test_roc_df, cv_roc_df]
                    )
                    full_auc_list.append(full_auc_df)

                    full_aupr_df = pd.concat(
                        [train_pr_df, test_pr_df, cv_pr_df]
                    )
                    full_aupr_list.append(full_aupr_df)
                    full_coef_list.append(coef_df)

# Now, compile all results and write to file
final_metrics_df = pd.concat(full_metrics_list)
final_auc_df = pd.concat(full_auc_list)
final_aupr_df = pd.concat(full_aupr_list)
final_coef_df = pd.concat(full_coef_list)

file = os.path.join('results', 'classify_metrics.tsv')
final_metrics_df.to_csv(file, sep='\t', index=False)

file = os.path.join('results', 'auc_threshold_metrics.tsv')
final_auc_df.to_csv(file, sep='\t', index=False)

file = os.path.join('results', 'aupr_threshold_metrics.tsv')
final_aupr_df.to_csv(file, sep='\t', index=False)

file = os.path.join('results', 'coefficients.tsv')
final_coef_df.to_csv(file, sep='\t', index=False)

Training model... gene: TP53, algorithm: pca, signal: signal, z_dim: 2, seed: 313363
Training model... gene: TP53, algorithm: ica, signal: signal, z_dim: 2, seed: 313363
Training model... gene: TP53, algorithm: pca, signal: signal, z_dim: 2, seed: 693066
Training model... gene: TP53, algorithm: ica, signal: signal, z_dim: 2, seed: 693066
Training model... gene: TP53, algorithm: pca, signal: signal, z_dim: 2, seed: 291359
Training model... gene: TP53, algorithm: ica, signal: signal, z_dim: 2, seed: 291359
Training model... gene: TP53, algorithm: pca, signal: signal, z_dim: 2, seed: 630060
Training model... gene: TP53, algorithm: ica, signal: signal, z_dim: 2, seed: 630060
Training model... gene: TP53, algorithm: pca, signal: signal, z_dim: 2, seed: 713701
Training model... gene: TP53, algorithm: ica, signal: signal, z_dim: 2, seed: 713701
Training model... gene: TP53, algorithm: pca, signal: signal, z_dim: 3, seed: 719912
Training model... gene: TP53, algorithm: ica, signal: signal, z_d

In [129]:
file = os.path.join('results', 'auc_threshold_metrics.tsv.gz')
final_auc_df.to_csv(file, sep='\t', index=False, compression='gzip',
                    float_format='%.5g')

file = os.path.join('results', 'aupr_threshold_metrics.tsv.gz')
final_aupr_df.to_csv(file, sep='\t', index=False, compression='gzip',
                     float_format='%.5g')

file = os.path.join('results', 'coefficients.tsv.gz')
final_coef_df.to_csv(file, sep='\t', index=False, compression='gzip',
                     float_format='%.5g')

In [122]:
x_train_df

Unnamed: 0,pca_0,pca_1,BLCA,BRCA,CESC,COAD,ESCA,GBM,HNSC,KICH,...,OV,PAAD,PRAD,READ,SARC,SKCM,STAD,UCEC,UCS,log10_mut
TCGA-AP-A0LN-01,1.618292,1.915562,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1.792392
TCGA-GM-A2DK-01,0.260184,0.477376,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.322219
TCGA-CL-5918-01,2.151609,-0.440165,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,2.071882
TCGA-D1-A17B-01,0.831326,0.685772,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,2.451786
TCGA-N6-A4VF-01,0.385513,-0.019445,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1.623249
TCGA-VS-A9UB-01,1.031097,-1.551026,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.991226
TCGA-AY-6196-01,-0.733639,-0.363069,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2.340444
TCGA-VQ-A91S-01,0.025868,-2.836798,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,2.369216
TCGA-BH-A1EV-01,-0.988460,-1.846175,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.929419
TCGA-E9-A1NC-01,1.582088,0.499138,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.812913


In [114]:
# Load Data
x_train_df = pd.read_table(z_train_file, index_col=0)
use_col = x_train_df.columns.str.contains(alg)
x_train_df = x_train_df.loc[:, use_col]

# Subset samples
train_samples = (
    set(y_df.index).intersection(set(x_train_df.index))
)
test_samples = (
    set(y_df.index).intersection(set(x_test_df.index))
)
x_train_df = x_train_df.reindex(train_samples)
y_train_df = y_df.reindex(test_samples)
y_train_df
pd.DataFrame(y_train_df.loc[:, 'log10_mut'])

Unnamed: 0_level_0,log10_mut
SAMPLE_BARCODE,Unnamed: 1_level_1
TCGA-AP-A0LN-01,1.792392
TCGA-GM-A2DK-01,1.322219
TCGA-CL-5918-01,2.071882
TCGA-D1-A17B-01,2.451786
TCGA-N6-A4VF-01,1.623249
TCGA-VS-A9UB-01,1.991226
TCGA-AY-6196-01,2.340444
TCGA-VQ-A91S-01,2.369216
TCGA-BH-A1EV-01,1.929419
TCGA-E9-A1NC-01,1.812913


In [99]:
pd.DataFrame(y_df)#, columns=['status'])

Unnamed: 0,TP53
TCGA-02-0047-01,0
TCGA-02-0055-01,1
TCGA-02-2483-01,1
TCGA-02-2485-01,1
TCGA-02-2486-01,0
TCGA-04-1348-01,1
TCGA-04-1357-01,1
TCGA-04-1362-01,1
TCGA-05-4244-01,0
TCGA-05-4249-01,0


In [4]:
# Load data to build y matrices
base_url = 'https://github.com/greenelab/pancancer/raw'
commit='9fd9afbecdbb4f855ecc85bb282fc59e75c7744f'

# File names
mut_file = '{}/{}/data/pancan_mutation_freeze.tsv.gz'.format(base_url, commit)
copy_gain_file = '{}/{}/data/copy_number_gain_status.tsv.gz'.format(base_url, commit)
copy_loss_file = '{}/{}/data/copy_number_loss_status.tsv.gz'.format(base_url, commit)
mut_burden_file = '{}/{}/data/mutation_burden_freeze.tsv'.format(base_url, commit)
sample_freeze_file = '../../pancancer/data/sample_freeze.tsv'

# Load data
sample_freeze_df = pd.read_table(sample_freeze_file, index_col=0)
mutation_df = pd.read_table(mut_file, index_col=0)
copy_loss_df = pd.read_table(copy_loss_file, index_col=0)
copy_gain_df = pd.read_table(copy_gain_file, index_col=0)
mut_burden_df = pd.read_table(mut_burden_file, index_col=0)

In [5]:
# Process the y matrix for the given gene or pathway
gene = 'RAS'
if gene == 'RAS':
    subset_genes = ['KRAS', 'HRAS', 'NRAS']
    y_copy_number_df = copy_gain_df.loc[:, subset_genes].max(axis='columns')
    y_mutation_df = mutation_df.loc[:, subset_genes].max(axis='columns')
else:
    y_copy_number_df = copy_loss_df.loc[:, gene]
    y_mutation_df = mutation_df.loc[:, gene]
    
y_df = y_copy_number_df + y_mutation_df
y_df.loc[y_df > 1] = 1

y_df = (
    pd.DataFrame(y_df, columns=['status'])
    .merge(sample_freeze_df, how='left', left_index=True, right_on='SAMPLE_BARCODE')
    .set_index('SAMPLE_BARCODE')
    .merge(mut_burden_df, left_index=True, right_index=True)
)

y_df.head(2)

Unnamed: 0_level_0,status,PATIENT_BARCODE,DISEASE,SUBTYPE,log10_mut
SAMPLE_BARCODE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
TCGA-02-0047-01,0,TCGA-02-0047,GBM,IDHwt,1.812913
TCGA-02-0055-01,0,TCGA-02-0055,GBM,IDHwt,1.70757


In [6]:
# Get statistics per gene and disease
disease_counts_df = pd.DataFrame(y_df.groupby('DISEASE').sum()['status'])

disease_proportion_df = (
    disease_counts_df
    .divide(y_df['DISEASE']
            .value_counts(sort=False)
            .sort_index(), axis=0)
)

# Filter diseases with low counts or proportions to maintain classification balance
filter_disease_df = (disease_counts_df > filter_count) & (disease_proportion_df > filter_prop)
filter_disease_df.columns = ['disease_included']

disease_stats_df = (
    disease_counts_df
    .merge(disease_proportion_df,
           left_index=True,
           right_index=True,
           suffixes=('_count', '_proportion'))
    .merge(filter_disease_df,
           left_index=True,
           right_index=True)
)

disease_stats_df

Unnamed: 0_level_0,status_count,status_proportion,disease_included
DISEASE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ACC,1,0.013158,False
BLCA,53,0.133166,True
BRCA,36,0.036697,False
CESC,18,0.066176,True
CHOL,3,0.083333,False
COAD,171,0.501466,True
DLBC,2,0.054054,False
ESCA,20,0.118343,True
GBM,3,0.024194,False
HNSC,37,0.075975,True


In [7]:
# Filter
use_diseases = disease_stats_df.query("disease_included").index.tolist()
burden_filter = y_df['log10_mut'] < 5 * y_df['log10_mut'].std()
y_df = y_df.loc[burden_filter, :].query("DISEASE in @use_diseases")

In [8]:
# Obtain a list of locations for each feature matrix (X)
z_matrix_dict = {}
signals = ['signal', 'shuffled']
for signal in signals:
    z_matrix_dict[signal] = {}
    if signal == 'signal':
        results_dir = 'TCGA_results'
    else:
        results_dir = 'TCGA_shuffled_results'
        
    matrix_dir = os.path.join('..', '2.ensemble-z-analysis', 'results',
                              results_dir, 'ensemble_z_matrices')
    for comp_dir in os.listdir(matrix_dir):
        matrix_comp_dir = os.path.join(matrix_dir, comp_dir)
        z_dim = comp_dir.split('_')[2]
        z_matrix_dict[signal][z_dim] = {}
        for z_file in glob.glob('{}/*_z_*'.format(matrix_comp_dir)):
            seed = os.path.basename(z_file).split('_')[1]
            
            if seed not in z_matrix_dict[signal][z_dim].keys():
                z_matrix_dict[signal][z_dim][seed] = {}

            if '_test_' in z_file:
                z_matrix_dict[signal][z_dim][seed]['test'] = z_file
            else:
                z_matrix_dict[signal][z_dim][seed]['train'] = z_file

In [9]:
signal = 'signal'
z_dim = '3'
seed = '719912'
algorithm = 'pca'

z_train_file = z_matrix_dict[signal][z_dim][seed]['train']
z_test_file = z_matrix_dict[signal][z_dim][seed]['test']

In [10]:
x_train_df = pd.read_table(z_train_file, index_col=0)
x_train_df = x_train_df.loc[:, x_train_df.columns.str.contains(algorithm)]

x_test_df = pd.read_table(z_test_file, index_col=0)
x_test_df = x_test_df.loc[:, x_test_df.columns.str.contains(algorithm)]

In [11]:
# Subset samples
training_samples = set(y_df.index).intersection(set(x_train_df.index))
testing_samples = set(y_df.index).intersection(set(x_test_df.index))

x_train_df = x_train_df.reindex(training_samples)
y_train_df = y_df.reindex(training_samples)

x_test_df = x_test_df.reindex(testing_samples)
y_test_df = y_df.reindex(testing_samples)

In [12]:
# Add in covariate info
covar_train_df = pd.get_dummies(y_train_df.DISEASE)
covar_test_df = pd.get_dummies(y_test_df.DISEASE)

mut_covar_train_df = pd.DataFrame(y_train_df.loc[:, 'log10_mut'], index=y_train_df.index)
mut_covar_test_df = pd.DataFrame(y_test_df.loc[:, 'log10_mut'], index=y_test_df.index)

x_train_df = (
    x_train_df
    .merge(covar_train_df, left_index=True, right_index=True)
    .merge(mut_covar_train_df, left_index=True, right_index=True)
)
x_test_df = (
    x_test_df
    .merge(covar_test_df, left_index=True, right_index=True)
    .merge(mut_covar_test_df, left_index=True, right_index=True)
)

In [13]:
clf_parameters = {'classify__loss': ['log'],
                  'classify__penalty': ['elasticnet'],
                  'classify__alpha': alphas,
                  'classify__l1_ratio': l1_ratios}

In [21]:
estimator = Pipeline(steps=[('classify', SGDClassifier(random_state=0,
                                                       class_weight='balanced',
                                                       loss='log',
                                                       max_iter=100,
                                                       tol=1e-3))])

In [25]:
cv_pipeline = GridSearchCV(estimator=estimator,
                           param_grid=clf_parameters,
                           n_jobs=-1,
                           cv=folds,
                           scoring='roc_auc',
                           return_train_score=True)

In [26]:
output_cv = cv_pipeline.fit(X=x_train_df, y=y_train_df.status)

In [39]:
y_cv_df = cross_val_predict(cv_pipeline.best_estimator_,
                            X=x_train_df,
                            y=y_train_df.status,
                            cv=folds,
                            method='decision_function')

In [40]:
# Obtain y predictions
y_predict_train_df = cv_pipeline.decision_function(x_train_df)
y_predict_test_df = cv_pipeline.decision_function(x_test_df)

In [53]:
# Get metric  predictions
y_train_results = get_threshold_metrics(y_train_df.status, y_predict_train_df, drop=False)
y_test_results = get_threshold_metrics(y_test_df.status, y_predict_test_df, drop=False)
y_cv_results = get_threshold_metrics(y_train_df.status, y_cv_df, drop=False)

In [79]:
# Get coefficients
final_pipeline = cv_pipeline.best_estimator_
final_classifier = final_pipeline.named_steps['classify']

coef_df = pd.DataFrame.from_dict(
    {'feature': x_train_df.columns,
     'weight': final_classifier.coef_[0]})

coef_df = (
    coef_df
    .assign(abs=coef_df['weight'].abs())
    .sort_values('abs', ascending=False)
    .reset_index(drop=True)
    .assign(gene=gene,
            signal=signal,
            z_dim=z_dim,
            seed=seed,
            algorithm=algorithm)
)

In [80]:
coef_df

Unnamed: 0,feature,weight,abs,gene,signal,z_dim,seed,algorithm
0,pca_2,0.226085,0.226085,RAS,signal,3,719912,pca
1,COAD,0.107055,0.107055,RAS,signal,3,719912,pca
2,PAAD,0.062577,0.062577,RAS,signal,3,719912,pca
3,pca_0,0.050441,0.050441,RAS,signal,3,719912,pca
4,HNSC,-0.044458,0.044458,RAS,signal,3,719912,pca
5,LUSC,-0.02283,0.02283,RAS,signal,3,719912,pca
6,READ,0.0,0.0,RAS,signal,3,719912,pca
7,UCEC,0.0,0.0,RAS,signal,3,719912,pca
8,THCA,0.0,0.0,RAS,signal,3,719912,pca
9,TGCT,0.0,0.0,RAS,signal,3,719912,pca


In [70]:
def summarize_results(results, gene, signal, z_dim, seed, algorithm, data_type):
    """
    Summarize and output all pertinent files
    """
    
    results_append_list = [gene, signal, z_dim, seed, algorithm, data_type]

    metrics_out_ = [results['auroc'], results['aupr']] + results_append_list

    roc_df_ = results['roc_df']
    pr_df_ = results['pr_df']
    
    roc_df_ = roc_df_.assign(
        gene=gene,
        signal=signal,
        z_dim=z_dim,
        seed=seed,
        algorithm=algorithm,
        data_type=data_type
    )
    
    pr_df_ = pr_df_.assign(
        gene=gene,
        signal=signal,
        z_dim=z_dim,
        seed=seed,
        algorithm=algorithm,
        data_type=data_type
    )
    
    return metrics_out_, roc_df_, pr_df_

In [71]:
train_metrics_, train_roc_df, train_pr_df = summarize_results(y_train_results, gene, signal, z_dim, seed, algorithm, 'train')
test_metrics_, test_roc_df, test_pr_df = summarize_results(y_test_results, gene, signal, z_dim, seed, algorithm, 'test')
cv_metrics_, cv_roc_df, cv_pr_df = summarize_results(y_cv_results, gene, signal, z_dim, seed, algorithm, 'cv')

In [73]:
pd.DataFrame([train_metrics_, test_metrics_, cv_metrics_],
             columns = ['auroc', 'aupr', 'gene', 'signal', 'z_dim',
                        'seed', 'algorithm', 'data_type'])

Unnamed: 0,auroc,aupr,gene,signal,z_dim,seed,algorithm,data_type
0,0.65983,0.379573,RAS,signal,3,719912,pca,train
1,0.646968,0.32783,RAS,signal,3,719912,pca,test
2,0.663956,0.369755,RAS,signal,3,719912,pca,cv


Unnamed: 0,fpr,tpr,threshold,gene,signal,z_dim,seed,algorithm,data_type
0,0.000000,0.000000,1.821327,RAS,signal,3,719912,pca,train
1,0.000296,0.000000,0.821327,RAS,signal,3,719912,pca,train
2,0.000593,0.000000,0.587707,RAS,signal,3,719912,pca,train
3,0.000593,0.001083,0.572504,RAS,signal,3,719912,pca,train
4,0.000889,0.001083,0.555312,RAS,signal,3,719912,pca,train
5,0.000889,0.002167,0.508068,RAS,signal,3,719912,pca,train
6,0.000889,0.003250,0.486784,RAS,signal,3,719912,pca,train
7,0.001186,0.003250,0.475059,RAS,signal,3,719912,pca,train
8,0.001186,0.004334,0.468309,RAS,signal,3,719912,pca,train
9,0.001482,0.004334,0.468261,RAS,signal,3,719912,pca,train


In [60]:
y_train_results.keys()

dict_keys(['auroc', 'aupr', 'roc_df', 'pr_df'])

In [57]:
train_roc_df = y_train_results['roc_df']

In [62]:
train_roc_df.assign(
    gene=gene,
    signal=signal,
    z_dim=z_dim,
    seed=seed,
    algorithm=algorithm,
    data_type='train'
)

Unnamed: 0,fpr,tpr,threshold,gene,signal,z_dim,seed,algorithm,data_type
0,0.000000,0.000000,1.821327,RAS,signal,3,719912,pca,train
1,0.000296,0.000000,0.821327,RAS,signal,3,719912,pca,train
2,0.000593,0.000000,0.587707,RAS,signal,3,719912,pca,train
3,0.000593,0.001083,0.572504,RAS,signal,3,719912,pca,train
4,0.000889,0.001083,0.555312,RAS,signal,3,719912,pca,train
5,0.000889,0.002167,0.508068,RAS,signal,3,719912,pca,train
6,0.000889,0.003250,0.486784,RAS,signal,3,719912,pca,train
7,0.001186,0.003250,0.475059,RAS,signal,3,719912,pca,train
8,0.001186,0.004334,0.468309,RAS,signal,3,719912,pca,train
9,0.001482,0.004334,0.468261,RAS,signal,3,719912,pca,train
