## Collect full models and save into tsv files

As a resource to provide with our paper, we want to provide the fit models (coefficients/effect sizes, and parameter choices) as a data file.

This notebook collects the results of the bash script at `07_train_final_classifiers/scripts/run_all_genes.sh` and assembles them into dataframes/`.tsv` files.

In [1]:
from pathlib import Path
import pickle as pkl

import numpy as np
import pandas as pd

import mpmp.config as cfg
import mpmp.utilities.data_utilities as du

%load_ext autoreload
%autoreload 2

In [2]:
results_dir = Path(cfg.results_dirs['final'],
                   'merged_all_params',
                   'gene').resolve()

### Get all possible gene expression features

This will be the index for our coefficient dataframe. We'll use `NA` to denote features that weren't used in that particular model (either a gene expression feature that wasn't in the top 8000 by MAD, or a cancer type indicator that wasn't included for that gene).

In [3]:
# get all cancer types
sample_info_df = du.load_sample_info('expression')
cancer_types = np.sort(sample_info_df.cancer_type.unique())
print(cancer_types.shape)
print(cancer_types)

(33,)
['ACC' 'BLCA' 'BRCA' 'CESC' 'CHOL' 'COAD' 'DLBC' 'ESCA' 'GBM' 'HNSC'
 'KICH' 'KIRC' 'KIRP' 'LAML' 'LGG' 'LIHC' 'LUAD' 'LUSC' 'MESO' 'OV' 'PAAD'
 'PCPG' 'PRAD' 'READ' 'SARC' 'SKCM' 'STAD' 'TGCT' 'THCA' 'THYM' 'UCEC'
 'UCS' 'UVM']


In [4]:
# the columns will include the sample id, so use all but the first one
gene_features = pd.read_csv(
    cfg.data_types['expression'], sep='\t', nrows=0
).columns[1:].values

print(gene_features.shape)
gene_features[:5]

(15369,)


array(['1', '100', '1000', '10000', '10001'], dtype=object)

In [5]:
all_feats = np.concatenate((
    gene_features,
    cancer_types,
    np.array(['log10_mut'])
))
print(all_feats.shape)

(15403,)


### Load coefficients and assemble into dataframe

In [6]:
coefs = {}
genes = []

# load coefficient vectors from output files, into dict
for gene_dir in results_dir.iterdir():
    gene_name = gene_dir.stem
    gene_dir = Path(results_dir, gene_dir)
    if gene_dir.is_file(): continue
    genes.append(gene_name)
    for results_file in gene_dir.iterdir():
        if not results_file.is_file(): continue
        results_filename = str(results_file.stem)
        if 'coefficients' not in results_filename: continue
        coefs_df = pd.read_csv(results_file, sep='\t')
        coefs[gene_name] = (coefs_df
            .loc[:, ['feature', 'weight']]
            .set_index('feature')
            .reindex(all_feats)
            .rename(columns={'weight': gene_name})
        )
                    
print(genes[:5])
print(len(genes))

['MAP3K1', 'ZFHX3', 'TP53', 'FAS', 'ABL2']
217


In [7]:
# make sure all genes with parameters have classifiers
# the set difference should be empty
print(len(set(genes) - set(coefs.keys())))
print(set(genes) - set(coefs.keys()))

0
set()


In [8]:
gene = 'PIK3CA'
print(coefs[gene].isna().sum())
coefs[gene].head()

PIK3CA    7390
dtype: int64


Unnamed: 0_level_0,PIK3CA
feature,Unnamed: 1_level_1
1,
100,
1000,0.0
10000,0.0
10001,


In [9]:
coefs[gene][coefs[gene][gene].isna()].head()

Unnamed: 0_level_0,PIK3CA
feature,Unnamed: 1_level_1
1,
100,
10001,
10002,
10003,


In [10]:
# concatenate coefficient vectors into a single dataframe
coefs_df = (
    pd.concat(coefs.values(), axis='columns')
      .sort_index(axis='columns')
)
coefs_df.index.name = None

print(coefs_df.shape)
coefs_df.iloc[:5, :5]

(15403, 217)


Unnamed: 0,ABL2,ACVR2A,AFF3,ALK,AMER1
1,,,,,
100,0.0,,0.0,,
1000,0.0,0.0,0.0,0.0,
10000,0.0,0.0,0.0,2.5468,5.1894
10001,,,,,


In [11]:
(cfg.data_dir / 'final_models').mkdir(exist_ok=True)
coefs_df.to_csv(cfg.final_coefs_df, sep='\t')

### Load parameters and assemble into dataframe

In [12]:
params = {}

# load coefficient vectors from output files, into dict
for gene_dir in results_dir.iterdir():
    gene_name = gene_dir.stem
    gene_dir = Path(results_dir, gene_dir)
    if gene_dir.is_file(): continue
    for results_file in gene_dir.iterdir():
        if not results_file.is_file(): continue
        results_filename = str(results_file.stem)
        if 'params' not in results_filename: continue
        with open(results_file, 'rb') as f:
            gene_params = pkl.load(f)
        params[gene_name] = pd.DataFrame(
            gene_params, index=[gene_name]
        )
        
print(list(params.keys())[:5])
print(len(params.keys()))

['MAP3K1', 'ZFHX3', 'TP53', 'FAS', 'ABL2']
217


In [13]:
params[gene].head()

Unnamed: 0,alpha,l1_ratio
PIK3CA,0.1,0.1


In [14]:
# concatenate coefficient vectors into a single dataframe
params_df = (
    pd.concat(params.values(), axis='rows')
      .sort_index(axis='rows')
)

print(params_df.shape)
params_df.iloc[:5, :5]

(217, 2)


Unnamed: 0,alpha,l1_ratio
ABL2,1.0,0.1
ACVR2A,0.1,0.5
AFF3,1.0,0.05
ALK,0.01,0.9
AMER1,0.001,0.9


In [15]:
params_df.to_csv(cfg.final_params_df, sep='\t')