In [1]:
import os

import numpy as np
import pandas as pd
import shap
import xgboost as xgb
from sklearn.preprocessing import StandardScaler

import mgitools.os_helpers as os_helpers

In [2]:
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

In [3]:
result_dir = '../results/06242021_new_results_format'
from pathlib import Path
Path(result_dir).mkdir(parents=True, exist_ok=True)

In [4]:
combined_fp = '../data/aggregated_10072021.txt.gz'
# combined_fp = '../data/aggregated_08012021_clinical.txt.gz'
# combined_fp = '../data/aggregated_08022021_clinical.txt.gz'


In [5]:
combined = pd.read_csv(combined_fp, sep='\t', index_col=0)

Columns (14819,63912) have mixed types.Specify dtype option on import or set low_memory=False.


In [6]:
d = pd.read_csv('../data/199_driver_genes.txt', sep='\t')
d

Unnamed: 0,Gene,Tumor suppressor or oncogene prediction (by 20/20+)
0,PHF6,possible tsg
1,ABL1,
2,ALK,
3,AR,
4,ARAF,
...,...,...
183,KMT2A,tsg
184,KMT2B,tsg
185,MAX,oncogene
186,MED12,oncogene


In [7]:
# target_genes = ['PIK3CA', 'TP53', 'KRAS']
target_genes = sorted(set(d['Gene']))

###### protein pairs

In [8]:
# get pik3ca related genes
pathways = pd.read_csv('../data/protein_pair_table_v2.txt', sep='\t')
pathways

Unnamed: 0,GENE,SUB_GENE,pair_pro,SUB_GENE.is_TF_downstream,SUB_GENE.is_TF_upstream,SUB_GENE.is_kinase_substrate,SUB_GENE.is_phosphatase_substrate,SUB_GENE.is_upstream_kinase,SUB_GENE.is_upstream_phosphatase,SUB_GENE.is_complex_partner
0,TP53,CDKN1A,TP53:CDKN1A,True,False,False,False,False,False,False
1,TP53,SIAH1,TP53:SIAH1,True,False,False,False,False,False,False
2,TP53,SFN,TP53:SFN,True,False,False,False,False,False,False
3,TP53,RPRM,TP53:RPRM,True,False,False,False,False,False,False
4,TP53,GADD45A,TP53:GADD45A,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
831929,SETD2,SETD2,SETD2:SETD2,False,False,False,False,False,False,False
831930,PUMA,PUMA,PUMA:PUMA,False,False,False,False,False,False,False
831931,NOXA,NOXA,NOXA:NOXA,False,False,False,False,False,False,False
831932,FOXR2,FOXR2,FOXR2:FOXR2,False,False,False,False,False,False,False


In [9]:
pathways[[True if g in target_genes else False
         for g in pathways['GENE']]]

Unnamed: 0,GENE,SUB_GENE,pair_pro,SUB_GENE.is_TF_downstream,SUB_GENE.is_TF_upstream,SUB_GENE.is_kinase_substrate,SUB_GENE.is_phosphatase_substrate,SUB_GENE.is_upstream_kinase,SUB_GENE.is_upstream_phosphatase,SUB_GENE.is_complex_partner
0,TP53,CDKN1A,TP53:CDKN1A,True,False,False,False,False,False,False
1,TP53,SIAH1,TP53:SIAH1,True,False,False,False,False,False,False
2,TP53,SFN,TP53:SFN,True,False,False,False,False,False,False
3,TP53,RPRM,TP53:RPRM,True,False,False,False,False,False,False
4,TP53,GADD45A,TP53:GADD45A,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
830817,IDH1,IDH1,IDH1:IDH1,False,False,False,False,False,False,False
831488,BCOR,BCOR,BCOR:BCOR,False,False,False,False,False,False,False
831552,ATRX,ATRX,ATRX:ATRX,False,False,False,False,False,False,False
831798,RQCD1,RQCD1,RQCD1:RQCD1,False,False,False,False,False,False,False


In [10]:
gene_to_subgenes = {t:[g for g in sorted(set(pathways[pathways['GENE']==t]['SUB_GENE'])) if g != t]
                    for t in target_genes}
target_genes[0], len(gene_to_subgenes[target_genes[0]])

('ABL1', 126)

#### train ols

In [11]:
combined.columns

Index(['AAAS_proteome', 'AAK1_proteome', 'AATF_proteome', 'ABCA1_proteome',
       'ABCA2_proteome', 'ABCB1_proteome', 'ABCB11_proteome', 'ABCC2_proteome',
       'ABCC3_proteome', 'ABCE1_proteome',
       ...
       'WIZ-211|WIZ|1066_288_299_2_2_S294S299_phospho',
       'WIZ-211|WIZ|1066_507_521_1_1_S521_phospho',
       'WIZ-211|WIZ|1066_542_549_1_1_S549_phospho',
       'WIZ-211|WIZ|1066_561_574_1_1_S561_phospho',
       'WIZ-211|WIZ|1066_750_755_1_1_T752_phospho',
       'WIZ-211|WIZ|1066_895_895_1_1_S895_phospho',
       'WIZ-211|WIZ|1066_932_936_1_1_S932_phospho',
       'TSGA10-219|TSGA10|789_173_179_1_1_S173_phospho',
       'TSGA10-219|TSGA10|789_779_786_1_1_S779_phospho',
       'SVIL-215|SVIL|1904_459_461_1_1_S459_phospho'],
      dtype='object', length=158939)

In [12]:
{'_'.join(x.split('_')[1:]) for x in combined.columns if 'expression' not in x}

{'',
 '1101_1115_2_2_S1103S1106_phospho',
 '235_244_1_0_phospho',
 '16_44_1_1_S33_phospho',
 '40_42_1_1_S42_phospho',
 '1649_1649_1_1_T1649_phospho',
 '838_855_1_1_S852_phospho',
 '1009_1016_1_1_S1013_phospho',
 '11_53_1_0_phospho',
 '1231_1231_1_1_S1231_phospho',
 '2390_2402_1_1_S2398_phospho',
 '27_36_1_1_S28_phospho',
 '425_441_3_0_phospho',
 '195_211_1_1_S199_phospho',
 '381_382_1_1_S381_phospho',
 '341_345_1_0_phospho',
 '2323_2332_1_1_T2332_phospho',
 '90_101_1_0_phospho',
 '10_25_1_1_S20_phospho',
 '2394_2406_1_0_phospho',
 '256_279_1_0_phospho',
 '23_27_2_0_phospho',
 '405_413_1_1_S413_phospho',
 '118_120_1_1_S120_phospho',
 '154_157_2_2_S154S157_phospho',
 '1097_1114_1_0_phospho',
 '1246_1261_1_0_phospho',
 '1248_1265_2_0_phospho',
 '2328_2335_1_1_S2335_phospho',
 '347_355_1_1_S354_phospho',
 '586_597_1_1_S587_phospho',
 '358_389_2_0_phospho',
 '264_275_1_1_S268_phospho',
 '27_48_1_1_S33_phospho',
 '164_177_1_1_S168_phospho',
 '123_132_1_1_S132_phospho',
 '1019_1028_1_1_S1028_

In [13]:
# [x for x in combined.columns if 'subtype' in x]

In [14]:
[x for x in combined.columns if 'clinical' in x]

['clinical_is_tumor',
 'clinical_age',
 'clinical_is_female',
 'clinical_predicted_ancestry_is_AFR',
 'clinical_predicted_ancestry_is_AMR',
 'clinical_predicted_ancestry_is_EAS',
 'clinical_predicted_ancestry_is_EUR',
 'clinical_predicted_ancestry_is_SAS']

In [14]:
def get_data_for_pair(gene, subgene, combined, features=['cnv', 'is_mutated', 'is_pathogenic_germline'],
                     standalone=['TumorPurity'], target='proteome', scale_features=['TumorPurity', 'cnv'],
                     scale_y=False):
    cols = [f'{gene}_{feat}' for feat in features]
    cols += standalone
    cols = sorted(set(cols))
    
    # check to make sure all columns are in X, if not then return None, None
    if f'{subgene}_{target}' not in combined.columns:
        return None, None
    if len([x for x in cols if x not in combined.columns]):
#         print(cols, f'{subgene}_{target}')
        return None, None
    
    X = combined[cols].copy()
    y = combined[[f'{subgene}_{target}']].copy()
    
    # filter NA
    X = X.dropna()
    
    if not X.shape[0]:
        return None, None
    
    to_scale = [c for c in cols for f in scale_features if f in c]
    prev = X[[c for c in X.columns if c not in to_scale]]
    
    if to_scale:
        scale = X[to_scale]
        scale = pd.DataFrame(data=StandardScaler().fit_transform(scale.values), columns=scale.columns, index=scale.index)
        X = pd.merge(scale, prev, left_index=True, right_index=True)
    
    # look for scale features in y
    if scale_y:
        y = pd.DataFrame(data=StandardScaler().fit_transform(y.values), columns=y.columns, index=y.index)
    
    try:
        X = X.astype(np.float32)
    except ValueError:
        return None, None
    y = y.loc[X.index]
    
    # make sure no y null
    y = y.dropna()
    X = X.loc[y.index]
    
    X.columns = [c.replace(gene, 'driver_gene') for c in X.columns]

    return X, y
    

In [15]:
import statsmodels.api as sm
from patsy import dmatrices
def run_ols(X, y):
    ground = y.values.flatten()
    df = pd.merge(X.copy(), y.copy(), left_index=True, right_index=True)
#     print(df.columns)
    command = f'{y.columns[0]} ~ ' + ' + '.join(X.columns)
    y_sm, X_sm = dmatrices(command, data=df, return_type='dataframe')
    
    model = sm.OLS(y_sm, X_sm)
    results = model.fit()
    
    coef_df = pd.DataFrame.from_dict({'coef': results.params.to_list(),
                           'p-value': results.pvalues.to_list()})
    coef_df.index = results.params.index.to_list()
    
    return {'result': results, 'coef_df': coef_df, 'r-squared': results.rsquared_adj,
           'r-squared p-value': results.f_pvalue, 'groundtruth': ground, 'predicted': results.fittedvalues.to_list(),
           'features': results.params.index.to_list(), 'X': X, 'y': y}

In [16]:
#### input_features: ['cnv', 'is_mutated', 'is_framshift', 'is_pathogenic_germline', 'Tumor purity', 'expression']

In [17]:
# features = ['cnv', 'has_truncating_mutation', 'is_pathogenic_germline']
# standalone = ['TumorPurity']
# target = 'expression'
# scale_features = ['TumorPurity', 'cnv']
# scale_y = False

In [20]:
# features = ['cnv', 'has_nonsilent_mutation', 'mutation_is_Silent', 'is_pathogenic_germline']
# standalone = ['TumorPurity']
# target = 'proteome'
# scale_features = ['TumorPurity', 'cnv']
# scale_y = False

In [21]:
# [c for c in combined.columns if 'subtype_BR' in c]

In [22]:
# features = ['cnv', 'has_nonsilent_mutation', 'is_pathogenic_germline']
# standalone = ['TumorPurity',
#              'is_subtype_BR_Basal',
#              'is_subtype_BR_Her2',
#              'is_subtype_BR_LumA',
#              'is_subtype_BR_LumB',
#              'is_subtype_BR_Normal-like',]
# target = 'proteome'
# scale_features = ['TumorPurity', 'cnv']
# scale_y = False

In [20]:
features = []
standalone = ['Purity',
 'clinical_is_tumor',
 'clinical_age',
 'clinical_is_female',
 'clinical_predicted_ancestry_is_AFR',
 'clinical_predicted_ancestry_is_AMR',
 'clinical_predicted_ancestry_is_EAS',
 'clinical_predicted_ancestry_is_EUR',
 'clinical_predicted_ancestry_is_SAS']
target = 'proteome'
scale_features = []
scale_y = False

In [18]:
features = ['cnv', 'has_nonsilent_mutation', 'is_pathogenic_germline']
standalone = ['TumorPurity']
target = 'phospho'
scale_features = ['TumorPurity', 'cnv']
scale_y = False

In [18]:
features = ['cnv', 'has_nonsilent_mutation', 'is_pathogenic_germline']
standalone = ['TumorPurity']
target = 'expression'
scale_features = ['TumorPurity', 'cnv']
scale_y = False

In [None]:
# features = ['cnv', 'has_nonsilent_mutation', 'is_pathogenic_germline']
# standalone = ['TumorPurity',
#              'methylation_subtype_1',
#              'methylation_subtype_2',
#              'methylation_subtype_3',
#              'methylation_subtype_4',
#              'methylation_subtype_5',
#              'methylation_subtype_6']
# target = 'proteome'
# scale_features = ['TumorPurity', 'cnv']
# scale_y = False

In [67]:
[c for c in combined.columns if 'clinical' in c]

['clinical_is_tumor',
 'clinical_age',
 'clinical_is_female',
 'clinical_predicted_ancestry_is_AFR',
 'clinical_predicted_ancestry_is_AMR',
 'clinical_predicted_ancestry_is_EAS',
 'clinical_predicted_ancestry_is_EUR',
 'clinical_predicted_ancestry_is_SAS']

In [32]:
features

['clinical_is_tumor',
 'clinical_age',
 'clinical_is_female',
 'clinical_predicted_ancestry_is_AFR',
 'clinical_predicted_ancestry_is_AMR',
 'clinical_predicted_ancestry_is_EAS',
 'clinical_predicted_ancestry_is_EUR',
 'clinical_predicted_ancestry_is_SAS']

In [23]:
results_dict = {}
for disease in sorted({x for x in combined['disease'] if not pd.isnull(x)}):
# for disease in ['BR']:
    print(disease)
    results_dict[disease] = {}
    filtered = combined[combined['disease']==disease]
    for i, (gene, subgenes) in enumerate(gene_to_subgenes.items()):
        if i % 10 == 0:
            print(i, gene)
        genes = [gene]
        genes += subgenes
        genes = sorted(set(genes))
        for subgene in genes:
            X, y = get_data_for_pair(gene, subgene, filtered, features=features,
                                    standalone=standalone, target=target, scale_features=scale_features,
                                    scale_y=scale_y)
            # make sure features have no -
            if X is not None and X.shape[0] and '-' not in gene and '-' not in subgene:

                X.columns = [c.replace('-', '_') for c in X.columns]
                r = run_ols(X.copy(), y.copy())
#                 print(r)
                results_dict[disease][f'{gene}_{subgene}'] = r
    
            

BRCA
0 ABL1
10 ARID1A
20 BCL2
30 CDK4
40 CSDE1
50 EGR3
60 FAT1
70 GNAQ
80 IDH1
90 KIF1A
100 MACF1
110 MGMT
120 NFE2L2
130 PHF6
140 PPP2R1A
150 RET
160 SETD2
170 STK11
180 TP53
CO
0 ABL1
10 ARID1A
20 BCL2
30 CDK4
40 CSDE1
50 EGR3
60 FAT1
70 GNAQ
80 IDH1
90 KIF1A
100 MACF1
110 MGMT
120 NFE2L2
130 PHF6
140 PPP2R1A
150 RET
160 SETD2
170 STK11
180 TP53
HNSCC
0 ABL1
10 ARID1A
20 BCL2
30 CDK4
40 CSDE1
50 EGR3
60 FAT1
70 GNAQ
80 IDH1
90 KIF1A
100 MACF1
110 MGMT
120 NFE2L2
130 PHF6
140 PPP2R1A
150 RET
160 SETD2
170 STK11
180 TP53
LSCC
0 ABL1
10 ARID1A
20 BCL2
30 CDK4
40 CSDE1
50 EGR3
60 FAT1
70 GNAQ
80 IDH1
90 KIF1A
100 MACF1
110 MGMT
120 NFE2L2
130 PHF6
140 PPP2R1A
150 RET
160 SETD2
170 STK11
180 TP53
LUAD
0 ABL1
10 ARID1A
20 BCL2
30 CDK4
40 CSDE1
50 EGR3
60 FAT1
70 GNAQ
80 IDH1
90 KIF1A
100 MACF1
110 MGMT
120 NFE2L2
130 PHF6
140 PPP2R1A
150 RET
160 SETD2
170 STK11
180 TP53
OV
0 ABL1
10 ARID1A
20 BCL2
30 CDK4
40 CSDE1
50 EGR3
60 FAT1
70 GNAQ
80 IDH1
90 KIF1A
100 MACF1
110 MGMT
120 NFE2L2
130 P

run this one for phospho


In [19]:
[x for x in combined.columns if 'phospho' in x][:10]

['M6PR-201|M6PR|277_267_267_1_1_S267_phospho',
 'ESRRA-201|ESRRA|423_19_22_1_0_phospho',
 'ESRRA-201|ESRRA|423_19_22_2_2_S19S22_phospho',
 'ESRRA-201|ESRRA|423_19_44_2_0_phospho',
 'ESRRA-201|ESRRA|423_19_44_2_2_S19S22_phospho',
 'ESRRA-201|ESRRA|423_19_44_3_0_phospho',
 'ESRRA-201|ESRRA|423_19_44_3_3_S22S26S27_phospho',
 'ESRRA-201|ESRRA|423_26_44_1_1_S27_phospho',
 'FKBP4-201|FKBP4|459_258_263_1_0_phospho',
 'FKBP4-201|FKBP4|459_258_263_1_1_S258_phospho']

In [43]:
results_dict = {}
x, y = None, None
for disease in sorted({x for x in combined['disease'] if not pd.isnull(x)}):
# for disease in ['BR']:
    print(disease)
    results_dict[disease] = {}
    filtered = combined[combined['disease']==disease]
    for i, (gene, subgenes) in enumerate(gene_to_subgenes.items()):
        if i % 10 == 0:
            print(i, gene)
        genes = [gene]
        genes += subgenes
        genes = sorted(set(genes))
        for subgene in genes:
            sites = ['_'.join(s.split('_')[:-1]) for s in filtered.columns if subgene in s and target in s]
            for site in sites:
#                 print(gene, site)
                X, y = get_data_for_pair(gene, site, filtered, features=features,
                                        standalone=standalone, target=target, scale_features=scale_features,
                                        scale_y=scale_y)
                x, y = (X, y)
# #                 if X is not None:
#                 print(gene, site, X.shape)
#                 print(X is not None)
#                 raise RuntimeError()
                # make sure features have no -
                if '-' in site and X is not None and y is not None:
                    y.columns = [y.columns[0].replace('-', '_').replace(' ', '_').replace('|', '_')]
                if X is not None and X.shape[0] and '-' not in gene and '-' not in subgene:
            
                    X.columns = [c.replace('-', '_') for c in X.columns]
                    r = run_ols(X.copy(), y.copy())
    #                 print(r)
                    results_dict[disease][f'{gene}_{site}'] = r

BR
0 ABL1
10 ARID1A
20 BCL2
30 CDK4
40 CSDE1
50 EGR3
60 FAT1
70 GNAQ
80 IDH1
90 KIF1A
100 MACF1
110 MGMT
120 NFE2L2
130 PHF6
140 PPP2R1A
150 RET
160 SETD2
170 STK11
180 TP53
CO
0 ABL1
10 ARID1A
20 BCL2
30 CDK4
40 CSDE1
50 EGR3
60 FAT1
70 GNAQ
80 IDH1
90 KIF1A
100 MACF1
110 MGMT
120 NFE2L2
130 PHF6
140 PPP2R1A
150 RET
160 SETD2
170 STK11
180 TP53
EC
0 ABL1
10 ARID1A
20 BCL2
30 CDK4
40 CSDE1
50 EGR3
60 FAT1
70 GNAQ
80 IDH1
90 KIF1A
100 MACF1
110 MGMT
120 NFE2L2
130 PHF6
140 PPP2R1A
150 RET
160 SETD2
170 STK11
180 TP53
GBM
0 ABL1
10 ARID1A
20 BCL2
30 CDK4
40 CSDE1
50 EGR3
60 FAT1
70 GNAQ
80 IDH1
90 KIF1A
100 MACF1
110 MGMT
120 NFE2L2
130 PHF6
140 PPP2R1A
150 RET
160 SETD2
170 STK11
180 TP53
HNSCC
0 ABL1
10 ARID1A
20 BCL2
30 CDK4
40 CSDE1
50 EGR3
60 FAT1
70 GNAQ
80 IDH1
90 KIF1A
100 MACF1
110 MGMT
120 NFE2L2
130 PHF6
140 PPP2R1A
150 RET
160 SETD2
170 STK11
180 TP53
LSCC
0 ABL1
10 ARID1A
20 BCL2
30 CDK4
40 CSDE1
50 EGR3
60 FAT1
70 GNAQ
80 IDH1
90 KIF1A
100 MACF1
110 MGMT
120 NFE2L2
130 PHF6

In [47]:
# adjust p-values for fdr in both R and coefs
import statsmodels
for disease, results in results_dict.items():
    print(disease)
    if results:
        keys, ps = zip(*[(k, d['r-squared p-value']) for k, d in results.items()])
        corrected = statsmodels.stats.multitest.fdrcorrection(ps)[1]
        for k, c in zip(keys, corrected): results[k]['r-squared FDR'] = c

        xs, ls, feats = [], [], []
        for feat in list(results.values())[0]['features']:
            temp_xs, temp_ls = zip(*[(k, d['coef_df'].loc[feat, 'p-value']) for k, d in results.items()
                          if feat in d['features']])
            xs += temp_xs
            ls += temp_ls
            feats += [feat] * len(temp_xs)

        keys, ps, fts = zip(*[(x, l, f) for x, l, f in zip(xs, ls, feats) if not pd.isnull(l)])
        corrected = statsmodels.stats.multitest.fdrcorrection(ps)[1]
        for k, c, feat in zip(keys, corrected, fts):
            if 'FDR' not in results[k]['coef_df'].columns:
                results[k]['coef_df']['FDR'] = np.nan
                results[k]['coef_df']['-log10(FDR)'] = np.nan
            else:
                results[k]['coef_df'].loc[feat, 'FDR'] = c
                results[k]['coef_df'].loc[feat, '-log10(FDR)'] = -np.log10(c)

    

BR
CO
EC
GBM
HNSCC
LSCC
LUAD
OV
PDA
ccRCC


In [83]:
results_dict['BR']['TP53_TP53']['result'].summary()

KeyError: 'BR'

In [85]:
results_dict['BR']

KeyError: 'BR'

In [None]:
results_dict['BR']['TP53_TP53']['coef_df']

In [None]:
results_dict['BR']['TP53_TP53']['result'].summary()

In [48]:
import json
def save_results_dict(filepath, results_dict):
    save_dict = {}
    for disease, d in results_dict.items():
        save_dict[disease] = {}
        for pair, d2 in d.items():
            save_dict[disease][pair] = {}
            for k, val in d2.items():
                if isinstance(val, pd.DataFrame):
                    save_dict[disease][pair][k] = val.to_dict()
                elif 'numpy.ndarray' in str(type(val)) and len(val):
                    save_dict[disease][pair][k] = [float(v) for v in val]
                elif 'numpy.float' in str(type(val)):
                    save_dict[disease][pair][k] = float(val)
                elif isinstance(val, statsmodels.regression.linear_model.RegressionResultsWrapper):
                    pass
                else:
                    save_dict[disease][pair][k] = val
    json.dump(save_dict, open(filepath, 'w'))
    
def load_results_dict(filepath):
    loaded = json.load(open(filepath))
    results_dict = {}
    for disease, d in loaded.items():
        results_dict[disease] = {}
        for pair, d2 in d.items():
            results_dict[disease][pair] = {}
            for k, val in d2.items():
                if k in ['coef_df', 'X', 'y']:
                    results_dict[disease][pair][k] = pd.DataFrame.from_dict(val)
                elif k in ['groundtruth', 'predicted', 'features']:
                    results_dict[disease][pair][k] = np.asarray(val)
                else:
                    results_dict[disease][pair][k] = val
    return results_dict

In [49]:
run_dir = os.path.join(result_dir, 'som_germ_cnv_pur_target_phospho')
Path(run_dir).mkdir(exist_ok=True, parents=True)

In [50]:
filepath = os.path.join(run_dir, 'results.json')
save_results_dict(filepath, results_dict)

In [51]:
# also save metadata
summary = {
    'features': features,
    'standalone': standalone,
    'target': target,
    'scale_features': scale_features,
    'scale_target': scale_y,
    'input_source': combined_fp,
}
json.dump(summary, open(os.path.join(run_dir, 'summary.json'), 'w'))

In [54]:
def get_master_coef_df(results_dict):
    master_coef_dict = None
    for disease, results in results_dict.items():
        for k, r in results.items():
            pieces = k.split('_')[0], '_'.join(k.split('_')[1:])
            driver, subgene = pieces
            n = r['coef_df'].shape[0]-1
            df_dict = r['coef_df'].iloc[1:, :].to_dict()
            order = r['coef_df'].index[1:].to_list()
            for k, v in df_dict.items():
                df_dict[k] = [v[o] for o in order]
            df_dict['feature'] = order

            # replace 1. fdr with np.nan
            if 'FDR' in df_dict:
                df_dict['FDR'] = [np.nan if pd.isnull(p) else x
                                  for p, x in zip(df_dict['p-value'], df_dict['FDR'])]
                df_dict['-log10(FDR)'] = [np.nan if pd.isnull(p) else x
                                          for p, x in zip(df_dict['p-value'], df_dict['-log10(FDR)'])]
            else:
                df_dict['FDR'] = [np.nan for i in range(n)]
                df_dict['-log10(FDR)'] = [np.nan for i in range(n)]
                
            

            df_dict['driver'] = [driver for i in range(n)]
            df_dict['target'] = [subgene for i in range(n)]
            df_dict['disease'] = [disease for i in range(n)]
            df_dict['model_r2'] = [r['r-squared'] for i in range(n)]
            df_dict['model_r2_FDR'] = [r['r-squared FDR'] for i in range(n)]

            if master_coef_dict is None:
                master_coef_dict = df_dict
            else:
                for k in master_coef_dict.keys():
                    master_coef_dict[k] += df_dict[k]
    master_coef_df = pd.DataFrame.from_dict(master_coef_dict)     
    master_coef_df.index = np.arange(master_coef_df.shape[0])
    
    return master_coef_df

In [55]:
master_coef_df = get_master_coef_df(results_dict)
master_coef_df.to_csv(os.path.join(run_dir, 'coef_results.txt'), sep='\t', index=False)

In [56]:
run_dir

'../results/06242021_new_results_format/som_germ_cnv_pur_target_phospho'

In [57]:
master_coef_df

Unnamed: 0,coef,p-value,FDR,-log10(FDR),feature,driver,target,disease,model_r2,model_r2_FDR
0,-0.016586,5.696457e-01,7.578885e-01,0.120395,driver_gene_cnv,ABL1,ABI1-204|ABI1|452_174_187_1_0,BR,0.020068,3.566061e-01
1,-0.061267,3.770408e-02,1.026869e-01,0.988485,TumorPurity,ABL1,ABI1-204|ABI1|452_174_187_1_0,BR,0.020068,3.566061e-01
2,-0.234853,3.007376e-01,5.128425e-01,0.290016,driver_gene_has_nonsilent_mutation,ABL1,ABI1-204|ABI1|452_174_187_1_0,BR,0.020068,3.566061e-01
3,0.000000,,,,driver_gene_is_pathogenic_germline,ABL1,ABI1-204|ABI1|452_174_187_1_0,BR,0.020068,3.566061e-01
4,-0.024269,3.489810e-01,5.651451e-01,0.247840,driver_gene_cnv,ABL1,ABI1-204|ABI1|452_174_187_1_1_S183,BR,0.018621,3.711768e-01
...,...,...,...,...,...,...,...,...,...,...
3518311,0.000000,,,,driver_gene_is_pathogenic_germline,ZNF750,ZNF777-201|ZNF777|831_604_610_1_0,ccRCC,0.041873,5.061344e-04
3518312,0.125712,8.089208e-21,2.521268e-20,19.598381,TumorPurity,ZNF750,ZNF839-209|ZNF839|811_670_677_1_0,ccRCC,0.246058,1.064838e-21
3518313,-0.019994,1.701699e-01,2.168608e-01,0.663819,driver_gene_cnv,ZNF750,ZNF839-209|ZNF839|811_670_677_1_0,ccRCC,0.246058,1.064838e-21
3518314,0.515714,7.632312e-04,1.355022e-03,2.868054,driver_gene_has_nonsilent_mutation,ZNF750,ZNF839-209|ZNF839|811_670_677_1_0,ccRCC,0.246058,1.064838e-21
