In [1]:
import pandas as pd
import numpy as np

# Data loading

### Expression levels

In [2]:
non_brca_ex = pd.read_hdf('../data/TCGA_gene_exp_20k_std-MAD.h5', key='non_brca')

In [3]:
non_brca_ex.shape

(9285, 20000)

In [4]:
import ast
with open('../common_genes_1-5.txt','r') as f:
   common_genes = ast.literal_eval(f.read())

In [5]:
non_brca_ex = non_brca_ex[list(common_genes)]

In [6]:
non_brca_ex.shape

(9285, 1492)

In [7]:
non_collinear_genes = pd.read_csv('non_collinear_genes.csv')
non_collinear_genes = non_collinear_genes[non_collinear_genes.columns[1:]]

In [8]:
non_collinear_genes = list(non_collinear_genes.values[0])

In [9]:
non_brca_ex = non_brca_ex[non_collinear_genes]

In [10]:
non_brca_ex.shape

(9285, 149)

### OS

In [11]:
non_brca_os = pd.read_hdf('../data/TCGA_data.h5', key='non_brca_clinical')[['OS', 'OS.time']]

In [12]:
## non_brca
non_brca_os['OS.time'] = non_brca_os['OS.time'].map(lambda x: np.nan if x == 'NaN' else x)
non_brca_os['OS'] = non_brca_os['OS'].map(lambda x: np.nan if x == 'NaN' else x)
non_brca_os.dropna(subset=['OS.time', 'OS'], inplace=True)
non_brca_os['OS.time'] = non_brca_os['OS.time'].astype(float)
non_brca_os['OS'] = non_brca_os['OS'].astype(float)

In [13]:
non_brca_os.shape

(9224, 2)

In [14]:
non_brca_ex = non_brca_ex.loc[non_brca_os.index]

In [15]:
sum(non_brca_ex.index == non_brca_os.index)

9224

In [16]:
non_brca_data = pd.concat([non_brca_ex, non_brca_os], axis=1, join_axes=[non_brca_ex.index])

In [17]:
non_brca_data.shape

(9224, 151)

In [18]:
from lifelines import CoxPHFitter

In [19]:
cph = CoxPHFitter()
cph.fit(non_brca_data, duration_col='OS.time', event_col='OS', show_progress=True)

cph.print_summary()  # access the results using cph.summary

Iteration 1: norm_delta = 0.97759, step_size = 0.9500, ll = -25414.45343, newton_decrement = 987.05193, seconds_since_start = 1.3
Iteration 2: norm_delta = 0.25606, step_size = 0.9500, ll = -24590.62100, newton_decrement = 134.50378, seconds_since_start = 2.3
Iteration 3: norm_delta = 0.07815, step_size = 0.9500, ll = -24442.55146, newton_decrement = 13.53672, seconds_since_start = 3.6
Iteration 4: norm_delta = 0.02494, step_size = 1.0000, ll = -24427.70157, newton_decrement = 0.87339, seconds_since_start = 4.9
Iteration 5: norm_delta = 0.00214, step_size = 1.0000, ll = -24426.78879, newton_decrement = 0.00467, seconds_since_start = 6.2
Iteration 6: norm_delta = 0.00001, step_size = 1.0000, ll = -24426.78410, newton_decrement = 0.00000, seconds_since_start = 7.0
Iteration 7: norm_delta = 0.00000, step_size = 1.0000, ll = -24426.78410, newton_decrement = 0.00000, seconds_since_start = 7.7
Convergence completed after 7 iterations.
<lifelines.CoxPHFitter: fitted with 9224 observations, 61