# CTD2 (aka CTRP) Drug Sensitivity Data Import
**Local Version**: 2
**Source Version**: 2

This notebook will import raw CTD (release 2) drug sensitivity data.

In [4]:
%run -m ipy_startup
%run -m ipy_logging
%matplotlib inline
from mgds.data_aggregation import database as db
from mgds.data_aggregation import source as src
from mgds.data_aggregation import io_utils as io

source_file = 'ftp://caftpd.nci.nih.gov/pub/dcc_ctd2/Broad/CTRPv2.0_2015_ctd2_ExpandedDataset/CTRPv2.0_2015_ctd2_ExpandedDataset.zip'
dest_file = db.raw_file(src.CTD_v2, 'drug-sensitivity.zip')

In [5]:
def get_data(filename):
    return io.extract_ftp_zip_to_data_frame(source_file, dest_file, filename, sep='\t')

In [6]:
d_auc = get_data('v20.data.curves_post_qc.txt')
d_auc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395263 entries, 0 to 395262
Data columns (total 17 columns):
experiment_id         395263 non-null int64
conc_pts_fit          395263 non-null int64
fit_num_param         395263 non-null int64
p1_conf_int_high      395263 non-null float64
p1_conf_int_low       395263 non-null float64
p2_conf_int_high      395263 non-null float64
p2_conf_int_low       395263 non-null float64
p4_conf_int_high      306346 non-null float64
p4_conf_int_low       306346 non-null float64
p1_center             395263 non-null float64
p2_slope              395263 non-null float64
p3_total_decline      395263 non-null float64
p4_baseline           395263 non-null float64
apparent_ec50_umol    395263 non-null float64
pred_pv_high_conc     395263 non-null float64
area_under_curve      395263 non-null float64
master_cpd_id         395263 non-null int64
dtypes: float64(13), int64(4)
memory usage: 51.3 MB


In [7]:
d_cmpd = get_data('v20.meta.per_compound.txt')
d_cmpd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 11 columns):
master_cpd_id                     545 non-null int64
cpd_name                          545 non-null object
broad_cpd_id                      545 non-null object
top_test_conc_umol                545 non-null float64
cpd_status                        545 non-null object
inclusion_rationale               545 non-null object
gene_symbol_of_protein_target     415 non-null object
target_or_activity_of_compound    545 non-null object
source_name                       495 non-null object
source_catalog_id                 359 non-null object
cpd_smiles                        545 non-null object
dtypes: float64(1), int64(1), object(9)
memory usage: 46.9+ KB


In [8]:
d_exp = get_data('v20.meta.per_experiment.txt')
d_exp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1061 entries, 0 to 1060
Data columns (total 9 columns):
experiment_id      1061 non-null int64
run_id             1061 non-null object
experiment_date    1061 non-null int64
culture_media      1061 non-null object
baseline_signal    1061 non-null float64
cells_per_well     1061 non-null int64
growth_mode        1061 non-null object
snp_fp_status      1061 non-null object
master_ccl_id      1061 non-null int64
dtypes: float64(1), int64(4), object(4)
memory usage: 74.7+ KB


In [9]:
d_cl = get_data('v20.meta.per_cell_line.txt')
d_cl.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1107 entries, 0 to 1106
Data columns (total 6 columns):
master_ccl_id          1107 non-null int64
ccl_name               1107 non-null object
ccl_availability       1107 non-null object
ccle_primary_site      1045 non-null object
ccle_primary_hist      1032 non-null object
ccle_hist_subtype_1    779 non-null object
dtypes: int64(1), object(5)
memory usage: 52.0+ KB


## Merge Datasets

In [10]:
d = pd.merge(d_auc, d_cmpd, on='master_cpd_id')
d = pd.merge(d, d_exp, on='experiment_id')
d = pd.merge(d, d_cl, on='master_ccl_id')
d = d[['ccl_name', 'cpd_name', 'area_under_curve', 'experiment_id']]
d.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 462784 entries, 0 to 462783
Data columns (total 4 columns):
ccl_name            462784 non-null object
cpd_name            462784 non-null object
area_under_curve    462784 non-null float64
experiment_id       462784 non-null int64
dtypes: float64(1), int64(1), object(2)
memory usage: 17.7+ MB


In [11]:
# At TOW, there were no unique cell line + drug + experiment id combinations that resulted in different 
# response AUC values, so make sure that this is always true (so that uniqueness can be assumed along these
# dimensions otherwise)
is_valid = np.all(d.groupby(['ccl_name', 'cpd_name', 'experiment_id'])['area_under_curve'].nunique() == 1)
assert is_valid, 'Found at least one cell line, compound, and experiment with conflicting AUC values'

In [12]:
# Distribution of number of experimental repetitions per cell line + drug combination
d.groupby(['ccl_name', 'cpd_name']).size().value_counts()

1    315186
2     68990
3      2198
4       756
dtype: int64

In [13]:
# Determine the mean AUC value across possibly duplicated experiments for the same cell line + compound combinations
d_exp = d.groupby(['ccl_name', 'cpd_name'])['area_under_curve']\
    .agg({'VALUE_MEAN': np.mean, 'VALUE_STD': np.std, 'VALUE_CT': 'count'}).reset_index().fillna(0)
d_exp = d_exp.rename(columns={'ccl_name': 'TUMOR_ID', 'cpd_name': 'DRUG_ID'})
d_exp.head()

Unnamed: 0,TUMOR_ID,DRUG_ID,VALUE_STD,VALUE_MEAN,VALUE_CT
0,2004,16-beta-bromoandrosterone,0.0,13.712,1
1,2004,"1S,3R-RSL-3",0.0,6.5377,1
2,2004,A-804598,0.0,15.839,1
3,2004,AA-COCF3,0.0,12.008,1
4,2004,ABT-199,0.0,14.476,1


In [14]:
assert d_exp.groupby(['TUMOR_ID', 'DRUG_ID']).size().max() <= 1, \
    'Found at least one cell line + drug combination with more than one record (should not be possible)'

## Export

In [15]:
assert np.all(pd.notnull(d_exp))
db.save(d_exp, src.CTD_v2, db.IMPORT, 'drug-sensitivity')

'/Users/eczech/data/research/mgds/import/ctd_v2_drug-sensitivity.pkl'