# CTD1 (aka CTRP) Drug Sensitivity Data Import
**Local Version**: 1
**Source Version**: 1

This notebook will import raw CTD (release 1) drug sensitivity data.

In [1]:
%run -m ipy_startup
%run -m ipy_logging

from mgds.data_aggregation import database as db
from mgds.data_aggregation import source as src
from mgds.data_aggregation import io_utils as io

source_file = 'ftp://caftpd.nci.nih.gov/pub/dcc_ctd2/Broad/CTRPv1.0_2013_pub_Cell_154_1151/CTRPv1.0_2013_pub_Cell_154_1151.zip'
dest_file = 'downloads/data.zip'

In [2]:
# Extract AUC data file from FTP zip archive
d = io.extract_ftp_zip_to_data_frame(source_file, dest_file, 'v10.D3.area_under_conc_curve.txt', sep='\t')

In [3]:
d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50531 entries, 0 to 50530
Data columns (total 3 columns):
ccl_name            50531 non-null object
cpd_name            50531 non-null object
area_under_curve    50531 non-null float64
dtypes: float64(1), object(2)
memory usage: 1.2+ MB


In [4]:
# Determine the mean AUC value across possibly duplicated experiments for the same cell line + compound combinations
d_exp = d.groupby(['ccl_name', 'cpd_name'])['area_under_curve']\
    .agg({'VALUE_MEAN': np.mean, 'VALUE_STD': np.std}).reset_index().fillna(0)
d_exp = d_exp.rename(columns={'ccl_name': 'TUMOR_ID', 'cpd_name': 'DRUG_ID'})
d_exp.head()

Unnamed: 0,TUMOR_ID,DRUG_ID,VALUE_MEAN,VALUE_STD
0,5637,"16,16-dimethylprostaglandin-E2",6.7149,0.0
1,5637,16-beta-bromoandrosterone,5.8866,0.0
2,5637,2-deoxyglucose,6.2317,0.0
3,5637,5-benzyl-9-tert-butyl-paullone,6.397,0.0
4,5637,6-NBDG,6.9988,0.0


In [5]:
assert d_exp.groupby(['TUMOR_ID', 'DRUG_ID']).size().max() <= 1, \
    'Found at least one cell line + drug combination with more than one record (should not be possible)'

In [6]:
assert np.all(pd.notnull(d_exp))
db.save(d_exp, src.CTD_v1, db.RAW, 'sensitivity')

'/Users/eczech/data/research/musc_genomics_db/raw/ctd_v1_sensitivity.pkl'