In [1]:
%run -m ipy_startup
%run -m ipy_logging
%matplotlib inline
%load_ext rpy2.ipython
from musc_genomics.data_import import cgds, api, prep
from musc_genomics.data_modeling import modeling
from musc_genomics import data
pd.set_option('max_info_rows', int(1E9))

# Load Cosmic Response Data

In [2]:
d_cosmic = data.load('prepared', 'response_cosmic')
d_cosmic.info()

2016-07-19 14:58:23,185:DEBUG:research.project.manager: Loading saved data from location "/Users/eczech/data/research/musc_genomics/prepared/response_cosmic.pkl"


<class 'pandas.core.frame.DataFrame'>
Int64Index: 958836 entries, 0 to 1187759
Data columns (total 8 columns):
CellLine             958836 non-null object
CosmicID             958836 non-null int64
CancerType           958836 non-null object
Tissue               958836 non-null object
ExperimentDetails    958836 non-null object
Value                958836 non-null float64
DrugName             958836 non-null object
ValueUnit            958836 non-null object
dtypes: float64(1), int64(1), object(6)
memory usage: 65.8+ MB


# Load CGDS Features

In [4]:
d_cgds = data.load('prepared', 'features_raw')
d_cgds.iloc[:5, :10]

2016-07-19 14:58:28,537:DEBUG:research.project.manager: Loading saved data from location "/Users/eczech/data/research/musc_genomics/prepared/features_raw.pkl"


FEATURE,CN:A1BG,CN:A1BG-AS1,CN:A1CF,CN:A2M,CN:A2ML1,CN:A4GALT,CN:A4GNT,CN:AAAS,CN:AACS,CN:AACSP1
TUMOR_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1321N1,-0.1544,-0.1544,-0.0985,-0.1819,-0.1819,-0.1732,0.2575,-0.0404,-0.1284,0.1682
143B,-0.212,-0.212,0.158,-0.1753,-0.1753,-0.2536,0.206,-0.1445,-0.2757,0.215
22RV1,-0.08084,-0.08084,-0.0025,0.4486,0.4486,-0.0574,-0.0303,0.4649,0.4406,-0.0518
2313287,-0.0331,-0.0331,-0.05348,-0.1917,-0.1917,-0.0512,-0.04608,-0.01474,-0.0516,-0.0514
253JBV,,,,,,,,,,


In [6]:
d_cgds.filter(regex='^GE:').apply(pd.isnull).mean(axis=1).value_counts()

0.0    967
1.0     52
dtype: int64

In [9]:
d_cgds.filter(regex='^MU:AAC').apply(pd.isnull).mean(axis=1)\
    .pipe(pd.cut, bins=[.8, .9, .97, .99, 1, np.inf], right=False).value_counts()

[0.99, 1)       889
[1, inf)        115
[0.97, 0.99)     15
[0.9, 0.97)       0
[0.8, 0.9)        0
dtype: int64

# Merge

In [10]:
# This list should contain all drugs to undergo further study (it 
# should grow over time, and not be repeated in different scenarios
# and saved to multiple merged results)
drug_names = [
    'PD-0332991', # Palbociclib
    'ABT-263', # Navitoclax
    'SB590885', 'PLX4720', 'Nutlin-3a', 'AZD2281', 'AG-014699'
]

d = modeling.get_modeling_data(d_cosmic, d_cgds, drug_names)
d.info()

<class 'pandas.core.frame.DataFrame'>
Index: 416 entries, 22RV1 to ZR7530
Columns: 107384 entries, CN:A1BG to RES:VAL:SB590885
dtypes: float64(107378), object(6)
memory usage: 340.8+ MB


In [11]:
d.filter(regex='RES:').info()

<class 'pandas.core.frame.DataFrame'>
Index: 416 entries, 22RV1 to ZR7530
Data columns (total 8 columns):
RES:CANCERTYPE        416 non-null object
RES:TISSUE            416 non-null object
RES:VAL:ABT-263       411 non-null float64
RES:VAL:AG-014699     409 non-null float64
RES:VAL:NUTLIN-3A     411 non-null float64
RES:VAL:PD-0332991    389 non-null float64
RES:VAL:PLX4720       411 non-null float64
RES:VAL:SB590885      396 non-null float64
dtypes: float64(6), object(2)
memory usage: 29.2+ KB


# Export

In [12]:
data.save('prepared', 'modeling_data_merge_raw', d)

2016-07-19 15:00:20,829:DEBUG:research.project.manager: Saving data to location "/Users/eczech/data/research/musc_genomics/prepared/modeling_data_merge_raw.pkl"


'/Users/eczech/data/research/musc_genomics/prepared/modeling_data_merge_raw.pkl'

# Analysis

In [13]:
d.filter(regex='^RES:|^CL:').head()

FEATURE,CL:CANCER_TYPE,CL:HISTOLOGY,CL:PRIMARY_SITE,CL:GENDER,RES:CANCERTYPE,RES:TISSUE,RES:VAL:ABT-263,RES:VAL:AG-014699,RES:VAL:NUTLIN-3A,RES:VAL:PD-0332991,RES:VAL:PLX4720,RES:VAL:SB590885
TUMOR_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
22RV1,solid,carcinoma,prostate,male,urogenital_system,prostate,5.010086,2.676501,2.549906,0.517856,6.697773,4.639378
2313287,solid,carcinoma,stomach,male,digestive_system,stomach,3.649499,6.000943,3.039341,3.912658,6.753041,6.373648
5637,solid,carcinoma,urinary_tract,male,urogenital_system,bladder,1.283993,2.897053,6.412843,5.88713,2.998122,5.108025
639V,solid,carcinoma,urinary_tract,male,urogenital_system,bladder,3.378504,5.784257,3.779543,3.054728,6.816524,6.274532
647V,solid,carcinoma,urinary_tract,male,urogenital_system,bladder,4.51045,3.973549,6.739784,5.881208,6.842532,6.625275


In [14]:
d[(d['CL:PRIMARY_SITE'] == 'skin') & (d['RES:TISSUE'] == 'large_intestine')].iloc[:5, :5]

FEATURE,CN:A1BG,CN:A1BG-AS1,CN:A1CF,CN:A2M,CN:A2ML1
TUMOR_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
COLO741,-0.8174,-0.8174,-0.2233,0.0218,0.0218


In [15]:
d.groupby(['CL:PRIMARY_SITE', 'RES:TISSUE']).size()

CL:PRIMARY_SITE                     RES:TISSUE                        
autonomic_ganglia                   neuroblastoma                          9
bone                                bone_other                             1
                                    ewings_sarcoma                         1
                                    osteosarcoma                           3
                                    rhabdomyosarcoma                       1
breast                              breast                                34
central_nervous_system              glioma                                25
                                    medulloblastoma                        2
endometrium                         endometrium                            9
haematopoietic_and_lymphoid_tissue  acute_myeloid_leukaemia               11
                                    anaplastic_large_cell_lymphoma         1
                                    b_cell_leukemia                        3
     