# CCLE Raw Clinical Data Importation
**Local Version**: 1
**Source Version**: NA

This notebook will import raw CCLE clinical data through the [CGDS](http://www.cbioportal.org/cgds_r.jsp) (aka "Cancer Genomic Data Server" portal).

In [9]:
%run -m ipy_startup
%run -m ipy_logging
%matplotlib inline
from mgds.data_aggregation import database as db
from mgds.data_aggregation import source as src
from mgds.data_aggregation.import_lib import cgds
from mgds.data_aggregation.import_lib import ccle
from mgds.data_aggregation import entity
from py_utils import assertion_utils

In [2]:
case_list_id = ccle.CASE_LIST_ID
op = lambda: cgds.get_clinical_data(case_list_id)
d = db.cache_raw_operation(op, src.CCLE_v1, 'cellline-meta')
d = cgds.prep_clinical_data(d)
d.info()

2016-12-01 14:59:41,640:DEBUG:mgds.data_aggregation.io_utils: Restoring serialized object from "/Users/eczech/data/research/mgds/raw/ccle_v1_cellline-meta.pkl"


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1019 entries, 0 to 1018
Data columns (total 9 columns):
CANCER_TYPE             1019 non-null object
CANCER_TYPE_DETAILED    1019 non-null object
CELL_LINE_ID            1019 non-null object
DATA_SOURCE             946 non-null object
GENDER                  834 non-null object
HISTOLOGICAL_SUBTYPE    703 non-null object
HISTOLOGY               946 non-null object
PRIMARY_SITE            946 non-null object
TUMOR_TYPE              946 non-null object
dtypes: object(9)
memory usage: 71.7+ KB


In [3]:
d['GENDER'].value_counts()

Male      469
Female    365
Name: GENDER, dtype: int64

In [17]:
d_cl = d.copy()

# Make sure that suffix on cell line id is equal to primary site and that when they're
# not equal, it's because there is no primary site recorded
d_cl_site = d_cl['CELL_LINE_ID'].str.split('_').str[1:].str.join('_')
mask = (d_cl_site == d_cl['PRIMARY_SITE'].str.upper())
assert np.all(d_cl[~mask]['PRIMARY_SITE'].isnull())

# Default to primary site in cell line id suffix when primary site field is null
d_cl['PRIMARY_SITE'] = np.where(d_cl['PRIMARY_SITE'].isnull(), d_cl_site, d_cl['PRIMARY_SITE'])
d_cl['PRIMARY_SITE'] = d_cl['PRIMARY_SITE'].str.upper()

# Convert gender to M/F
assert np.all(d_cl['GENDER'].dropna().isin(['Male', 'Female']))
d_cl['GENDER'] = d_cl['GENDER'].map({'Male': 'M', 'Female': 'F'})

# Make sure primary site is no longer ever null
assert np.all(d_cl['PRIMARY_SITE'].notnull())

# Add generic properties map with more source-specific metadata
c_detail = ['CANCER_TYPE', 'CANCER_TYPE_DETAILED', 'DATA_SOURCE', 'HISTOLOGY', 'HISTOLOGICAL_SUBTYPE', 'TUMOR_TYPE']
d_cl['PROPERTIES'] = d_cl[c_detail].apply(lambda r: r.to_dict(), axis=1)

d_cl.head()

Unnamed: 0,CANCER_TYPE,CANCER_TYPE_DETAILED,CELL_LINE_ID,DATA_SOURCE,GENDER,HISTOLOGICAL_SUBTYPE,HISTOLOGY,PRIMARY_SITE,TUMOR_TYPE,PROPERTIES
0,Cancer of Unknown Primary,Mixed Cancer Types,HCC78_LUNG,DSMZ,M,adenocarcinoma,carcinoma,LUNG,lung_NSC,"{'CANCER_TYPE': 'Cancer of Unknown Primary', '..."
1,Cancer of Unknown Primary,Mixed Cancer Types,COLO800_SKIN,DSMZ,,,malignant_melanoma,SKIN,melanoma,"{'CANCER_TYPE': 'Cancer of Unknown Primary', '..."
2,Cancer of Unknown Primary,Mixed Cancer Types,SKMEL1_SKIN,ATCC,M,,malignant_melanoma,SKIN,melanoma,"{'CANCER_TYPE': 'Cancer of Unknown Primary', '..."
3,Cancer of Unknown Primary,Mixed Cancer Types,HT115_LARGE_INTESTINE,ECACC,,,carcinoma,LARGE_INTESTINE,colorectal,"{'CANCER_TYPE': 'Cancer of Unknown Primary', '..."
4,Cancer of Unknown Primary,Mixed Cancer Types,ECC12_STOMACH,RIKEN,,small_cell_adenocarcinoma,carcinoma,STOMACH,stomach,"{'CANCER_TYPE': 'Cancer of Unknown Primary', '..."


In [5]:
d_cl.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1019 entries, 0 to 1018
Data columns (total 10 columns):
CANCER_TYPE             1019 non-null object
CANCER_TYPE_DETAILED    1019 non-null object
CELL_LINE_ID            1019 non-null object
DATA_SOURCE             946 non-null object
GENDER                  1019 non-null object
HISTOLOGICAL_SUBTYPE    703 non-null object
HISTOLOGY               946 non-null object
PRIMARY_SITE            1019 non-null object
TUMOR_TYPE              946 non-null object
PROPERTIES              1019 non-null object
dtypes: object(10)
memory usage: 79.7+ KB


In [6]:
d_cl['PRIMARY_SITE'].value_counts()

LUNG                                  184
HAEMATOPOIETIC_AND_LYMPHOID_TISSUE    180
SKIN                                   62
LARGE_INTESTINE                        60
BREAST                                 59
CENTRAL_NERVOUS_SYSTEM                 55
OVARY                                  51
PANCREAS                               46
STOMACH                                38
UPPER_AERODIGESTIVE_TRACT              33
KIDNEY                                 33
BONE                                   29
ENDOMETRIUM                            28
URINARY_TRACT                          28
LIVER                                  28
OESOPHAGUS                             26
SOFT_TISSUE                            20
AUTONOMIC_GANGLIA                      17
THYROID                                12
PLEURA                                 11
PROSTATE                                8
BILIARY_TRACT                           8
SALIVARY_GLAND                          2
SMALL_INTESTINE                   

In [7]:
d_cl['DATA_SOURCE'].value_counts()

ATCC            428
DSMZ            212
HSRRB           116
RIKEN            62
ECACC            54
KCLB             51
Academic Lab     10
NCI/DCTD          7
ICLC              6
Name: DATA_SOURCE, dtype: int64

## Export

In [19]:
d_exp = d_cl[['CELL_LINE_ID', 'PRIMARY_SITE', 'GENDER', 'PROPERTIES']]
d_exp = entity.prepare_cellline_meta(d_exp)
d_exp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1019 entries, 0 to 1018
Data columns (total 4 columns):
CELL_LINE_ID    1019 non-null object
PRIMARY_SITE    1019 non-null object
GENDER          1019 non-null object
PROPERTIES      1019 non-null object
dtypes: object(4)
memory usage: 31.9+ KB


In [20]:
db.save(d_exp, src.CCLE_v1, db.IMPORT, 'cellline-meta')

'/Users/eczech/data/research/mgds/import/ccle_v1_cellline-meta.pkl'