# GDSC Meta Data Importation
**Local Version**: 2
**Source Version**: 6.0

This notebook will import Cell Line and Cancer Type metadata data through the [GDSC](http://www.cancerrxgene.org/downloads) portal.  Files for this are hosted on the [Sanger FTP Server](ftp://ftp.sanger.ac.uk/pub/project/cancerrxgene/releases/) (release-6.0 in this case), and the only file of interest here is [Cell_Lines_Details.xlsx](ftp://ftp.sanger.ac.uk/pub/project/cancerrxgene/releases/release-6.0/Cell_Lines_Details.xlsx).  This file contains a mapping of COSMIC cell line ids to common cell line ids as well as notes which data is available for each.  It also contains the cancer site and histology of each cell line.

In [1]:
%run -m ipy_startup
%run -m ipy_logging
from mgds.data_aggregation import database as db
from mgds.data_aggregation import source as src
from mgds.data_aggregation import io_utils as io
from py_utils.collection_utils import subset
import re
pd.set_option('display.max_info_rows', 50000000)

In [2]:
# Download Cell Line Details spreadsheet
url = 'ftp://ftp.sanger.ac.uk/pub/project/cancerrxgene/releases/release-6.0/Cell_Lines_Details.xlsx'
filepath = db.raw_file(src.GDSC_v2, 'cell-line-details.xlsx')
filepath = io.download(url, filepath, check_exists=True)
filepath

2016-11-21 15:14:48,528:DEBUG:mgds.data_aggregation.io_utils: Returning previously downloaded path for "/Users/eczech/data/research/mgds/raw/gdsc_v2_cell-line-details.xlsx"


'/Users/eczech/data/research/mgds/raw/gdsc_v2_cell-line-details.xlsx'

In [3]:
d = pd.read_excel(filepath, sheetname=None)
d.keys()

dict_keys(['Cell line details', 'Decode', 'COSMIC tissue classification'])

## Prepare Cell Line Details

In [4]:
d_cl_det = d['Cell line details']
d_cl_det = d_cl_det.rename(columns=lambda c: c.replace('\n', ''))
d_cl_det.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1002 entries, 0 to 1001
Data columns (total 13 columns):
Sample Name                                1002 non-null object
COSMIC identifier                          1001 non-null float64
Whole Exome Sequencing (WES)               1002 non-null object
Copy Number Alterations (CNA)              1002 non-null object
Gene Expression                            1002 non-null object
Methylation                                1002 non-null object
DrugResponse                               1002 non-null object
GDSCTissue descriptor 1                    1001 non-null object
GDSCTissuedescriptor 2                     1001 non-null object
Cancer Type(matching TCGA label)           826 non-null object
Microsatellite instability Status (MSI)    986 non-null object
Screen Medium                              1001 non-null object
Growth Properties                          999 non-null object
dtypes: float64(1), object(12)
memory usage: 101.8+ KB


In [5]:
def convert_ids_to_string(d):
    # Convert COSMIC and common cell line ID to string (carefully)
    
    # For COSMIC IDs, make sure we have no floats not equal to integer representation (ie only ".0")
    assert np.all(d['CELL_LINE_ID:COSMIC'] == d['CELL_LINE_ID:COSMIC'].astype(np.int64))
    d['CELL_LINE_ID:COSMIC'] = d['CELL_LINE_ID:COSMIC'].astype(np.int64).astype(str)

    # For common cell line IDs, make sure there are no floats and then convert each to string
    assert np.all(d['CELL_LINE_ID'].apply(type).isin([str, int])), \
        'Found cell line ID not given as string or int'
    d['CELL_LINE_ID'] = d['CELL_LINE_ID'].astype(str)
    return d

In [6]:
# Rename and cleanup field names, and ensure cell line ids are strings
def clean(x):
    x = re.compile('\(.*\)').sub('', x).strip()
    x = re.compile('\s+').sub('_', x)
    return x.upper()

d_cl_det = d_cl_det.rename(columns=clean)

d_cl_det = d_cl_det.rename(columns={
    'SAMPLE_NAME': 'CELL_LINE_ID',
    'COSMIC_IDENTIFIER': 'CELL_LINE_ID:COSMIC',
    'GDSCTISSUE_DESCRIPTOR_1': 'GDSC_TISSUE_DESCRIPTOR_1',
    'GDSCTISSUEDESCRIPTOR_2': 'GDSC_TISSUE_DESCRIPTOR_2'
})

# Ignore records with a null cell line ID of any kind, since these will not be useful when joined on other data
d_cl_det = subset(
    d_cl_det, lambda df: df[df['CELL_LINE_ID:COSMIC'].notnull()], 
    subset_op='Remove records with null COSMIC cell line ID'
)
d_cl_det = subset(
    d_cl_det, lambda df: df[df['CELL_LINE_ID'].notnull()], 
    subset_op='Remove records with null common cell line ID'
)

d_cl_det = convert_ids_to_string(d_cl_det)

# Ensure that identifiers never conflict
assert d_cl_det.groupby('CELL_LINE_ID')['CELL_LINE_ID:COSMIC'].nunique().max() == 1
assert d_cl_det.groupby('CELL_LINE_ID:COSMIC')['CELL_LINE_ID'].nunique().max() == 1

# Ensure no cell lines are repeated
assert d_cl_det['CELL_LINE_ID'].value_counts().max() == 1

d_cl_det.info()

[Remove records with null COSMIC cell line ID] Records before = 1002, Records after = 1001, Records removed = 1 (%0.10)
[Remove records with null common cell line ID] Records before = 1001, Records after = 1001, Records removed = 0 (%0.00)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1001 entries, 0 to 1000
Data columns (total 13 columns):
CELL_LINE_ID                         1001 non-null object
CELL_LINE_ID:COSMIC                  1001 non-null object
WHOLE_EXOME_SEQUENCING               1001 non-null object
COPY_NUMBER_ALTERATIONS              1001 non-null object
GENE_EXPRESSION                      1001 non-null object
METHYLATION                          1001 non-null object
DRUGRESPONSE                         1001 non-null object
GDSC_TISSUE_DESCRIPTOR_1             1001 non-null object
GDSC_TISSUE_DESCRIPTOR_2             1001 non-null object
CANCER_TYPE                          826 non-null object
MICROSATELLITE_INSTABILITY_STATUS    986 non-null object
SCREEN_MEDIUM    

In [7]:
d_cl_det.head()

Unnamed: 0,CELL_LINE_ID,CELL_LINE_ID:COSMIC,WHOLE_EXOME_SEQUENCING,COPY_NUMBER_ALTERATIONS,GENE_EXPRESSION,METHYLATION,DRUGRESPONSE,GDSC_TISSUE_DESCRIPTOR_1,GDSC_TISSUE_DESCRIPTOR_2,CANCER_TYPE,MICROSATELLITE_INSTABILITY_STATUS,SCREEN_MEDIUM,GROWTH_PROPERTIES
0,A253,906794,Y,Y,Y,Y,Y,aero_dig_tract,head and neck,,MSS/MSI-L,D/F12,Adherent
1,BB30-HNC,753531,Y,Y,Y,Y,Y,aero_dig_tract,head and neck,HNSC,MSS/MSI-L,D/F12,Adherent
2,BB49-HNC,753532,Y,Y,Y,Y,Y,aero_dig_tract,head and neck,HNSC,MSS/MSI-L,D/F12,Adherent
3,BHY,753535,Y,Y,Y,Y,Y,aero_dig_tract,head and neck,HNSC,MSS/MSI-L,D/F12,Adherent
4,BICR10,1290724,Y,Y,Y,Y,Y,aero_dig_tract,head and neck,HNSC,MSS/MSI-L,D/F12,Adherent


## Prepare Tissue Metadata

In [8]:
d_cl_tis = d['COSMIC tissue classification']
d_cl_tis = d_cl_tis.rename(columns=lambda c: c.replace('\n', ''))
d_cl_tis.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1029 entries, 0 to 1028
Data columns (total 4 columns):
Line         1029 non-null object
COSMIC_ID    1029 non-null int64
Site         1029 non-null object
Histology    1029 non-null object
dtypes: int64(1), object(3)
memory usage: 32.2+ KB


In [9]:
# Rename fields to conform to other datasets
d_cl_tis = d_cl_tis.rename(columns={
    'Line': 'CELL_LINE_ID', 
    'COSMIC_ID': 'CELL_LINE_ID:COSMIC',
    'Site': 'SITE', 
    'Histology': 'HISTOLOGY'
})

# Convert identifiers to strings
d_cl_tis = convert_ids_to_string(d_cl_tis)

# Ensure that identifiers never conflict
assert d_cl_tis.groupby('CELL_LINE_ID')['CELL_LINE_ID:COSMIC'].nunique().max() == 1
assert d_cl_tis.groupby('CELL_LINE_ID:COSMIC')['CELL_LINE_ID'].nunique().max() == 1

# Ensure no cell lines are repeated
assert d_cl_tis['CELL_LINE_ID'].value_counts().max() == 1

d_cl_tis.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1029 entries, 0 to 1028
Data columns (total 4 columns):
CELL_LINE_ID           1029 non-null object
CELL_LINE_ID:COSMIC    1029 non-null object
SITE                   1029 non-null object
HISTOLOGY              1029 non-null object
dtypes: object(4)
memory usage: 32.2+ KB


## Merge Metadata

In [10]:
d_cl = pd.merge(
    d_cl_det.assign(ID1=np.arange(len(d_cl_det))), 
    d_cl_tis.assign(ID2=np.arange(len(d_cl_tis))), 
    on=['CELL_LINE_ID', 'CELL_LINE_ID:COSMIC'], how='outer'
)

# Comment this out to make it more clear which records are joining 
# between the two datasets
d_cl = d_cl.drop(['ID1', 'ID2'], axis=1)

d_cl.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1034 entries, 0 to 1033
Data columns (total 15 columns):
CELL_LINE_ID                         1034 non-null object
CELL_LINE_ID:COSMIC                  1034 non-null object
WHOLE_EXOME_SEQUENCING               1001 non-null object
COPY_NUMBER_ALTERATIONS              1001 non-null object
GENE_EXPRESSION                      1001 non-null object
METHYLATION                          1001 non-null object
DRUGRESPONSE                         1001 non-null object
GDSC_TISSUE_DESCRIPTOR_1             1001 non-null object
GDSC_TISSUE_DESCRIPTOR_2             1001 non-null object
CANCER_TYPE                          826 non-null object
MICROSATELLITE_INSTABILITY_STATUS    986 non-null object
SCREEN_MEDIUM                        1001 non-null object
GROWTH_PROPERTIES                    999 non-null object
SITE                                 1029 non-null object
HISTOLOGY                            1029 non-null object
dtypes: object(15)
memory 

In [None]:
d_cl[d_cl['CELL_LINE_ID'].str.contains('')]

In [22]:
assert np.all(d_cl['CELL_LINE_ID'].apply(type) == str)
assert np.all(d_cl['CELL_LINE_ID:COSMIC'].apply(type) == str)
db.save(d_cl, src.GDSC_v2, db.IMPORT, 'cellline-meta')

'/Users/eczech/data/research/mgds/import/gdsc_v2_cellline-meta.pkl'