# GDSC Meta Data Importation
**Local Version**: 2
**Source Version**: 6.0

This notebook will import Cell Line and Cancer Type metadata data through the [GDSC](http://www.cancerrxgene.org/downloads) portal.  Files for this are hosted on the [Sanger FTP Server](ftp://ftp.sanger.ac.uk/pub/project/cancerrxgene/releases/) (release-6.0 in this case), and the only file of interest here is [Cell_Lines_Details.xlsx](ftp://ftp.sanger.ac.uk/pub/project/cancerrxgene/releases/release-6.0/Cell_Lines_Details.xlsx).  This file contains a mapping of COSMIC cell line ids to common cell line ids as well as notes which data is available for each.  It also contains the cancer site and histology of each cell line.

In [23]:
%run -m ipy_startup
%run -m ipy_logging
from mgds.data_aggregation import database as db
from mgds.data_aggregation import source as src
from mgds.data_aggregation import io_utils as io
from mgds.data_aggregation import entity
from py_utils.collection_utils import subset
from py_utils import assertion_utils
import re
pd.set_option('display.max_info_rows', 50000000)

In [2]:
# Download Cell Line Details spreadsheet
url = 'ftp://ftp.sanger.ac.uk/pub/project/cancerrxgene/releases/release-6.0/Cell_Lines_Details.xlsx'
filepath = db.raw_file(src.GDSC_v2, 'cellline-meta.xlsx')
filepath = io.download(url, filepath, check_exists=True)
filepath

2016-12-01 15:05:40,848:DEBUG:mgds.data_aggregation.io_utils: Returning previously downloaded path for "/Users/eczech/data/research/mgds/raw/gdsc_v2_cellline-meta.xlsx"


'/Users/eczech/data/research/mgds/raw/gdsc_v2_cellline-meta.xlsx'

In [3]:
d = pd.read_excel(filepath, sheetname=None)
d.keys()

dict_keys(['Cell line details', 'Decode', 'COSMIC tissue classification'])

## Prepare Cell Line Details

In [4]:
d_cl_det = d['Cell line details']
d_cl_det = d_cl_det.rename(columns=lambda c: c.replace('\n', ''))
d_cl_det.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1002 entries, 0 to 1001
Data columns (total 13 columns):
Sample Name                                1002 non-null object
COSMIC identifier                          1001 non-null float64
Whole Exome Sequencing (WES)               1002 non-null object
Copy Number Alterations (CNA)              1002 non-null object
Gene Expression                            1002 non-null object
Methylation                                1002 non-null object
DrugResponse                               1002 non-null object
GDSCTissue descriptor 1                    1001 non-null object
GDSCTissuedescriptor 2                     1001 non-null object
Cancer Type(matching TCGA label)           826 non-null object
Microsatellite instability Status (MSI)    986 non-null object
Screen Medium                              1001 non-null object
Growth Properties                          999 non-null object
dtypes: float64(1), object(12)
memory usage: 101.8+ KB


In [5]:
def convert_ids_to_string(d):
    # Convert COSMIC and common cell line ID to string (carefully)
    
    # For COSMIC IDs, make sure we have no floats not equal to integer representation (ie only ".0")
    assert np.all(d['CELL_LINE_ID:COSMIC'] == d['CELL_LINE_ID:COSMIC'].astype(np.int64))
    d['CELL_LINE_ID:COSMIC'] = d['CELL_LINE_ID:COSMIC'].astype(np.int64).astype(str)

    # For common cell line IDs, make sure there are no floats and then convert each to string
    assert np.all(d['CELL_LINE_ID'].apply(type).isin([str, int])), \
        'Found cell line ID not given as string or int'
    d['CELL_LINE_ID'] = d['CELL_LINE_ID'].astype(str)
    return d

In [6]:
# Rename and cleanup field names, and ensure cell line ids are strings
def clean(x):
    x = re.compile('\(.*\)').sub('', x).strip()
    x = re.compile('\s+').sub('_', x)
    return x.upper()

d_cl_det = d_cl_det.rename(columns=clean)

d_cl_det = d_cl_det.rename(columns={
    'SAMPLE_NAME': 'CELL_LINE_ID',
    'COSMIC_IDENTIFIER': 'CELL_LINE_ID:COSMIC',
    'GDSCTISSUE_DESCRIPTOR_1': 'GDSC_TISSUE_DESCRIPTOR_1',
    'GDSCTISSUEDESCRIPTOR_2': 'GDSC_TISSUE_DESCRIPTOR_2'
})

# Ignore records with a null cell line ID of any kind, since these will not be useful when joined on other data
d_cl_det = subset(
    d_cl_det, lambda df: df[df['CELL_LINE_ID:COSMIC'].notnull()], 
    subset_op='Remove records with null COSMIC cell line ID'
)
d_cl_det = subset(
    d_cl_det, lambda df: df[df['CELL_LINE_ID'].notnull()], 
    subset_op='Remove records with null common cell line ID'
)

d_cl_det = convert_ids_to_string(d_cl_det)

# Ensure that identifiers never conflict
assert d_cl_det.groupby('CELL_LINE_ID')['CELL_LINE_ID:COSMIC'].nunique().max() == 1
assert d_cl_det.groupby('CELL_LINE_ID:COSMIC')['CELL_LINE_ID'].nunique().max() == 1

# Ensure no cell lines are repeated
assert d_cl_det['CELL_LINE_ID'].value_counts().max() == 1

d_cl_det.info()

[Remove records with null COSMIC cell line ID] Records before = 1002, Records after = 1001, Records removed = 1 (%0.10)
[Remove records with null common cell line ID] Records before = 1001, Records after = 1001, Records removed = 0 (%0.00)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1001 entries, 0 to 1000
Data columns (total 13 columns):
CELL_LINE_ID                         1001 non-null object
CELL_LINE_ID:COSMIC                  1001 non-null object
WHOLE_EXOME_SEQUENCING               1001 non-null object
COPY_NUMBER_ALTERATIONS              1001 non-null object
GENE_EXPRESSION                      1001 non-null object
METHYLATION                          1001 non-null object
DRUGRESPONSE                         1001 non-null object
GDSC_TISSUE_DESCRIPTOR_1             1001 non-null object
GDSC_TISSUE_DESCRIPTOR_2             1001 non-null object
CANCER_TYPE                          826 non-null object
MICROSATELLITE_INSTABILITY_STATUS    986 non-null object
SCREEN_MEDIUM    

In [7]:
d_cl_det.head()

Unnamed: 0,CELL_LINE_ID,CELL_LINE_ID:COSMIC,WHOLE_EXOME_SEQUENCING,COPY_NUMBER_ALTERATIONS,GENE_EXPRESSION,METHYLATION,DRUGRESPONSE,GDSC_TISSUE_DESCRIPTOR_1,GDSC_TISSUE_DESCRIPTOR_2,CANCER_TYPE,MICROSATELLITE_INSTABILITY_STATUS,SCREEN_MEDIUM,GROWTH_PROPERTIES
0,A253,906794,Y,Y,Y,Y,Y,aero_dig_tract,head and neck,,MSS/MSI-L,D/F12,Adherent
1,BB30-HNC,753531,Y,Y,Y,Y,Y,aero_dig_tract,head and neck,HNSC,MSS/MSI-L,D/F12,Adherent
2,BB49-HNC,753532,Y,Y,Y,Y,Y,aero_dig_tract,head and neck,HNSC,MSS/MSI-L,D/F12,Adherent
3,BHY,753535,Y,Y,Y,Y,Y,aero_dig_tract,head and neck,HNSC,MSS/MSI-L,D/F12,Adherent
4,BICR10,1290724,Y,Y,Y,Y,Y,aero_dig_tract,head and neck,HNSC,MSS/MSI-L,D/F12,Adherent


## Prepare Tissue Metadata

In [8]:
d_cl_tis = d['COSMIC tissue classification']
d_cl_tis = d_cl_tis.rename(columns=lambda c: c.replace('\n', ''))
d_cl_tis.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1029 entries, 0 to 1028
Data columns (total 4 columns):
Line         1029 non-null object
COSMIC_ID    1029 non-null int64
Site         1029 non-null object
Histology    1029 non-null object
dtypes: int64(1), object(3)
memory usage: 32.2+ KB


In [9]:
# Rename fields to conform to other datasets
d_cl_tis = d_cl_tis.rename(columns={
    'Line': 'CELL_LINE_ID', 
    'COSMIC_ID': 'CELL_LINE_ID:COSMIC',
    'Site': 'PRIMARY_SITE', 
    'Histology': 'HISTOLOGY'
})

# Convert identifiers to strings
d_cl_tis = convert_ids_to_string(d_cl_tis)

# Ensure that identifiers never conflict
assert d_cl_tis.groupby('CELL_LINE_ID')['CELL_LINE_ID:COSMIC'].nunique().max() == 1
assert d_cl_tis.groupby('CELL_LINE_ID:COSMIC')['CELL_LINE_ID'].nunique().max() == 1

# Ensure no cell lines are repeated
assert d_cl_tis['CELL_LINE_ID'].value_counts().max() == 1

d_cl_tis.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1029 entries, 0 to 1028
Data columns (total 4 columns):
CELL_LINE_ID           1029 non-null object
CELL_LINE_ID:COSMIC    1029 non-null object
PRIMARY_SITE           1029 non-null object
HISTOLOGY              1029 non-null object
dtypes: object(4)
memory usage: 32.2+ KB


## Merge Metadata

In [10]:
d_cl = pd.merge(
    d_cl_det.assign(ID1=np.arange(len(d_cl_det))), 
    d_cl_tis.assign(ID2=np.arange(len(d_cl_tis))), 
    on=['CELL_LINE_ID', 'CELL_LINE_ID:COSMIC'], how='outer'
)

# Comment this out to make it more clear which records are joining 
# between the two datasets
d_cl = d_cl.drop(['ID1', 'ID2'], axis=1)

# Add generic properties map with more source-specific metadata
c_detail = ['CANCER_TYPE', 'HISTOLOGY', 'GDSC_TISSUE_DESCRIPTOR_1', 'GDSC_TISSUE_DESCRIPTOR_2']
d_cl['PROPERTIES'] = d_cl[c_detail].apply(lambda r: r.to_dict(), axis=1)

d_cl.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1034 entries, 0 to 1033
Data columns (total 16 columns):
CELL_LINE_ID                         1034 non-null object
CELL_LINE_ID:COSMIC                  1034 non-null object
WHOLE_EXOME_SEQUENCING               1001 non-null object
COPY_NUMBER_ALTERATIONS              1001 non-null object
GENE_EXPRESSION                      1001 non-null object
METHYLATION                          1001 non-null object
DRUGRESPONSE                         1001 non-null object
GDSC_TISSUE_DESCRIPTOR_1             1001 non-null object
GDSC_TISSUE_DESCRIPTOR_2             1001 non-null object
CANCER_TYPE                          826 non-null object
MICROSATELLITE_INSTABILITY_STATUS    986 non-null object
SCREEN_MEDIUM                        1001 non-null object
GROWTH_PROPERTIES                    999 non-null object
PRIMARY_SITE                         1029 non-null object
HISTOLOGY                            1029 non-null object
PROPERTIES                

In [12]:
d_cl['GDSC_TISSUE_DESCRIPTOR_2'].value_counts().head()

lung_NSCLC_adenocarcinoma    67
lung_small_cell_carcinoma    66
melanoma                     55
glioma                       53
breast                       52
Name: GDSC_TISSUE_DESCRIPTOR_2, dtype: int64

In [13]:
d_cl['HISTOLOGY'].value_counts().head()

carcinoma                  595
lymphoid_neoplasm          131
malignant_melanoma          57
glioma                      54
haematopoietic_neoplasm     44
Name: HISTOLOGY, dtype: int64

In [14]:
d_cl['CANCER_TYPE'].value_counts()

SCLC                  66
LUAD                  64
SKCM                  55
BRCA                  51
COAD/READ             51
HNSC                  42
GBM                   36
DLBC                  35
ESCA                  35
OV                    34
KIRC                  32
NB                    32
PAAD                  30
LAML                  28
ALL                   26
STAD                  25
MESO                  21
BLCA                  19
MM                    18
LGG                   17
LIHC                  17
THCA                  16
LUSC                  15
CESC                  14
UNABLE TO CLASSIFY    14
LCML                  10
UCEC                   9
PRAD                   6
MB                     4
CLL                    3
ACC                    1
Name: CANCER_TYPE, dtype: int64

In [15]:
d_cl['PRIMARY_SITE'].value_counts()

lung                                  179
haematopoietic_and_lymphoid_tissue    175
skin                                   62
central_nervous_system                 58
large_intestine                        56
breast                                 52
ovary                                  45
bone                                   44
upper_aerodigestive_tract              43
oesophagus                             36
autonomic_ganglia                      34
kidney                                 33
pancreas                               32
stomach                                29
soft_tissue                            23
pleura                                 21
urinary_tract                          20
liver                                  17
thyroid                                16
cervix                                 14
endometrium                            11
prostate                                8
biliary_tract                           5
NS                                

## Resolve Duplicate Cell Line IDs

In [16]:
cts = d_cl['CELL_LINE_ID:COSMIC'].value_counts()
d_cl[d_cl['CELL_LINE_ID:COSMIC'].isin(cts[cts>1].index.values)].sort_values('CELL_LINE_ID:COSMIC')

Unnamed: 0,CELL_LINE_ID,CELL_LINE_ID:COSMIC,WHOLE_EXOME_SEQUENCING,COPY_NUMBER_ALTERATIONS,GENE_EXPRESSION,METHYLATION,DRUGRESPONSE,GDSC_TISSUE_DESCRIPTOR_1,GDSC_TISSUE_DESCRIPTOR_2,CANCER_TYPE,MICROSATELLITE_INSTABILITY_STATUS,SCREEN_MEDIUM,GROWTH_PROPERTIES,PRIMARY_SITE,HISTOLOGY,PROPERTIES
372,Hep 3B2_1-7,1240147,Y,Y,Y,Y,Y,digestive_system,liver,LIHC,MSS/MSI-L,D/F12,Adherent,,,"{'HISTOLOGY': nan, 'GDSC_TISSUE_DESCRIPTOR_1':..."
1021,Hep_3B2_1-7,1240147,,,,,,,,,,,,NS,NS,"{'HISTOLOGY': 'NS', 'GDSC_TISSUE_DESCRIPTOR_1'..."
948,PC-3 [JPC-3],1240202,Y,Y,Y,N,Y,lung_NSCLC,lung_NSCLC_adenocarcinoma,LUAD,MSS/MSI-L,D/F12,Adherent,,,"{'HISTOLOGY': nan, 'GDSC_TISSUE_DESCRIPTOR_1':..."
1012,PC-3_[JPC-3],1240202,,,,,,,,,,,,lung,carcinoma,"{'HISTOLOGY': 'carcinoma', 'GDSC_TISSUE_DESCRI..."
261,G-292 Clone A141B1,1290807,Y,Y,Y,Y,Y,bone,osteosarcoma,,MSS/MSI-L,R,Adherent,,,"{'HISTOLOGY': nan, 'GDSC_TISSUE_DESCRIPTOR_1':..."
1010,G-292_Clone_A141B1,1290807,,,,,,,,,,,,bone,osteosarcoma,"{'HISTOLOGY': 'osteosarcoma', 'GDSC_TISSUE_DES..."
911,UWB1.289,1480374,Y,Y,Y,Y,Y,urogenital_system,ovary,OV,MSS/MSI-L,D/F12,Adherent,,,"{'HISTOLOGY': nan, 'GDSC_TISSUE_DESCRIPTOR_1':..."
1032,UWB1_289,1480374,,,,,,,,,,,,ovary,carcinoma,"{'HISTOLOGY': 'carcinoma', 'GDSC_TISSUE_DESCRI..."
921,NTERA-2 cl.D1,908454,Y,Y,Y,Y,Y,urogenital_system,testis,,MSS/MSI-L,D/F12,Adherent,,,"{'HISTOLOGY': nan, 'GDSC_TISSUE_DESCRIPTOR_1':..."
1022,NTERA-2_cl_D1,908454,,,,,,,,,,,,testis,germ_cell_tumour,"{'HISTOLOGY': 'germ_cell_tumour', 'GDSC_TISSUE..."


In [17]:
rm_cl = ['Hep_3B2_1-7', 'PC-3_[JPC-3]', 'G-292_Clone_A141B1', 'UWB1_289', 'NTERA-2_cl_D1']
n_before = len(d_cl)
d_cl = subset(d_cl, lambda df: df[~df['CELL_LINE_ID'].isin(rm_cl)], subset_op='Remove bad cell line duplicates')
assert d_cl['CELL_LINE_ID:COSMIC'].value_counts().max() == 1
assert d_cl['CELL_LINE_ID'].value_counts().max() == 1
assert len(d_cl) == n_before - len(rm_cl)

[Remove bad cell line duplicates] Records before = 1034, Records after = 1029, Records removed = 5 (%0.48)


In [18]:
# The "TT" cell lines
d_cl[d_cl['CELL_LINE_ID:COSMIC'].isin(['1299064', '930299'])]

Unnamed: 0,CELL_LINE_ID,CELL_LINE_ID:COSMIC,WHOLE_EXOME_SEQUENCING,COPY_NUMBER_ALTERATIONS,GENE_EXPRESSION,METHYLATION,DRUGRESPONSE,GDSC_TISSUE_DESCRIPTOR_1,GDSC_TISSUE_DESCRIPTOR_2,CANCER_TYPE,MICROSATELLITE_INSTABILITY_STATUS,SCREEN_MEDIUM,GROWTH_PROPERTIES,PRIMARY_SITE,HISTOLOGY,PROPERTIES
76,T-T,1299064,Y,Y,Y,Y,Y,aero_dig_tract,oesophagus,ESCA,MSS/MSI-L,D/F12,Adherent,oesophagus,carcinoma,"{'HISTOLOGY': 'carcinoma', 'GDSC_TISSUE_DESCRI..."
828,TT,930299,Y,Y,Y,Y,Y,thyroid,thyroid,THCA,MSS/MSI-L,D/F12,Adherent,thyroid,carcinoma,"{'HISTOLOGY': 'carcinoma', 'GDSC_TISSUE_DESCRI..."


In [19]:
# The "KMH" cell lines
d_cl[d_cl['CELL_LINE_ID:COSMIC'].isin(['909976', '1298167'])]

Unnamed: 0,CELL_LINE_ID,CELL_LINE_ID:COSMIC,WHOLE_EXOME_SEQUENCING,COPY_NUMBER_ALTERATIONS,GENE_EXPRESSION,METHYLATION,DRUGRESPONSE,GDSC_TISSUE_DESCRIPTOR_1,GDSC_TISSUE_DESCRIPTOR_2,CANCER_TYPE,MICROSATELLITE_INSTABILITY_STATUS,SCREEN_MEDIUM,GROWTH_PROPERTIES,PRIMARY_SITE,HISTOLOGY,PROPERTIES
180,KM-H2,909976,Y,Y,Y,Y,Y,lymphoma,Hodgkin_lymphoma,,MSS/MSI-L,R,Suspension,haematopoietic_and_lymphoid_tissue,lymphoid_neoplasm,"{'HISTOLOGY': 'lymphoid_neoplasm', 'GDSC_TISSU..."
965,KMH-2,1298167,Y,Y,N,Y,Y,thyroid,thyroid,THCA,MSS/MSI-L,R,Adherent,thyroid,carcinoma,"{'HISTOLOGY': 'carcinoma', 'GDSC_TISSUE_DESCRI..."


In [20]:
# The "786-0" cell line (which is given as 786-O from the CCLE)
d_cl[d_cl['CELL_LINE_ID:COSMIC'].isin(['905947'])]

Unnamed: 0,CELL_LINE_ID,CELL_LINE_ID:COSMIC,WHOLE_EXOME_SEQUENCING,COPY_NUMBER_ALTERATIONS,GENE_EXPRESSION,METHYLATION,DRUGRESPONSE,GDSC_TISSUE_DESCRIPTOR_1,GDSC_TISSUE_DESCRIPTOR_2,CANCER_TYPE,MICROSATELLITE_INSTABILITY_STATUS,SCREEN_MEDIUM,GROWTH_PROPERTIES,PRIMARY_SITE,HISTOLOGY,PROPERTIES
416,786-0,905947,Y,Y,Y,Y,Y,kidney,kidney,KIRC,MSS/MSI-L,R,Adherent,kidney,carcinoma,"{'HISTOLOGY': 'carcinoma', 'GDSC_TISSUE_DESCRI..."


In [21]:
d_cl[d_cl['CELL_LINE_ID:COSMIC'].isin(['753563'])]

Unnamed: 0,CELL_LINE_ID,CELL_LINE_ID:COSMIC,WHOLE_EXOME_SEQUENCING,COPY_NUMBER_ALTERATIONS,GENE_EXPRESSION,METHYLATION,DRUGRESPONSE,GDSC_TISSUE_DESCRIPTOR_1,GDSC_TISSUE_DESCRIPTOR_2,CANCER_TYPE,MICROSATELLITE_INSTABILITY_STATUS,SCREEN_MEDIUM,GROWTH_PROPERTIES,PRIMARY_SITE,HISTOLOGY,PROPERTIES
220,IM-9,753563,Y,Y,Y,Y,Y,myeloma,myeloma,MM,MSS/MSI-L,R,Suspension,haematopoietic_and_lymphoid_tissue,lymphoid_neoplasm,"{'HISTOLOGY': 'lymphoid_neoplasm', 'GDSC_TISSU..."


## Export

In [24]:
d_exp = d_cl[['CELL_LINE_ID', 'CELL_LINE_ID:COSMIC', 'PRIMARY_SITE', 'PROPERTIES']].copy()
d_exp = entity.prepare_cellline_meta(d_exp)
d_exp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1029 entries, 0 to 1033
Data columns (total 4 columns):
CELL_LINE_ID           1029 non-null object
CELL_LINE_ID:COSMIC    1029 non-null object
PRIMARY_SITE           1029 non-null object
PROPERTIES             1029 non-null object
dtypes: object(4)
memory usage: 40.2+ KB


In [26]:
db.save(d_exp, src.GDSC_v2, db.IMPORT, 'cellline-meta')

'/Users/eczech/data/research/mgds/import/gdsc_v2_cellline-meta.pkl'