# GDSC Raw Exome Sequencing Data Importation
**Local Version**: 2
**Source Version**: 6.0

This notebook will import raw GDSC exome sequencing data through the [GDSC](http://www.cancerrxgene.org/downloads) portal which hosts files on the [Sanger FTP Server](ftp://ftp.sanger.ac.uk/pub/project/cancerrxgene/releases/) (release-6.0 in this case)

Note that the GDSC exposes 3 sequencing datasets, labeled as the following:

1. Raw - "WES data for Cell lines"
2. Preprocessed - "Cell-line sequence variants"
3. Preprocessed - "Sequencing BEMs for Cell lines"

In this case option 2 will be used, but the others are worth future consideration.

In [1]:
%run -m ipy_startup
%run -m ipy_logging
from mgds.data_aggregation import database as db
from mgds.data_aggregation import source as src
from mgds.data_aggregation import io_utils as io
pd.set_option('display.max_info_rows', 50000000)

In [2]:
url = 'ftp://ftp.sanger.ac.uk/pub/project/cancerrxgene/releases/release-6.0/WES_variants.xlsx'
#url = 'http://www.cancerrxgene.org/gdsc1000/GDSC1000_WebResources/Data/BEMs/CellLines/CellLines_CG_BEMs.zip'
filepath = db.raw_file(src.GDSC_v2, 'exome-seq.xlsx')
filepath = io.download(url, filepath, check_exists=True)
filepath

2016-11-21 13:47:46,687:DEBUG:mgds.data_aggregation.io_utils: Returning previously downloaded path for "/Users/eczech/data/research/mgds/raw/gdsc_v2_exome-seq.xlsx"


'/Users/eczech/data/research/mgds/raw/gdsc_v2_exome-seq.xlsx'

In [3]:
# This initial read is slow -- avoid where possible
d = pd.read_excel(filepath, sheetname='WES_variants', )
d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 486243 entries, 0 to 486242
Data columns (total 13 columns):
SAMPLE               486243 non-null object
COSMIC_ID            486243 non-null int64
Cancer Type          409611 non-null object
Gene                 486243 non-null object
Transcript           486243 non-null object
cDNA                 486243 non-null object
AA                   486243 non-null object
Classification       486243 non-null object
Gene_list            24214 non-null object
Recurrence Filter    29214 non-null object
Subs                 52568 non-null float64
Truncating           65745 non-null float64
inframe              441 non-null float64
dtypes: float64(3), int64(1), object(9)
memory usage: 48.2+ MB


In [9]:
d.head()

Unnamed: 0,SAMPLE,COSMIC_ID,Cancer Type,Gene,Transcript,cDNA,AA,Classification,Gene_list,Recurrence Filter,Subs,Truncating,inframe
0,KARPAS-45,907272,ALL,A1BG,ENST00000263100,c.842A>G,p.Y281C,missense,,,,,
1,Jurkat,998184,ALL,A1BG,ENST00000263100,c.589G>A,p.A197T,missense,,,,,
2,KARPAS-45,907272,ALL,A1BG,ENST00000263100,c.571G>A,p.A191T,missense,,,2.0,,
3,HT-115,907289,COAD/READ,A1BG,ENST00000263100,c.1456G>A,p.D486N,missense,,,,,
4,KM12,905989,COAD/READ,A1BG,ENST00000263100,c.770C>A,p.T257N,missense,,,,,


In [10]:
# Example record almost entirely duplicated except for "cDNA"
d[(d.SAMPLE == 5637) & (d['Gene'] == 'ZCCHC14')]

Unnamed: 0,SAMPLE,COSMIC_ID,Cancer Type,Gene,Transcript,cDNA,AA,Classification,Gene_list,Recurrence Filter,Subs,Truncating,inframe
465668,5637,687452,BLCA,ZCCHC14,ENST00000268616,c.2034_2035insCA,p.A679fs*11,frameshift,,,,10.0,
465669,5637,687452,BLCA,ZCCHC14,ENST00000268616,c.2029_2030insAC,p.A679fs*11,frameshift,,,,10.0,


In [12]:
# Notes:
# - The 'AA' = 'Amno acid positon and alteration' field has values matching those in the CCLE mutation dataset
d_exp = d.rename(columns=lambda c: c.upper().replace(' ', '_'))
d_exp = d_exp.rename(columns={
    'SAMPLE': 'CELL_LINE_ID', 
    'COSMIC_ID': 'CELL_LINE_ID:COSMIC', 
    'GENE': 'GENE_ID'
})

# For common cell line IDs, make sure there are no floats and then convert each to string
assert np.all(d_exp['CELL_LINE_ID'].apply(type).isin([str, int])), \
    'Found cell line ID not given as string or int'
d_exp['CELL_LINE_ID'] = d_exp['CELL_LINE_ID'].astype(str)

# Make sure all COSMIC cell line IDs are integers
assert np.all(d_exp['CELL_LINE_ID:COSMIC'].apply(type) == int)

# Ensure that all gene names are strings
assert np.all(d_exp['GENE_ID'].apply(type) == str)

# Ensure that cell line identifiers do not conflict
assert np.all(d_exp.groupby('CELL_LINE_ID')['CELL_LINE_ID:COSMIC'].nunique() == 1)
assert np.all(d_exp.groupby('CELL_LINE_ID:COSMIC')['CELL_LINE_ID'].nunique() == 1)

# Record uniqueness only exists across the following:
# 'CELL_LINE_ID', 'CELL_LINE_ID:COSMIC', 'GENE_ID', 'CDNA', 'AA', 'TRANSCRIPT'
# Anything less specific than this will include multiple records (except for 'AA' which can
# be removed but it seems worth keeping as part of any unique key)
d_exp.head()

Unnamed: 0,CELL_LINE_ID,CELL_LINE_ID:COSMIC,CANCER_TYPE,GENE_ID,TRANSCRIPT,CDNA,AA,CLASSIFICATION,GENE_LIST,RECURRENCE_FILTER,SUBS,TRUNCATING,INFRAME
0,KARPAS-45,907272,ALL,A1BG,ENST00000263100,c.842A>G,p.Y281C,missense,,,,,
1,Jurkat,998184,ALL,A1BG,ENST00000263100,c.589G>A,p.A197T,missense,,,,,
2,KARPAS-45,907272,ALL,A1BG,ENST00000263100,c.571G>A,p.A191T,missense,,,2.0,,
3,HT-115,907289,COAD/READ,A1BG,ENST00000263100,c.1456G>A,p.D486N,missense,,,,,
4,KM12,905989,COAD/READ,A1BG,ENST00000263100,c.770C>A,p.T257N,missense,,,,,


In [13]:
d_exp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 486243 entries, 0 to 486242
Data columns (total 13 columns):
CELL_LINE_ID           486243 non-null object
CELL_LINE_ID:COSMIC    486243 non-null int64
CANCER_TYPE            409611 non-null object
GENE_ID                486243 non-null object
TRANSCRIPT             486243 non-null object
CDNA                   486243 non-null object
AA                     486243 non-null object
CLASSIFICATION         486243 non-null object
GENE_LIST              24214 non-null object
RECURRENCE_FILTER      29214 non-null object
SUBS                   52568 non-null float64
TRUNCATING             65745 non-null float64
INFRAME                441 non-null float64
dtypes: float64(3), int64(1), object(9)
memory usage: 48.2+ MB


In [14]:
# Verify that none of the following fields contain NA values, all other fields
# are mostly extraneous metadata 
non_na_cols = [
    'CELL_LINE_ID',
    'CELL_LINE_ID:COSMIC',
    'GENE_ID',
    'TRANSCRIPT',
    'CDNA',
    'AA',
    'CLASSIFICATION'
]
for c in non_na_cols:
    assert np.all(d_exp[c].notnull()), 'Found null values for field "{}"'.format(c)

In [15]:
d_exp['CELL_LINE_ID:COSMIC'].nunique()

1001

In [16]:
db.save(d_exp, src.GDSC_v2, db.IMPORT, 'gene-exome-seq')

'/Users/eczech/data/research/mgds/import/gdsc_v2_gene-exome-seq.pkl'