# GDSC Raw Copy Number Data Importation
**Local Version**: 2
**Source Version**: 6.0

This notebook will import raw GDSC copy number data through the [GDSC](http://www.cancerrxgene.org/downloads) portal which hosts files on the [Sanger FTP Server](ftp://ftp.sanger.ac.uk/pub/project/cancerrxgene/releases/) (release-6.0 in this case)

Note that the GDSC exposes 4 copy number datasets, labeled as the following:

1. Raw - "Copy number data for Cell lines"
2. Preprocessed - "Gene level copy number data"
3. Preprocessed - "RACS in cell lines"
4. Preprocessed - "RACSs CNV BEMs for cell lines"

In this case option 2 will be used, but the others are worth future consideration.

In [1]:
%run -m ipy_startup
%run -m ipy_logging
from mgds.data_aggregation import database as db
from mgds.data_aggregation import source as src
from mgds.data_aggregation import io_utils as io
pd.set_option('display.max_info_rows', 50000000)

In [2]:
url = 'ftp://ftp.sanger.ac.uk/pub/project/cancerrxgene/releases/release-6.0/Gene_level_CN.xlsx'
filepath = db.raw_file(src.GDSC_v2, 'gene-copy-number.xlsx')
filepath = io.download(url, filepath, check_exists=True)
filepath

2016-11-22 06:32:32,481:DEBUG:mgds.data_aggregation.io_utils: Returning previously downloaded path for "/Users/eczech/data/research/mgds/raw/gdsc_v2_copy-number.xlsx"


'/Users/eczech/data/research/mgds/raw/gdsc_v2_copy-number.xlsx'

In [3]:
d = pd.read_excel(filepath, sheetname='Gene_level_CN', converters={'gene': str})
d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46222 entries, 0 to 46221
Columns: 1000 entries, gene to no-11
dtypes: float64(2), object(998)
memory usage: 352.6+ MB


In [4]:
d.head()

Unnamed: 0,gene,chr,start,stop,201T,22RV1,23132-87,42-MG-BA,451Lu,5637,...,WSU-NHL,YAPC,YH-13,YKG-1,YMB-1-E,YT,ZR-75-30,huH-1,no-10,no-11
0,,,,,1287381,924100,910924,687561,1287706,687452,...,909785,909904,909905,687592,1303911,946358,909907,1298146,908452,908450
1,DDX11L1,1.0,11869.0,14412.0,"-1,-1,-,-","-1,-1,-,-","-1,-1,-,-","-1,-1,-,-","-1,-1,-,-","-1,-1,-,-",...,"-1,-1,-,-","-1,-1,-,-","-1,-1,-,-","-1,-1,-,-","-1,-1,-,-","-1,-1,-,-","-1,-1,-,-","-1,-1,-,-","-1,-1,-,-","-1,-1,-,-"
2,WASH7P,1.0,14363.0,29806.0,"-1,-1,-,-","-1,-1,-,-","-1,-1,-,-","-1,-1,-,-","-1,-1,-,-","-1,-1,-,-",...,"-1,-1,-,-","-1,-1,-,-","-1,-1,-,-","-1,-1,-,-","-1,-1,-,-","-1,-1,-,-","-1,-1,-,-","-1,-1,-,-","-1,-1,-,-","-1,-1,-,-"
3,FAM138A,1.0,34554.0,36081.0,"-1,-1,-,-","-1,-1,-,-","-1,-1,-,-","-1,-1,-,-","-1,-1,-,-","-1,-1,-,-",...,"-1,-1,-,-","-1,-1,-,-","-1,-1,-,-","-1,-1,-,-","-1,-1,-,-","-1,-1,-,-","-1,-1,-,-","-1,-1,-,-","-1,-1,-,-","-1,-1,-,-"
4,OR4G11P,1.0,62948.0,63887.0,"5,5,L,-","2,2,H,-","2,2,H,-","5,5,H,-","4,4,H,-","3,3,H,-",...,"2,2,L,-","4,4,H,-","2,2,H,-","3,3,H,-","2,2,H,-","5,5,L,-","2,2,L,-","2,2,L,-","3,3,H,-","3,3,L,-"


In [5]:
cosmic_ids = d.iloc[0, 4:]
cosmic_ids.head()

201T        1287381
22RV1        924100
23132-87     910924
42-MG-BA     687561
451Lu       1287706
Name: 0, dtype: object

In [6]:
pd.Series([type(x) for x in d.columns]).value_counts()

<class 'str'>    998
<class 'int'>      2
dtype: int64

In [7]:
d.iloc[1:,:].head()

Unnamed: 0,gene,chr,start,stop,201T,22RV1,23132-87,42-MG-BA,451Lu,5637,...,WSU-NHL,YAPC,YH-13,YKG-1,YMB-1-E,YT,ZR-75-30,huH-1,no-10,no-11
1,DDX11L1,1,11869.0,14412.0,"-1,-1,-,-","-1,-1,-,-","-1,-1,-,-","-1,-1,-,-","-1,-1,-,-","-1,-1,-,-",...,"-1,-1,-,-","-1,-1,-,-","-1,-1,-,-","-1,-1,-,-","-1,-1,-,-","-1,-1,-,-","-1,-1,-,-","-1,-1,-,-","-1,-1,-,-","-1,-1,-,-"
2,WASH7P,1,14363.0,29806.0,"-1,-1,-,-","-1,-1,-,-","-1,-1,-,-","-1,-1,-,-","-1,-1,-,-","-1,-1,-,-",...,"-1,-1,-,-","-1,-1,-,-","-1,-1,-,-","-1,-1,-,-","-1,-1,-,-","-1,-1,-,-","-1,-1,-,-","-1,-1,-,-","-1,-1,-,-","-1,-1,-,-"
3,FAM138A,1,34554.0,36081.0,"-1,-1,-,-","-1,-1,-,-","-1,-1,-,-","-1,-1,-,-","-1,-1,-,-","-1,-1,-,-",...,"-1,-1,-,-","-1,-1,-,-","-1,-1,-,-","-1,-1,-,-","-1,-1,-,-","-1,-1,-,-","-1,-1,-,-","-1,-1,-,-","-1,-1,-,-","-1,-1,-,-"
4,OR4G11P,1,62948.0,63887.0,"5,5,L,-","2,2,H,-","2,2,H,-","5,5,H,-","4,4,H,-","3,3,H,-",...,"2,2,L,-","4,4,H,-","2,2,H,-","3,3,H,-","2,2,H,-","5,5,L,-","2,2,L,-","2,2,L,-","3,3,H,-","3,3,L,-"
5,OR4F5,1,65882.0,70008.0,"5,5,L,-","2,2,H,-","2,2,H,-","5,5,H,-","4,4,H,-","3,3,H,-",...,"2,2,L,-","4,4,H,-","2,2,H,-","3,3,H,-","2,2,H,-","5,5,L,-","2,2,L,-","2,2,L,-","3,3,H,-","3,3,L,-"


In [10]:
d_tr = pd.melt(d.iloc[1:,:], id_vars=['gene', 'chr', 'start', 'stop'], var_name='CELL_LINE_ID', value_name='VALUE')
d_tr.head()

Unnamed: 0,gene,chr,start,stop,CELL_LINE_ID,VALUE
0,DDX11L1,1,11869.0,14412.0,201T,"-1,-1,-,-"
1,WASH7P,1,14363.0,29806.0,201T,"-1,-1,-,-"
2,FAM138A,1,34554.0,36081.0,201T,"-1,-1,-,-"
3,OR4G11P,1,62948.0,63887.0,201T,"5,5,L,-"
4,OR4F5,1,65882.0,70008.0,201T,"5,5,L,-"


In [11]:
# Add COSMIC integer IDs as a separate field
d_tr['CELL_LINE_ID:COSMIC'] = d_tr['CELL_LINE_ID'].map(cosmic_ids)
assert np.all(d_tr['CELL_LINE_ID:COSMIC'].notnull())
d_tr = d_tr.rename(columns=lambda c: c.upper()).rename(columns={'GENE': 'GENE_ID:HGNC'})

# For COSMIC IDs, make sure all values are integers first
assert np.all(d_tr['CELL_LINE_ID:COSMIC'].apply(type) == int)
d_tr['CELL_LINE_ID:COSMIC'] = d_tr['CELL_LINE_ID:COSMIC'].astype(str)

# For common cell line IDs, make sure there are no floats and then convert each to string
assert np.all(d_tr['CELL_LINE_ID'].apply(type).isin([str, int])), \
    'Found cell line ID not given as string or int'
d_tr['CELL_LINE_ID'] = d_tr['CELL_LINE_ID'].astype(str)

# Ensure that no cell line IDs ever conflict with one another
assert d_tr.groupby('CELL_LINE_ID')['CELL_LINE_ID:COSMIC'].nunique().max() == 1
assert d_tr.groupby('CELL_LINE_ID:COSMIC')['CELL_LINE_ID'].nunique().max() == 1

# Ensure that each cell line + gene combination has no more than 1 record
assert d_tr.groupby(['CELL_LINE_ID', 'GENE_ID:HGNC']).size().max() == 1, \
    'Found at least one duplicated cell line + gene combinaton'

d_tr.head()

Unnamed: 0,GENE_ID:HGNC,CHR,START,STOP,CELL_LINE_ID,VALUE,CELL_LINE_ID:COSMIC
0,DDX11L1,1,11869.0,14412.0,201T,"-1,-1,-,-",1287381
1,WASH7P,1,14363.0,29806.0,201T,"-1,-1,-,-",1287381
2,FAM138A,1,34554.0,36081.0,201T,"-1,-1,-,-",1287381
3,OR4G11P,1,62948.0,63887.0,201T,"5,5,L,-",1287381
4,OR4F5,1,65882.0,70008.0,201T,"5,5,L,-",1287381


In [12]:
d_tr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46036116 entries, 0 to 46036115
Data columns (total 7 columns):
GENE_ID:HGNC           46036116 non-null object
CHR                    46036116 non-null object
START                  46036116 non-null float64
STOP                   46036116 non-null float64
CELL_LINE_ID           46036116 non-null object
VALUE                  46036116 non-null object
CELL_LINE_ID:COSMIC    46036116 non-null object
dtypes: float64(2), object(5)
memory usage: 2.4+ GB


In [13]:
d_tr['GENE_ID:HGNC'].apply(type).value_counts()

<class 'str'>    46036116
Name: GENE_ID:HGNC, dtype: int64

In [14]:
d_tr['VALUE'].value_counts().head()

2,2,H,-    13527476
3,3,H,-    10046271
4,4,H,-     7211121
2,2,L,-     6579659
3,3,L,-     2454722
Name: VALUE, dtype: int64

In [15]:
d_tr['CELL_LINE_ID'].nunique()

996

In [16]:
assert np.all(pd.notnull(d_tr))
db.save(d_tr, src.GDSC_v2, db.IMPORT, 'gene-copy-number')

'/Users/eczech/data/research/mgds/import/gdsc_v2_gene-copy-number.pkl'