# NCI60 Gene Expression Data Importation
**Local Version**: 2
**Source Version**: NA

This notebook will import raw NCI60 gene expression data using the [CGDS](http://www.cbioportal.org/cgds_r.jsp) (aka "Cancer Genomic Data Server") portal.

In [1]:
%run -m ipy_startup
%run -m ipy_logging
from mgds.data_aggregation import database as db
from mgds.data_aggregation import source as src
from mgds.data_aggregation import api
from mgds.data_aggregation.import_lib import cgds
from mgds.data_aggregation.import_lib import nci60
pd.set_option('display.max_info_rows', 25000000)

In [2]:
case_list_id = nci60.CASE_LIST_ID
genetic_profile_id = nci60.PROF_GENE_EXPRESSION
batch_size = 50

op = lambda: cgds.get_genetic_profile_data(
    case_list_id, genetic_profile_id,
    api.get_hugo_gene_ids(), gene_id_batch_size=batch_size
)
d = db.cache_raw_operation(op, src.NCI60_v2, 'gene-expression', overwrite=False)

2016-11-28 14:15:44,535:DEBUG:mgds.data_aggregation.io_utils: Restoring serialized object from "/Users/eczech/data/research/mgds/raw/nci60_v2_gene-expression.pkl"


In [3]:
d.head()

Unnamed: 0,GENE_ID,COMMON,BT_549,HS578T,MCF7,MDA_MB_231,T47D,SF_268,SF_295,SF_539,...,DU_145,PC_3,786_0,A498,ACHN,CAKI_1,RXF_393,SN12C,TK_10,UO_31
0,1,A1BG,,,,,,,,,...,,,,,,,,,,
1,503538,A1BG-AS1,1.7,0.87,1.36,-0.15,1.38,,,,...,0.3,-1.19,-1.1,-0.81,-0.64,0.18,-0.02,-0.35,-1.05,-0.51
2,29974,A1CF,-0.1,0.15,-0.14,-0.18,-0.16,,,,...,-0.37,-0.32,-0.12,-0.05,-0.07,-0.1,-0.19,-0.23,-0.07,-0.27
3,2,A2M,-0.39,-0.24,-0.38,-0.4,-0.38,,,,...,-0.36,-0.51,-0.45,-0.35,-0.36,-0.39,-0.4,-0.43,-0.37,-0.4
4,144571,A2M-AS1,,,,,,,,,...,,,,,,,,,,


In [4]:
d = cgds.melt_raw_data(d)
d.info()

[Remove null values for column "VALUE"] Records before = 2341920, Records after = 1209798, Records removed = 1132122 (%48.34)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1209798 entries, 1 to 2341919
Data columns (total 4 columns):
GENE_ID:CGDS    1209798 non-null int64
GENE_ID:HGNC    1209798 non-null object
CELL_LINE_ID    1209798 non-null object
VALUE           1209798 non-null float64
dtypes: float64(1), int64(1), object(2)
memory usage: 46.2+ MB


In [5]:
d_agg, d_dist = cgds.aggregate(d)
d_dist

1    1206236
2       1457
3        216
Name: Number of Replicates, dtype: int64

In [6]:
d_agg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1207909 entries, 0 to 1207908
Data columns (total 6 columns):
CELL_LINE_ID    1207909 non-null object
GENE_ID:HGNC    1207909 non-null object
GENE_ID:CGDS    1207909 non-null int64
VALUE_MEAN      1207909 non-null float64
VALUE_STD       1207909 non-null float64
VALUE_CT        1207909 non-null int64
dtypes: float64(2), int64(2), object(2)
memory usage: 55.3+ MB


In [8]:
assert np.all(pd.notnull(d_agg))
db.save(d_agg, src.NCI60_v2, db.IMPORT, 'gene-expression')

'/Users/eczech/data/research/mgds/import/nci60_v2_gene-expression.pkl'