# NCI Dream Gene Expression Data Import

Gene expression data import for NCI DREAM drug sensitivity challenge.

Source data README: /Users/eczech/.synapseCache/428/756428/DREAM7_DrugSensitivity1_GeneExpression_README.txt

In [1]:
%run -m ipy_startup
from mgds.data_aggregation import database as db
from mgds.data_aggregation import source as src
from mgds.data_aggregation import excel_utils
from mgds.data_aggregation.import_lib import nci_dream
from py_utils.collection_utils import subset
from py_utils import assertion_utils

In [7]:
file_path = nci_dream.get_file('GeneExpression.txt')
d = pd.read_csv(file_path, sep='\t')

# At TOW, this data had no "Excel Dates" as gene ids so make sure that remains true
assert not np.any(excel_utils.is_excel_date(d['HGNC_ID']))

# Run manual conversions for known special cases in NCI Dream Gene Symbols
d['HGNC_ID'] = nci_dream.convert_hgnc_id(d['HGNC_ID'])

d.head()

Unnamed: 0,HGNC_ID,184B5,600MPE,AU565,BT20,BT474,BT483,CAMA1,HCC38,HCC70,...,SUM185PE,SUM225CWN,SUM1315MO2,T47D,UACC812,ZR751,ZR7530,ZR75B,BT549,MCF10A
0,C9orf152,3.61303,8.463736,7.935823,3.098928,9.010836,6.238427,7.53196,2.941374,5.048602,...,8.611076,7.479183,2.743167,7.885278,6.478226,9.30671,7.672073,9.016769,3.312954,3.257463
1,ELMO2,7.20594,7.959597,8.073369,8.164778,8.120015,8.950747,7.747369,8.551765,7.708224,...,9.39123,8.086427,7.653533,8.027774,8.112902,8.274168,8.654507,7.907656,8.463506,7.348165
2,RPS11,10.476276,9.939852,9.81743,10.0385,9.533423,9.471906,9.889735,9.852145,9.496845,...,9.025056,9.624821,9.447288,8.779309,9.069048,9.333409,9.360958,10.064785,9.693886,9.858231
3,CREB3L1,5.565989,9.835957,5.696639,4.972852,8.155372,6.437281,6.57289,7.007242,5.386509,...,9.897796,8.059426,7.569412,5.437983,8.168436,7.360383,8.892531,7.051816,7.973954,5.256
4,PNMA1,7.664038,7.645439,7.761882,7.766212,8.393242,8.081416,7.643172,7.922658,7.409595,...,7.985742,7.395009,8.757817,8.315187,8.430294,7.001755,7.582231,7.348228,8.558804,7.668157


## Melt to Long Format

In [8]:
d = d.rename(columns={'HGNC_ID': 'GENE_ID:HGNC'})
id_vars = ['GENE_ID:HGNC']
d = pd.melt(d, id_vars=id_vars, value_name='VALUE', var_name='CELL_LINE_ID')

# Ensure no gene + cell line combos are repeated
assert d.groupby(id_vars + ['CELL_LINE_ID']).size().max() == 1

# Remove records with null values
d = subset(d, lambda df: df[df['VALUE'].notnull()], subset_op='Remove null values for column "VALUE"')

# Ensure all objects are strings
assertion_utils.assert_object_types(d)

d.head()

[Remove null values for column "VALUE"] Records before = 857072, Records after = 857072, Records removed = 0 (%0.00)


Unnamed: 0,GENE_ID:HGNC,CELL_LINE_ID,VALUE
0,C9orf152,184B5,3.61303
1,ELMO2,184B5,7.20594
2,RPS11,184B5,10.476276
3,CREB3L1,184B5,5.565989
4,PNMA1,184B5,7.664038


In [9]:
d.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 857072 entries, 0 to 857071
Data columns (total 3 columns):
GENE_ID:HGNC    857072 non-null object
CELL_LINE_ID    857072 non-null object
VALUE           857072 non-null float64
dtypes: float64(1), object(2)
memory usage: 26.2+ MB


## Export

In [10]:
assert np.all(pd.notnull(d))
db.save(d, src.NCIDREAM_v1, db.IMPORT, 'gene-expression')

'/Users/eczech/data/research/mgds/import/ncidream_v1_gene-expression.pkl'