# NCI Dream Methylation Data Import

DNA Methylation data import for NCI DREAM drug sensitivity challenge.

Source data README: /Users/eczech/.synapseCache/428/756428/DREAM7_DrugSensitivity1_Methylation_README.txt

Note that the values themselves here are "beta" values and that these are sometimes ignored or unreliable based on the "CGct1" or "Cct1" values (see the README for more details).

In [1]:
%run -m ipy_startup
from mgds.data_aggregation import database as db
from mgds.data_aggregation import source as src
from mgds.data_aggregation.import_lib import nci_dream

In [6]:
file_path = nci_dream.get_file('Methylation.txt')
d = pd.read_csv(file_path, sep='\t')
d.head()

Unnamed: 0,Illumina_ID,HGNC_ID,CGct1,Cct1,600MPE,AU565,BT20,BT474,BT483,BT549,...,SUM52PE,SUM149PT,SUM159PT,SUM185PE,SUM225CWN,SUM1315MO2,T47D,UACC812,ZR751,ZR7530
0,cg00000292,ATP2A1,2,10,0.881395,0.625653,0.922304,0.908254,0.932414,0.891903,...,0.910745,0.939521,0.889001,0.926362,0.813929,0.613688,0.862319,0.859278,0.888635,0.893735
1,cg00002426,SLMAP,3,10,0.067202,0.056657,0.057735,0.224676,0.365351,0.038872,...,0.061264,0.052474,0.045397,0.642288,0.046751,0.909573,0.777622,0.899977,0.098025,0.094077
2,cg00003994,MEOX2,1,13,0.863474,0.038321,0.541085,0.587678,0.038327,0.84666,...,0.181697,0.066858,0.066631,0.191392,0.051345,0.071066,0.158072,0.368934,0.757326,0.822995
3,cg00005847,HOXD3,1,12,0.842265,0.102223,0.893619,0.856988,0.87687,0.811981,...,0.838893,0.569239,0.478482,0.835688,0.428073,0.679917,0.57999,0.852463,0.836502,0.837975
4,cg00006414,ZNF398,4,9,0.084663,0.091306,0.065515,0.089713,0.128638,0.070089,...,0.055451,0.0728,0.074124,0.138269,0.087123,0.048023,0.188648,0.085332,0.074521,0.122249


In [7]:
d = d.rename(columns={
    'HGNC_ID': 'GENE_ID:HGNC', 
    'Illumina_ID': 'GENE_ID:ILLUMINA',
    'CGct1': 'CGCT1',
    'Cct1': 'CCT1',
})
id_vars = ['GENE_ID:HGNC', 'GENE_ID:ILLUMINA', 'CGCT1', 'CCT1']
d = pd.melt(d, id_vars=id_vars, value_name='VALUE', var_name='CELL_LINE_ID')
assert d.groupby(id_vars + ['CELL_LINE_ID']).size().max() == 1
d.head()

Unnamed: 0,GENE_ID:HGNC,GENE_ID:ILLUMINA,CGCT1,CCT1,CELL_LINE_ID,VALUE
0,ATP2A1,cg00000292,2,10,600MPE,0.881395
1,SLMAP,cg00002426,3,10,600MPE,0.067202
2,MEOX2,cg00003994,1,13,600MPE,0.863474
3,HOXD3,cg00005847,1,12,600MPE,0.842265
4,ZNF398,cg00006414,4,9,600MPE,0.084663


In [8]:
d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1129591 entries, 0 to 1129590
Data columns (total 6 columns):
GENE_ID:HGNC        1129591 non-null object
GENE_ID:ILLUMINA    1129591 non-null object
CGCT1               1129591 non-null int64
CCT1                1129591 non-null int64
CELL_LINE_ID        1129591 non-null object
VALUE               1129591 non-null float64
dtypes: float64(1), int64(2), object(3)
memory usage: 51.7+ MB


In [9]:
assert np.all(pd.notnull(d))
db.save(d, src.NCIDREAM_v1, db.RAW, 'gene-methylation')

'/Users/eczech/data/research/musc_genomics_db/raw/ncidream_v1_gene-methylation.pkl'