# NCI Dream Methylation Data Import

DNA Methylation data import for NCI DREAM drug sensitivity challenge.

Source data README: /Users/eczech/.synapseCache/428/756428/DREAM7_DrugSensitivity1_Methylation_README.txt

Note that the values themselves here are "beta" values and that these are sometimes ignored or unreliable based on the "CGct1" or "Cct1" values (see the README for more details).

In [1]:
%run -m ipy_startup
%run -m ipy_seaborn
from mgds.data_aggregation import database as db
from mgds.data_aggregation import source as src
from mgds.data_aggregation import excel_utils
from mgds.data_aggregation.import_lib import nci_dream
from py_utils import assertion_utils
from py_utils import collection_utils

In [2]:
file_path = nci_dream.get_file('Methylation.txt')
# Note: Illumina_ID is an array id and has nothing to do with cell line or gene ids
d = pd.read_csv(file_path, sep='\t')

# Run manual conversions for known special cases in NCI Dream Gene Symbols
d['HGNC_ID'] = nci_dream.convert_hgnc_id(d['HGNC_ID'])

d.head()

Unnamed: 0,Illumina_ID,HGNC_ID,CGct1,Cct1,600MPE,AU565,BT20,BT474,BT483,BT549,...,SUM52PE,SUM149PT,SUM159PT,SUM185PE,SUM225CWN,SUM1315MO2,T47D,UACC812,ZR751,ZR7530
0,cg00000292,ATP2A1,2,10,0.881395,0.625653,0.922304,0.908254,0.932414,0.891903,...,0.910745,0.939521,0.889001,0.926362,0.813929,0.613688,0.862319,0.859278,0.888635,0.893735
1,cg00002426,SLMAP,3,10,0.067202,0.056657,0.057735,0.224676,0.365351,0.038872,...,0.061264,0.052474,0.045397,0.642288,0.046751,0.909573,0.777622,0.899977,0.098025,0.094077
2,cg00003994,MEOX2,1,13,0.863474,0.038321,0.541085,0.587678,0.038327,0.84666,...,0.181697,0.066858,0.066631,0.191392,0.051345,0.071066,0.158072,0.368934,0.757326,0.822995
3,cg00005847,HOXD3,1,12,0.842265,0.102223,0.893619,0.856988,0.87687,0.811981,...,0.838893,0.569239,0.478482,0.835688,0.428073,0.679917,0.57999,0.852463,0.836502,0.837975
4,cg00006414,ZNF398,4,9,0.084663,0.091306,0.065515,0.089713,0.128638,0.070089,...,0.055451,0.0728,0.074124,0.138269,0.087123,0.048023,0.188648,0.085332,0.074521,0.122249


## Correct Excel Dates in Gene Names

In [3]:
converted_gene_id = excel_utils.convert_gene_ids(d['HGNC_ID'])
excel_utils.get_gene_conversions(d['HGNC_ID'], converted_gene_id)

Unnamed: 0,ORIGINAL_GENE_ID,CONVERTED_GENE_ID
744,5-Mar,MARCH5
789,2-Sep,SEPT2
922,11-Sep,SEPT11
2474,7-Mar,MARCH7
2699,8-Mar,MARCH8
3343,9-Sep,SEPT9
3568,10-Sep,SEPT10
4306,3-Sep,SEPT3
4474,9-Sep,SEPT9
4543,7-Sep,SEPT7


In [4]:
d['HGNC_ID'] = converted_gene_id
d = collection_utils.subset(d, lambda df: df[df['HGNC_ID'].notnull()], subset_op='Remove records with null gene ids')
d.info()

[Remove records with null gene ids] Records before = 27551, Records after = 27549, Records removed = 2 (%0.01)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 27549 entries, 0 to 27550
Data columns (total 45 columns):
Illumina_ID    27549 non-null object
HGNC_ID        27549 non-null object
CGct1          27549 non-null int64
Cct1           27549 non-null int64
600MPE         27549 non-null float64
AU565          27549 non-null float64
BT20           27549 non-null float64
BT474          27549 non-null float64
BT483          27549 non-null float64
BT549          27549 non-null float64
CAMA1          27549 non-null float64
HCC38          27549 non-null float64
HCC70          27549 non-null float64
HCC202         27549 non-null float64
HCC1143        27549 non-null float64
HCC1187        27549 non-null float64
HCC1428        27549 non-null float64
HCC1569        27549 non-null float64
HCC1937        27549 non-null float64
HCC1954        27549 non-null float64
HCC2185        27549 non-n

## Melt to Long Format

In [5]:
d_tr = d.rename(columns={
    'HGNC_ID': 'GENE_ID:HGNC', 
    'Illumina_ID': 'ARRAY_ID:ILLUMINA',
    'CGct1': 'CGCT1',
    'Cct1': 'CCT1',
})
id_vars = ['GENE_ID:HGNC', 'ARRAY_ID:ILLUMINA', 'CGCT1', 'CCT1']
d_tr = pd.melt(d_tr, id_vars=id_vars, value_name='VALUE', var_name='CELL_LINE_ID')

# Make sure there are no duplicate cell line + gene + array combinations
assert not np.any(d_tr[d_tr.columns.difference(['VALUE'])].duplicated())

# Make sure that all object fields contain only strings
assertion_utils.assert_object_types(d_tr, na_ok=False)

d_tr.head()

Unnamed: 0,GENE_ID:HGNC,ARRAY_ID:ILLUMINA,CGCT1,CCT1,CELL_LINE_ID,VALUE
0,ATP2A1,cg00000292,2,10,600MPE,0.881395
1,SLMAP,cg00002426,3,10,600MPE,0.067202
2,MEOX2,cg00003994,1,13,600MPE,0.863474
3,HOXD3,cg00005847,1,12,600MPE,0.842265
4,ZNF398,cg00006414,4,9,600MPE,0.084663


In [6]:
d_tr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1129509 entries, 0 to 1129508
Data columns (total 6 columns):
GENE_ID:HGNC         1129509 non-null object
ARRAY_ID:ILLUMINA    1129509 non-null object
CGCT1                1129509 non-null int64
CCT1                 1129509 non-null int64
CELL_LINE_ID         1129509 non-null object
VALUE                1129509 non-null float64
dtypes: float64(1), int64(2), object(3)
memory usage: 51.7+ MB


## Aggregate Replicate Measurements

In [7]:
# According to the documentation for this source, it is common to ignore probes
# with a CGCT1 or CCT1 of < 3 (or at least I think that's what the docs mean)
# so consider doing that here in the future:

# d_agg = d_tr[(d_tr['CGCT1'] >= 3) & (d_tr['CCT1'] >= 3)]\
#     .groupby(['CELL_LINE_ID', 'GENE_ID:HGNC'])['VALUE']\
#   .agg({'VALUE_MEAN': np.mean, 'VALUE_STD': np.std, 'VALUE_CT': 'count'}).reset_index()

d_agg = d_tr.groupby(['CELL_LINE_ID', 'GENE_ID:HGNC'])['VALUE']\
    .agg({'VALUE_MEAN': np.mean, 'VALUE_STD': np.std, 'VALUE_CT': 'count'}).reset_index()

d_agg['VALUE_STD'] = d_agg['VALUE_STD'].fillna(0)
d_agg.head()

Unnamed: 0,CELL_LINE_ID,GENE_ID:HGNC,VALUE_STD,VALUE_MEAN,VALUE_CT
0,600MPE,7A5,0.0,0.252937,1
1,600MPE,A1BG,0.0,0.962454,1
2,600MPE,A2BP1,0.089486,0.496695,2
3,600MPE,A2M,0.0,0.706011,1
4,600MPE,A2ML1,0.011319,0.785992,2


In [8]:
d_agg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 593475 entries, 0 to 593474
Data columns (total 5 columns):
CELL_LINE_ID    593475 non-null object
GENE_ID:HGNC    593475 non-null object
VALUE_STD       593475 non-null float64
VALUE_MEAN      593475 non-null float64
VALUE_CT        593475 non-null int64
dtypes: float64(2), int64(1), object(2)
memory usage: 22.6+ MB


## Export

In [9]:
assert np.all(pd.notnull(d_agg))
db.save(d_agg, src.NCIDREAM_v1, db.IMPORT, 'gene-methylation')

'/Users/eczech/data/research/mgds/import/ncidream_v1_gene-methylation.pkl'