# NCI Dream Cell Line Metadata Import

There appears to be no source of deeper metadata on the NCI Dream Cell Lines so for now metadata will just consist of the the cell line ids and their known primary site/type, Breast Cancer.  This could be created from any combination of training/test genomics data with records for all 53 cell lines, but for simplicity the [supplementary materials](https://www.synapse.org/#!Synapse:syn2785786) spreadsheet will be used instead which contains several tables with all 53 cell line ids in one place.

In [15]:
%run -m ipy_startup
%matplotlib inline
from mgds.data_aggregation import database as db
from mgds.data_aggregation import source as src
from mgds.data_aggregation import data_type
from mgds.data_aggregation import entity
from mgds.data_aggregation.import_lib import nci_dream
from mgds.data_aggregation import excel_utils
from py_utils import assertion_utils

In [11]:
d = pd.read_excel(
    nci_dream.get_file('NCI-DREAM_SC1_Supplementary_Tables.xlsx'), 
    sheetname='Supp Table 10', skiprows=[0,1,2,3]
)
d = d[['Cell line']].rename(columns={'Cell line': 'CELL_LINE_ID'})
d['PRIMARY_SITE'] = 'BREAST'
d['PROPERTIES'] = np.repeat({}, len(d))
d = entity.prepare_cellline_meta(d)
d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53 entries, 0 to 52
Data columns (total 3 columns):
CELL_LINE_ID    53 non-null object
PRIMARY_SITE    53 non-null object
PROPERTIES      53 non-null object
dtypes: object(3)
memory usage: 1.3+ KB


In [12]:
d['CELL_LINE_ID'].unique()

array(['HCC1954', 'AU565', 'HCC1937', 'CAMA1', 'T47DKBLUC', 'UACC812',
       'HCC1569', 'MCF12A', 'HCC1187', 'HCC38', 'SUM229PE', 'ZR751',
       'BT483', 'T47D', 'ZR7530', 'BT549', 'MDAMB231', 'MDAMB453',
       'MCF10F', 'MDAMB157', 'HCC1428', 'MDAMB134VI', 'BT20', 'HCC1419',
       'MDAMB361', 'HCC202', 'MDAMB415', 'MCF7', 'MDAMB175VII', 'HCC1395',
       'HCC1143', 'HCC70', 'BT474', 'HCC1806', 'HS578T', '184A1', '184B5',
       '21MT1', '21NT', '600MPE', 'HCC2185', 'HCC3153', 'LY2', 'MCF10A',
       'MX1', 'SKBR3', 'SUM1315MO2', 'SUM149PT', 'SUM159PT', 'SUM185PE',
       'SUM225CWN', 'SUM52PE', 'ZR75B'], dtype=object)

In [16]:
db.save(d, src.NCIDREAM_v1, db.IMPORT, data_type.CELLLINE_META)

'/Users/eczech/data/research/mgds/import/ncidream_v1_cellline-meta.pkl'