# NCI60 Cell Line Metadata Importation
**Local Version**: 2
**Source Version**: NA

This notebook will import raw NCI60 cellline metadata data using the CellMiner downloads page hosted at: https://discover.nci.nih.gov/cellminer/metadata.do

In [1]:
%run -m ipy_startup
%run -m ipy_logging
%matplotlib inline
from mgds.data_aggregation import io_utils
from mgds.data_aggregation import database as db
from mgds.data_aggregation import source as src
from mgds.data_aggregation import entity
from mgds.data_aggregation.import_lib import cgds
from mgds.data_aggregation.import_lib import nci60
from py_utils import assertion_utils

In [2]:
url = 'https://discover.nci.nih.gov/cellminer/samples/CELLMINER_CELL_LINE_METADATA.txt'
filepath = db.raw_file(src.NCI60_v2, 'cellline-meta.txt')
filepath = io_utils.download(url, filepath, check_exists=True)
filepath

2016-12-01 15:06:36,775:DEBUG:mgds.data_aggregation.io_utils: Returning previously downloaded path for "/Users/eczech/data/research/mgds/raw/nci60_v2_cellline-meta.txt"


'/Users/eczech/data/research/mgds/raw/nci60_v2_cellline-meta.txt'

In [3]:
d = pd.read_csv(filepath, sep='\t', skiprows=list(range(7)), error_bad_lines=False)
d = d.iloc[:60, :]
d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 15 columns):
Cell Line Name           60 non-null object
tissue of origin (a)     60 non-null object
age (a)                  45 non-null float64
sex (a)                  54 non-null object
prior treatment (a,b)    39 non-null object
Epithelial               60 non-null object
histology (a,c)          60 non-null object
source                   20 non-null object
ploidy (d)               59 non-null object
p53 (e)                  60 non-null object
mdr (f)                  59 non-null float64
doubling time (g)        60 non-null float64
Institute                60 non-null object
Contributor              60 non-null object
Reference                60 non-null object
dtypes: float64(3), object(12)
memory usage: 7.1+ KB


b'Skipping line 69: expected 15 fields, saw 29\n'


In [25]:
d_cl = d.copy()

d_cl.columns = d_cl.columns.to_series()\
    .str.replace('\(.*\)', '')\
    .str.strip()\
    .str.replace(' ', '_')\
    .str.upper()
c_m = {
    'CELL_LINE_NAME': 'CELL_LINE_ID',
    'SEX': 'GENDER',
    'TISSUE_OF_ORIGIN': 'PRIMARY_SITE'
}
d_cl = d_cl.rename(columns=c_m)

# Strip off leading tissue type abbreviation on cell line ids (eg: BR:MCF7)
assert np.all(d_cl['CELL_LINE_ID'].apply(lambda x: len(x.split(':')) == 2))
d_cl['CELL_LINE_ID'] = d_cl['CELL_LINE_ID'].apply(lambda x: x.split(':')[1])

# Also strip out paren enclosures in cell line ids -- this caused 4 known issues
# where a cell line id here like "LOXIMVI (h)" does not match "LOXIMVI" in omics data sets
d_cl['CELL_LINE_ID'] = d_cl['CELL_LINE_ID'].str.replace('\(.*\)', '').str.strip()

# Add generic properties map with more source-specific metadata
c_detail = ['HISTOLOGY', 'INSTITUTE', 'PRIOR_TREATMENT']
d_cl['PROPERTIES'] = d_cl[c_detail].apply(lambda r: r.to_dict(), axis=1)

# Subset to only most relevant fields
d_cl = d_cl[['CELL_LINE_ID', 'PRIMARY_SITE', 'AGE', 'GENDER', 'PROPERTIES']]

d_cl.head()

Unnamed: 0,CELL_LINE_ID,PRIMARY_SITE,AGE,GENDER,PROPERTIES
0,MCF7,Breast,69.0,F,"{'INSTITUTE': 'Michigan Cancer Foundtion', 'PR..."
1,MDA_MB_231,Breast,51.0,F,{'INSTITUTE': 'MD Anderson Hospital and Tumor ...
2,HS578T,Breast,74.0,F,"{'INSTITUTE': 'Naval Bioscience Laboratory', '..."
3,BT_549,Breast,72.0,F,"{'INSTITUTE': '?', 'PRIOR_TREATMENT': nan, 'HI..."
4,T47D,Breast,54.0,F,"{'INSTITUTE': '?', 'PRIOR_TREATMENT': nan, 'HI..."


In [26]:
d_cl['CELL_LINE_ID'].values

array(['MCF7', 'MDA_MB_231', 'HS578T', 'BT_549', 'T47D', 'SF_268',
       'SF_295', 'SF_539', 'SNB_19', 'SNB_75', 'U251', 'COLO205',
       'HCC_2998', 'HCT_116', 'HCT_15', 'HT29', 'KM12', 'SW_620',
       'CCRF_CEM', 'HL_60', 'K_562', 'MOLT_4', 'RPMI_8226', 'SR',
       'LOXIMVI', 'MALME_3M', 'M14', 'SK_MEL_2', 'SK_MEL_28', 'SK_MEL_5',
       'UACC_257', 'UACC_62', 'MDA_MB_435', 'MDA_N', 'A549', 'EKVX',
       'HOP_62', 'HOP_92', 'NCI_H226', 'NCI_H23', 'NCI_H322M', 'NCI_H460',
       'NCI_H522', 'IGROV1', 'OVCAR_3', 'OVCAR_4', 'OVCAR_5', 'OVCAR_8',
       'SK_OV_3', 'NCI_ADR_RES', 'PC_3', 'DU_145', '786_0', 'A498', 'ACHN',
       'CAKI_1', 'RXF_393', 'SN12C', 'TK_10', 'UO_31'], dtype=object)

In [27]:
d_cl.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 5 columns):
CELL_LINE_ID    60 non-null object
PRIMARY_SITE    60 non-null object
AGE             45 non-null float64
GENDER          54 non-null object
PROPERTIES      60 non-null object
dtypes: float64(1), object(4)
memory usage: 2.4+ KB


In [28]:
d_cl['GENDER'].value_counts()

M    28
F    26
Name: GENDER, dtype: int64

In [29]:
d_cl['PRIMARY_SITE'].value_counts()

Melanoma                  10
Non-Small Cell Lung        9
Renal                      8
Ovarian                    7
Colon                      7
Leukemia                   6
Central nervous system     6
Breast                     5
Prostate                   2
Name: PRIMARY_SITE, dtype: int64

In [30]:
d_cl['PROPERTIES'].apply(lambda x: x['HISTOLOGY']).value_counts().head(10)

Adenocarcinoma-md                                                                    4
Malignant melanotic melanoma                                                         4
Adenocarcinoma                                                                       3
Melanotic melanoma                                                                   3
Glioblastoma, ud                                                                     3
Adenocarcinoma-vpd                                                                   2
Carcinoma-ud                                                                         2
Ductal carcinoma- mammary gland; breast; duct; metastatic site: pleural effusion;    2
Clear cell carcinoma                                                                 1
Malignant amelanotic melanoma                                                        1
Name: PROPERTIES, dtype: int64

In [31]:
d_cl['PROPERTIES'].apply(lambda x: x['PRIOR_TREATMENT']).value_counts().head(10)

None                    24
Rad                      3
CyPh/CsPt/Adr            2
Rad/VB/CCNU/Mto/Pred     1
Thiotepa                 1
Rad/BCNU/5FU/HU/6MP      1
Rad/HU/5FU/Mtx/Ctx       1
None (non smoker)        1
Ctx/Adr/CsPt/CyPh        1
VB/6MP/Pred              1
Name: PROPERTIES, dtype: int64

## Export

In [32]:
d_exp = entity.prepare_cellline_meta(d_cl)
d_exp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 5 columns):
CELL_LINE_ID    60 non-null object
PRIMARY_SITE    60 non-null object
AGE             45 non-null float64
GENDER          60 non-null object
PROPERTIES      60 non-null object
dtypes: float64(1), object(4)
memory usage: 2.4+ KB


In [33]:
d_exp['CELL_LINE_ID'].str.contains('(', regex=False).value_counts()

False    60
Name: CELL_LINE_ID, dtype: int64

In [36]:
db.save(d_exp, src.NCI60_v2, db.IMPORT, 'cellline-meta')

'/Users/eczech/data/research/mgds/import/nci60_v2_cellline-meta.pkl'