# TCGA Meta Prep

The purpose of this notebook is to analyze and prepare BRCA TCGA metadata

In [58]:
%run -m ipy_startup
from mgds.data_aggregation.import_lib import cgds
from mgds.data_aggregation.import_lib import tcga
from mgds.data_aggregation import database as db
from mgds.data_aggregation import source as src
from mgds.data_aggregation import data_type as dtyp

In [30]:
d_tcga_meta = tcga.load_clinical_data(cohorts=['brca']).set_index('CASE_ID')
#d_tcga_meta = cgds.prep_clinical_data(d_tcga_meta, keep_cols=['COHORT'])
d_tcga_meta.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1105 entries, TCGA-A7-A3J0-01 to TCGA-AN-A0FD-01
Columns: 108 entries, AGE to COHORT
dtypes: bool(1), float64(19), int64(1), object(87)
memory usage: 933.4+ KB


In [31]:
d_tcga_meta.head()

Unnamed: 0_level_0,AGE,AJCC_METASTASIS_PATHOLOGIC_PM,AJCC_NODES_PATHOLOGIC_PN,AJCC_PATHOLOGIC_TUMOR_STAGE,AJCC_STAGING_EDITION,AJCC_TUMOR_PATHOLOGIC_PT,BRACHYTHERAPY_TOTAL_DOSE_POINT_A,CANCER_TYPE,CANCER_TYPE_DETAILED,CENT17_COPY_NUMBER,...,STAGING_SYSTEM_OTHER,SURGERY_FOR_POSITIVE_MARGINS,SURGERY_FOR_POSITIVE_MARGINS_OTHER,SURGICAL_PROCEDURE_FIRST,TISSUE_SOURCE_SITE,TUMOR_STATUS,TUMOR_TISSUE_SITE,VIAL_NUMBER,VITAL_STATUS,COHORT
CASE_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-A7-A3J0-01,62.0,M0,N0,Stage IIA,7th,T2,,Breast Cancer,Breast Invasive Mixed Mucinous Carcinoma,,...,,,,Lumpectomy,A7,TUMOR FREE,Breast,A,Alive,brca
TCGA-OL-A66N-01,59.0,MX,N3,Stage IIIC,7th,T3,,Breast Cancer,Breast Invasive Lobular Carcinoma,,...,,,,Modified Radical Mastectomy,OL,TUMOR FREE,Breast,A,Alive,brca
TCGA-AQ-A0Y5-01,70.0,MX,N2a,Stage IIIA,7th,T2,,Breast Cancer,Breast Invasive Ductal Carcinoma,,...,,,,Modified Radical Mastectomy,AQ,TUMOR FREE,Breast,A,Dead,brca
TCGA-E9-A22H-01,42.0,M0,N1,Stage IIB,7th,T2,,Breast Cancer,Breast Invasive Ductal Carcinoma,,...,,,,Modified Radical Mastectomy,E9,TUMOR FREE,Breast,A,Alive,brca
TCGA-BH-A0EB-01,69.0,M0,N0 (i-),Stage IA,,T1c,H-SCORE 300,Breast Cancer,Breast Invasive Ductal Carcinoma,2.13,...,,,,Other,BH,TUMOR FREE,Breast,A,Alive,brca


In [32]:
status_cols = [
    'NTE_ER_STATUS',
    'ER_STATUS_BY_IHC',
    
    'IHC_HER2',
    'HER2_FISH_STATUS',
    
    'PR_STATUS_BY_IHC',
    'NTE_PR_STATUS_BY_IHC'
]

meta_cols = [    
    'DFS_STATUS',   # Recurrent vs DiseaseFree
    'VITAL_STATUS'  # Alive/Dead
]

In [33]:
d_tcga_meta[status_cols].info()

<class 'pandas.core.frame.DataFrame'>
Index: 1105 entries, TCGA-A7-A3J0-01 to TCGA-AN-A0FD-01
Data columns (total 6 columns):
NTE_ER_STATUS           13 non-null object
ER_STATUS_BY_IHC        1054 non-null object
IHC_HER2                922 non-null object
HER2_FISH_STATUS        422 non-null object
PR_STATUS_BY_IHC        1053 non-null object
NTE_PR_STATUS_BY_IHC    12 non-null object
dtypes: object(6)
memory usage: 60.4+ KB


In [34]:
d_tcga_meta['NTE_ER_STATUS'].fillna('x').value_counts()

x           1092
Positive       9
Negative       4
Name: NTE_ER_STATUS, dtype: int64

In [35]:
d_tcga_meta['ER_STATUS_BY_IHC'].fillna('x').value_counts()

Positive         813
Negative         239
x                 51
Indeterminate      2
Name: ER_STATUS_BY_IHC, dtype: int64

In [50]:
def get_cancer_status(r):
    """ Resolve various status values to ER +/-, PR +/-, HER2 +/-"""
    r = r.fillna('Unknown')
    r = r.apply(lambda v: 'Unknown' if v == 'Indeterminate' or v == 'Equivocal' or pd.isnull(v) else v)
    
    if not np.all(r.isin(['Positive', 'Negative', 'Unknown'])):
        print('Encountered unexpected value: {}'.format(r.unique()))
    assert np.all(r.isin(['Positive', 'Negative', 'Unknown']))
    
    # ER status
    er_nte = r['NTE_ER_STATUS']    # Very sparse
    er_ihc = r['ER_STATUS_BY_IHC'] # Few null
    er_conflict = False
    if er_nte != 'Unknown' and er_ihc != 'Unknown':
        if er_nte != er_ihc:
            er_conflict = True
    if er_ihc != 'Unknown':
        er = er_ihc
    else:
        er = er_nte
    
    # PR status
    pr_nte = r['NTE_PR_STATUS_BY_IHC'] # Very sparse
    pr_ihc = r['PR_STATUS_BY_IHC']     # Few null
    pr_conflict = False
    if pr_nte != 'Unknown' and pr_ihc != 'Unknown':
        if pr_nte != pr_ihc:
            pr_conflict = True
    if pr_ihc != 'Unknown':
        pr = pr_ihc
    else:
        pr = pr_nte
        
    # HER2 status
    her2_fis = r['HER2_FISH_STATUS'] # About half as present as below
    her2_ihc = r['IHC_HER2']         # Few null
    her2_conflict = False
    if her2_ihc != 'Unknown' and her2_fis != 'Unknown':
        if her2_ihc != her2_fis:
            her2_conflict = True
    if her2_ihc != 'Unknown':
        her2 = her2_ihc
    else:
        her2 = her2_fis
        
    triple_negative = (er == pr == her2 == 'Negative')
    triple_negative = 'Positive' if triple_negative else 'Negative'
    return pd.Series({
        'HER2_STATUS': her2, 'ER_STATUS': er, 'PR_STATUS': pr, 'TN_STATUS': triple_negative,
        'HER2_CONFLICT': her2_conflict, 'ER_CONFLICT': er_conflict, 'PR_CONFLICT': pr_conflict
    })

In [51]:
d_meta = pd.concat([
    d_tcga_meta[meta_cols].fillna('Unknown'),
    d_tcga_meta[status_cols].apply(get_cancer_status, axis=1)
], axis=1)
d_meta.index.name = 'Sample'
d_meta.head()

Unnamed: 0_level_0,DFS_STATUS,VITAL_STATUS,ER_CONFLICT,ER_STATUS,HER2_CONFLICT,HER2_STATUS,PR_CONFLICT,PR_STATUS,TN_STATUS
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
TCGA-A7-A3J0-01,DiseaseFree,Alive,False,Positive,False,Negative,False,Positive,Negative
TCGA-OL-A66N-01,DiseaseFree,Alive,False,Positive,False,Negative,False,Negative,Negative
TCGA-AQ-A0Y5-01,Unknown,Dead,False,Positive,False,Positive,False,Positive,Negative
TCGA-E9-A22H-01,DiseaseFree,Alive,False,Positive,False,Positive,False,Positive,Negative
TCGA-BH-A0EB-01,DiseaseFree,Alive,False,Positive,False,Negative,False,Positive,Negative


In [52]:
d_meta.filter(regex='CONFLICT').apply(pd.Series.value_counts)

Unnamed: 0,ER_CONFLICT,HER2_CONFLICT,PR_CONFLICT
False,1105.0,1090,1104
True,,15,1


In [53]:
d_meta = d_meta.drop(d_meta.filter(regex='CONFLICT').columns.tolist(), axis=1)
d_meta.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1105 entries, TCGA-A7-A3J0-01 to TCGA-AN-A0FD-01
Data columns (total 6 columns):
DFS_STATUS      1105 non-null object
VITAL_STATUS    1105 non-null object
ER_STATUS       1105 non-null object
HER2_STATUS     1105 non-null object
PR_STATUS       1105 non-null object
TN_STATUS       1105 non-null object
dtypes: object(6)
memory usage: 60.4+ KB


In [54]:
d_meta.apply(pd.Series.value_counts)

Unnamed: 0,DFS_STATUS,VITAL_STATUS,ER_STATUS,HER2_STATUS,PR_STATUS,TN_STATUS
Alive,,948.0,,,,
Dead,,155.0,,,,
DiseaseFree,896.0,,,,,
Negative,,,239.0,766.0,345.0,944.0
Positive,,,813.0,195.0,704.0,161.0
Recurred/Progressed,112.0,,,,,
Unknown,97.0,2.0,53.0,144.0,56.0,


In [60]:
assert np.all(pd.notnull(d_meta))
# Note that is kind of metadata is very specific to breast cancer so there's
# no need to make this kind of preparation more generic
db.save(d_meta, src.TCGA_v1, db.PREP, 'brca-cellline-meta')

'/Users/eczech/data/research/mgds/prep/tcga_v1_brca-cellline-meta.pkl'