# CGDS Genetic Feature Data Assembly/Prep

In [1]:
%run -m ipy_startup
%run -m ipy_logging
%matplotlib inline
from musc_genomics.data_import import cgds, api, prep
from musc_genomics import data
pd.set_option('max_info_rows', int(1E9))

## Load All Raw Data

In [2]:
d_cn = data.load('materialized', 'cgds_genetic_cn')
d_ge = data.load('materialized', 'cgds_genetic_ge')
d_mu = data.load('materialized', 'cgds_genetic_mu')
d_mm = data.load('materialized', 'cgds_meta_mu')
d_cl = data.load('materialized', 'cgds_clinical')

2016-07-19 14:41:31,210:DEBUG:research.project.manager: Loading saved data from location "/Users/eczech/data/research/musc_genomics/materialized/cgds_genetic_cn.pkl"
2016-07-19 14:41:31,803:DEBUG:research.project.manager: Loading saved data from location "/Users/eczech/data/research/musc_genomics/materialized/cgds_genetic_ge.pkl"
2016-07-19 14:41:32,403:DEBUG:research.project.manager: Loading saved data from location "/Users/eczech/data/research/musc_genomics/materialized/cgds_genetic_mu.pkl"
2016-07-19 14:41:35,375:DEBUG:research.project.manager: Loading saved data from location "/Users/eczech/data/research/musc_genomics/materialized/cgds_meta_mu.pkl"
2016-07-19 14:41:35,472:DEBUG:research.project.manager: Loading saved data from location "/Users/eczech/data/research/musc_genomics/materialized/cgds_clinical.pkl"


## Copy Number Prep

In [3]:
d_cn = prep.prep_raw_cgds_genetic_data(d_cn).rename(columns={'GENE_ID': 'FEATURE'})
d_cn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20279095 entries, 0 to 20279094
Data columns (total 3 columns):
FEATURE     20279095 non-null object
TUMOR_ID    20279095 non-null object
VALUE       20279095 non-null float64
dtypes: float64(1), object(2)
memory usage: 464.2+ MB


In [4]:
d_cn.head()

Unnamed: 0,FEATURE,TUMOR_ID,VALUE
0,A1BG,1321N1_CENTRAL_NERVOUS_SYSTEM,-0.1544
1,A1BG-AS1,1321N1_CENTRAL_NERVOUS_SYSTEM,-0.1544
2,A1CF,1321N1_CENTRAL_NERVOUS_SYSTEM,-0.0985
3,A2M,1321N1_CENTRAL_NERVOUS_SYSTEM,-0.1819
4,A2ML1,1321N1_CENTRAL_NERVOUS_SYSTEM,-0.1819


In [5]:
d_cn = d_cn.pivot_table(index='TUMOR_ID', columns='FEATURE', values='VALUE')
d_cn.iloc[:5, :10]

FEATURE,A1BG,A1BG-AS1,A1CF,A2M,A2ML1,A4GALT,A4GNT,AAAS,AACS,AACSP1
TUMOR_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1321N1_CENTRAL_NERVOUS_SYSTEM,-0.1544,-0.1544,-0.0985,-0.1819,-0.1819,-0.1732,0.2575,-0.0404,-0.1284,0.1682
143B_BONE,-0.212,-0.212,0.158,-0.1753,-0.1753,-0.2536,0.206,-0.1445,-0.2757,0.215
22RV1_PROSTATE,-0.08084,-0.08084,-0.0025,0.4486,0.4486,-0.0574,-0.0303,0.4649,0.4406,-0.0518
2313287_STOMACH,-0.0331,-0.0331,-0.05348,-0.1917,-0.1917,-0.0512,-0.04608,-0.01474,-0.0516,-0.0514
42MGBA_CENTRAL_NERVOUS_SYSTEM,0.0322,0.0322,-0.0522,-0.3905,-0.3905,-0.0163,-0.2686,-0.3059,0.0857,0.2794


## Gene Expression Prep

In [6]:
d_ge = prep.prep_raw_cgds_genetic_data(d_ge).rename(columns={'GENE_ID': 'FEATURE'})
d_ge.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15512614 entries, 0 to 15512613
Data columns (total 3 columns):
FEATURE     15512614 non-null object
TUMOR_ID    15512614 non-null object
VALUE       15512614 non-null float64
dtypes: float64(1), object(2)
memory usage: 355.1+ MB


In [7]:
d_ge.head()

Unnamed: 0,FEATURE,TUMOR_ID,VALUE
0,A1BG,1321N1_CENTRAL_NERVOUS_SYSTEM,-0.1004
1,A1CF,1321N1_CENTRAL_NERVOUS_SYSTEM,-0.4882
2,A2M,1321N1_CENTRAL_NERVOUS_SYSTEM,-0.2826
3,A2M-AS1,1321N1_CENTRAL_NERVOUS_SYSTEM,-0.7492
4,A2ML1,1321N1_CENTRAL_NERVOUS_SYSTEM,0.0017


In [8]:
d_ge = d_ge.pivot_table(index='TUMOR_ID', columns='FEATURE', values='VALUE')
d_ge.iloc[:5, :10]

FEATURE,A1BG,A1CF,A2M,A2M-AS1,A2ML1,A4GALT,A4GNT,AAAS,AACS,AADAC
TUMOR_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1321N1_CENTRAL_NERVOUS_SYSTEM,-0.1004,-0.4882,-0.2826,-0.7492,0.0017,0.8475,0.0203,0.2452,-1.3409,-0.5405
143B_BONE,-0.4436,-0.3827,0.0548,-0.2432,-0.3059,-0.3778,0.6206,0.3606,-0.755,-0.4748
22RV1_PROSTATE,-0.9152,3.3331,-0.0267,1.5792,-0.441,-1.3142,-0.7317,1.6733,1.9962,-0.5656
2313287_STOMACH,-1.3652,1.927,-0.4622,0.0741,-0.356,0.3141,-0.622,-0.3155,0.043,-0.3193
42MGBA_CENTRAL_NERVOUS_SYSTEM,1.7218,-0.3239,-0.3587,-1.2605,0.2094,-1.2627,-1.3619,-0.1937,-0.3963,-0.5527


## Mutation Data Prep

In [9]:
d_mu = prep.prep_raw_cgds_genetic_data(d_mu)
d_mu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53541 entries, 0 to 53540
Data columns (total 3 columns):
GENE_ID     53541 non-null object
TUMOR_ID    53541 non-null object
VALUE       53541 non-null object
dtypes: object(3)
memory usage: 1.2+ MB


In [10]:
d_mu_ft = d_mu.groupby([c for c in d_mu if c != 'VALUE'])['VALUE']\
    .apply(prep.split_mutation_value).reset_index()
d_mu_ft.head()

Unnamed: 0,GENE_ID,TUMOR_ID,MUTATION,VALUE
0,AAK1,22RV1_PROSTATE,R634H,1
1,AAK1,CMLT1_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,Q545_Q546del,1
2,AAK1,HEC108_ENDOMETRIUM,Q533H,1
3,AAK1,HEC251_ENDOMETRIUM,N713S,1
4,AAK1,HEC59_ENDOMETRIUM,P461S,1


In [11]:
d_mu_all = prep.add_mutation_metadata(d_mu_ft, d_mm)
d_mu_all.head()

Unnamed: 0,GENE_ID,TUMOR_ID,MUTATION,VALUE,MUTATION_TYPE,IMPACT_SCORE
0,AAK1,22RV1_PROSTATE,R634H,1,missense_mutation,low
1,AAK1,CMLT1_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,Q545_Q546del,1,in_frame_del,unknown
2,AAK1,HEC108_ENDOMETRIUM,Q533H,1,missense_mutation,neutral
3,AAK1,HEC251_ENDOMETRIUM,N713S,1,missense_mutation,low
4,AAK1,HEC59_ENDOMETRIUM,P461S,1,missense_mutation,neutral


In [12]:
d_mu_lng = d_mu_all.groupby(['TUMOR_ID', 'GENE_ID']).apply(prep.get_gene_mutation_features).reset_index()
d_mu_lng.head()

Unnamed: 0,TUMOR_ID,GENE_ID,FEATURE,VALUE
0,22RV1_PROSTATE,AAK1,AAC:AAK1:R634H,1
1,22RV1_PROSTATE,AAK1,TYP:AAK1:missense_mutation,1
2,22RV1_PROSTATE,AAK1,IMP:AAK1:low,1
3,22RV1_PROSTATE,ABCC4,AAC:ABCC4:L883Wfs*7,1
4,22RV1_PROSTATE,ABCC4,TYP:ABCC4:frame_shift_del,1


In [13]:
d_mu_mm = d_mu_lng.pivot_table(index='TUMOR_ID', columns='FEATURE', values='VALUE', aggfunc='sum')
d_mu_mm.iloc[:5, :10]

FEATURE,AAC:AAK1:F312Lfs*26,AAC:AAK1:G15D,AAC:AAK1:H189R,AAC:AAK1:N713S,AAC:AAK1:P336T,AAC:AAK1:P461S,AAC:AAK1:P771R,AAC:AAK1:Q533H,AAC:AAK1:Q545_Q546del,AAC:AAK1:Q743*
TUMOR_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
22RV1_PROSTATE,,,,,,,,,,
2313287_STOMACH,,,,,,,,,,
253JBV_URINARY_TRACT,,,,,,,,,,
253J_URINARY_TRACT,,,,,,,,,,
42MGBA_CENTRAL_NERVOUS_SYSTEM,,,,,,,,,,


## Clinical Data Prep

In [14]:
d_cl.head()

Unnamed: 0,CASE_ID,CANCER_TYPE,DATA_SOURCE,GENDER,HISTOLOGICAL_SUBTYPE,HISTOLOGY,PRIMARY_SITE,TUMOR_TYPE
0,HCC78_LUNG,solid,DSMZ,Male,adenocarcinoma,carcinoma,lung,lung_NSC
1,COLO800_SKIN,solid,DSMZ,,,malignant_melanoma,skin,melanoma
2,SKMEL1_SKIN,solid,ATCC,Male,,malignant_melanoma,skin,melanoma
3,HT115_LARGE_INTESTINE,solid,ECACC,,,carcinoma,large_intestine,colorectal
4,ECC12_STOMACH,solid,RIKEN,,small_cell_adenocarcinoma,carcinoma,stomach,stomach


In [15]:
d_cd = d_cl.copy()

# Fill in missing primary site values in clinical data fields using primary site in case/tumor ID
d_cd['PRIMARY_SITE_ID'] = d_cd['CASE_ID'].str.split('_').str[1:].str.join('_').str.lower()

# Verify that when present, the primary site value from the clinical data is always equal to
# the site name in the case/tumor ID
d_site = d_cd[d_cd['PRIMARY_SITE'].notnull()]
assert np.all(d_site['PRIMARY_SITE'] == d_site['PRIMARY_SITE_ID'])

# Fill in missing primary site values
d_cd['PRIMARY_SITE'] = d_cd['PRIMARY_SITE'].where(d_cd['PRIMARY_SITE'].notnull(), d_cd['PRIMARY_SITE_ID'])
d_cd = d_cd.drop('PRIMARY_SITE_ID', axis=1)

# Pivot all clinical features into rows
cols = ['TUMOR_ID', 'CANCER_TYPE', 'HISTOLOGY', 'PRIMARY_SITE', 'GENDER']
d_cd = d_cd.rename(columns={'CASE_ID': 'TUMOR_ID'})[cols]\
    .set_index('TUMOR_ID')\
    .fillna('unknown')\
    .applymap(lambda x: None if pd.isnull(x) else x.lower())
d_cd.columns.name = 'FEATURE'

d_cd.head()

FEATURE,CANCER_TYPE,HISTOLOGY,PRIMARY_SITE,GENDER
TUMOR_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
HCC78_LUNG,solid,carcinoma,lung,male
COLO800_SKIN,solid,malignant_melanoma,skin,unknown
SKMEL1_SKIN,solid,malignant_melanoma,skin,male
HT115_LARGE_INTESTINE,solid,carcinoma,large_intestine,unknown
ECC12_STOMACH,solid,carcinoma,stomach,unknown


## Merging

In [25]:
d_exp = pd.concat([
    d_cn.rename(columns=lambda c: 'CN:' + c), 
    d_ge.rename(columns=lambda c: 'GE:' + c), 
    d_mu_mm.rename(columns=lambda c: 'MU:' + c), 
    d_cd.rename(columns=lambda c: 'CL:' + c)
], axis=1)

# Rename known duplicates in tumor id values
dupe_id = {'TT_OESOPHAGUS': 'TTOESOPH_OESOPHAGUS', 'TT_THYROID': 'TTTHYR_THYROID'}
d_exp.index = d_exp.index.map(lambda x: dupe_id.get(x, x))

# Remove primary site from tumor ID
d_exp.index = [c.split('_')[0] for c in d_exp.index.values]
d_exp.index.name = 'TUMOR_ID'

# Ensure no duplicate tumor ids exist in index
idx_dupe = d_exp.index[d_exp.index.duplicated()]
assert len(idx_dupe) == 0, 'Found duplicate tumor IDs in index: {}'.format(idx_dupe)

d_exp.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1019 entries, 1321N1 to ZR7530
Columns: 107376 entries, CN:A1BG to CL:GENDER
dtypes: float64(107372), object(4)
memory usage: 834.8+ MB


In [26]:
d_exp.filter(regex='^MU:TYP:BRAF|^MU:IMP:BRAF|^CL:').info()

<class 'pandas.core.frame.DataFrame'>
Index: 1019 entries, 1321N1 to ZR7530
Data columns (total 13 columns):
MU:IMP:BRAF:high                 8 non-null float64
MU:IMP:BRAF:low                  73 non-null float64
MU:IMP:BRAF:med                  4 non-null float64
MU:IMP:BRAF:neutral              6 non-null float64
MU:IMP:BRAF:unknown              10 non-null float64
MU:TYP:BRAF:frame_shift_del      2 non-null float64
MU:TYP:BRAF:in_frame_ins         1 non-null float64
MU:TYP:BRAF:missense_mutation    93 non-null float64
MU:TYP:BRAF:splice_site          2 non-null float64
CL:CANCER_TYPE                   1019 non-null object
CL:HISTOLOGY                     1019 non-null object
CL:PRIMARY_SITE                  1019 non-null object
CL:GENDER                        1019 non-null object
dtypes: float64(9), object(4)
memory usage: 111.5+ KB


## Export

In [29]:
data.save('prepared', 'features_raw', d_exp)

2016-07-19 14:57:11,017:DEBUG:research.project.manager: Saving data to location "/Users/eczech/data/research/musc_genomics/prepared/features_raw.pkl"


'/Users/eczech/data/research/musc_genomics/prepared/features_raw.pkl'

<hr>

## Analysis

In [30]:
pd.Series([c.split(':')[0] for c in d_exp]).value_counts()

MU    70949
CN    20381
GE    16042
CL        4
dtype: int64

In [31]:
dt = d_exp[d_exp.filter(regex='MU:TYP:BRAF').sum(axis=1) > 0]
dt.loc['OUMS23'].filter(regex='CL:|:BRAF:').dropna()

FEATURE
MU:AAC:BRAF:V600E                              1
MU:AAC:BRAF:V600E,X287_splice                  1
MU:AAC:BRAF:X287_splice                        1
MU:IMP:BRAF:low                                1
MU:IMP:BRAF:unknown                            1
MU:TYP:BRAF:missense_mutation                  1
MU:TYP:BRAF:splice_site                        1
CL:CANCER_TYPE                             solid
CL:HISTOLOGY                           carcinoma
CL:PRIMARY_SITE                  large_intestine
CL:GENDER                                unknown
Name: OUMS23, dtype: object

In [52]:
dt = d_mu[d_mu['GENE_ID'] == 'BRAF']
dt[dt['TUMOR_ID'] == 'OUMS23_LARGE_INTESTINE']

Unnamed: 0,GENE_ID,TUMOR_ID,VALUE
36904,BRAF,OUMS23_LARGE_INTESTINE,"V600E,X287_splice"


In [55]:
d_mu_mm.loc['OUMS23_LARGE_INTESTINE'].filter(regex='BRAF').dropna()

FEATURE
AAC:BRAF:V600E                1.0
AAC:BRAF:V600E,X287_splice    1.0
AAC:BRAF:X287_splice          1.0
IMP:BRAF:low                  1.0
IMP:BRAF:unknown              1.0
TYP:BRAF:missense_mutation    1.0
TYP:BRAF:splice_site          1.0
Name: OUMS23_LARGE_INTESTINE, dtype: float64

In [69]:
from musc_genomics.data_import import cgds
dt = cgds.get_mutation_data(cgds.CCLE_CASE_LIST_ID, cgds.GEN_PROF_MUTATION, ['BRAF'], gene_id_batch_size=500)
dt[dt['case_id'] == 'OUMS23_LARGE_INTESTINE']

2016-07-16 06:13:29,179:INFO:musc_genomics.data_import.cgds: Processing batch 1 of 1


Unnamed: 0,entrez_gene_id,gene_symbol,case_id,sequencing_center,mutation_status,mutation_type,validation_status,amino_acid_change,functional_impact_score,xvar_link,...,chr,start_position,end_position,reference_allele,variant_allele,reference_read_count_tumor,variant_read_count_tumor,reference_read_count_normal,variant_read_count_normal,genetic_profile_id
80,673,BRAF,OUMS23_LARGE_INTESTINE,broad.mit.edu,,Missense_Mutation,,V600E,L,"getma.org/?cm=var&var=hg19,7,140453136,A,T&fts...",...,7,140453136,140453136,A,T,92,27,,,cellline_ccle_broad_mutations
96,673,BRAF,OUMS23_LARGE_INTESTINE,broad.mit.edu,,Splice_Site,,X287_splice,,,...,7,140500282,140500282,C,T,61,7,,,cellline_ccle_broad_mutations
