# CGDS Genetic Feature Data Assembly/Prep

In [1]:
%run -m ipy_startup
%run -m ipy_logging
%matplotlib inline
from musc_genomics.data_import import cgds, api, prep
from musc_genomics import data
pd.set_option('max_info_rows', int(1E9))

## Load All Raw Data

In [2]:
d_cn = data.load('materialized', 'cgds_genetic_cn')
d_ge = data.load('materialized', 'cgds_genetic_ge')
d_mu = data.load('materialized', 'cgds_genetic_mu')
d_mm = data.load('materialized', 'cgds_meta_mu')
d_cl = data.load('materialized', 'cgds_clinical')

2016-07-07 15:22:17,156:DEBUG:research.project.manager: Loading saved data from location "/Users/eczech/data/research/musc_genomics/materialized/cgds_genetic_cn.pkl"
2016-07-07 15:22:17,536:DEBUG:research.project.manager: Loading saved data from location "/Users/eczech/data/research/musc_genomics/materialized/cgds_genetic_ge.pkl"
2016-07-07 15:22:17,961:DEBUG:research.project.manager: Loading saved data from location "/Users/eczech/data/research/musc_genomics/materialized/cgds_genetic_mu.pkl"
2016-07-07 15:22:20,854:DEBUG:research.project.manager: Loading saved data from location "/Users/eczech/data/research/musc_genomics/materialized/cgds_meta_mu.pkl"
2016-07-07 15:22:20,946:DEBUG:research.project.manager: Loading saved data from location "/Users/eczech/data/research/musc_genomics/materialized/cgds_clinical.pkl"


## Copy Number Prep

In [3]:
d_cn = prep.prep_raw_cgds_genetic_data(d_cn).rename(columns={'GENE_ID': 'FEATURE'})
d_cn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20279095 entries, 0 to 20279094
Data columns (total 3 columns):
FEATURE     20279095 non-null object
TUMOR_ID    20279095 non-null object
VALUE       20279095 non-null float64
dtypes: float64(1), object(2)
memory usage: 464.2+ MB


In [4]:
d_cn.head()

Unnamed: 0,FEATURE,TUMOR_ID,VALUE
0,A1BG,1321N1_CENTRAL_NERVOUS_SYSTEM,-0.1544
1,A1BG-AS1,1321N1_CENTRAL_NERVOUS_SYSTEM,-0.1544
2,A1CF,1321N1_CENTRAL_NERVOUS_SYSTEM,-0.0985
3,A2M,1321N1_CENTRAL_NERVOUS_SYSTEM,-0.1819
4,A2ML1,1321N1_CENTRAL_NERVOUS_SYSTEM,-0.1819


In [5]:
d_cn = d_cn.pivot_table(index='TUMOR_ID', columns='FEATURE', values='VALUE')
d_cn.iloc[:5, :10]

FEATURE,A1BG,A1BG-AS1,A1CF,A2M,A2ML1,A4GALT,A4GNT,AAAS,AACS,AACSP1
TUMOR_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1321N1_CENTRAL_NERVOUS_SYSTEM,-0.1544,-0.1544,-0.0985,-0.1819,-0.1819,-0.1732,0.2575,-0.0404,-0.1284,0.1682
143B_BONE,-0.212,-0.212,0.158,-0.1753,-0.1753,-0.2536,0.206,-0.1445,-0.2757,0.215
22RV1_PROSTATE,-0.08084,-0.08084,-0.0025,0.4486,0.4486,-0.0574,-0.0303,0.4649,0.4406,-0.0518
2313287_STOMACH,-0.0331,-0.0331,-0.05348,-0.1917,-0.1917,-0.0512,-0.04608,-0.01474,-0.0516,-0.0514
42MGBA_CENTRAL_NERVOUS_SYSTEM,0.0322,0.0322,-0.0522,-0.3905,-0.3905,-0.0163,-0.2686,-0.3059,0.0857,0.2794


## Gene Expression Prep

In [6]:
d_ge = prep.prep_raw_cgds_genetic_data(d_ge).rename(columns={'GENE_ID': 'FEATURE'})
d_ge.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15512614 entries, 0 to 15512613
Data columns (total 3 columns):
FEATURE     15512614 non-null object
TUMOR_ID    15512614 non-null object
VALUE       15512614 non-null float64
dtypes: float64(1), object(2)
memory usage: 355.1+ MB


In [7]:
d_ge.head()

Unnamed: 0,FEATURE,TUMOR_ID,VALUE
0,A1BG,1321N1_CENTRAL_NERVOUS_SYSTEM,-0.1004
1,A1CF,1321N1_CENTRAL_NERVOUS_SYSTEM,-0.4882
2,A2M,1321N1_CENTRAL_NERVOUS_SYSTEM,-0.2826
3,A2M-AS1,1321N1_CENTRAL_NERVOUS_SYSTEM,-0.7492
4,A2ML1,1321N1_CENTRAL_NERVOUS_SYSTEM,0.0017


In [8]:
d_ge = d_ge.pivot_table(index='TUMOR_ID', columns='FEATURE', values='VALUE')
d_ge.iloc[:5, :10]

FEATURE,A1BG,A1CF,A2M,A2M-AS1,A2ML1,A4GALT,A4GNT,AAAS,AACS,AADAC
TUMOR_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1321N1_CENTRAL_NERVOUS_SYSTEM,-0.1004,-0.4882,-0.2826,-0.7492,0.0017,0.8475,0.0203,0.2452,-1.3409,-0.5405
143B_BONE,-0.4436,-0.3827,0.0548,-0.2432,-0.3059,-0.3778,0.6206,0.3606,-0.755,-0.4748
22RV1_PROSTATE,-0.9152,3.3331,-0.0267,1.5792,-0.441,-1.3142,-0.7317,1.6733,1.9962,-0.5656
2313287_STOMACH,-1.3652,1.927,-0.4622,0.0741,-0.356,0.3141,-0.622,-0.3155,0.043,-0.3193
42MGBA_CENTRAL_NERVOUS_SYSTEM,1.7218,-0.3239,-0.3587,-1.2605,0.2094,-1.2627,-1.3619,-0.1937,-0.3963,-0.5527


## Mutation Data Prep

In [9]:
d_mu = prep.prep_raw_cgds_genetic_data(d_mu)
d_mu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53541 entries, 0 to 53540
Data columns (total 3 columns):
GENE_ID     53541 non-null object
TUMOR_ID    53541 non-null object
VALUE       53541 non-null object
dtypes: object(3)
memory usage: 1.2+ MB


In [10]:
d_mu_ft = d_mu.groupby([c for c in d_mu if c != 'VALUE'])['VALUE']\
    .apply(prep.split_mutation_value).reset_index()
d_mu_ft.head()

Unnamed: 0,GENE_ID,TUMOR_ID,MUTATION,VALUE
0,AAK1,22RV1_PROSTATE,R634H,1
1,AAK1,CMLT1_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,Q545_Q546del,1
2,AAK1,HEC108_ENDOMETRIUM,Q533H,1
3,AAK1,HEC251_ENDOMETRIUM,N713S,1
4,AAK1,HEC59_ENDOMETRIUM,P461S,1


In [11]:
d_mu_all = prep.add_mutation_metadata(d_mu_ft, d_mm)
d_mu_all.head()

Unnamed: 0,GENE_ID,TUMOR_ID,MUTATION,VALUE,MUTATION_TYPE,IMPACT_SCORE
0,AAK1,22RV1_PROSTATE,R634H,1,missense_mutation,low
1,AAK1,CMLT1_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,Q545_Q546del,1,in_frame_del,unknown
2,AAK1,HEC108_ENDOMETRIUM,Q533H,1,missense_mutation,neutral
3,AAK1,HEC251_ENDOMETRIUM,N713S,1,missense_mutation,low
4,AAK1,HEC59_ENDOMETRIUM,P461S,1,missense_mutation,neutral


In [12]:
d_mu_lng = d_mu_all.groupby(['TUMOR_ID', 'GENE_ID']).apply(prep.get_gene_mutation_features).reset_index()
d_mu_lng.head()

Unnamed: 0,TUMOR_ID,GENE_ID,FEATURE,VALUE
0,22RV1_PROSTATE,AAK1,AAC:AAK1:R634H,1
1,22RV1_PROSTATE,AAK1,TYP:missense_mutation,1
2,22RV1_PROSTATE,AAK1,IMP:low,1
3,22RV1_PROSTATE,ABCC4,AAC:ABCC4:L883Wfs*7,1
4,22RV1_PROSTATE,ABCC4,TYP:frame_shift_del,1


In [13]:
d_mu_mm = d_mu_lng.pivot_table(index='TUMOR_ID', columns='FEATURE', values='VALUE', aggfunc='sum')
d_mu_mm.iloc[:5, :10]

FEATURE,AAC:AAK1:F312Lfs*26,AAC:AAK1:G15D,AAC:AAK1:H189R,AAC:AAK1:N713S,AAC:AAK1:P336T,AAC:AAK1:P461S,AAC:AAK1:P771R,AAC:AAK1:Q533H,AAC:AAK1:Q545_Q546del,AAC:AAK1:Q743*
TUMOR_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
22RV1_PROSTATE,,,,,,,,,,
2313287_STOMACH,,,,,,,,,,
253JBV_URINARY_TRACT,,,,,,,,,,
253J_URINARY_TRACT,,,,,,,,,,
42MGBA_CENTRAL_NERVOUS_SYSTEM,,,,,,,,,,


## Clinical Data Prep

In [34]:
# Pivot all clinical features into rows
cols = ['TUMOR_ID', 'CANCER_TYPE', 'HISTOLOGY', 'PRIMARY_SITE', 'GENDER']
d_cd = d_cl.rename(columns={'CASE_ID': 'TUMOR_ID'})[cols]\
    .set_index('TUMOR_ID')\
    .fillna('unknown')\
    .applymap(lambda x: None if pd.isnull(x) else x.lower())
d_cd.columns.name = 'FEATURE'
d_cd.head()

FEATURE,CANCER_TYPE,HISTOLOGY,PRIMARY_SITE,GENDER
TUMOR_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
HCC78_LUNG,solid,carcinoma,lung,male
COLO800_SKIN,solid,malignant_melanoma,skin,unknown
SKMEL1_SKIN,solid,malignant_melanoma,skin,male
HT115_LARGE_INTESTINE,solid,carcinoma,large_intestine,unknown
ECC12_STOMACH,solid,carcinoma,stomach,unknown


## Merging

In [39]:
d_sparse = pd.concat([
    d_cn.rename(columns=lambda c: 'CN:' + c), 
    d_ge.rename(columns=lambda c: 'GE:' + c), 
    d_mu_mm.rename(columns=lambda c: 'MU:' + c), 
    d_cd.rename(columns=lambda c: 'CL:' + c)
], axis=1)
d_sparse.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1019 entries, 1321N1_CENTRAL_NERVOUS_SYSTEM to ZR7530_BREAST
Columns: 93981 entries, CN:A1BG to CL:GENDER
dtypes: float64(93977), object(4)
memory usage: 730.6+ MB


In [49]:
fill_map = {
    '^MU:AAC:': 0,
    '^MU:TYP:': 'unknown',
    '^MU:IMP:': 'unknown',
    '^CL:': 'unknown'
}
d_fill = d_sparse.copy()
for pattern in fill_map:
    cols = d_sparse.filter(regex=pattern).columns.tolist()
    d_fill[cols] = d_fill[cols].fillna(fill_map[pattern])

In [50]:
d_fill.filter(regex='^MU:TYP:|^MU:IMP:|^CL:').info()

<class 'pandas.core.frame.DataFrame'>
Index: 1019 entries, 1321N1_CENTRAL_NERVOUS_SYSTEM to ZR7530_BREAST
Data columns (total 19 columns):
MU:IMP:high                      1019 non-null object
MU:IMP:low                       1019 non-null object
MU:IMP:med                       1019 non-null object
MU:IMP:neutral                   1019 non-null object
MU:IMP:unknown                   1019 non-null object
MU:TYP:frame_shift_del           1019 non-null object
MU:TYP:frame_shift_ins           1019 non-null object
MU:TYP:in_frame_del              1019 non-null object
MU:TYP:in_frame_ins              1019 non-null object
MU:TYP:missense_mutation         1019 non-null object
MU:TYP:nonsense_mutation         1019 non-null object
MU:TYP:nonstop_mutation          1019 non-null object
MU:TYP:splice_site               1019 non-null object
MU:TYP:targeted_region           1019 non-null object
MU:TYP:translation_start_site    1019 non-null object
CL:CANCER_TYPE                   1019 non-null obje

## Export

In [51]:
data.save('prepared', 'features_raw_sparse', d_sparse)

2016-07-07 15:48:21,638:DEBUG:research.project.manager: Saving data to location "/Users/eczech/data/research/musc_genomics/prepared/features_raw_sparse.pkl"


'/Users/eczech/data/research/musc_genomics/prepared/features_raw_sparse.pkl'

In [52]:
data.save('prepared', 'features_raw_fill', d_fill)

2016-07-07 15:48:24,243:DEBUG:research.project.manager: Saving data to location "/Users/eczech/data/research/musc_genomics/prepared/features_raw_fill.pkl"


'/Users/eczech/data/research/musc_genomics/prepared/features_raw_fill.pkl'

<hr>

## Analysis

In [48]:
pd.Series([c.split(':')[0] for c in d_sparse]).value_counts()

MU    57554
CN    20381
GE    16042
CL        4
dtype: int64

In [47]:
pd.Series([c.split(':')[0] for c in d_full]).value_counts()

MU    57554
CN    20381
GE    16042
CL        4
dtype: int64

In [33]:
d.filter(regex='^MU:IMP').fillna(0).describe()

FEATURE,MU:IMP:high,MU:IMP:low,MU:IMP:med,MU:IMP:neutral,MU:IMP:unknown
count,1019.0,1019.0,1019.0,1019.0,1019.0
mean,2.843965,16.746811,15.954858,10.129539,15.23945
std,6.284996,31.57594,31.483459,17.550738,26.169907
min,0.0,0.0,0.0,0.0,0.0
25%,0.0,6.0,5.0,3.0,5.0
50%,1.0,10.0,9.0,6.0,9.0
75%,3.0,16.0,14.0,10.0,14.0
max,95.0,395.0,430.0,199.0,288.0


In [25]:
d.filter(regex='^MU:TYP').fillna(0).describe()

FEATURE,MU:TYP:frame_shift_del,MU:TYP:frame_shift_ins,MU:TYP:in_frame_del,MU:TYP:in_frame_ins,MU:TYP:missense_mutation,MU:TYP:nonsense_mutation,MU:TYP:nonstop_mutation,MU:TYP:splice_site,MU:TYP:targeted_region,MU:TYP:translation_start_site
count,1019.0,1019.0,1019.0,1019.0,1019.0,1019.0,1019.0,1019.0,1019.0,1019.0
mean,3.77527,1.578999,2.529931,1.027478,46.652601,3.441609,0.037291,1.820412,0.003925,0.047105
std,11.658936,5.586645,2.179525,1.134921,87.110508,10.135103,0.199663,3.303587,0.062561,0.261736
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,1.0,0.0,17.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,1.0,2.0,1.0,28.0,1.0,0.0,1.0,0.0,0.0
75%,2.0,1.0,4.0,2.0,42.5,3.0,0.0,2.0,0.0,0.0
max,104.0,114.0,14.0,9.0,1130.0,181.0,2.0,36.0,1.0,4.0


In [32]:
d.filter(regex='^MU:TYP').fillna(0).loc['EN_ENDOMETRIUM']

FEATURE
MU:TYP:frame_shift_del            41.0
MU:TYP:frame_shift_ins           114.0
MU:TYP:in_frame_del                4.0
MU:TYP:in_frame_ins                1.0
MU:TYP:missense_mutation         431.0
MU:TYP:nonsense_mutation          12.0
MU:TYP:nonstop_mutation            0.0
MU:TYP:splice_site                27.0
MU:TYP:targeted_region             0.0
MU:TYP:translation_start_site      1.0
Name: EN_ENDOMETRIUM, dtype: float64

In [26]:
r = d[d['MU:TYP:frame_shift_ins'] == 114.]
r.iloc[:5, :10]

FEATURE,CN:A1BG,CN:A1BG-AS1,CN:A1CF,CN:A2M,CN:A2ML1,CN:A4GALT,CN:A4GNT,CN:AAAS,CN:AACS,CN:AACSP1
EN_ENDOMETRIUM,0.056,0.056,0.05902,0.0127,0.0127,0.03715,0.0545,0.04603,0.028,-0.00393


In [30]:
d_mu_all[d_mu_all['TUMOR_ID'] == 'EN_ENDOMETRIUM']

Unnamed: 0,GENE_ID,TUMOR_ID,MUTATION,VALUE,MUTATION_TYPE,IMPACT_SCORE
24,AATK,EN_ENDOMETRIUM,V250A,1,missense_mutation,low
61,ABCA3,EN_ENDOMETRIUM,L1584P,1,missense_mutation,med
191,ABCC4,EN_ENDOMETRIUM,G487E,1,missense_mutation,low
312,ABL2,EN_ENDOMETRIUM,G69R,1,missense_mutation,neutral
630,ACVR1B,EN_ENDOMETRIUM,X111_splice,1,splice_site,unknown
654,ACVR1C,EN_ENDOMETRIUM,E235G,1,missense_mutation,high
755,ACVRL1,EN_ENDOMETRIUM,Q343R,1,missense_mutation,neutral
804,ADAM12,EN_ENDOMETRIUM,H122R,1,missense_mutation,low
805,ADAM12,EN_ENDOMETRIUM,Y128C,1,missense_mutation,med
922,ADAM22,EN_ENDOMETRIUM,A388S,1,missense_mutation,neutral
