In [1]:
%run -m ipy_startup
%run -m ipy_logging
%matplotlib inline
from musc_genomics import data
pd.set_option('max_info_rows', 10000)

In [2]:
d_raw = data.load('prepared', 'modeling_data_merge_raw')

2016-07-19 15:01:08,237:DEBUG:research.project.manager: Loading saved data from location "/Users/eczech/data/research/musc_genomics/prepared/modeling_data_merge_raw.pkl"


In [3]:
d_raw.filter(regex='^RES:|^CL:').info()

<class 'pandas.core.frame.DataFrame'>
Index: 416 entries, 22RV1 to ZR7530
Data columns (total 12 columns):
CL:CANCER_TYPE        416 non-null object
CL:HISTOLOGY          416 non-null object
CL:PRIMARY_SITE       416 non-null object
CL:GENDER             416 non-null object
RES:CANCERTYPE        416 non-null object
RES:TISSUE            416 non-null object
RES:VAL:ABT-263       411 non-null float64
RES:VAL:AG-014699     409 non-null float64
RES:VAL:NUTLIN-3A     411 non-null float64
RES:VAL:PD-0332991    389 non-null float64
RES:VAL:PLX4720       411 non-null float64
RES:VAL:SB590885      396 non-null float64
dtypes: float64(6), object(6)
memory usage: 42.2+ KB


In [4]:
d_raw.filter(regex='^RES:|^CL:').head()

FEATURE,CL:CANCER_TYPE,CL:HISTOLOGY,CL:PRIMARY_SITE,CL:GENDER,RES:CANCERTYPE,RES:TISSUE,RES:VAL:ABT-263,RES:VAL:AG-014699,RES:VAL:NUTLIN-3A,RES:VAL:PD-0332991,RES:VAL:PLX4720,RES:VAL:SB590885
TUMOR_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
22RV1,solid,carcinoma,prostate,male,urogenital_system,prostate,5.010086,2.676501,2.549906,0.517856,6.697773,4.639378
2313287,solid,carcinoma,stomach,male,digestive_system,stomach,3.649499,6.000943,3.039341,3.912658,6.753041,6.373648
5637,solid,carcinoma,urinary_tract,male,urogenital_system,bladder,1.283993,2.897053,6.412843,5.88713,2.998122,5.108025
639V,solid,carcinoma,urinary_tract,male,urogenital_system,bladder,3.378504,5.784257,3.779543,3.054728,6.816524,6.274532
647V,solid,carcinoma,urinary_tract,male,urogenital_system,bladder,4.51045,3.973549,6.739784,5.881208,6.842532,6.625275


In [5]:
d_raw['CL:HISTOLOGY'].value_counts()

carcinoma                                                     264
lymphoid_neoplasm                                              37
malignant_melanoma                                             30
glioma                                                         25
haematopoietic_neoplasm                                        21
neuroblastoma                                                   9
mesothelioma                                                    6
unknown                                                         4
osteosarcoma                                                    4
rhabdomyosarcoma                                                3
ewings_sarcoma-peripheral_primitive_neuroectodermal_tumour      2
sarcoma                                                         2
rhabdoid_tumour                                                 2
primitive_neuroectodermal_tumour-medulloblastoma                2
fibrosarcoma                                                    1
other     

In [6]:
c_idx = ['TUMOR_ID', 'RES:CANCERTYPE', 'RES:TISSUE']
d = d_raw.reset_index()

# Ensure that all metadata values are currently non-null before pushing them
# into an index
assert np.all(d[c_idx].notnull())

d = d.set_index(c_idx)

d.iloc[:5, :5]

Unnamed: 0_level_0,Unnamed: 1_level_0,FEATURE,CN:A1BG,CN:A1BG-AS1,CN:A1CF,CN:A2M,CN:A2ML1
TUMOR_ID,RES:CANCERTYPE,RES:TISSUE,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
22RV1,urogenital_system,prostate,-0.08084,-0.08084,-0.0025,0.4486,0.4486
2313287,digestive_system,stomach,-0.0331,-0.0331,-0.05348,-0.1917,-0.1917
5637,urogenital_system,bladder,-0.03514,-0.03514,-0.4935,-0.0657,-0.0657
639V,urogenital_system,bladder,0.0828,0.0828,0.059,0.3126,0.3126
647V,urogenital_system,bladder,0.2913,0.2913,-0.323,0.1603,0.1603


## Minimally Encoded Feature Set

In [7]:
from sklearn.preprocessing import LabelEncoder

d_encode_min = d.copy()
encoders = {}

# For each of the following non-numeric values, apply a label encoder
# and save that encoder in a map to eventually be attached to resulting data frame
c_encode = ['CL:PRIMARY_SITE', 'CL:CANCER_TYPE', 'CL:HISTOLOGY', 'CL:GENDER']
for c in c_encode:
    assert np.all(d_encode_min[c].notnull()), 'Categorical variable "{}" has null values'.format(c)
    encoders[c] = LabelEncoder().fit(d_encode_min[c].values)
    d_encode_min[c] = encoders[c].transform(d_encode_min[c]).astype(np.float64)
d_encode_min.label_encoders = encoders

# Ensure that all features have been encoded in some numeric form at this point
assert np.all(d_encode_min.dtypes == np.float64), 'Some features are still not floating point values'

In [8]:
d_encode_min.filter(regex='^CL:').describe().T

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
FEATURE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
CL:CANCER_TYPE,416.0,0.870192,0.364015,0.0,1.0,1.0,1.0,2.0
CL:HISTOLOGY,416.0,3.487981,3.928867,0.0,1.0,1.0,5.0,18.0
CL:PRIMARY_SITE,416.0,9.1875,5.865023,0.0,5.0,9.0,12.0,22.0
CL:GENDER,416.0,0.704327,0.633659,0.0,0.0,1.0,1.0,2.0


In [9]:
data.save('features', 'encode_minimal', d_encode_min)

2016-07-19 15:01:18,820:DEBUG:research.project.manager: Saving data to location "/Users/eczech/data/research/musc_genomics/features/encode_minimal.pkl"


'/Users/eczech/data/research/musc_genomics/features/encode_minimal.pkl'

## Maximally Encoded Feature Set

In [10]:
d_encode_max = d.copy()

# For each of the following non-numeric values, apply dummy encoding
c_encode = ['CL:PRIMARY_SITE', 'CL:CANCER_TYPE', 'CL:HISTOLOGY', 'CL:GENDER']
for c in c_encode:
    assert np.all(d_encode_max[c].notnull()), 'Categorical variable "{}" has null values'.format(c)
    
    d_encode_max = pd.concat([
        d_encode_max.drop(c, axis=1),
        pd.get_dummies(d_encode_max[c], prefix_sep=':', prefix=c)
    ], axis=1)

    # Remove "unknown" values?
    #     c_ref = c + ':unknown'
    #     if c_ref in d_encode_max:
    #         d_encode_max = d_encode_max.drop(c_ref, axis=1)
    
# Ensure that all features have been encoded in some numeric form at this point
assert np.all(d_encode_max.dtypes == np.float64), 'Some features are still not floating point values'

In [11]:
d_encode_max.iloc[:3, :5]

Unnamed: 0_level_0,Unnamed: 1_level_0,FEATURE,CN:A1BG,CN:A1BG-AS1,CN:A1CF,CN:A2M,CN:A2ML1
TUMOR_ID,RES:CANCERTYPE,RES:TISSUE,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
22RV1,urogenital_system,prostate,-0.08084,-0.08084,-0.0025,0.4486,0.4486
2313287,digestive_system,stomach,-0.0331,-0.0331,-0.05348,-0.1917,-0.1917
5637,urogenital_system,bladder,-0.03514,-0.03514,-0.4935,-0.0657,-0.0657


In [13]:
d_encode_max.filter(regex='^CL:').describe().T.head()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
FEATURE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
CL:PRIMARY_SITE:autonomic_ganglia,416.0,0.021635,0.145662,0.0,0.0,0.0,0.0,1.0
CL:PRIMARY_SITE:bone,416.0,0.014423,0.11937,0.0,0.0,0.0,0.0,1.0
CL:PRIMARY_SITE:breast,416.0,0.081731,0.274284,0.0,0.0,0.0,0.0,1.0
CL:PRIMARY_SITE:central_nervous_system,416.0,0.064904,0.246653,0.0,0.0,0.0,0.0,1.0
CL:PRIMARY_SITE:endometrium,416.0,0.021635,0.145662,0.0,0.0,0.0,0.0,1.0


In [14]:
data.save('features', 'encode_maximal', d_encode_max)

2016-07-19 15:01:31,819:DEBUG:research.project.manager: Saving data to location "/Users/eczech/data/research/musc_genomics/features/encode_maximal.pkl"


'/Users/eczech/data/research/musc_genomics/features/encode_maximal.pkl'