In [1]:
import pandas as pd
from sklearn.linear_model import ElasticNetCV
from sklearn.model_selection import KFold
from numpy.random import RandomState
from numpy import nonzero

In [2]:
##########################################
### Data Load
##########################################
input_directory = '/home/share/heeyeon/csbl/users/heeyeon/tcga_files/combined_dataset/age_methyl/'
raw_df = pd.read_table(input_directory + 'prostate_correlation.tsv', low_memory=False)

In [3]:
##########################################
### Data preprocessing
##########################################

# Drop p-value and r column
extracted_df = raw_df.drop(['p-value', 'r'], axis=1)

# Drop gender row (Drop last row)
extracted_df = extracted_df[:-1]

# Transpose for ElasticNet input form
input_df = extracted_df.transpose()

In [4]:
##########################################
### Machine learning input preparation
##########################################

# Prepare X(features: cg sites) and y(labels: age)
x_training_set = input_df.iloc[1:, :-1].astype('float64').values
y_training_set = input_df.iloc[1:, -1].astype('float64').values
feature_vector = input_df.iloc[0, :-1].values

In [5]:
##########################################
### Learning ElasticNet
##########################################

cv_option = KFold(n_splits=10, shuffle=True)
elastic_net = ElasticNetCV(cv=cv_option, verbose=1, n_jobs=-1, max_iter=100000, 
                           selection='random', random_state=RandomState(None))
elastic_net.fit(x_training_set, y_training_set)

...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:    5.6s remaining:   13.1s
............................................................................................................................................................................................................................................................................................

ElasticNetCV(alphas=None, copy_X=True,
       cv=KFold(n_splits=10, random_state=None, shuffle=True), eps=0.001,
       fit_intercept=True, l1_ratio=0.5, max_iter=100000, n_alphas=100,
       n_jobs=-1, normalize=False, positive=False, precompute='auto',
       random_state=<mtrand.RandomState object at 0x7fd71449eb88>,
       selection='random', tol=0.0001, verbose=1)

In [16]:
##########################################
### Print information of total feature
##########################################

print("Total features")
print(feature_vector)
print("Num of total features {}".format(len(feature_vector)))

Total features
['cg00001245' 'cg00003298' 'cg00007466' ..., 'ch.9.75018133F' 'rs10774834'
 'rs348937']
Num of total features 8026


In [15]:
##########################################
### Print information of non-zero feature
##########################################

non_zero_features = feature_vector[nonzero(elastic_net.coef_)]
print("Non-zero features")
print(non_zero_features)
print("Num of non-zero features {}".format(len(non_zero_features)))

Non-zero features
['cg00078513' 'cg00178224' 'cg00228620' 'cg00301879' 'cg00320625'
 'cg00349361' 'cg00384707' 'cg00389619' 'cg00400259' 'cg00403874'
 'cg00437258' 'cg00460763' 'cg00460983' 'cg00477978' 'cg00557402'
 'cg00590039' 'cg00663575' 'cg00907200' 'cg00934746' 'cg00947319'
 'cg01004363' 'cg01066220' 'cg01069808' 'cg01126532' 'cg01228342'
 'cg01379846' 'cg01408932' 'cg01423883' 'cg01425746' 'cg01477379'
 'cg01544213' 'cg01555263' 'cg01569660' 'cg01572513' 'cg01618719'
 'cg01643712' 'cg01646795' 'cg01696605' 'cg01699298' 'cg01720616'
 'cg01758799' 'cg01768144' 'cg01786704' 'cg01792601' 'cg01813335'
 'cg01904776' 'cg01985201' 'cg02077237' 'cg02214414' 'cg02273436'
 'cg02286602' 'cg02394686' 'cg02471658' 'cg02719634' 'cg02757456'
 'cg02905426' 'cg02927042' 'cg03131829' 'cg03134147' 'cg03161190'
 'cg03207593' 'cg03650119' 'cg03719428' 'cg03750315' 'cg03771436'
 'cg03866831' 'cg03904042' 'cg04042248' 'cg04074004' 'cg04074536'
 'cg04167075' 'cg04183933' 'cg04303033' 'cg04524088' 'cg04