In [1]:
#pip install import-ipynb

In [5]:
import import_ipynb

from Data_preprocessing import *

In [19]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

from RENT2 import RENT, stability
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import RepeatedKFold

import numpy as np
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import seaborn as sns

# Feature selection based on the second approach

Referring to the thesis, chapter 5.2.3, RENT selects features based on this approach and the results are mentioned in this file.

## First block

### Including two features of RESPONSE1 and TIMETOTRM1

In [33]:
df3 = pd.read_csv('NNTG_NEC-retrospective register_LIVE_enc_Form 3.csv',
                 sep=';')

In [34]:
df3['DATETRM1'] = pd.to_datetime(df3['DATETRM1']) # Making the features of TIMETOTRM1
df1['DATEDIAG'] = pd.to_datetime(df1['DATEDIAG'])

In [35]:
TIMETOTRM1 = (df3['DATETRM1'] - df1['DATEDIAG']).apply(lambda x: x.days) # Change the date type to numerical value

In [36]:
# Making new datframe including both of the features
X_new = pd.concat([X, df3['RESPONS1']], axis=1)
X_new = pd.concat([X_new,pd.get_dummies(df3['RESPONS1'], prefix='RESPONS1')],axis=1)
# we will have new target with exculded missing values:
X_new = pd.concat([X_new, pd.DataFrame(y,columns=['y'])], axis=1)
X_new = pd.concat([X_new, pd.DataFrame(TIMETOTRM1,columns=['TIMETOTRM1'])], axis=1)

X_new = X_new.dropna() # Dropping the missing items
X_new.set_index(pd.Series(range(0,X_new.shape[0])), inplace = True)
X_new = X_new.drop(['RESPONS1'], axis = 1)

In [37]:
y_new = X_new.loc[:,'y']
X_new = X_new.loc[:, X_new.columns != 'y']

In [38]:
X_new.shape

(71, 62)

In [39]:
X.shape[0] - X_new.shape[0]

9

In [40]:
X_new.to_pickle("X_new.pkl") # saving the dataframe to use it in the next approach

As we can see above, including two new features leads to having 9 missing samples.  

In [41]:
y_new.to_pickle("y_new.pkl") # saving the dataframe to use it in the next files

## The code can be run from here:

### Repeated 4 fold cross-validation

In [42]:
X_new = pd.read_pickle("X_new.pkl")  # reading the pickle file of X_new
y_new = pd.read_pickle("y_new.pkl")

In [43]:
rkf = RepeatedKFold(n_splits=4, n_repeats=2, random_state=10)
rkf.get_n_splits(X_new, y_new)

8

### RENT feature selection

In [44]:
my_C_params = [0.1, 0.3, 0.5, 0.7, 1, 10]
my_l1_ratios = [0, 0.5, 1]

for train_index, test_index in rkf.split(X_new, y_new):
    train_data, test_data = X_new.loc[train_index , : ] , X_new.loc[test_index,:]
    train_labels, test_labels = y_new[train_index], y_new[test_index]
    train_labels = train_labels.to_numpy()
    analysis =  RENT.RENT_Regression(data=train_data, 
                                target=train_labels, 
                                feat_names=X_new.columns, 
                                C= my_C_params, 
                                l1_ratios=my_l1_ratios,
                                autoEnetParSel=True,
                                poly='OFF',
                                testsize_range=(0.25,0.25),
                                K=700,
                                random_state=0,
                                verbose=1)

    analysis.train()
    selected_features = analysis.select_features(tau_1_cutoff=0.9, tau_2_cutoff=0.9, tau_3_cutoff=0.975)
    print('SELECTED FAETURES ARE : {}'.format(train_data.columns[selected_features]))

    # Scale the data accordingly
    sc = StandardScaler()
    train_data_1 = sc.fit_transform(train_data.iloc[:, selected_features])
    test_data_1 = sc.transform(test_data.iloc[:, selected_features])
    # Train model with 
    prediction_model = LinearRegression().fit(train_data_1, train_labels)
    print("R2: ", r2_score(test_labels, prediction_model.predict(test_data_1)))
    print("RMSEP: ", np.sqrt(mean_squared_error(test_labels, prediction_model.predict(test_data_1))))

    # summary object here: average of absolut error
    print(analysis.get_summary_objects())

data dimension: (53, 62)  data type: <class 'pandas.core.frame.DataFrame'>
target dimension: (53,)
regularization parameters C: [0.1, 0.3, 0.5, 0.7, 1, 10]
elastic net l1_ratios: [0, 0.5, 1]
poly: OFF
number of models in ensemble: 700
random state: 0
verbose: 1
SELECTED FAETURES ARE : Index(['PRIMTUM_Colon', 'SURGMET', 'RESPONS1_Complete Response (CR)'], dtype='object')
R2:  0.2581374188342612
RMSEP:  712.5835976632957
    # test  average abs error
0    177.0         578.701776
1    188.0        1585.839818
5    186.0         490.336576
7    188.0         981.025865
8    213.0        1025.112748
9    180.0         887.089322
10   182.0         444.829267
11   182.0         615.482784
12   187.0         924.275083
13   175.0         528.738175
14   181.0         516.373883
15   178.0        1380.804725
16   162.0        1062.264180
17   176.0        1273.963581
18   205.0        1137.375294
22   175.0        1050.912216
23   188.0         794.552212
24   200.0         496.772711
25   17

SELECTED FAETURES ARE : Index(['RESPONS1_Complete Response (CR)'], dtype='object')
R2:  0.46115404289404505
RMSEP:  405.32369200152704
    # test  average abs error
0    177.0         873.220463
3    188.0        2102.111738
5    186.0         990.159662
6    188.0         826.441863
9    213.0         657.579279
11   180.0        1062.555629
12   182.0        1211.110433
14   182.0         679.706379
15   187.0        1586.547400
16   175.0         805.180165
17   181.0         978.713582
18   178.0        1098.410484
20   162.0        1230.992418
21   176.0        4346.011779
22   205.0         743.092404
23   175.0        1126.919106
24   188.0         407.796392
25   200.0         792.731352
26   172.0        1185.246715
27   188.0         655.841296
29   187.0        2361.650638
31   170.0         793.299970
32   170.0         514.944677
33   176.0         970.650959
34   190.0         717.252020
35   203.0        1221.138850
37   171.0         875.623236
40   201.0        2312.65

- Table 5.9 in the thesis summarizes the above results.
- Average result of all summary objects are also summarized in table 5.11

## Second block

### Removing missing samples of RESPONSE and TIMETOTRM1 from the second blcok

In [51]:
X2 = pd.read_pickle("X2.pkl")

In [52]:
X_2_new = pd.concat([X2, df3['RESPONS1']], axis=1)
X_2_new = pd.concat([X_2_new,pd.get_dummies(df3['RESPONS1'], prefix='RESPONS1')],axis=1)
X_2_new = pd.concat([X_2_new, pd.DataFrame(y,columns=['y'])], axis=1)
# Missing items of RESPONS1 and TIMETOTRM1 overlaps and all the missings of TIMETOTRM1 are included in RESPONSE1.
X_2_new = X_2_new.dropna()
X_2_new.set_index(pd.Series(range(0,X_2_new.shape[0])), inplace = True)
# removing the response feature, since it belongs to the first block:
X_2_new = X_2_new.drop(['RESPONS1','RESPONS1_Complete Response (CR)',
                       'RESPONS1_Partial Response (PR)',
                       'RESPONS1_Progressive Disease (PD)',
                       'RESPONS1_Stable Disease (SD)'], axis = 1) 

In [53]:
X_2_new = X_2_new.loc[:, X_2_new.columns != 'y']

In [54]:
X2.shape[0] - X_2_new.shape[0]

9

In [55]:
X_2_new.to_pickle("X_2_new.pkl") # saving the dataframe to use it in the next approach

Same as before, we have 9 missing smaples.

### Repeated 4 fold cross-validation

In [56]:
X_2_new = pd.read_pickle("X_2_new.pkl")

In [57]:
rkf = RepeatedKFold(n_splits=4, n_repeats=2, random_state=10)
rkf.get_n_splits(X_2_new, y_new)

8

### RENT feature selection

In [58]:
my_C_params = [0.1, 0.3, 0.5, 0.7, 1, 10]
my_l1_ratios = [0, 0.5, 1]

for train_index, test_index in rkf.split(X_2_new, y_new):
    train_data, test_data = X_2_new.loc[train_index , : ] , X_2_new.loc[test_index,:]
    train_labels, test_labels = y_new[train_index], y_new[test_index]
    train_labels = train_labels.to_numpy()
    analysis =  RENT.RENT_Regression(data=train_data, 
                                target=train_labels, 
                                feat_names=X_2_new.columns, 
                                C= my_C_params, 
                                l1_ratios=my_l1_ratios,
                                autoEnetParSel=True,
                                poly='OFF',
                                testsize_range=(0.25,0.25),
                                K=700,
                                random_state=0,
                                verbose=1)

    analysis.train()
    selected_features = analysis.select_features(tau_1_cutoff=0.9, tau_2_cutoff=0.9, tau_3_cutoff=0.975)
    print('SELECTED FAETURES ARE : {}'.format(train_data.columns[selected_features]))

    # Scale the data accordingly
    sc = StandardScaler()
    train_data_1 = sc.fit_transform(train_data.iloc[:, selected_features])
    test_data_1 = sc.transform(test_data.iloc[:, selected_features])
    # Train model with 
    prediction_model = LinearRegression().fit(train_data_1, train_labels)
    print("R2: ", r2_score(test_labels, prediction_model.predict(test_data_1)))
    print("RMSEP: ", np.sqrt(mean_squared_error(test_labels, prediction_model.predict(test_data_1))))

    # summary object here: average of absolut error
    print(analysis.get_summary_objects())

data dimension: (53, 27)  data type: <class 'pandas.core.frame.DataFrame'>
target dimension: (53,)
regularization parameters C: [0.1, 0.3, 0.5, 0.7, 1, 10]
elastic net l1_ratios: [0, 0.5, 1]
poly: OFF
number of models in ensemble: 700
random state: 0
verbose: 1
SELECTED FAETURES ARE : Index(['LACTDHDR_Not Done', 'CGA2_Normal'], dtype='object')
R2:  -0.32995389490809823
RMSEP:  954.0966724934027
    # test  average abs error
0    177.0         793.397737
1    188.0         613.664153
5    186.0        1199.241073
7    188.0         972.968460
8    213.0         410.634392
9    180.0         273.513090
10   182.0         540.522572
11   182.0        1371.897680
12   187.0         713.778708
13   175.0        1183.149846
14   181.0         642.573783
15   178.0         546.665476
16   162.0         762.808596
17   176.0         455.526392
18   205.0         320.314158
22   175.0        1171.177305
23   188.0         815.829033
24   200.0         564.802187
25   172.0         798.386248
27

SELECTED FAETURES ARE : Index(['CGA2_Normal'], dtype='object')
R2:  -0.9388155132593394
RMSEP:  768.8434494583072
    # test  average abs error
0    177.0        1428.810463
3    188.0         782.677700
5    186.0        1070.473132
6    188.0        1681.732512
9    213.0         312.956763
11   180.0         990.434127
12   182.0         618.979182
14   182.0         827.499744
15   187.0         823.968882
16   175.0        1004.344809
17   181.0         386.770857
18   178.0         465.618557
20   162.0        1039.309030
21   176.0        4042.399428
22   205.0         794.104642
23   175.0         407.824578
24   188.0         440.274953
25   200.0         799.243833
26   172.0         477.972591
27   188.0        1118.162710
29   187.0        3239.690487
31   170.0         633.925086
32   170.0        1001.326553
33   176.0         477.466385
34   190.0         746.776655
35   203.0         775.347416
37   171.0         979.283013
40   201.0        2788.285335
41   183.0      

- Table 5.10 in the thesis summarizes the above results.
- Average result of all summary objects are also summarized in table 5.11