# Regression analysis and training

In [1]:
import glob
import os
import os.path
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import logging
from sklearn.preprocessing import StandardScaler
from bokeh.io import output_notebook, show
from bokeh.plotting import figure
from bokeh.models.annotations import Title
output_notebook()

from config import mld_features_path, hinds_features_path, xls_path
from core import *

# setting the logging format
FORMAT = '%(asctime)-15s %(clientip)s %(user)-8s %(message)s'
logging.basicConfig(format=FORMAT, level=logging.INFO)

0.9978143432822129


## Constructing the training dataframe

In [2]:
# Reading the extracted features
mld_features= pd.read_csv(mld_features_path)
hinds_features= pd.read_csv(hinds_features_path)

# Determining the ids and positions
mld_features['id']= mld_features['filename'].apply(lambda x: x.split(os.sep)[-1][:-4])
hinds_features['id']= hinds_features['filename'].apply(lambda x: x.split(os.sep)[-1][:-4])

# Reading the XLS data
xls_data= pd.read_excel(xls_path, engine='openpyxl')

# Extracting the ground truth data
target= pd.DataFrame(data= {'ct_num': xls_data[u'CT_num'].astype(str).str.zfill(3),
                               'pos': xls_data[u'position'].astype(str),
                               'mld': xls_data[u'MLD'].astype(float),
                               'hinds': xls_data[u'left_hinds_fillet'].astype(float)})
target['id']= target['ct_num'] + '.mnc-' + target['pos']
target= target.sort_values('id')
target= target.reset_index()

# Merging the MLD features with the ground truth data
mld_target= target[['id', 'mld', 'pos']]
hinds_target= target[['id', 'hinds', 'pos']]

mld_data= pd.merge(mld_features, mld_target, how='inner', on='id').dropna()
hinds_data= pd.merge(hinds_features, hinds_target, how='inner', on='id').dropna()

In [3]:
mld_target= mld_data['mld']
mld_features= mld_data.drop(['filename', 'id', 'mld'], axis='columns')
hinds_target= hinds_data['hinds']
hinds_features= hinds_data.drop(['filename', 'id', 'hinds'], axis='columns')

# encoding the positions
mld_features['pos']= mld_features['pos'].apply(lambda x: {'a': 0, 'k': 1, 'f': 2}[x])
hinds_features['pos']= hinds_features['pos'].apply(lambda x: {'a': 0, 'k': 1, 'f': 2}[x])

## Model selection

In [4]:
masks= np.unique([c.split('-')[1] for c in mld_features.columns if len(c) > 10 and len(c.split('-')[1]) == 4])

In [5]:
masks

array(['203a', '203k', '204f', '206k', '208f'], dtype='<U4')

### MLD using all features

In [6]:
results= []
results.append(model_selection(mld_features, mld_target, dataset='mld', type='all'))

Objective KNNR_Objective:
0.9978143432822129
iterations: 3304
Number of used features: 40
Used features: ['num-203a-mld.nii-0.500000', 'sum-203a-mld.nii-0.500000', 'std-203a-mld.nii-0.500000', 'skew-203a-mld.nii-0.500000', 'hist-4-203a-mld.nii-0.500000', 'hist-14-203a-mld.nii-0.500000', 'hist-16-203a-mld.nii-0.500000', 'hist-18-203a-mld.nii-0.500000', 'num-203k-mld.nii-0.500000', 'hist-2-203k-mld.nii-0.500000', 'hist-3-203k-mld.nii-0.500000', 'hist-7-203k-mld.nii-0.500000', 'hist-15-203k-mld.nii-0.500000', 'num-204f-mld.nii-0.500000', 'sum-204f-mld.nii-0.500000', 'std-204f-mld.nii-0.500000', 'hist-4-204f-mld.nii-0.500000', 'hist-5-204f-mld.nii-0.500000', 'hist-6-204f-mld.nii-0.500000', 'hist-13-204f-mld.nii-0.500000', 'sum-206k-mld.nii-0.500000', 'num-208f-mld.nii-0.500000', 'sum-208f-mld.nii-0.500000', 'kurt-208f-mld.nii-0.500000', 'hist-5-208f-mld.nii-0.500000', 'hist-6-208f-mld.nii-0.500000', 'hist-8-208f-mld.nii-0.500000', 'hist-15-208f-mld.nii-0.500000', 'hist-18-208f-mld.nii-0.50

NameError: name 't' is not defined

### MLD using the mean mask features

In [7]:
results.append(model_selection(mld_features[[c for c in mld_features.columns if 'mean_mask' in c or c == 'type']], mld_target, dataset='mld', type='mean_mask'))

Objective KNNR_Objective:
0.9978143432822129
iterations: 1182
Number of used features: 3
Used features: ['num-0.500000-mean_mask', 'hist-5-0.500000-mean_mask', 'hist-12-0.500000-mean_mask'] 
Score: -0.7157476778800311
200
1 0.7254837047046641
Objective LinearRegression_Objective:
0.9978143432822129
iterations: 1040
Number of used features: 3
Used features: ['num-0.500000-mean_mask', 'hist-0-0.500000-mean_mask', 'hist-15-0.500000-mean_mask'] 
Score: -0.7832200489011554
200
1 0.7836888125775561
Objective LassoRegression_Objective:
0.9978143432822129
iterations: 1100
Number of used features: 3
Used features: ['num-0.500000-mean_mask', 'hist-0-0.500000-mean_mask', 'hist-17-0.500000-mean_mask'] 
Score: -0.7836574227513643
200
1 0.7836099073749718
Objective RidgeRegression_Objective:
0.9978143432822129
iterations: 1036
Number of used features: 3
Used features: ['num-0.500000-mean_mask', 'hist-0-0.500000-mean_mask', 'hist-17-0.500000-mean_mask'] 
Score: -0.7835949045248055
200
1 0.78360716145

### MLD using the features of the individual masks

In [8]:
for m in masks:
    results.append(model_selection(mld_features[[c for c in mld_features.columns if m in c or c == 'type']], mld_target, dataset='mld', type=m))

Objective KNNR_Objective:
0.9978143432822129
iterations: 1930
Number of used features: 2
Used features: ['num-203a-mld.nii-0.500000', 'hist-5-203a-mld.nii-0.500000'] 
Score: -0.7284261593560039
200
1 0.7422861725658454
Objective LinearRegression_Objective:
0.9978143432822129
iterations: 1063
Number of used features: 4
Used features: ['num-203a-mld.nii-0.500000', 'hist-0-203a-mld.nii-0.500000', 'hist-14-203a-mld.nii-0.500000', 'hist-19-203a-mld.nii-0.500000'] 
Score: -0.7866667742605806
200
1 0.7875610728347573
Objective LassoRegression_Objective:
0.9978143432822129
iterations: 1090
Number of used features: 4
Used features: ['num-203a-mld.nii-0.500000', 'hist-0-203a-mld.nii-0.500000', 'hist-14-203a-mld.nii-0.500000', 'hist-19-203a-mld.nii-0.500000'] 
Score: -0.7863132395965321
200
1 0.7870948835837552
Objective RidgeRegression_Objective:
0.9978143432822129
iterations: 1467
Number of used features: 4
Used features: ['num-203a-mld.nii-0.500000', 'hist-0-203a-mld.nii-0.500000', 'hist-14-20

### Hinds using all features

In [9]:
results.append(model_selection(hinds_features, hinds_target, dataset='hinds', type='all'))

Objective KNNR_Objective:
0.9978143432822129
iterations: 2584
Number of used features: 18
Used features: ['num-203a-hinds.nii-0.500000', 'num-203k-hinds.nii-0.500000', 'hist-6-203k-hinds.nii-0.500000', 'hist-10-203k-hinds.nii-0.500000', 'num-204f-hinds.nii-0.500000', 'hist-1-204f-hinds.nii-0.500000', 'hist-8-204f-hinds.nii-0.500000', 'num-206k-hinds.nii-0.500000', 'sum-206k-hinds.nii-0.500000', 'hist-1-206k-hinds.nii-0.500000', 'hist-6-206k-hinds.nii-0.500000', 'hist-15-206k-hinds.nii-0.500000', 'num-208f-hinds.nii-0.500000', 'hist-0-208f-hinds.nii-0.500000', 'hist-16-208f-hinds.nii-0.500000', 'hist-17-208f-hinds.nii-0.500000', 'num-0.500000-mean_mask', 'hist-15-0.500000-mean_mask'] 
Score: -0.8401056800246137
200
1 0.852743127985394
Objective LinearRegression_Objective:
0.9978143432822129
iterations: 1500
Number of used features: 12
Used features: ['num-203a-hinds.nii-0.500000', 'hist-1-203k-hinds.nii-0.500000', 'hist-8-204f-hinds.nii-0.500000', 'hist-14-206k-hinds.nii-0.500000', 'his

### Hinds using the mean mask features

In [10]:
results.append(model_selection(hinds_features[[c for c in hinds_features.columns if 'mean_mask' in c or c == 'type']], hinds_target, dataset='hinds', type='mean_mask'))

Objective KNNR_Objective:
0.9978143432822129
iterations: 1083
Number of used features: 2
Used features: ['num-0.500000-mean_mask', 'hist-12-0.500000-mean_mask'] 
Score: -0.816527745909563
200
1 0.8283618103348969
Objective LinearRegression_Objective:
0.9978143432822129
iterations: 1027
Number of used features: 3
Used features: ['num-0.500000-mean_mask', 'kurt-0.500000-mean_mask', 'hist-4-0.500000-mean_mask'] 
Score: -0.8763673050536803
200
1 0.8772721368152827
Objective LassoRegression_Objective:
0.9978143432822129
iterations: 1228
Number of used features: 4
Used features: ['num-0.500000-mean_mask', 'kurt-0.500000-mean_mask', 'hist-2-0.500000-mean_mask', 'hist-6-0.500000-mean_mask'] 
Score: -0.8792609423753571
200
1 0.8809713868131402
Objective RidgeRegression_Objective:
0.9978143432822129
iterations: 1170
Number of used features: 4
Used features: ['num-0.500000-mean_mask', 'kurt-0.500000-mean_mask', 'hist-2-0.500000-mean_mask', 'hist-6-0.500000-mean_mask'] 
Score: -0.8806936463577117


### Hinds using the features of the individual masks

In [11]:
for m in masks:
    results.append(model_selection(hinds_features[[c for c in hinds_features.columns if m in c or c == 'type']], hinds_target, dataset='hinds', type=m))

Objective KNNR_Objective:
0.9978143432822129
iterations: 1128
Number of used features: 3
Used features: ['num-203a-hinds.nii-0.500000', 'hist-0-203a-hinds.nii-0.500000', 'hist-5-203a-hinds.nii-0.500000'] 
Score: -0.8136476852308019
200
1 0.8237987496068505
Objective LinearRegression_Objective:
0.9978143432822129
iterations: 1027
Number of used features: 2
Used features: ['num-203a-hinds.nii-0.500000', 'hist-4-203a-hinds.nii-0.500000'] 
Score: -0.875621945627535
200
1 0.876782685692848
Objective LassoRegression_Objective:
0.9978143432822129
iterations: 1042
Number of used features: 2
Used features: ['num-203a-hinds.nii-0.500000', 'hist-4-203a-hinds.nii-0.500000'] 
Score: -0.8751410948265079
200
1 0.8763055401341836
Objective RidgeRegression_Objective:
0.9978143432822129
iterations: 1157
Number of used features: 6
Used features: ['num-203a-hinds.nii-0.500000', 'hist-0-203a-hinds.nii-0.500000', 'hist-2-203a-hinds.nii-0.500000', 'hist-6-203a-hinds.nii-0.500000', 'hist-12-203a-hinds.nii-0.5

### Saving the results

In [12]:
results= pd.concat(results)
results.to_csv('results.csv', index=False)
import pickle
pickle.dump(results, open('results.pickle', 'wb'))