# Regression analysis and training

In [1]:
import glob
import os
import os.path
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import logging
from sklearn.preprocessing import StandardScaler
from bokeh.io import output_notebook, show
from bokeh.plotting import figure
from bokeh.models.annotations import Title
import pickle
output_notebook()

from config import mld_features_path, hinds_features_path, xls_path
from core import *

# setting the logging format
FORMAT = '%(asctime)-15s %(clientip)s %(user)-8s %(message)s'
logging.basicConfig(format=FORMAT, level=logging.INFO)

Executables being used: /bin/elastix /bin/transformix


## Constructing the training dataframe

In [2]:
# Reading the extracted features
mld_features= pd.read_csv(mld_features_path)
hinds_features= pd.read_csv(hinds_features_path)

# Determining the ids and positions
mld_features['id']= mld_features['filename'].apply(lambda x: x.split(os.sep)[-1][:-4])
hinds_features['id']= hinds_features['filename'].apply(lambda x: x.split(os.sep)[-1][:-4])

# Reading the XLS data
xls_data= pd.read_excel(xls_path, engine='openpyxl')

# Extracting the ground truth data
target= pd.DataFrame(data= {'ct_num': xls_data[u'CT_num'].astype(str).str.zfill(3),
                               'pos': xls_data[u'position'].astype(str),
                               'mld': xls_data[u'MLD'].astype(float),
                               'hinds': xls_data[u'left_hinds_fillet'].astype(float)})
target['id']= target['ct_num'] + '.mnc-' + target['pos']
target= target.sort_values('id')
target= target.reset_index()

# Merging the MLD features with the ground truth data
mld_target= target[['id', 'mld', 'pos']]
hinds_target= target[['id', 'hinds', 'pos']]

mld_data= pd.merge(mld_features, mld_target, how='inner', on='id').dropna()
hinds_data= pd.merge(hinds_features, hinds_target, how='inner', on='id').dropna()

In [3]:
mld_target= mld_data['mld']
mld_features= mld_data.drop(['filename', 'id', 'mld'], axis='columns')
hinds_target= hinds_data['hinds']
hinds_features= hinds_data.drop(['filename', 'id', 'hinds'], axis='columns')

# encoding the positions
mld_features['pos']= mld_features['pos'].apply(lambda x: {'a': 0, 'k': 1, 'f': 2}[x])
hinds_features['pos']= hinds_features['pos'].apply(lambda x: {'a': 0, 'k': 1, 'f': 2}[x])

## Model selection with feature selection

In [4]:
masks= np.unique([c.split('-')[1] for c in mld_features.columns if len(c) > 10 and len(c.split('-')[1]) == 4])

In [5]:
masks

array(['203a', '203k', '204f', '206k', '208f'], dtype='<U4')

In [6]:
results= []
results_no_fs= []

### MLD using all features

In [7]:
results.append(model_selection(mld_features, mld_target, dataset='mld', type='all'))

  0%|          | 0/8000 [00:00<?, ?it/s]Objective KNNR_Objective:
 28%|██▊       | 2254/8000 [10:31<26:50,  3.57it/s]
58it [00:00, 577.79it/s]iterations: 2254
Number of used features: 41
Used features: ['num-203a-mld.nii-0.500000', 'sum-203a-mld.nii-0.500000', 'std-203a-mld.nii-0.500000', 'skew-203a-mld.nii-0.500000', 'hist-4-203a-mld.nii-0.500000', 'hist-14-203a-mld.nii-0.500000', 'hist-16-203a-mld.nii-0.500000', 'hist-18-203a-mld.nii-0.500000', 'num-203k-mld.nii-0.500000', 'hist-2-203k-mld.nii-0.500000', 'hist-3-203k-mld.nii-0.500000', 'hist-7-203k-mld.nii-0.500000', 'hist-15-203k-mld.nii-0.500000', 'num-204f-mld.nii-0.500000', 'sum-204f-mld.nii-0.500000', 'std-204f-mld.nii-0.500000', 'hist-4-204f-mld.nii-0.500000', 'hist-5-204f-mld.nii-0.500000', 'hist-6-204f-mld.nii-0.500000', 'hist-13-204f-mld.nii-0.500000', 'sum-206k-mld.nii-0.500000', 'num-208f-mld.nii-0.500000', 'sum-208f-mld.nii-0.500000', 'kurt-208f-mld.nii-0.500000', 'hist-5-208f-mld.nii-0.500000', 'hist-6-208f-mld.nii-0.500

### MLD using the mean mask features

In [8]:
results.append(model_selection(mld_features[[c for c in mld_features.columns if 'mean_mask' in c or c == 'type']], mld_target, dataset='mld', type='mean_mask'))

  0%|          | 0/8000 [00:00<?, ?it/s]Objective KNNR_Objective:
  9%|▊         | 682/8000 [01:31<16:18,  7.48it/s]
58it [00:00, 576.52it/s]iterations: 682
Number of used features: 3
Used features: ['num-0.500000-mean_mask', 'hist-5-0.500000-mean_mask', 'hist-12-0.500000-mean_mask'] 
Score: -0.7157476778800311
200it [00:00, 571.04it/s]
  0%|          | 0/8000 [00:00<?, ?it/s]200
1 0.7254837047046641
Objective LinearRegression_Objective:
  7%|▋         | 540/8000 [00:11<02:35, 47.96it/s] 
71it [00:00, 702.37it/s]iterations: 540
Number of used features: 3
Used features: ['num-0.500000-mean_mask', 'hist-0-0.500000-mean_mask', 'hist-15-0.500000-mean_mask'] 
Score: -0.7832200489011554
200it [00:00, 736.34it/s]
  0%|          | 0/8000 [00:00<?, ?it/s]200
1 0.7836888125775561
Objective LassoRegression_Objective:
  8%|▊         | 600/8000 [00:39<08:03, 15.31it/s]
62it [00:00, 613.39it/s]iterations: 600
Number of used features: 3
Used features: ['num-0.500000-mean_mask', 'hist-0-0.500000-mean_

### MLD using the features of the individual masks

In [9]:
for m in masks:
    results.append(model_selection(mld_features[[c for c in mld_features.columns if m in c or c == 'type']], mld_target, dataset='mld', type=m))

  0%|          | 0/8000 [00:00<?, ?it/s]Objective KNNR_Objective:
  9%|▊         | 682/8000 [01:33<16:41,  7.31it/s]
57it [00:00, 563.85it/s]iterations: 682
Number of used features: 3
Used features: ['num-203a-mld.nii-0.500000', 'std-203a-mld.nii-0.500000', 'hist-5-203a-mld.nii-0.500000'] 
Score: -0.7264921818915584
200it [00:00, 580.70it/s]
  0%|          | 0/8000 [00:00<?, ?it/s]200
1 0.7342859164645557
Objective LinearRegression_Objective:
  7%|▋         | 563/8000 [00:12<02:44, 45.08it/s] 
71it [00:00, 701.83it/s]iterations: 563
Number of used features: 4
Used features: ['num-203a-mld.nii-0.500000', 'hist-0-203a-mld.nii-0.500000', 'hist-14-203a-mld.nii-0.500000', 'hist-19-203a-mld.nii-0.500000'] 
Score: -0.7866667742605806
200it [00:00, 706.59it/s]
  0%|          | 0/8000 [00:00<?, ?it/s]200
1 0.7875610728347573
Objective LassoRegression_Objective:
  7%|▋         | 590/8000 [00:43<09:07, 13.53it/s]
60it [00:00, 599.66it/s]iterations: 590
Number of used features: 4
Used features: ['

### Hinds using all features

In [10]:
results.append(model_selection(hinds_features, hinds_target, dataset='hinds', type='all'))

  0%|          | 0/8000 [00:00<?, ?it/s]Objective KNNR_Objective:
 26%|██▌       | 2084/8000 [07:15<20:35,  4.79it/s]
64it [00:00, 634.77it/s]iterations: 2084
Number of used features: 18
Used features: ['num-203a-hinds.nii-0.500000', 'num-203k-hinds.nii-0.500000', 'hist-6-203k-hinds.nii-0.500000', 'hist-10-203k-hinds.nii-0.500000', 'num-204f-hinds.nii-0.500000', 'hist-1-204f-hinds.nii-0.500000', 'hist-8-204f-hinds.nii-0.500000', 'num-206k-hinds.nii-0.500000', 'sum-206k-hinds.nii-0.500000', 'hist-1-206k-hinds.nii-0.500000', 'hist-6-206k-hinds.nii-0.500000', 'hist-15-206k-hinds.nii-0.500000', 'num-208f-hinds.nii-0.500000', 'hist-0-208f-hinds.nii-0.500000', 'hist-16-208f-hinds.nii-0.500000', 'hist-17-208f-hinds.nii-0.500000', 'num-0.500000-mean_mask', 'hist-15-0.500000-mean_mask'] 
Score: -0.8401056800246137
200it [00:00, 646.09it/s]
  0%|          | 0/8000 [00:00<?, ?it/s]200
1 0.852743127985394
Objective LinearRegression_Objective:
 12%|█▎        | 1000/8000 [01:28<10:22, 11.25it/s]
68i

### Hinds using the mean mask features

In [11]:
results.append(model_selection(hinds_features[[c for c in hinds_features.columns if 'mean_mask' in c or c == 'type']], hinds_target, dataset='hinds', type='mean_mask'))

Objective KNNR_Objective:
  7%|▋         | 583/8000 [01:14<15:53,  7.78it/s]
55it [00:00, 546.74it/s]iterations: 583
Number of used features: 2
Used features: ['num-0.500000-mean_mask', 'hist-12-0.500000-mean_mask'] 
Score: -0.816527745909563
200it [00:00, 572.19it/s]
  0%|          | 0/8000 [00:00<?, ?it/s]200
1 0.8283618103348969
Objective LinearRegression_Objective:
  7%|▋         | 527/8000 [00:10<02:22, 52.30it/s] 
76it [00:00, 751.98it/s]iterations: 527
Number of used features: 3
Used features: ['num-0.500000-mean_mask', 'kurt-0.500000-mean_mask', 'hist-4-0.500000-mean_mask'] 
Score: -0.8763673050536803
200it [00:00, 760.18it/s]
  0%|          | 0/8000 [00:00<?, ?it/s]200
1 0.8772721368152827
Objective LassoRegression_Objective:
  8%|▊         | 650/8000 [00:48<09:09, 13.39it/s]
63it [00:00, 627.05it/s]iterations: 650
Number of used features: 4
Used features: ['num-0.500000-mean_mask', 'kurt-0.500000-mean_mask', 'hist-2-0.500000-mean_mask', 'hist-6-0.500000-mean_mask'] 
Score: -0

### Hinds using the features of the individual masks

In [12]:
for m in masks:
    results.append(model_selection(hinds_features[[c for c in hinds_features.columns if m in c or c == 'type']], hinds_target, dataset='hinds', type=m))

  0%|          | 0/8000 [00:00<?, ?it/s]Objective KNNR_Objective:
  8%|▊         | 628/8000 [01:19<15:35,  7.88it/s]
58it [00:00, 577.33it/s]iterations: 628
Number of used features: 3
Used features: ['num-203a-hinds.nii-0.500000', 'hist-0-203a-hinds.nii-0.500000', 'hist-5-203a-hinds.nii-0.500000'] 
Score: -0.8136476852308019
200it [00:00, 590.46it/s]
  0%|          | 0/8000 [00:00<?, ?it/s]200
1 0.8237987496068505
Objective LinearRegression_Objective:
  7%|▋         | 527/8000 [00:09<02:14, 55.43it/s] 
77it [00:00, 766.10it/s]iterations: 527
Number of used features: 2
Used features: ['num-203a-hinds.nii-0.500000', 'hist-4-203a-hinds.nii-0.500000'] 
Score: -0.875621945627535
200it [00:00, 774.36it/s]
  0%|          | 0/8000 [00:00<?, ?it/s]200
1 0.876782685692848
Objective LassoRegression_Objective:
  7%|▋         | 585/8000 [00:39<08:25, 14.66it/s]
61it [00:00, 609.35it/s]iterations: 585
Number of used features: 3
Used features: ['num-203a-hinds.nii-0.500000', 'hist-1-203a-hinds.nii-0.

### Saving the results

In [13]:
results= pd.concat(results)
results.to_csv('results.csv', index=False)
pickle.dump(results, open('results.pickle', 'wb'))

## Without feature selection

### MLD with all features

In [14]:
results_no_fs.append(model_selection(mld_features, mld_target, dataset='mld', type='all', disable_feature_selection=True))

Objective KNNR_Objective:
  7%|▋         | 559/8000 [01:54<25:20,  4.89it/s]
42it [00:00, 419.18it/s]iterations: 559
Number of used features: 157
Used features: ['num-203a-mld.nii-0.500000', 'sum-203a-mld.nii-0.500000', 'mean-203a-mld.nii-0.500000', 'std-203a-mld.nii-0.500000', 'skew-203a-mld.nii-0.500000', 'kurt-203a-mld.nii-0.500000', 'hist-0-203a-mld.nii-0.500000', 'hist-1-203a-mld.nii-0.500000', 'hist-2-203a-mld.nii-0.500000', 'hist-3-203a-mld.nii-0.500000', 'hist-4-203a-mld.nii-0.500000', 'hist-5-203a-mld.nii-0.500000', 'hist-6-203a-mld.nii-0.500000', 'hist-7-203a-mld.nii-0.500000', 'hist-8-203a-mld.nii-0.500000', 'hist-9-203a-mld.nii-0.500000', 'hist-10-203a-mld.nii-0.500000', 'hist-11-203a-mld.nii-0.500000', 'hist-12-203a-mld.nii-0.500000', 'hist-13-203a-mld.nii-0.500000', 'hist-14-203a-mld.nii-0.500000', 'hist-15-203a-mld.nii-0.500000', 'hist-16-203a-mld.nii-0.500000', 'hist-17-203a-mld.nii-0.500000', 'hist-18-203a-mld.nii-0.500000', 'hist-19-203a-mld.nii-0.500000', 'num-203k-m

### MLD mean mask

In [15]:
results_no_fs.append(model_selection(mld_features[[c for c in mld_features.columns if 'mean_mask' in c or c == 'type']], mld_target, dataset='mld', type='mean_mask', disable_feature_selection=True))

Objective KNNR_Objective:
  6%|▋         | 511/8000 [00:30<07:32, 16.56it/s]
63it [00:00, 620.36it/s]iterations: 511
Number of used features: 26
Used features: ['num-0.500000-mean_mask', 'sum-0.500000-mean_mask', 'mean-0.500000-mean_mask', 'std-0.500000-mean_mask', 'skew-0.500000-mean_mask', 'kurt-0.500000-mean_mask', 'hist-0-0.500000-mean_mask', 'hist-1-0.500000-mean_mask', 'hist-2-0.500000-mean_mask', 'hist-3-0.500000-mean_mask', 'hist-4-0.500000-mean_mask', 'hist-5-0.500000-mean_mask', 'hist-6-0.500000-mean_mask', 'hist-7-0.500000-mean_mask', 'hist-8-0.500000-mean_mask', 'hist-9-0.500000-mean_mask', 'hist-10-0.500000-mean_mask', 'hist-11-0.500000-mean_mask', 'hist-12-0.500000-mean_mask', 'hist-13-0.500000-mean_mask', 'hist-14-0.500000-mean_mask', 'hist-15-0.500000-mean_mask', 'hist-16-0.500000-mean_mask', 'hist-17-0.500000-mean_mask', 'hist-18-0.500000-mean_mask', 'hist-19-0.500000-mean_mask'] 
Score: -0.5863258267040994
200it [00:00, 541.63it/s]
  0%|          | 0/8000 [00:00<?, ?i

### MLD individual masks

In [16]:
for m in masks:
    results_no_fs.append(model_selection(mld_features[[c for c in mld_features.columns if m in c or c == 'type']], mld_target, dataset='mld', type=m, disable_feature_selection=True))

  0%|          | 0/8000 [00:00<?, ?it/s]Objective KNNR_Objective:
 12%|█▏        | 946/8000 [00:45<05:35, 21.00it/s]
33it [00:00, 329.86it/s]iterations: 946
Number of used features: 26
Used features: ['num-203a-mld.nii-0.500000', 'sum-203a-mld.nii-0.500000', 'mean-203a-mld.nii-0.500000', 'std-203a-mld.nii-0.500000', 'skew-203a-mld.nii-0.500000', 'kurt-203a-mld.nii-0.500000', 'hist-0-203a-mld.nii-0.500000', 'hist-1-203a-mld.nii-0.500000', 'hist-2-203a-mld.nii-0.500000', 'hist-3-203a-mld.nii-0.500000', 'hist-4-203a-mld.nii-0.500000', 'hist-5-203a-mld.nii-0.500000', 'hist-6-203a-mld.nii-0.500000', 'hist-7-203a-mld.nii-0.500000', 'hist-8-203a-mld.nii-0.500000', 'hist-9-203a-mld.nii-0.500000', 'hist-10-203a-mld.nii-0.500000', 'hist-11-203a-mld.nii-0.500000', 'hist-12-203a-mld.nii-0.500000', 'hist-13-203a-mld.nii-0.500000', 'hist-14-203a-mld.nii-0.500000', 'hist-15-203a-mld.nii-0.500000', 'hist-16-203a-mld.nii-0.500000', 'hist-17-203a-mld.nii-0.500000', 'hist-18-203a-mld.nii-0.500000', 'hist

### Hinds all features

In [17]:
results_no_fs.append(model_selection(hinds_features, hinds_target, dataset='hinds', type='all', disable_feature_selection=True))

Objective KNNR_Objective:
  7%|▋         | 545/8000 [01:57<26:51,  4.63it/s]
44it [00:00, 437.71it/s]iterations: 545
Number of used features: 157
Used features: ['num-203a-hinds.nii-0.500000', 'sum-203a-hinds.nii-0.500000', 'mean-203a-hinds.nii-0.500000', 'std-203a-hinds.nii-0.500000', 'skew-203a-hinds.nii-0.500000', 'kurt-203a-hinds.nii-0.500000', 'hist-0-203a-hinds.nii-0.500000', 'hist-1-203a-hinds.nii-0.500000', 'hist-2-203a-hinds.nii-0.500000', 'hist-3-203a-hinds.nii-0.500000', 'hist-4-203a-hinds.nii-0.500000', 'hist-5-203a-hinds.nii-0.500000', 'hist-6-203a-hinds.nii-0.500000', 'hist-7-203a-hinds.nii-0.500000', 'hist-8-203a-hinds.nii-0.500000', 'hist-9-203a-hinds.nii-0.500000', 'hist-10-203a-hinds.nii-0.500000', 'hist-11-203a-hinds.nii-0.500000', 'hist-12-203a-hinds.nii-0.500000', 'hist-13-203a-hinds.nii-0.500000', 'hist-14-203a-hinds.nii-0.500000', 'hist-15-203a-hinds.nii-0.500000', 'hist-16-203a-hinds.nii-0.500000', 'hist-17-203a-hinds.nii-0.500000', 'hist-18-203a-hinds.nii-0.500

### Hinds mean mask

In [18]:
results_no_fs.append(model_selection(hinds_features[[c for c in hinds_features.columns if 'mean_mask' in c or c == 'type']], hinds_target, dataset='hinds', type='mean_mask', disable_feature_selection=True))

Objective KNNR_Objective:
  7%|▋         | 529/8000 [00:31<07:29, 16.63it/s]
62it [00:00, 619.86it/s]iterations: 529
Number of used features: 26
Used features: ['num-0.500000-mean_mask', 'sum-0.500000-mean_mask', 'mean-0.500000-mean_mask', 'std-0.500000-mean_mask', 'skew-0.500000-mean_mask', 'kurt-0.500000-mean_mask', 'hist-0-0.500000-mean_mask', 'hist-1-0.500000-mean_mask', 'hist-2-0.500000-mean_mask', 'hist-3-0.500000-mean_mask', 'hist-4-0.500000-mean_mask', 'hist-5-0.500000-mean_mask', 'hist-6-0.500000-mean_mask', 'hist-7-0.500000-mean_mask', 'hist-8-0.500000-mean_mask', 'hist-9-0.500000-mean_mask', 'hist-10-0.500000-mean_mask', 'hist-11-0.500000-mean_mask', 'hist-12-0.500000-mean_mask', 'hist-13-0.500000-mean_mask', 'hist-14-0.500000-mean_mask', 'hist-15-0.500000-mean_mask', 'hist-16-0.500000-mean_mask', 'hist-17-0.500000-mean_mask', 'hist-18-0.500000-mean_mask', 'hist-19-0.500000-mean_mask'] 
Score: -0.6426118256846536
200it [00:00, 627.09it/s]
  0%|          | 0/8000 [00:00<?, ?i

### Hinds individual masks

In [19]:
for m in masks:
    results_no_fs.append(model_selection(hinds_features[[c for c in hinds_features.columns if m in c or c == 'type']], hinds_target, dataset='hinds', type=m, disable_feature_selection=True))

  0%|          | 0/8000 [00:00<?, ?it/s]Objective KNNR_Objective:
  7%|▋         | 529/8000 [00:31<07:23, 16.86it/s]
64it [00:00, 638.15it/s]iterations: 529
Number of used features: 26
Used features: ['num-203a-hinds.nii-0.500000', 'sum-203a-hinds.nii-0.500000', 'mean-203a-hinds.nii-0.500000', 'std-203a-hinds.nii-0.500000', 'skew-203a-hinds.nii-0.500000', 'kurt-203a-hinds.nii-0.500000', 'hist-0-203a-hinds.nii-0.500000', 'hist-1-203a-hinds.nii-0.500000', 'hist-2-203a-hinds.nii-0.500000', 'hist-3-203a-hinds.nii-0.500000', 'hist-4-203a-hinds.nii-0.500000', 'hist-5-203a-hinds.nii-0.500000', 'hist-6-203a-hinds.nii-0.500000', 'hist-7-203a-hinds.nii-0.500000', 'hist-8-203a-hinds.nii-0.500000', 'hist-9-203a-hinds.nii-0.500000', 'hist-10-203a-hinds.nii-0.500000', 'hist-11-203a-hinds.nii-0.500000', 'hist-12-203a-hinds.nii-0.500000', 'hist-13-203a-hinds.nii-0.500000', 'hist-14-203a-hinds.nii-0.500000', 'hist-15-203a-hinds.nii-0.500000', 'hist-16-203a-hinds.nii-0.500000', 'hist-17-203a-hinds.nii-0

### Saving the resulst

In [20]:
results_no_fs= pd.concat(results_no_fs)
results_no_fs.to_csv('results_no_fs.csv', index=False)
pickle.dump(results_no_fs, open('results_no_fs.pickle', 'wb'))