# Exercise 4

Feature selection:

1. You need to load the UCI Arrhythmia dataset
https://archive.ics.uci.edu/ml/datasets/arrhythmia
* the problem is a multi-class classification

2. For comparing the result you need to use kNN (k=3), and compare based on the accuracy with the original dataset (with the all features).

2. Run three different types of feature selection methods (Univariate Statistics, model based, and Iterative Feature Selection).
* The original data set has 279 features.
* You need to run the methods for number_of_features= 20, 50, 100, 150, 200
* All the results should show in the same figure (At the end we have just one figure)

### Updating Environment

In [None]:
#Updating and restarting sklearn
!pip install --upgrade scikit-learn --quiet

In [None]:
import sklearn
sklearn.__version__

In [None]:
!pip install --upgrade seaborn==0.11.2

In [None]:
# First upgrade the environment.
import pip
from subprocess import run
# add what you will need
modules =[
    'matplotlib', 
    'numpy',
    'pandas',
    'prettytable',
    'plotly'
]
proc = run(f'pip install {" ".join(modules)} --upgrade --no-input', 
       shell=True, 
       text=True, 
       capture_output=True, 
       timeout=120) #a couple of minutes
print(proc.stderr)

## Collecting and importing needed libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns   
import matplotlib.pyplot as plt
from pandas import set_option
from prettytable import PrettyTable
import plotly.graph_objects as go
from pandas import set_option

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline, make_pipeline

from sklearn.feature_selection import SelectKBest, SelectFromModel,SelectPercentile, RFE, f_classif, chi2, mutual_info_classif

from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import RidgeClassifier, RidgeClassifierCV
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB 

from sklearn.metrics import confusion_matrix, accuracy_score, f1_score

np.seterr(invalid='ignore')
np.set_printoptions(precision=2)

## Uploading the data and renaming columns

In [2]:
# Uploading the dataset
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/arrhythmia/'
file_name = 'arrhythmia.data'
data = pd.read_csv(url+file_name,header=None, na_values="?")

In [3]:
#Renaming column names
col_names = ['Age', 'Sex', 'Height', 'Weight', 'QRS_duration', 'PR_int', 'QT_int', 'T_int', 'P_int', 'QRS', 'T', 'P', 'QRST', 'J','Heart_rate',
             'DI_Q_wave','DI_R_wave', 'DI_S_wave','DI_R*_Wave', 'DI_S*_wave','DI_int_deflec', 'DI_ragged_R','DI_diphasic_R',
             'DI_ragged_P', 'DI_diphasic_P','DI_ragged_T', 'DI_diphasic_T',
             'DII_Q_wave','DII_R_wave', 'DII_S_wave','DII_R*_Wave', 'DII_S*_wave','DII_int_deflec', 'DII_ragged_R','DII_diphasic_R',
             'DII_ragged_P', 'DII_diphasic_P','DII_ragged_T', 'DII_diphasic_T',
             'DIII_Q_wave','DIII_R_wave', 'DIII_S_wave','DIII_R*_Wave', 'DIII_S*_wave','DIII_int_deflec', 'DIII_ragged_R','DIII_diphasic_R',
             'DIII_ragged_P', 'DIII_diphasic_P','DIII_ragged_T', 'DIII_diphasic_T',
             'AVR_Q_wave','AVR_R_wave', 'AVR_S_wave','AVR_R*_Wave', 'AVR_S*_wave','AVR_int_deflec', 'AVR_ragged_R','AVR_diphasic_R',
             'AVR_ragged_P', 'AVR_diphasic_P','AVR_ragged_T', 'AVR_diphasic_T',
             'AVL_Q_wave','AVL_R_wave', 'AVL_S_wave','AVL_R*_Wave', 'AVL_S*_wave','AVL_int_deflec', 'AVL_ragged_R','AVL_diphasic_R',
             'AVL_ragged_P', 'AVL_diphasic_P','AVL_ragged_T', 'AVL_diphasic_T',
             'AVF_Q_wave','AVF_R_wave', 'AVF_S_wave','AVF_R*_Wave', 'AVF_S*_wave','AVF_int_deflec', 'AVF_ragged_R','AVF_diphasic_R',
             'AVF_ragged_P', 'AVF_diphasic_P','AVF_ragged_T', 'AVF_diphasic_T',
             'V1_Q_wave','V1_R_wave', 'V1_S_wave','V1_R*_Wave', 'V1_S*_wave','V1_int_deflec', 'V1_ragged_R','V1_diphasic_R',
             'V1_ragged_P', 'V1_diphasic_P','V1_ragged_T', 'V1_diphasic_T',
             'V2_Q_wave','V2_R_wave', 'V2_S_wave','V2_R*_Wave', 'V2_S*_wave','V2_int_deflec', 'V2_ragged_R','V2_diphasic_R',
             'V2_ragged_P', 'V2_diphasic_P','V2_ragged_T', 'V2_diphasic_T',
             'V3_Q_wave','V3_R_wave', 'V3_S_wave','V3_R*_Wave', 'V3_S*_wave','V3_int_deflec', 'V3_ragged_R','V3_diphasic_R',
             'V3_ragged_P', 'V3_diphasic_P','V3_ragged_T', 'V3_diphasic_T',
             'V4_Q_wave','V4_R_wave', 'V4_S_wave','V4_R*_Wave', 'V4_S*_wave','V4_int_deflec', 'V4_ragged_R','V4_diphasic_R',
             'V4_ragged_P', 'V4_diphasic_P','V4_ragged_T', 'V4_diphasic_T',
             'V5_Q_wave','V5_R_wave', 'V5_S_wave','V5_R*_Wave', 'V5_S*_wave','V5_int_deflec', 'V5_ragged_R','V5_diphasic_R',
             'V5_ragged_P', 'V5_diphasic_P','V5_ragged_T', 'V5_diphasic_T',
             'V6_Q_wave','V6_R_wave', 'V6_S_wave','V6_R*_Wave', 'V6_S*_wave','V6_int_deflec', 'V6_ragged_R','V6_diphasic_R',
             'V6_ragged_P', 'V6_diphasic_P','V6_ragged_T', 'V6_diphasic_T',
             'DI_Amp_JJ','DI_Amp_Q','DI_Amp_R','DI_Amp_S','DI_Amp_R*','DI_Amp_S*','DI_Amp_P','DI_Amp_T', 'DI_Amp_QRSA','DI_Amp_QRSTA',
             'DII_Amp_JJ','DII_Amp_Q','DII_Amp_R','DII_Amp_S','DII_Amp_R*','DII_Amp_S*','DII_Amp_P','DII_Amp_T', 'DII_Amp_QRSA','DII_Amp_QRSTA',
             'DIII_Amp_JJ','DIII_Amp_Q','DIII_Amp_R','DIII_Amp_S','DIII_Amp_R*','DIII_Amp_S*','DIII_Amp_P','DIII_Amp_T', 'DIII_Amp_QRSA','DIII_Amp_QRSTA',
             'AVR_Amp_JJ','AVR_Amp_Q','AVR_Amp_R','AVR_Amp_S','AVR_Amp_R*','AVR_Amp_S*','AVR_Amp_P','AVR_Amp_T', 'AVR_Amp_QRSA','AVR_Amp_QRSTA',
             'AVL_Amp_JJ','AVL_Amp_Q','AVL_Amp_R','AVL_Amp_S','AVL_Amp_R*','AVL_Amp_S*','AVL_Amp_P','AVL_Amp_T', 'AVL_Amp_QRSA','AVL_Amp_QRSTA',
             'AVF_Amp_JJ','AVF_Amp_Q','AVF_Amp_R','AVF_Amp_S','AVF_Amp_R*','AVF_Amp_S*','AVF_Amp_P','AVF_Amp_T', 'AVF_Amp_QRSA','AVF_Amp_QRSTA',
             'V1_Amp_JJ','V1_Amp_Q','V1_Amp_R','V1_Amp_S','V1_Amp_R*','V1_Amp_S*','V1_Amp_P','V1_Amp_T', 'V1_Amp_QRSA','V1_Amp_QRSTA',
             'V2_Amp_JJ','V2_Amp_Q','V2_Amp_R','V2_Amp_S','V2_Amp_R*','V2_Amp_S*','V2_Amp_P','V2_Amp_T', 'V2_Amp_QRSA','V2_Amp_QRSTA',
             'V3_Amp_JJ','V3_Amp_Q','V3_Amp_R','V3_Amp_S','V3_Amp_R*','V3_Amp_S*','V3_Amp_P','V3_Amp_T', 'V3_Amp_QRSA','V3_Amp_QRSTA',
             'V4_Amp_JJ','V4_Amp_Q','V4_Amp_R','V4_Amp_S','V4_Amp_R*','V4_Amp_S*','V4_Amp_P','V4_Amp_T', 'V4_Amp_QRSA','V4_Amp_QRSTA',
             'V5_Amp_JJ','V5_Amp_Q','V5_Amp_R','V5_Amp_S','V5_Amp_R*','V5_Amp_S*','V5_Amp_P','V5_Amp_T', 'V5_Amp_QRSA','V5_Amp_QRSTA',
             'V6_Amp_JJ','V6_Amp_Q','V6_Amp_R','V6_Amp_S','V6_Amp_R*','V6_Amp_S*','V6_Amp_P','V6_Amp_T', 'V6_Amp_QRSA','V6_Amp_QRSTA',
             'Arrhythmia']
data.columns = col_names

### Splitting data into features/label, training/validation/test sets

In [4]:
#Splitting data into features and label
X_data = data.loc[:, data.columns != 'Arrhythmia']
y_data = data['Arrhythmia']

In [5]:
#Splitting data into training+validation and test
X_trainval, X_test, y_trainval, y_test = train_test_split(X_data, y_data, random_state=11)

#Splitting train+validation set into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X_trainval, y_trainval, random_state=10)

print("Size of training set: {} | Size of validation set: {} | Size of test set:" 
      " {}\n".format(X_train.shape[0], X_valid.shape[0], X_test.shape[0]))

Size of training set: 254 | Size of validation set: 85 | Size of test set: 113



### Creating Pipeline for scaling, Imputing missing values and classification

In [6]:
#Creating a function for running the pipeline for scaling, imputing missing values, selection and classification
def run_pipeline(sca,imp,sel,mod):
    scaler = sca
    imputer = imp
    selection = sel
    model = mod
    pipe = Pipeline([("scaler", scaler),('imputer', imputer),('select', selection),("model", model)])
    return pipe

### Running model in training and test datasets

In [7]:
#Running the classifier model on the training data using the pipeline
pipe1 = run_pipeline(sca=MinMaxScaler(),imp=KNNImputer(),sel=None,mod=KNeighborsClassifier(n_neighbors=3))
model_train = pipe1.fit(X_train, y_train)

#Scoring and printing the model using the validation data
val_score = model_train.score(X_valid, y_valid)
print("Validation score: {:.2f}".format(val_score))

Validation score: 0.60


In [8]:
#Building the model on training+validation set using the pipeline
model_trainval = pipe1.fit(X_trainval,y_trainval)

#Scoring the model using the test data
test_score = model_trainval.score(X_test, y_test)
print("Test score: {:.2f}".format(test_score))

Test score: 0.58


### Feature Selection : Univariate Statistics

In [9]:
#Defining scaler, imputer, selection and model for Univariate Statistics feature selection
scaler = MinMaxScaler()
imputer = KNNImputer()
selection = SelectKBest()
model = KNeighborsClassifier(n_neighbors=3)
number_features = [20,50,100,150,200]

#Running the pipeline
pipe2 = run_pipeline(sca=scaler,imp=imputer,sel=selection,mod=model)

In [10]:
#param_grid_uni = {'select__k': [7.168,17.92,35.84,53.76,71.68]}
param_grid_uni = {'select__k': number_features}

In [11]:
#Fitting classifier on training set using grid search for number of features = [20,50,100,150,200]
grid_uni_train = GridSearchCV(pipe2, param_grid=param_grid_uni, cv=10).fit(X_train, y_train)

#Printing results
print("Best params:\n{}\n".format(grid_uni_train.best_params_))
print("Best cross-validation score: {:.2f}".format(grid_uni_train.best_score_))

#Collecting training scores
train_score_uni = grid_uni_train.cv_results_['mean_test_score']

 156 157 164 204 264 274] are constant.
 151 156 157 164 204 264 274] are constant.
 151 156 157 164 204 264 274] are constant.
 143 145 151 153 156 157 164 204 244 264 274] are constant.
 141 143 145 151 156 157 164 194 204 264 274] are constant.
 141 143 145 151 156 157 164 174 204 264 274] are constant.
 145 151 156 157 164 204 264 274] are constant.
 143 145 151 156 157 164 204 254 264 274] are constant.
 145 151 156 157 164 204 234 264 274] are constant.
  f = msb / msw
 151 154 156 157 164 204 264 274] are constant.
 156 157 164 204 264 274] are constant.
 151 156 157 164 204 264 274] are constant.
 151 156 157 164 204 264 274] are constant.
 143 145 151 153 156 157 164 204 244 264 274] are constant.
 141 143 145 151 156 157 164 194 204 264 274] are constant.
 141 143 145 151 156 157 164 174 204 264 274] are constant.
 145 151 156 157 164 204 264 274] are constant.
 143 145 151 156 157 164 204 254 264 274] are constant.
 145 151 156 157 164 204 234 264 274] are constant.
  f = ms

Best params:
{'select__k': 20}

Best cross-validation score: 0.65


 143 145 151 156 157 164 204 254 264 274] are constant.
 145 151 156 157 164 204 234 264 274] are constant.
  f = msb / msw
 151 154 156 157 164 204 264 274] are constant.
 156 157 164 204 264 274] are constant.


In [12]:
train_score_uni

array([0.65, 0.62, 0.62, 0.58, 0.58])

In [13]:
#Fitting classifier on training and validation set using grid search for number of features = [20,50,100,150,200]
grid_uni_test = GridSearchCV(pipe2, param_grid=param_grid_uni, cv=10).fit(X_trainval, y_trainval)

#Printing results
print("Best params:\n{}\n".format(grid_uni_test.best_params_))
print("Best cross-validation score: {:.2f}".format(grid_uni_test.best_score_))
print("Test set score: {:.2f}".format(grid_uni_test.score(X_test, y_test)))

#Collecting test scores
test_score_uni = grid_uni_test.cv_results_['mean_test_score']

 164 204 224 264 274] are constant.
 244 264 274] are constant.
 194 204 264 274] are constant.
 274] are constant.
 274] are constant.
 164 204 254 264 274] are constant.
 164 204 224 264 274] are constant.
 244 264 274] are constant.
 194 204 264 274] are constant.
 274] are constant.
 274] are constant.
 164 204 254 264 274] are constant.
 164 204 224 264 274] are constant.
 244 264 274] are constant.
 194 204 264 274] are constant.
 274] are constant.
 274] are constant.
 164 204 254 264 274] are constant.
 164 204 224 264 274] are constant.
 244 264 274] are constant.
 194 204 264 274] are constant.
 274] are constant.
 274] are constant.
 164 204 254 264 274] are constant.
 164 204 224 264 274] are constant.
 244 264 274] are constant.
 194 204 264 274] are constant.
 274] are constant.
 274] are constant.
 164 204 254 264 274] are constant.


Best params:
{'select__k': 20}

Best cross-validation score: 0.68
Test set score: 0.65




In [14]:
test_score_uni

array([0.68, 0.67, 0.63, 0.61, 0.59])

### Feature Selection : Model based

In [15]:
#Setting selection method for Model-based feature selection
selection_mod = SelectFromModel(estimator=RidgeClassifier(),threshold=-np.inf)

#Running pipeline
pipe3 = run_pipeline(sca=scaler,imp=imputer,sel=selection_mod,mod=model)

In [16]:
#Setting the parameter grid
param_grid_mod = {'select__max_features': number_features}

In [28]:
#Fitting classifier on training set using grid search for number of features = [20,50,100,150,200]
grid_mod_train = GridSearchCV(pipe3, param_grid=param_grid_mod, cv=10).fit(X_train, y_train)

#Printing results
print("Best params:\n{}\n".format(grid_mod_train.best_params_))
print("Best cross-validation score: {:.2f}".format(grid_mod_train.best_score_))

#Collecting training scores
train_score_mod = grid_mod_train.cv_results_['mean_test_score']


The least populated class in y has only 1 members, which is less than n_splits=10.



Best params:
{'select__max_features': 20}

Best cross-validation score: 0.63


In [29]:
train_score_mod

array([0.63, 0.63, 0.59, 0.57, 0.56])

In [30]:
#Fitting classifier on training and validation set using grid search for number of features = [20,50,100,150,200]
grid_mod_test = GridSearchCV(pipe3, param_grid=param_grid_mod, cv=10).fit(X_trainval, y_trainval)

#Printing results
print("Best params:\n{}\n".format(grid_mod_test.best_params_))
print("Best cross-validation score: {:.2f}".format(grid_mod_test.best_score_))
print("Test set score: {:.2f}".format(grid_mod_test.score(X_test, y_test)))

#Collecting test scores
test_score_mod = grid_mod_test.cv_results_['mean_test_score']


The least populated class in y has only 2 members, which is less than n_splits=10.



Best params:
{'select__max_features': 50}

Best cross-validation score: 0.63
Test set score: 0.61


In [31]:
test_score_mod

array([0.62, 0.63, 0.6 , 0.6 , 0.59])

### Feature Selection : Iterative 

In [21]:
#Selection selection method for Iterative feature selection
selection_iter = RFE(estimator=RidgeClassifier(),step=2)

#Running the pipeline
pipe4 = run_pipeline(sca=scaler,imp=imputer,sel=selection_iter,mod=model)

In [22]:
#Setting the parameter grid
param_grid_iter = {'select__n_features_to_select': number_features}

In [32]:
#Fitting classifier on training set using grid search for number of features = [20,50,100,150,200]
grid_iter_train = GridSearchCV(pipe4, param_grid=param_grid_iter, cv=10).fit(X_train, y_train)

#Printing results
print("Best params:\n{}\n".format(grid_iter_train.best_params_))
print("Best cross-validation score: {:.2f}".format(grid_iter_train.best_score_))

#Collecting training scores
train_score_iter = grid_iter_train.cv_results_['mean_test_score']


The least populated class in y has only 1 members, which is less than n_splits=10.



Best params:
{'select__n_features_to_select': 20}

Best cross-validation score: 0.63


In [33]:
#Fitting classifier on training and validation set using grid search for number of features = [20,50,100,150,200]
grid_iter_test = GridSearchCV(pipe4, param_grid=param_grid_iter, cv=10).fit(X_trainval, y_trainval)

#Printing results
print("Best params:\n{}\n".format(grid_iter_test.best_params_))
print("Best cross-validation score: {:.2f}".format(grid_iter_test.best_score_))
print("Test set score: {:.2f}".format(grid_iter_test.score(X_test, y_test)))

#Collecting test scores
test_score_iter = grid_iter_test.cv_results_['mean_test_score']


The least populated class in y has only 2 members, which is less than n_splits=10.



Best params:
{'select__n_features_to_select': 20}

Best cross-validation score: 0.66
Test set score: 0.72


### Showing tables of scores

In [35]:
print("All features:")
print("train score: {:.4f}".format(val_score))
print("test score: {:.4f}".format(test_score))

headerColor = 'grey'
rowEvenColor = 'lightgrey'
rowOddColor = 'white'

fig = go.Figure(data=[go.Table(
  header=dict(
    values=['<b>Features</b>','<b>Univariate Train</b>','<b>Univariate Train_Val (Test) </b>','<b>Model-based Train</b>','<b>Model-based Train_Val (Test)</b>','<b>Iterative Train</b>','<b>Iterative Train_Val (Test)</b>'],
    line_color='darkslategray',
    fill_color=headerColor,
    align=['left','center'],
    font=dict(color='white', size=15)
  ),
  cells=dict(
    #Adding the data (results)
    values=[
      number_features,
      train_score_uni,
      test_score_uni,
      train_score_mod,
      test_score_mod,
      train_score_iter,
      test_score_iter],
    line_color='darkslategray',
    # 2-D list of colors for alternating rows
    fill_color = [[rowOddColor,rowEvenColor,rowOddColor, rowEvenColor,rowOddColor]*5],
    align = ['left', 'center'],
    font = dict(color = 'darkslategray', size = 12),
    format=[".0f",".4f"]
    ))
])

fig.show()

All features:
train score: 0.6000
test score: 0.5841


In [36]:
#Printing results
print("For Univariate Statistics Feature Selection")
print("The best model is with features equal to {}".format(grid_uni_train.best_params_))
print("Best score for Testing data: {:.4f}".format(grid_uni_train.best_score_))
print("Best score Testing_Validation data: {:.4f}".format(grid_uni_test.best_score_))
print("Test-set score: {:.4f} \n".format(grid_uni_test.score(X_test, y_test)))

#Printing results
print("For Model-based Feature Selection")
print("The best model is with features equal to {}".format(grid_mod_train.best_params_))
print("Best score for Testing data: {:.4f}".format(grid_mod_train.best_score_))
print("Best score Testing_Validation data: {:.4f}".format(grid_mod_test.best_score_))
print("Test-set score: {:.4f} \n".format(grid_mod_test.score(X_test, y_test)))

#Printing results
print("For Iterative Feature Selection")
print("The best model is with features equal to {}".format(grid_iter_train.best_params_))
print("Best score for Testing data: {:.4f}".format(grid_iter_train.best_score_))
print("Best score Testing_Validation data: {:.4f}".format(grid_iter_test.best_score_))
print("Test-set score: {:.4f} \n".format(grid_iter_test.score(X_test, y_test)))

For Univariate Statistics Feature Selection
The best model is with features equal to {'select__k': 20}
Best score for Testing data: 0.6494
Best score Testing_Validation data: 0.6816
Test-set score: 0.6460 

For Model-based Feature Selection
The best model is with features equal to {'select__max_features': 20}
Best score for Testing data: 0.6260
Best score Testing_Validation data: 0.6312
Test-set score: 0.6106 

For Iterative Feature Selection
The best model is with features equal to {'select__n_features_to_select': 20}
Best score for Testing data: 0.6338
Best score Testing_Validation data: 0.6605
Test-set score: 0.7168 

