# Machine Learning II: Introduction to Supervised Classification Methods

# FULL SCOPE EXERCISE

## Load Data

In [3]:
# Supervised Learning Class

from sklearn.datasets import load_breast_cancer
import pandas as pd

cancer = load_breast_cancer()
print("cancer.keys(): \n{}".format(cancer.keys()))
print(cancer['target_names'])

data = pd.DataFrame(cancer['data'], columns = cancer['feature_names'])
target = pd.DataFrame(cancer['target'], columns = ['target'])
data.head()

cancer.keys(): 
dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])
['malignant' 'benign']


Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [19]:
## Databalance:
import numpy as np 

np.unique(cancer['target'], return_counts=True)

(array([0, 1]), array([212, 357]))

## Hyperparameter and choices:

In [4]:
# Feature Selection?
# Non-transformed vs Standardization?
# Tune the parameters of the models

## Split in Train and Test

In [5]:
# Train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(cancer['data'], cancer['target'], random_state=0)

## Feature Selection

In [6]:
# Feature Selection
import seaborn as sns
#display(pd.DataFrame(X_train, columns = data.columns).corr())

features = pd.DataFrame(X_train, columns = data.columns)
target = pd.Series(y_train)
corr_dict = {}
cols_to_remove = []

for col in list(features.columns):
  corr_dict[col] = abs(features[col].corr(target))
  if corr_dict[col] < 0.5:
    cols_to_remove.append(col)

#corr_dict
cols_to_remove

['mean texture',
 'mean smoothness',
 'mean symmetry',
 'mean fractal dimension',
 'texture error',
 'smoothness error',
 'compactness error',
 'concavity error',
 'concave points error',
 'symmetry error',
 'fractal dimension error',
 'worst texture',
 'worst smoothness',
 'worst symmetry',
 'worst fractal dimension']

In [7]:
# Create "Feature Selected sets"
X_train, X_test, y_train, y_test = train_test_split(cancer['data'], 
                                                    cancer['target'], 
                                                    random_state=0)

c_data = pd.DataFrame(cancer['data'], columns = cancer['feature_names'])
c_data = c_data.drop(cols_to_remove, axis=1)

X_train_featsel, X_test_featsel, y_train, y_test = train_test_split(c_data, 
                                                                    cancer['target'],
                                                                    random_state=0)

## Standardization

In [8]:
# Standardiztion
from sklearn.preprocessing import StandardScaler

##### X_train #########
# create standardization object
scale = StandardScaler().fit(features) # "features" is just the training data

X_train_standard = scale.transform(X_train)
X_test_standard  = scale.transform(X_test)          

X_train_standard = pd.DataFrame(X_train_standard, columns = features.columns)
X_test_standard = pd.DataFrame(X_test_standard, columns = features.columns)

##### X_train_featsel #########
# create standardization object
scale = StandardScaler().fit(features) # "features" is just the training data

X_train_featsel_standard = scale.transform(X_train)
X_test_featsel_standard = scale.transform(X_test)

X_train_featsel_standard = pd.DataFrame(X_train_featsel_standard, columns = features.columns)
X_test_featsel_standard = pd.DataFrame(X_test_featsel_standard, columns = features.columns)


# Standardization
from sklearn.preprocessing import StandardScaler

##### X_train #########
# create standardization object
scale = StandardScaler().fit(features)

X_train_standard = scale.transform(X_train)
X_train_standard = pd.DataFrame(X_train_standard, columns = features.columns)

X_test_standard  = scale.transform(X_test)
X_test_standard = pd.DataFrame(X_test_standard, columns = features.columns)

##### X_train_featsel #########
# create standardization object
features_featsel = pd.DataFrame(X_train_featsel, columns = c_data.columns)
scale = StandardScaler().fit(features_featsel)

X_train_featsel_standard = scale.transform(X_train_featsel)
X_train_featsel_standard = pd.DataFrame(X_train_featsel_standard, columns = c_data.columns)

X_test_featsel_standard = scale.transform(X_test_featsel)
X_test_featsel_standard = pd.DataFrame(X_test_featsel_standard, columns = c_data.columns)

## Model Selection

In [20]:
# Model Selection
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_validate

import warnings
warnings.filterwarnings('ignore')

def model_picker(model_name, parameter):
  if model_name == 'KNN':
    return KNeighborsClassifier(n_neighbors = parameter)
  if model_name == 'SVM':
    return LinearSVC(C = parameter)
  else:
    raise ValueError("I dont know this model")

model_selection = []
cv_tpr_list = []

train_data_dict = {'normal':X_train, 
              'featsel':X_train_featsel,
              'standard': X_train_standard,
              'featsel_standard': X_train_featsel_standard}

model_list = ['KNN', 'SVM']
model_para = {'KNN':[1,3,5,7,9], 'SVM':[0.001, 0.01, 0.1, 1, 10]}

for train_data_type in ['normal', 'featsel', 'standard', 'featsel_standard']:
  train_data = train_data_dict[train_data_type]
  print(train_data_type)
  for model_name in model_list:
    for p in model_para[model_name]:
      #print(p)
      model = model_picker(model_name, p) 
      model.fit(train_data, y_train)

      cv_acc = cross_validate(model,
                              train_data, y_train, scoring='recall',
                              cv = 10)['test_score'].mean()
      
      #print(train_data_type, model_name, p, cv_acc)
      model_selection.append([train_data_type, model_name, str(p), cv_acc])
      cv_tpr_list.append(cv_acc)

print(max(cv_tpr_list))
print(min(cv_tpr_list))

print('\n')
display(model_selection)
print('\n')
for best in model_selection:
  #print(best)
  if best[3] == max(cv_acc_list):
    print('the best model is: ', best)

normal
featsel
standard
featsel_standard
0.9962962962962962
0.8321937321937322




[['normal', 'KNN', '1', 0.9480056980056981],
 ['normal', 'KNN', '3', 0.9515669515669515],
 ['normal', 'KNN', '5', 0.9514245014245015],
 ['normal', 'KNN', '7', 0.9589743589743589],
 ['normal', 'KNN', '9', 0.9588319088319087],
 ['normal', 'SVM', '0.001', 0.9628205128205127],
 ['normal', 'SVM', '0.01', 0.958119658119658],
 ['normal', 'SVM', '0.1', 0.9733618233618234],
 ['normal', 'SVM', '1', 0.9219373219373219],
 ['normal', 'SVM', '10', 0.962962962962963],
 ['featsel', 'KNN', '1', 0.9366096866096866],
 ['featsel', 'KNN', '3', 0.9401709401709402],
 ['featsel', 'KNN', '5', 0.9514245014245015],
 ['featsel', 'KNN', '7', 0.9589743589743589],
 ['featsel', 'KNN', '9', 0.9588319088319087],
 ['featsel', 'SVM', '0.001', 0.9662393162393161],
 ['featsel', 'SVM', '0.01', 0.8737891737891739],
 ['featsel', 'SVM', '0.1', 0.8321937321937322],
 ['featsel', 'SVM', '1', 0.9259259259259259],
 ['featsel', 'SVM', '10', 0.9353276353276353],
 ['standard', 'KNN', '1', nan],
 ['standard', 'KNN', '3', nan],
 ['stand



the best model is:  ['standard', 'SVM', '0.01', 0.9962962962962962]


In [10]:
## Test your model
model = model_picker('SVM', 0.01)
model.fit(X_train_standard, y_train)
model.score(X_test_standard, y_test)

0.972027972027972

## How do I "share"/use what I just did?

In [11]:
# Supervised Learning Class

from sklearn.datasets import load_breast_cancer
import pandas as pd

cancer = load_breast_cancer()
print("cancer.keys(): \n{}".format(cancer.keys()))
print(cancer['target_names'])

data = pd.DataFrame(cancer['data'], columns = cancer['feature_names'])
target = pd.DataFrame(cancer['target'], columns = ['target'])
data.head()

cols_to_remove = ['mean texture', 
                'mean smoothness', 
                'mean symmetry', 
                'mean fractal dimension', 
                'texture error', 
                'smoothness error', 
                'compactness error', 
                'concavity error', 
                'concave points error', 
                'symmetry error', 
                'fractal dimension error', 
                'worst texture', 
                'worst smoothness', 
                'worst symmetry', 
                'worst fractal dimension']

def model_picker(model_name, parameter):
  if model_name == 'KNN':
    return KNeighborsClassifier(n_neighbors = parameter)
  if model_name == 'SVM':
    return LinearSVC(C = parameter)
  else:
    raise ValueError("I dont know this model")

cancer.keys(): 
dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])
['malignant' 'benign']


In [12]:
# We need to:
features = pd.DataFrame(cancer['data'], columns = data.columns)
target = pd.Series(cancer['target'])

# Save the preprocessing steps (feature selection and standardization):
print(f'The columns to remove are {cols_to_remove}')

# Drop the un-selected columns
features = features.drop(cols_to_remove, axis=1)

# Rebuild the Scaler with all the data
scale = StandardScaler().fit(features)
print(f'The Standardizer we should use is: {scale}')

# Save the model (save the trained model with adjusted internal parameters in a file)
#model = model_picker('SVM', 0.01)
model = LinearSVC(C = 0.01)

features = pd.DataFrame(scale.transform(features) , columns = features.columns) # Normalize the features before training the model
model.fit(features, target)
print('The model we should use is: ', model)
# When using this we need to load the aforementioned elements

######################################
## SAVE THIS FUNCTION IN A .py FILE  or do this with another student ##
import pandas as pd
import pickle
from sklearn.preprocessing import StandardScaler

def pre_processing(features):

  # Feature Selection
  cols_to_remove = ['mean texture', 
                  'mean smoothness', 
                  'mean symmetry', 
                  'mean fractal dimension', 
                  'texture error', 
                  'smoothness error', 
                  'compactness error', 
                  'concavity error', 
                  'concave points error', 
                  'symmetry error', 
                  'fractal dimension error', 
                  'worst texture', 
                  'worst smoothness', 
                  'worst symmetry', 
                  'worst fractal dimension']
  
  features = features.drop(cols_to_remove, axis=1)

  # Feature Scaling
  scale = pickle.load(open('Scaler_for_cancer.sav', 'rb'))
  return pd.DataFrame(scale.transform(features) , columns = features.columns)

######################################

The columns to remove are ['mean texture', 'mean smoothness', 'mean symmetry', 'mean fractal dimension', 'texture error', 'smoothness error', 'compactness error', 'concavity error', 'concave points error', 'symmetry error', 'fractal dimension error', 'worst texture', 'worst smoothness', 'worst symmetry', 'worst fractal dimension']
The Standardizer we should use is: StandardScaler()
The model we should use is:  LinearSVC(C=0.01)


In [13]:
import pickle

# save the Standardizer to disk
filename = 'Scaler_for_cancer.sav'
pickle.dump(scale, open(filename, 'wb'))

# save the model to disk
filename = 'Model_for_cancer.sav'
pickle.dump(model, open(filename, 'wb'))

And with this, you have all the elements you need to run this again: the model, the scaler and the preprocessing code