In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, cross_validate, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

In [3]:
data = pd.read_csv(r'C:\Users\durga\Desktop\uOttawa\Term1\ML_HernaViktor\Assignments\Assignment2\heart_cleveland_upload.csv')

In [4]:
#convert numeric data to float datatype
data = data.astype(str).astype(float)
data.dtypes

age          float64
sex          float64
cp           float64
trestbps     float64
chol         float64
fbs          float64
restecg      float64
thalach      float64
exang        float64
oldpeak      float64
slope        float64
ca           float64
thal         float64
condition    float64
dtype: object

In [5]:
#feature selection - remove least variance
L = pd.DataFrame(data.drop(columns='condition').var())
L = L.sort_values(by=[0], ascending=False)
L

Unnamed: 0,0
chol,2703.748589
thalach,526.31527
trestbps,315.51729
age,81.897716
oldpeak,1.359842
restecg,0.989853
cp,0.930954
thal,0.915256
ca,0.881654
slope,0.382155


In [6]:
#dropping values with variance < 0.15(variance < 0.15 means the data is constant for more than 85% of the data)
data = data.drop(columns=['fbs'])
data

Unnamed: 0,age,sex,cp,trestbps,chol,restecg,thalach,exang,oldpeak,slope,ca,thal,condition
0,69.0,1.0,0.0,160.0,234.0,2.0,131.0,0.0,0.1,1.0,1.0,0.0,0.0
1,69.0,0.0,0.0,140.0,239.0,0.0,151.0,0.0,1.8,0.0,2.0,0.0,0.0
2,66.0,0.0,0.0,150.0,226.0,0.0,114.0,0.0,2.6,2.0,0.0,0.0,0.0
3,65.0,1.0,0.0,138.0,282.0,2.0,174.0,0.0,1.4,1.0,1.0,0.0,1.0
4,64.0,1.0,0.0,110.0,211.0,2.0,144.0,1.0,1.8,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
292,40.0,1.0,3.0,152.0,223.0,0.0,181.0,0.0,0.0,0.0,0.0,2.0,1.0
293,39.0,1.0,3.0,118.0,219.0,0.0,140.0,0.0,1.2,1.0,0.0,2.0,1.0
294,35.0,1.0,3.0,120.0,198.0,0.0,130.0,1.0,1.6,1.0,0.0,2.0,1.0
295,35.0,0.0,3.0,138.0,183.0,0.0,182.0,0.0,1.4,0.0,0.0,0.0,0.0


In [7]:
#feature scaling
scaler = MinMaxScaler()
scaler.fit(data)
D_transformed = scaler.transform(data)

In [8]:
D_transformed = pd.DataFrame(D_transformed, columns=list(data.columns.values))
D_transformed

Unnamed: 0,age,sex,cp,trestbps,chol,restecg,thalach,exang,oldpeak,slope,ca,thal,condition
0,0.833333,1.0,0.0,0.622642,0.246575,1.0,0.458015,0.0,0.016129,0.5,0.333333,0.0,0.0
1,0.833333,0.0,0.0,0.433962,0.257991,0.0,0.610687,0.0,0.290323,0.0,0.666667,0.0,0.0
2,0.770833,0.0,0.0,0.528302,0.228311,0.0,0.328244,0.0,0.419355,1.0,0.000000,0.0,0.0
3,0.750000,1.0,0.0,0.415094,0.356164,1.0,0.786260,0.0,0.225806,0.5,0.333333,0.0,1.0
4,0.729167,1.0,0.0,0.150943,0.194064,1.0,0.557252,1.0,0.290323,0.5,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
292,0.229167,1.0,1.0,0.547170,0.221461,0.0,0.839695,0.0,0.000000,0.0,0.000000,1.0,1.0
293,0.208333,1.0,1.0,0.226415,0.212329,0.0,0.526718,0.0,0.193548,0.5,0.000000,1.0,1.0
294,0.125000,1.0,1.0,0.245283,0.164384,0.0,0.450382,1.0,0.258065,0.5,0.000000,1.0,1.0
295,0.125000,0.0,1.0,0.415094,0.130137,0.0,0.847328,0.0,0.225806,0.0,0.000000,0.0,0.0


In [9]:
X = D_transformed.drop(columns=['condition']).values
y = D_transformed['condition'].values

In [10]:
#SVM
#mentioning default params
clf = SVC(kernel="rbf", C=1,random_state=0, gamma='scale')
scores = cross_val_score(clf, X, y, cv=10)
scores
#finding default gamma
gamma = 1 / (X.shape[1] * X.var())
gamma

0.5859887049917147

In [11]:
#The mean score and the standard deviation are hence given by:
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.81 accuracy with a standard deviation of 0.10


In [12]:
hparams = {"C": (0.0001, 10000), "gamma": (0.0001, 10000)}
def estimator(C, gamma):
    # initialize model
    model = SVC(C=C, gamma=gamma, degree=1, random_state=0)
    # set in cross-validation
    result = cross_validate(model, X, y, cv=10)
    # result is mean of test_score
    return np.mean(result['test_score'])
from bayes_opt import BayesianOptimization
# give model and hyperparameter to optmizer
svc_bayesopt = BayesianOptimization(estimator, hparams)

In [13]:
svc_bayesopt.maximize(init_points=5, n_iter=50, acq='ucb')
svc_bayesopt.max

|   iter    |  target   |     C     |   gamma   |
-------------------------------------------------
| [0m1        [0m | [0m0.5389   [0m | [0m5.744e+03[0m | [0m7.246e+03[0m |
| [0m2        [0m | [0m0.5389   [0m | [0m2.132e+03[0m | [0m1.008e+03[0m |
| [0m3        [0m | [0m0.5389   [0m | [0m7.288e+03[0m | [0m500.5    [0m |
| [0m4        [0m | [0m0.5389   [0m | [0m6.393e+03[0m | [0m9.643e+03[0m |
| [0m5        [0m | [0m0.5389   [0m | [0m9.611e+03[0m | [0m9.592e+03[0m |
| [0m6        [0m | [0m0.5389   [0m | [0m19.99    [0m | [0m9.872e+03[0m |
| [95m7        [0m | [95m0.5455   [0m | [95m9.994e+03[0m | [95m88.16    [0m |
| [0m8        [0m | [0m0.5389   [0m | [0m9.935e+03[0m | [0m213.9    [0m |
| [0m9        [0m | [0m0.5389   [0m | [0m9.98e+03 [0m | [0m214.4    [0m |
| [0m10       [0m | [0m0.5389   [0m | [0m4.396e+03[0m | [0m3.873e+03[0m |
| [0m11       [0m | [0m0.5455   [0m | [0m9.842e+03[0m | [0m110.5    

{'target': 0.7645977011494253,
 'params': {'C': 9703.943106249999, 'gamma': 0.6904447753469372}}

In [14]:
#SVM
#{'target': 0.8382758620689655,'params': {'C': 9.993231110534596, 'gamma': 0.025674348882233595}}
#inserting our optimized params
clf = SVC(kernel="rbf", C=9.993231110534596,random_state=0, gamma=0.025674348882233595)
scores = cross_val_score(clf, X, y, cv=10)
scores

array([0.56666667, 0.83333333, 0.93333333, 0.9       , 0.83333333,
       0.83333333, 1.        , 0.86206897, 0.82758621, 0.79310345])

In [15]:
#The mean score and the standard deviation are hence given by:
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.84 accuracy with a standard deviation of 0.11


In [16]:
#DT
clf = DecisionTreeClassifier(criterion='gini',max_depth=4,random_state=0)
scores = cross_val_score(clf, X, y, cv=10)
scores

array([0.56666667, 0.5       , 0.63333333, 0.86666667, 0.76666667,
       0.93333333, 0.9       , 0.75862069, 0.48275862, 0.79310345])

In [17]:
#The mean score and the standard deviation are hence given by:
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.72 accuracy with a standard deviation of 0.16


In [18]:
hparams = {"min_samples_leaf": (0.1, 0.5), "min_samples_split": (0.1, 1.0)}
def estimator(min_samples_leaf, min_samples_split):
    # initialize model
    model = DecisionTreeClassifier(min_samples_leaf=min_samples_leaf, min_samples_split=min_samples_split)
    # set in cross-validation
    result = cross_validate(model, X, y, cv=10)
    # result is mean of test_score
    return np.mean(result['test_score'])
from bayes_opt import BayesianOptimization
# give model and hyperparameter to optmizer
dt_bayesopt = BayesianOptimization(estimator, hparams)

In [19]:
dt_bayesopt.maximize(init_points=5, n_iter=50, acq='ucb')
dt_bayesopt.max

|   iter    |  target   | min_sa... | min_sa... |
-------------------------------------------------
| [0m1        [0m | [0m0.659    [0m | [0m0.2118   [0m | [0m0.3837   [0m |
| [0m2        [0m | [0m0.6323   [0m | [0m0.1394   [0m | [0m0.3609   [0m |
| [0m3        [0m | [0m0.6285   [0m | [0m0.4798   [0m | [0m0.8326   [0m |
| [0m4        [0m | [0m0.6356   [0m | [0m0.1451   [0m | [0m0.4924   [0m |
| [0m5        [0m | [0m0.659    [0m | [0m0.1452   [0m | [0m0.8123   [0m |
| [0m6        [0m | [0m0.659    [0m | [0m0.2803   [0m | [0m0.8668   [0m |
| [0m7        [0m | [0m0.659    [0m | [0m0.1797   [0m | [0m0.7348   [0m |
| [95m8        [0m | [95m0.7936   [0m | [95m0.4638   [0m | [95m0.4017   [0m |
| [0m9        [0m | [0m0.5526   [0m | [0m0.5      [0m | [0m0.2562   [0m |
| [0m10       [0m | [0m0.659    [0m | [0m0.2238   [0m | [0m0.4667   [0m |
| [0m11       [0m | [0m0.659    [0m | [0m0.3432   [0m | [0m0.928    

{'target': 0.8039080459770116,
 'params': {'min_samples_leaf': 0.4774639479283226,
  'min_samples_split': 0.436424870584057}}

In [20]:
#{'target': 0.8004597701149425,'params': {'min_samples_leaf': 0.47378765131749734,'min_samples_split': 0.4816994460771336}}
clf_t = DecisionTreeClassifier()
hparams = {"criterion": ["gini", "entropy", "log_loss"], 
           "max_depth": [1,2,8,16,32],
          "min_samples_split": [0.1,0.4816994460771336,0.5],
          "min_samples_leaf":[0.1,0.47378765131749734,0.5]}

In [21]:
GS_estimator = GridSearchCV(clf_t, hparams, cv=10, scoring="accuracy")
GS_estimator.fit(X, y)
GS_estimator.best_params_

{'criterion': 'gini',
 'max_depth': 1,
 'min_samples_leaf': 0.47378765131749734,
 'min_samples_split': 0.1}

In [22]:
#DT
#best params {"criterion": 'gini', "max_depth": 1,"min_samples_split": 0.1,"min_samples_leaf": 0.47378765131749734}
clf = DecisionTreeClassifier(criterion='gini',max_depth=1,random_state=0, min_samples_leaf=0.47378765131749734, min_samples_split=0.1,splitter='best')
scores = cross_val_score(clf, X, y, cv=10)
scores

array([0.36666667, 0.7       , 0.8       , 1.        , 1.        ,
       1.        , 1.        , 0.79310345, 0.68965517, 0.65517241])

In [23]:
#The mean score and the standard deviation are hence given by:
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.80 accuracy with a standard deviation of 0.20


In [24]:
#k-neighbors
#default: K=5
clf = KNeighborsClassifier()
scores = cross_val_score(clf, X, y, cv=10)
scores

array([0.5       , 0.86666667, 0.93333333, 0.9       , 0.73333333,
       0.83333333, 0.93333333, 0.93103448, 0.68965517, 0.75862069])

In [25]:
#The mean score and the standard deviation are hence given by:
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.81 accuracy with a standard deviation of 0.13


In [26]:
#List Hyperparameters that we want to tune.
leaf_size = [1,5,10,15,20,25,30,35,40,45,50]
n_neighbors = [1,5,10,15,20,25,30]
p=[1,2]
weights = ["uniform", "distance"]
algorithm = ["auto", "ball_tree", "kd_tree", "brute"]
#Convert to dictionary
hyperparameters = dict(leaf_size=leaf_size, n_neighbors=n_neighbors, p=p, weights=weights, algorithm=algorithm)
#Create new KNN object
knn_2 = KNeighborsClassifier()
#Use GridSearch
clf_t = GridSearchCV(knn_2, hyperparameters, cv=10)

In [27]:
#Fit the model
best_model = clf_t.fit(X,y)

In [32]:
print("leaf-size:", best_model.best_estimator_.get_params()['leaf_size'],
     "\np:", best_model.best_estimator_.get_params()['p'],
     "\nn_neighbors:", best_model.best_estimator_.get_params()['n_neighbors'],
     "\nweights: ", best_model.best_estimator_.get_params()['weights'],
     "\nalgorithm: ",best_model.best_estimator_.get_params()['algorithm']) 

leaf-size: 1 
p: 1 
n_neighbors: 20 
weights:  distance 
algorithm:  auto


In [33]:
#k-neighbors
#best params: leaf_size = 1, n_neighbors = 20, p=1, weights = distance", algorithm = "auto"
clf = KNeighborsClassifier(leaf_size=1, p=1, n_neighbors=20, weights='distance', algorithm='auto')
scores = cross_val_score(clf, X, y, cv=10)
scores

array([0.5       , 0.86666667, 0.9       , 0.9       , 0.73333333,
       0.86666667, 0.96666667, 0.93103448, 0.79310345, 0.82758621])

In [34]:
#The mean score and the standard deviation are hence given by:
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.83 accuracy with a standard deviation of 0.13


In [45]:
#Random forest
clf = RandomForestClassifier()
scores = cross_val_score(clf, X, y, cv=10)
scores

array([0.6       , 0.66666667, 0.83333333, 0.86666667, 0.8       ,
       0.93333333, 0.96666667, 0.82758621, 0.55172414, 0.82758621])

In [46]:
#The mean score and the standard deviation are hence given by:
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.79 accuracy with a standard deviation of 0.13


In [74]:
hparams = {"min_samples_leaf": (0.1, 0.5), "min_samples_split": (0.1, 1.0)}
def estimator(min_samples_leaf, min_samples_split):
    # initialize model
    model = RandomForestClassifier(min_samples_leaf=min_samples_leaf, min_samples_split=min_samples_split)
    # set in cross-validation
    result = cross_validate(model, X, y, cv=10)
    # result is mean of test_score
    return np.mean(result['test_score'])
from bayes_opt import BayesianOptimization
# give model and hyperparameter to optmizer
rf_bayesopt = BayesianOptimization(estimator, hparams)

In [75]:
rf_bayesopt.maximize(init_points=5, n_iter=50, acq='ucb')
rf_bayesopt.max

|   iter    |  target   | min_sa... | min_sa... |
-------------------------------------------------
| [0m1        [0m | [0m0.7872   [0m | [0m0.2336   [0m | [0m0.1047   [0m |
| [0m2        [0m | [0m0.7739   [0m | [0m0.2985   [0m | [0m0.5174   [0m |
| [0m3        [0m | [0m0.5389   [0m | [0m0.4463   [0m | [0m0.346    [0m |
| [95m4        [0m | [95m0.8075   [0m | [95m0.2075   [0m | [95m0.1518   [0m |
| [0m5        [0m | [0m0.5389   [0m | [0m0.2569   [0m | [0m0.76     [0m |
| [0m6        [0m | [0m0.8007   [0m | [0m0.1      [0m | [0m0.1      [0m |
| [95m7        [0m | [95m0.818    [0m | [95m0.1      [0m | [95m0.3227   [0m |
| [0m8        [0m | [0m0.8011   [0m | [0m0.1      [0m | [0m0.4979   [0m |
| [0m9        [0m | [0m0.5389   [0m | [0m0.5      [0m | [0m1.0      [0m |
| [0m10       [0m | [0m0.8077   [0m | [0m0.2063   [0m | [0m0.3969   [0m |
| [0m11       [0m | [0m0.5389   [0m | [0m0.1      [0m | [0m1.0   

{'target': 0.8314942528735634,
 'params': {'min_samples_leaf': 0.10154279341986147,
  'min_samples_split': 0.5592278993918031}}

In [76]:
#{'target': 0.8348275862068967,'params': {'min_samples_leaf': 0.23626468633236386, 'min_samples_split': 0.5174498346335343}}
clf_rf = RandomForestClassifier()
hparams_rf = {"criterion": ["gini", "entropy", "log_loss"], 
           "max_depth": [1,2,8,16,32],
          "min_samples_split": [0.1,0.5174498346335343,0.5],
          "min_samples_leaf":[0.1,0.23626468633236386,0.5]}

In [77]:
GS_estimator = GridSearchCV(clf_rf, hparams_rf, cv=10, scoring="accuracy")
GS_estimator.fit(X, y)
print(GS_estimator.best_params_)

{'criterion': 'entropy', 'max_depth': 1, 'min_samples_leaf': 0.1, 'min_samples_split': 0.1}


In [78]:
#Random forest: 
#{'criterion': 'log_loss', 'max_depth': 8, 'min_samples_leaf': 0.1, 'min_samples_split': 0.5} - 82
clf = RandomForestClassifier(criterion='log_loss',max_depth=8,min_samples_leaf=0.1, min_samples_split=0.5)
scores = cross_val_score(clf, X, y, cv=10)
scores

array([0.6       , 0.63333333, 0.83333333, 0.96666667, 0.83333333,
       0.9       , 1.        , 0.82758621, 0.72413793, 0.75862069])

In [79]:
#The mean score and the standard deviation are hence given by:
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.81 accuracy with a standard deviation of 0.12


In [60]:
#MLP
clf = MLPClassifier(random_state=1, max_iter=3000)
scores = cross_val_score(clf, X, y, cv=10)
scores

array([0.53333333, 0.7       , 0.83333333, 0.8       , 0.76666667,
       0.86666667, 0.83333333, 0.82758621, 0.72413793, 0.86206897])

In [61]:
#The mean score and the standard deviation are hence given by:
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.77 accuracy with a standard deviation of 0.10


In [62]:
mlp = MLPClassifier(max_iter=1000)
parameter_space = {
    'activation': ['logistic','tanh', 'relu'],
    'solver': ['lbfgs','sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}
clf_t = GridSearchCV(mlp, parameter_space, n_jobs=-1, cv=10)
clf_t.fit(X, y)
clf_t.best_params_

{'activation': 'tanh',
 'alpha': 0.0001,
 'learning_rate': 'adaptive',
 'solver': 'adam'}

In [67]:
#MLP
#best params: 'activation': tanh','solver': 'sgd','alpha': 0.05,'learning_rate': 'adaptive'
clf = MLPClassifier(max_iter=3000, activation='tanh', alpha=0.05, learning_rate='adaptive',solver='sgd')
scores = cross_val_score(clf, X, y, cv=10)
scores

array([0.46666667, 0.83333333, 0.9       , 0.86666667, 0.8       ,
       0.86666667, 1.        , 0.86206897, 0.82758621, 0.82758621])

In [68]:
#The mean score and the standard deviation are hence given by:
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.83 accuracy with a standard deviation of 0.13


In [69]:
#GradientBoostingClassifier
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=1, random_state=0)
scores = cross_val_score(clf, X, y, cv=10)
scores

array([0.56666667, 0.7       , 0.76666667, 0.76666667, 0.8       ,
       0.96666667, 0.86666667, 0.82758621, 0.48275862, 0.79310345])

In [70]:
#The mean score and the standard deviation are hence given by:
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.75 accuracy with a standard deviation of 0.13


In [71]:
gbc = GradientBoostingClassifier()
parameters = {
    "n_estimators":[5,50,250,500],
    "max_depth":[1,3,5,7,9],
    "learning_rate":[0.01,0.1,1,10,100]
}
clf_t = GridSearchCV(gbc,parameters,cv=10)
clf_t.fit(X,y)
clf_t.best_params_

{'learning_rate': 0.1, 'max_depth': 1, 'n_estimators': 50}

In [72]:
#GradientBoostingClassifier
#best parameters = {"n_estimators": 50,"max_depth":1,"learning_rate":0.1}
clf = GradientBoostingClassifier(n_estimators=50, learning_rate=0.1,max_depth=1, random_state=0)
scores = cross_val_score(clf, X, y, cv=10)
scores

array([0.63333333, 0.76666667, 0.9       , 0.9       , 0.83333333,
       0.9       , 1.        , 0.82758621, 0.79310345, 0.86206897])

In [73]:
#The mean score and the standard deviation are hence given by:
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.84 accuracy with a standard deviation of 0.09
