### Hyperparameter Tuning = external configuration variables
### Model score chance increase just because we use hyperparameter tuning
### Bayesian optimization
### Grid search
### Random search


### increase accuracy

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [12]:
df = pd.read_csv('heart.csv')
df.shape

(303, 14)

In [13]:
df.columns

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'],
      dtype='object')

In [14]:
df.head(10)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
5,57,1,0,140,192,0,1,148,0,0.4,1,0,1,1
6,56,0,1,140,294,0,0,153,0,1.3,1,0,2,1
7,44,1,1,120,263,0,1,173,0,0.0,2,0,3,1
8,52,1,2,172,199,1,1,162,0,0.5,2,0,3,1
9,57,1,2,150,168,0,1,174,0,1.6,2,0,2,1


In [15]:
x = df.iloc[:,0:-1]
y = df.iloc[:,-1]

In [16]:
y

0      1
1      1
2      1
3      1
4      1
      ..
298    0
299    0
300    0
301    0
302    0
Name: target, Length: 303, dtype: int64

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
x_train , x_test , y_train , y_test = train_test_split(x,y,test_size =0.2,random_state=59)

In [19]:
len(x_train)


242

In [20]:
len(x_test)

61

In [21]:
from sklearn.ensemble import RandomForestClassifier ,GradientBoostingClassifier 
from sklearn.metrics import accuracy_score 

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

In [22]:
rf = RandomForestClassifier()
gb = GradientBoostingClassifier()
svm = SVC()
lr = LogisticRegression()

In [23]:
rf

In [24]:
rf.fit(x_train , y_train)
y_pred = rf.predict(x_test)
accuracy_score(y_test,y_pred)

0.8688524590163934

In [25]:
y_test

107    1
136    1
185    0
3      1
238    0
      ..
29     1
211    0
1      1
82     1
202    0
Name: target, Length: 61, dtype: int64

In [37]:
y_pred

array([1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1,
       1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0])

In [26]:
gb.fit(x_train , y_train)
y_pred = gb.predict(x_test)
accuracy_score(y_test,y_pred)

0.8032786885245902

In [27]:
svm.fit(x_train , y_train)
y_pred = svm.predict(x_test)
accuracy_score(y_test,y_pred)

0.6065573770491803

In [28]:
lr.fit(x_train , y_train)
y_pred = lr.predict(x_test)
accuracy_score(y_test,y_pred)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.8852459016393442

In [29]:
rf = RandomForestClassifier(max_samples = 0.75, random_state=42)
rf.fit(x_train , y_train)
y_pred = rf.predict(x_test)
accuracy_score(y_test,y_pred)

0.8852459016393442

In [30]:
from sklearn.model_selection import cross_val_score

In [31]:
np.mean(cross_val_score(RandomForestClassifier() , x , y , cv=5 , scoring='accuracy'))

np.float64(0.8150819672131149)

In [32]:
np.mean(cross_val_score(RandomForestClassifier(max_samples=0.75) , x , y , cv=5 , scoring='accuracy'))

np.float64(0.8248087431693989)

In [33]:
n_estimators = [20,60,100,120] # numbers of trees 
max_features = [0.2,0.6,1.0] # number of featues consider at every split ( columns )
max_depth = [2,8,None] # maximumNumber of levels in tree
max_samples = [0.5,0.75,1.0] # Number of samples

# 108 ( 4x3x3x3) random forest train

In [34]:
param_grid = {
    'n_estimators':n_estimators,
    'max_features':max_features,
    'max_depth':max_depth,
    'max_samples':max_samples
}
print(param_grid)


{'n_estimators': [20, 60, 100, 120], 'max_features': [0.2, 0.6, 1.0], 'max_depth': [2, 8, None], 'max_samples': [0.5, 0.75, 1.0]}


In [35]:
from sklearn.model_selection import GridSearchCV

In [36]:
rf_g = GridSearchCV(estimator = rf , 
                                param_grid = param_grid,
                                cv=5,
                                verbose=2,
                                n_jobs=-1)

In [85]:
rf_g.fit(x_train,y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


In [86]:
rf_g.best_params_

{'max_depth': 8, 'max_features': 0.2, 'max_samples': 0.5, 'n_estimators': 60}

In [87]:
from sklearn.model_selection import RandomizedSearchCV

In [90]:
rf_g = RandomizedSearchCV(estimator = rf , 
                                param_distributions = param_grid,
                                cv=5,
                                verbose=2,
                                n_jobs=-1)

In [92]:
rf_g.fit(x_train , y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [93]:
rf_g.best_params_

{'n_estimators': 100, 'max_samples': 1.0, 'max_features': 0.2, 'max_depth': 2}

In [8]:
X_test.shape

NameError: name 'X_test' is not defined