In [1]:
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split

In [6]:
iris = datasets.load_iris()
X = iris.data[:, :2]  # we only take the first two features.
y = iris.target
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [3]:
import itertools
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
plt.rcParams["font.family"] = 'DejaVu Sans'

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=90)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [4]:
#Model1
from datetime import datetime
def perform_model(model, X_train, y_train, X_test, y_test, class_labels, cm_normalize=True, \
                 print_cm=True, cm_cmap=plt.cm.Greens):
    
    
    # to store results at various phases
    results = dict()
    
    # time at which model starts training 
    train_start_time = datetime.now()
    print('training the model..')
    model.fit(X_train, y_train)
    print('Done \n \n')
    train_end_time = datetime.now()
    results['training_time'] =  train_end_time - train_start_time
    print('training_time(HH:MM:SS.ms) - {}\n\n'.format(results['training_time']))
    
    
    # predict test data
    print('Predicting test data')
    test_start_time = datetime.now()
    y_pred = model.predict(X_test)
    test_end_time = datetime.now()
    print('Done \n \n')
    results['testing_time'] = test_end_time - test_start_time
    print('testing time(HH:MM:SS:ms) - {}\n\n'.format(results['testing_time']))
    results['predicted'] = y_pred
   

    # calculate overall accuracty of the model
    accuracy = metrics.accuracy_score(y_true=y_test, y_pred=y_pred)
    # store accuracy in results
    results['accuracy'] = accuracy
    print('---------------------')
    print('|      Accuracy      |')
    print('---------------------')
    print('\n    {}\n\n'.format(accuracy))
    
    
    # confusion matrix
    cm = metrics.confusion_matrix(y_test, y_pred)
    results['confusion_matrix'] = cm
    if print_cm: 
        print('--------------------')
        print('| Confusion Matrix |')
        print('--------------------')
        print('\n {}'.format(cm))
        
    # plot confusin matrix
    #plt.figure(figsize=(8,8))
    #plt.grid(b=False)
    #plot_confusion_matrix(cm, classes=class_labels, normalize=True, title='Normalized confusion matrix', cmap = cm_cmap)
    #plt.show()
    
    # get classification report
    print('-------------------------')
    print('| Classifiction Report |')
    print('-------------------------')
    classification_report = metrics.classification_report(y_test, y_pred)
    # store report in results
    results['classification_report'] = classification_report
    print(classification_report)
    
    # add the trained  model to the results
    results['model'] = model
    
    return results
    
    

In [5]:
def print_grid_search_attributes(model):
    # Estimator that gave highest score among all the estimators formed in GridSearch
    print('--------------------------')
    print('|      Best Estimator     |')
    print('--------------------------')
    print('\n\t{}\n'.format(model.best_estimator_))


    # parameters that gave best results while performing grid search
    print('--------------------------')
    print('|     Best parameters     |')
    print('--------------------------')
    print('\tParameters of best estimator : \n\n\t{}\n'.format(model.best_params_))


    #  number of cross validation splits
    print('---------------------------------')
    print('|   No of CrossValidation sets   |')
    print('--------------------------------')
    print('\n\tTotal numbre of cross validation sets: {}\n'.format(model.n_splits_))


    # Average cross validated score of the best estimator, from the Grid Search 
    print('--------------------------')
    print('|        Best Score       |')
    print('--------------------------')
    print('\n\tAverage Cross Validate scores of best estimator : \n\n\t{}\n'.format(model.best_score_))

    
    

## 1. Logistic Regression with Grid Search

In [7]:
from sklearn import linear_model
from sklearn import metrics

from sklearn.model_selection import GridSearchCV

In [9]:

# start Grid search
parameters = {'C':[0.01, 0.1, 1, 10, 20, 30], 'penalty':['l2','l1']}
log_reg = linear_model.LogisticRegression()
log_reg_grid = GridSearchCV(log_reg, param_grid=parameters, cv=3, verbose=1, n_jobs=-1)
log_reg_grid_results =  perform_model(log_reg_grid, x_train, y_train, x_test, y_test, class_labels=y)



training the model..
Fitting 3 folds for each of 12 candidates, totalling 36 fits
Done 
 

training_time(HH:MM:SS.ms) - 0:00:01.994345


Predicting test data
Done 
 

testing time(HH:MM:SS:ms) - 0:00:00.000238


---------------------
|      Accuracy      |
---------------------

    0.82


--------------------
| Confusion Matrix |
--------------------

 [[19  0  0]
 [ 0  8  7]
 [ 0  2 14]]
-------------------------
| Classifiction Report |
-------------------------
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       0.80      0.53      0.64        15
           2       0.67      0.88      0.76        16

    accuracy                           0.82        50
   macro avg       0.82      0.80      0.80        50
weighted avg       0.83      0.82      0.81        50



 0.79976233        nan 0.79976233        nan 0.79976233        nan]


In [10]:
#plt.figure(figsize=(8,8))
#plt.grid(b=False)
#plot_confusion_matrix(log_reg_grid_results['confusion_matrix'], classes=labels, cmap=plt.cm.Greens, )
#plt.show()

## 2. Linear SVC with GridSearch

In [11]:
from sklearn.svm import LinearSVC

In [12]:
parameters = {'C':[0.125, 0.5, 1, 2, 8, 16]}
lr_svc = LinearSVC(tol=0.00005)
lr_svc_grid = GridSearchCV(lr_svc, param_grid=parameters, n_jobs=-1, verbose=1)
lr_svc_grid_results = perform_model(lr_svc_grid, x_train, y_train, x_test, y_test, class_labels=y)

training the model..
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Done 
 

training_time(HH:MM:SS.ms) - 0:00:00.087121


Predicting test data
Done 
 

testing time(HH:MM:SS:ms) - 0:00:00.000176


---------------------
|      Accuracy      |
---------------------

    0.78


--------------------
| Confusion Matrix |
--------------------

 [[19  0  0]
 [ 1  6  8]
 [ 0  2 14]]
-------------------------
| Classifiction Report |
-------------------------
              precision    recall  f1-score   support

           0       0.95      1.00      0.97        19
           1       0.75      0.40      0.52        15
           2       0.64      0.88      0.74        16

    accuracy                           0.78        50
   macro avg       0.78      0.76      0.74        50
weighted avg       0.79      0.78      0.76        50





In [13]:
print_grid_search_attributes(lr_svc_grid_results['model'])

--------------------------
|      Best Estimator     |
--------------------------

	LinearSVC(C=1, tol=5e-05)

--------------------------
|     Best parameters     |
--------------------------
	Parameters of best estimator : 

	{'C': 1}

---------------------------------
|   No of CrossValidation sets   |
--------------------------------

	Total numbre of cross validation sets: 5

--------------------------
|        Best Score       |
--------------------------

	Average Cross Validate scores of best estimator : 

	0.7699999999999999



## 3. Kernel SVM with GridSearch

In [14]:
from sklearn.svm import SVC
parameters = {'C':[2,8,16],\
              'gamma': [ 0.0078125, 0.125, 2]}
rbf_svm = SVC(kernel='rbf')
rbf_svm_grid = GridSearchCV(rbf_svm,param_grid=parameters, n_jobs=-1)
rbf_svm_grid_results = perform_model(rbf_svm_grid, x_train, y_train, x_test, y_test, class_labels=y)

training the model..
Done 
 

training_time(HH:MM:SS.ms) - 0:00:00.035258


Predicting test data
Done 
 

testing time(HH:MM:SS:ms) - 0:00:00.000636


---------------------
|      Accuracy      |
---------------------

    0.78


--------------------
| Confusion Matrix |
--------------------

 [[19  0  0]
 [ 0  8  7]
 [ 0  4 12]]
-------------------------
| Classifiction Report |
-------------------------
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       0.67      0.53      0.59        15
           2       0.63      0.75      0.69        16

    accuracy                           0.78        50
   macro avg       0.77      0.76      0.76        50
weighted avg       0.78      0.78      0.78        50



In [15]:
print_grid_search_attributes(rbf_svm_grid_results['model'])

--------------------------
|      Best Estimator     |
--------------------------

	SVC(C=2, gamma=0.125)

--------------------------
|     Best parameters     |
--------------------------
	Parameters of best estimator : 

	{'C': 2, 'gamma': 0.125}

---------------------------------
|   No of CrossValidation sets   |
--------------------------------

	Total numbre of cross validation sets: 5

--------------------------
|        Best Score       |
--------------------------

	Average Cross Validate scores of best estimator : 

	0.8099999999999999



## 4. Decision Trees with GridSearchCV

In [16]:
from sklearn.tree import DecisionTreeClassifier
parameters = {'max_depth':np.arange(3,10,2)}
dt = DecisionTreeClassifier()
dt_grid = GridSearchCV(dt,param_grid=parameters, n_jobs=-1)
dt_grid_results = perform_model(dt_grid, x_train, y_train, x_test, y_test, class_labels=y)
print_grid_search_attributes(dt_grid_results['model'])

training the model..
Done 
 

training_time(HH:MM:SS.ms) - 0:00:00.078046


Predicting test data
Done 
 

testing time(HH:MM:SS:ms) - 0:00:00.000341


---------------------
|      Accuracy      |
---------------------

    0.76


--------------------
| Confusion Matrix |
--------------------

 [[18  1  0]
 [ 0  9  6]
 [ 0  5 11]]
-------------------------
| Classifiction Report |
-------------------------
              precision    recall  f1-score   support

           0       1.00      0.95      0.97        19
           1       0.60      0.60      0.60        15
           2       0.65      0.69      0.67        16

    accuracy                           0.76        50
   macro avg       0.75      0.74      0.75        50
weighted avg       0.77      0.76      0.76        50

--------------------------
|      Best Estimator     |
--------------------------

	DecisionTreeClassifier(max_depth=3)

--------------------------
|     Best parameters     |
--------------------------
	Parame

## 5. Random Forest Classifier with GridSearch

In [17]:
from sklearn.ensemble import RandomForestClassifier
params = {'n_estimators': np.arange(10,201,20), 'max_depth':np.arange(3,15,2)}
rfc = RandomForestClassifier()
rfc_grid = GridSearchCV(rfc, param_grid=params, n_jobs=-1)
rfc_grid_results = perform_model(rfc_grid, x_train, y_train, x_test, y_test, class_labels=y)
print_grid_search_attributes(rfc_grid_results['model'])

training the model..
Done 
 

training_time(HH:MM:SS.ms) - 0:00:05.348872


Predicting test data
Done 
 

testing time(HH:MM:SS:ms) - 0:00:00.003486


---------------------
|      Accuracy      |
---------------------

    0.74


--------------------
| Confusion Matrix |
--------------------

 [[19  0  0]
 [ 0  8  7]
 [ 0  6 10]]
-------------------------
| Classifiction Report |
-------------------------
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       0.57      0.53      0.55        15
           2       0.59      0.62      0.61        16

    accuracy                           0.74        50
   macro avg       0.72      0.72      0.72        50
weighted avg       0.74      0.74      0.74        50

--------------------------
|      Best Estimator     |
--------------------------

	RandomForestClassifier(max_depth=5, n_estimators=30)

--------------------------
|     Best parameters     |
-----------------

## 6. Gradient Boosted Decision Trees With GridSearch

In [18]:
from sklearn.ensemble import GradientBoostingClassifier
param_grid = {'max_depth': np.arange(5,8,1), \
             'n_estimators':np.arange(130,170,10)}
gbdt = GradientBoostingClassifier()
gbdt_grid = GridSearchCV(gbdt, param_grid=param_grid, n_jobs=-1)
gbdt_grid_results = perform_model(gbdt_grid, x_train, y_train, x_test, y_test, class_labels=y)
print_grid_search_attributes(gbdt_grid_results['model'])

training the model..
Done 
 

training_time(HH:MM:SS.ms) - 0:00:04.710234


Predicting test data
Done 
 

testing time(HH:MM:SS:ms) - 0:00:00.001361


---------------------
|      Accuracy      |
---------------------

    0.76


--------------------
| Confusion Matrix |
--------------------

 [[19  0  0]
 [ 0  8  7]
 [ 0  5 11]]
-------------------------
| Classifiction Report |
-------------------------
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       0.62      0.53      0.57        15
           2       0.61      0.69      0.65        16

    accuracy                           0.76        50
   macro avg       0.74      0.74      0.74        50
weighted avg       0.76      0.76      0.76        50

--------------------------
|      Best Estimator     |
--------------------------

	GradientBoostingClassifier(max_depth=6, n_estimators=150)

--------------------------
|     Best parameters     |
------------

## 7. Comparing all models

In [19]:
print('\n                     Accuracy     Error')
print('                     ----------   --------')
print('Logistic Regression : {:.04}%       {:.04}%'.format(log_reg_grid_results['accuracy'] * 100,\
                                                  100-(log_reg_grid_results['accuracy'] * 100)))

print('Linear SVC          : {:.04}%       {:.04}% '.format(lr_svc_grid_results['accuracy'] * 100,\
                                                        100-(lr_svc_grid_results['accuracy'] * 100)))

print('rbf SVM classifier  : {:.04}%      {:.04}% '.format(rbf_svm_grid_results['accuracy'] * 100,\
                                                          100-(rbf_svm_grid_results['accuracy'] * 100)))

print('DecisionTree        : {:.04}%      {:.04}% '.format(dt_grid_results['accuracy'] * 100,\
                                                        100-(dt_grid_results['accuracy'] * 100)))

print('Random Forest       : {:.04}%      {:.04}% '.format(rfc_grid_results['accuracy'] * 100,\
                                                           100-(rfc_grid_results['accuracy'] * 100)))
print('GradientBoosting DT : {:.04}%      {:.04}% '.format(rfc_grid_results['accuracy'] * 100,\
                                                        100-(rfc_grid_results['accuracy'] * 100)))


                     Accuracy     Error
                     ----------   --------
Logistic Regression : 82.0%       18.0%
Linear SVC          : 78.0%       22.0% 
rbf SVM classifier  : 78.0%      22.0% 
DecisionTree        : 76.0%      24.0% 
Random Forest       : 74.0%      26.0% 
GradientBoosting DT : 74.0%      26.0% 
