In [1]:
# Imports and data loading
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, LabelBinarizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
import datetime

# Supress the unnumbered hordes of warnings that make output completely unreadable
import warnings
from sklearn.exceptions import DataConversionWarning, ConvergenceWarning
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=DataConversionWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

feature_names = ['sepal length', 'sepal width', 'petal length', 'petal width', 'species']

iris_data = pd.read_csv('../input/iris.data', names=feature_names)
data = iris_data.loc[:, 'sepal length':'petal width']
target = iris_data.loc[:, 'species']

label_encoder = LabelEncoder()
label_encoder.fit(target)
print('Target classes: {}'.format(label_encoder.classes_))
print('Target class numbers: {}'.format(label_encoder.transform(label_encoder.classes_)))
target_numeric = label_encoder.transform(target)

Target classes: ['Iris-setosa' 'Iris-versicolor' 'Iris-virginica']
Target class numbers: [0 1 2]


EDA takeaways: 
- Feature data is numeric, continuous ratio data, roughly arranged according to the normal distribution. 
- Two of the classes are not linearly separable. 

In [2]:
# Train/test split
x_train, x_test, y_train, y_test = train_test_split(data, target_numeric, train_size=0.7, test_size=0.3, random_state=4)

# Directly confirm their sizes 
print("x_train.shape: %s" % str(x_train.shape))
print("x_test.shape: %s" % str(x_test.shape))
print("y_train.shape: %s" % str(y_train.shape))
print("y_test.shape: %s" % str(y_test.shape))


# Standardisation
scaler = StandardScaler().fit(x_train)
standardised_x_train = scaler.transform(x_train)
standardised_x_test = scaler.transform(x_test)

# Directly confirm their sizes
print("standardised_x_train.shape: %s" % str(standardised_x_train.shape))
print("standardised_x_test.shape: %s" % str(standardised_x_test.shape))

x_train.shape: (105, 4)
x_test.shape: (45, 4)
y_train.shape: (105,)
y_test.shape: (45,)
standardised_x_train.shape: (105, 4)
standardised_x_test.shape: (45, 4)


In [3]:
# OOB Logistic Regression
lr = LogisticRegression(random_state=4)
lr.fit(standardised_x_train, y_train)
lr_pred = lr.predict(standardised_x_test)
print('OOB Accuracy Score: %f' % accuracy_score(y_test, lr_pred))
print('OOB Classification Report:')
print(classification_report(y_test, lr_pred))

# Logistic Regression with Grid Search
lr_gs_params = {'tol': [0.000003, 0.00001, 0.00003, 0.0001, 0.0003, 0.001, 0.003], 
                'C': [0.03,0.1,0.3,1,3,10,30]}
lr_grid = GridSearchCV(estimator=LogisticRegression(random_state=4), param_grid=lr_gs_params)
lr_grid.fit(standardised_x_train, y_train)
print('GridSearchCV best score: %f' % lr_grid.best_score_)
print('GridSearchCV best estimator: %s' % lr_grid.best_estimator_)

OOB Accuracy Score: 0.933333
OOB Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.95      0.98        21
           1       0.82      0.90      0.86        10
           2       0.93      0.93      0.93        14

   micro avg       0.93      0.93      0.93        45
   macro avg       0.92      0.93      0.92        45
weighted avg       0.94      0.93      0.93        45

GridSearchCV best score: 0.942857
GridSearchCV best estimator: LogisticRegression(C=30, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=4, solver='warn',
          tol=3e-06, verbose=0, warm_start=False)


In [4]:
# OOB Random Forest
rf = RandomForestClassifier(random_state=4)
rf.fit(x_train, y_train)
rf_pred = rf.predict(x_test)
print('OOB Accuracy Score: %f' % accuracy_score(y_test, rf_pred))
print('OOB Classification Report:')
print(classification_report(y_test, rf_pred))

# Random Forest with Grid Search
rf_gs_params = {'n_estimators': np.arange(5,16),
                'min_samples_split': [2, 10, 20, 40, 80, 120]}
rf_grid = GridSearchCV(estimator=RandomForestClassifier(random_state=4), param_grid=rf_gs_params)
rf_grid.fit(x_train, y_train)
print('GridSearchCV best score: %f' % rf_grid.best_score_)
print('GridSearchCV best estimator: %s' % rf_grid.best_estimator_)

OOB Accuracy Score: 0.955556
OOB Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        21
           1       0.90      0.90      0.90        10
           2       0.93      0.93      0.93        14

   micro avg       0.96      0.96      0.96        45
   macro avg       0.94      0.94      0.94        45
weighted avg       0.96      0.96      0.96        45

GridSearchCV best score: 0.952381
GridSearchCV best estimator: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=5, n_jobs=None,
            oob_score=False, random_state=4, verbose=0, warm_start=False)


In [5]:
# OOB K Nearest Neighbors 
knn = KNeighborsClassifier()
knn.fit(standardised_x_train, y_train)
knn_pred = knn.predict(standardised_x_test)
print('OOB Accuracy Score: %f' % accuracy_score(y_test, knn_pred))
print('OOB Classification Report:')
print(classification_report(y_test, knn_pred))

# K Nearest Neighbors with Grid Search
knn_gs_params = {'n_neighbors': np.arange(1,16), 
                'weights': ['uniform', 'distance']}
knn_grid = GridSearchCV(estimator=KNeighborsClassifier(), param_grid=knn_gs_params)
knn_grid.fit(standardised_x_train, y_train)
print('GridSearchCV best score: %f' % knn_grid.best_score_)
print('GridSearchCV best estimator: %s' % knn_grid.best_estimator_)

OOB Accuracy Score: 0.955556
OOB Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        21
           1       0.90      0.90      0.90        10
           2       0.93      0.93      0.93        14

   micro avg       0.96      0.96      0.96        45
   macro avg       0.94      0.94      0.94        45
weighted avg       0.96      0.96      0.96        45

GridSearchCV best score: 0.971429
GridSearchCV best estimator: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=4, p=2,
           weights='uniform')


In [6]:
# OOB Multi-layer Perceptron
print(datetime.datetime.now())
mlp = MLPClassifier(random_state=4)
mlp.fit(standardised_x_train, y_train)
mlp_pred = mlp.predict(standardised_x_test)
print('OOB Accuracy Score: %f' % accuracy_score(y_test, mlp_pred))
print('OOB Classification Report:')
print(classification_report(y_test, mlp_pred))

# Multi-layer Perceptron with Grid Search
mlp_gs_params = {'hidden_layer_sizes': [(100,), (100,100), (130,80,40)],
                 'activation': ['identity', 'logistic', 'tanh', 'relu'],
                 'solver': ['lbfgs', 'sgd', 'adam'], 
                 'alpha': [0.000003, 0.00001, 0.00003, 0.0001, 0.0003, 0.001, 0.003]}
mlp_grid = GridSearchCV(estimator=MLPClassifier(random_state=4), param_grid=mlp_gs_params)
mlp_grid.fit(standardised_x_train, y_train)
print('GridSearchCV best score: %f' % mlp_grid.best_score_)
print('GridSearchCV best estimator: %s' % mlp_grid.best_estimator_)
print(datetime.datetime.now())

2018-10-04 12:37:00.753662
OOB Accuracy Score: 0.933333
OOB Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.95      0.98        21
           1       0.82      0.90      0.86        10
           2       0.93      0.93      0.93        14

   micro avg       0.93      0.93      0.93        45
   macro avg       0.92      0.93      0.92        45
weighted avg       0.94      0.93      0.93        45

GridSearchCV best score: 0.971429
GridSearchCV best estimator: MLPClassifier(activation='logistic', alpha=3e-06, batch_size='auto',
       beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100, 100), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=4, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)
2018-10-04 12:

In [7]:
# OOB Multi-layer Perceptron max_iter=500
print(datetime.datetime.now())
mlp = MLPClassifier(max_iter=500, random_state=4)
mlp.fit(standardised_x_train, y_train)
mlp_pred = mlp.predict(standardised_x_test)
print('OOB Accuracy Score: %f' % accuracy_score(y_test, mlp_pred))
print('OOB Classification Report:')
print(classification_report(y_test, mlp_pred))

# Multi-layer Perceptron with Grid Search
mlp_gs_params = {'hidden_layer_sizes': [(100,), (100,100), (130,80,40)],
                 'activation': ['identity', 'logistic', 'tanh', 'relu'],
                 'solver': ['lbfgs', 'sgd', 'adam'], 
                 'alpha': [0.000003, 0.00001, 0.00003, 0.0001, 0.0003, 0.001, 0.003]}
mlp_grid = GridSearchCV(estimator=MLPClassifier(max_iter=500, random_state=4), param_grid=mlp_gs_params)
mlp_grid.fit(standardised_x_train, y_train)
print('GridSearchCV best score: %f' % mlp_grid.best_score_)
print('GridSearchCV best estimator: %s' % mlp_grid.best_estimator_)
print(datetime.datetime.now())

2018-10-04 12:39:12.686094
OOB Accuracy Score: 0.933333
OOB Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.95      0.98        21
           1       0.82      0.90      0.86        10
           2       0.93      0.93      0.93        14

   micro avg       0.93      0.93      0.93        45
   macro avg       0.92      0.93      0.92        45
weighted avg       0.94      0.93      0.93        45

GridSearchCV best score: 0.971429
GridSearchCV best estimator: MLPClassifier(activation='logistic', alpha=3e-06, batch_size='auto',
       beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=500, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=4, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)
2018-10-04 12:42:5

In [8]:
# OOB Support Vector Machine
svm = SVC(random_state=4)
svm.fit(standardised_x_train, y_train)
svm_pred = svm.predict(standardised_x_test)
print('OOB Accuracy Score: %f' % accuracy_score(y_test, svm_pred))
print('OOB Classification Report:')
print(classification_report(y_test, svm_pred))

# Support Vector Machine with Grid Search
svm_gs_params = {'C': [0.03,0.1,0.3,1,3,10,30],
                 'kernel': ['rbf', 'poly', 'sigmoid']}
svm_grid = GridSearchCV(estimator=SVC(random_state=4), param_grid=svm_gs_params)
svm_grid.fit(standardised_x_train, y_train)
print('GridSearchCV best score: %f' % svm_grid.best_score_)
print('GridSearchCV best estimator: %s' % svm_grid.best_estimator_)

OOB Accuracy Score: 0.955556
OOB Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        21
           1       0.90      0.90      0.90        10
           2       0.93      0.93      0.93        14

   micro avg       0.96      0.96      0.96        45
   macro avg       0.94      0.94      0.94        45
weighted avg       0.96      0.96      0.96        45

GridSearchCV best score: 0.971429
GridSearchCV best estimator: SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=4,
  shrinking=True, tol=0.001, verbose=False)


In [9]:
# OOB Decision Tree
dtc = DecisionTreeClassifier(random_state=4)
dtc.fit(x_train, y_train)
dtc_pred = dtc.predict(x_test)
print('OOB Accuracy Score: %f' % accuracy_score(y_test, dtc_pred))
print('OOB Classification Report:')
print(classification_report(y_test, dtc_pred))

# Decision Tree with Grid Search
dtc_gs_params = {'criterion': ['gini', 'entropy']}
dtc_grid = GridSearchCV(estimator=DecisionTreeClassifier(random_state=4), param_grid=dtc_gs_params)
dtc_grid.fit(x_train, y_train)
print('GridSearchCV best score: %f' % dtc_grid.best_score_)
print('GridSearchCV best estimator: %s' % dtc_grid.best_estimator_)

OOB Accuracy Score: 0.977778
OOB Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        21
           1       1.00      0.90      0.95        10
           2       0.93      1.00      0.97        14

   micro avg       0.98      0.98      0.98        45
   macro avg       0.98      0.97      0.97        45
weighted avg       0.98      0.98      0.98        45

GridSearchCV best score: 0.952381
GridSearchCV best estimator: DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=4,
            splitter='best')


In [10]:
# OOB Naive Bayes
gnb = GaussianNB()
gnb.fit(standardised_x_train, y_train)
gnb_pred = gnb.predict(standardised_x_test)
print('OOB Accuracy Score: %f' % accuracy_score(y_test, gnb_pred))
print('OOB Classification Report:')
print(classification_report(y_test, gnb_pred))

# Naive Bayes with Grid Search
# There were simple parameters to search through, so no grid search was performed for GaussianNB

OOB Accuracy Score: 0.977778
OOB Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        21
           1       0.91      1.00      0.95        10
           2       1.00      0.93      0.96        14

   micro avg       0.98      0.98      0.98        45
   macro avg       0.97      0.98      0.97        45
weighted avg       0.98      0.98      0.98        45

