In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import seaborn as sn

# Training

In [3]:
training_data = pd.read_csv("./new_iris_training.csv")
X = training_data[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']]
y = training_data.loc[:,'Species']
#X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size = 0.3, random_state = 20)

     SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm
0            4.927         3.696          1.514         0.247
1            5.524         2.828          4.129         1.184
2            5.114         3.521          1.287         0.262
3            5.947         2.607          5.228         2.038
4            4.835         3.440          1.345         0.190
5            4.893         3.335          1.447         0.225
6            6.347         2.805          5.602         2.226
7            6.469         3.106          5.510         1.942
8            5.500         2.300          4.000         1.300
9            5.919         2.777          4.266         1.312
10           7.700         3.800          6.700         2.200
11           5.061         3.315          1.501         0.215
12           6.381         2.572          3.711         1.501
13           5.867         3.024          4.019         1.525
14           5.723         2.521          4.063         1.451
15      

# Tuning

In [4]:
from sklearn.model_selection import RandomizedSearchCV# Number of trees in random forest
from pprint import pprint

n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


In [5]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 10, verbose=2, random_state=42, n_jobs = -1)# Fit the random search model
rf_random.fit(X, y)
#y_pred = clf.predict(X_validation)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   15.0s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:  7.7min finished


RandomizedSearchCV(cv=10, error_score='raise-deprecating',
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    n_estimators='warn',
                                                    n_jobs=Non

In [6]:
rf_random.best_params_

{'n_estimators': 800,
 'min_samples_split': 5,
 'min_samples_leaf': 4,
 'max_features': 'sqrt',
 'max_depth': 30,
 'bootstrap': False}

# Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV

# Create the parameter grid based on the results of random search 
param_grid = {'n_estimators': [2000],
 'min_samples_split': [10],
 'min_samples_leaf': [1],
 'max_features': ['sqrt'],
 'max_depth': [10],
 'bootstrap': [False]}

# Create a based model
rf = RandomForestClassifier()# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

# Confusion Matrix

In [None]:
#confusion_matrix = pd.crosstab(y_validation, y_pred, rownames=['Actual'], colnames=['Predicted'])
#sn.heatmap(confusion_matrix, annot=True)
#print('Accuracy Score',  metrics.accuracy_score(y_validation, y_pred))

# Predicting

In [7]:
data_test = pd.read_csv('./new_iris_test.csv')
print(data_test.head(5))
X_test = data_test[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']]

   SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm  Species
0          5.933         2.770          4.244         1.310      NaN
1          6.637         2.736          5.725         1.981      NaN
2          5.434         2.747          4.239         1.373      NaN
3          5.099         3.358          1.515         0.222      NaN
4          5.011         3.431          1.461         0.256      NaN


In [8]:
test_pred = rf_random.predict(X_test)

In [9]:
submission = pd.read_csv("./iris_submission.csv")
submission.loc[:,'Species'] = test_pred
print(submission.head(20))

    Id     Species
0    1  versicolor
1    2   virginica
2    3  versicolor
3    4      setosa
4    5      setosa
5    6   virginica
6    7      setosa
7    8   virginica
8    9      setosa
9   10  versicolor
10  11      setosa
11  12   virginica
12  13  versicolor
13  14      setosa
14  15      setosa
15  16  versicolor
16  17   virginica
17  18      setosa
18  19   virginica
19  20  versicolor


In [10]:
submission.to_csv("./191018_3_finished_iris_submission.csv", index=None, header=True)