In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import learning_curve
from sklearn.model_selection import GridSearchCV
from IPython.display import display, Image
import graphviz
import sys

# Data Preparation

In [None]:
df = pd.read_csv("covtype.csv")

In [None]:
df.info()

In [None]:
df.groupby('Cover_Type').size()

In [None]:
df['y'] = np.where((df['Cover_Type']==4) | (df['Cover_Type']==5), 1, 0)
df.groupby('y').size()

In [None]:
#split features and targets 
X = df.drop(['Cover_Type','y'], axis=1)
Y = df['y']

#Spliting data into training, testing set 
np.random.seed(0)
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size = 0.95, stratify =Y, random_state=999)

# User Defined Helper Functions

In [None]:
def plot_learning_curve(train_sizes, train_scores, valid_scores, 
                        score, title="Learning Curve"):
    # Create means and standard deviations of training set scores
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)

    # Create means and standard deviations of test set scores
    valid_mean = np.mean(valid_scores, axis=1)
    valid_std = np.std(valid_scores, axis=1)

    # Draw lines
    plt.plot(train_sizes, train_mean, '--', color="#111111",  label="Training score")
    plt.plot(train_sizes, valid_mean, color="#111111", label="Cross-validation score")

    # Draw bands
    plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, color="#DDDDDD")
    plt.fill_between(train_sizes, valid_mean - valid_std, valid_mean + valid_std, color="#DDDDDD")

    # Create plot
    plt.title(title)
    plt.xlabel("Training Set Size"), plt.ylabel(score), plt.legend(loc="best")
    plt.tight_layout()
    plt.show()

In [None]:
def plot_single_parameter_search_curve(params, train_mean, valid_mean, train_std, valid_std, 
                                       param_name, score, title='Hyper Parameter Tuning'):
    # Draw lines
    plt.plot(params, train_mean, '--', color="#111111",  label="Training score")
    plt.plot(params, valid_mean, color="#111111", label="Cross-validation score")

    # Draw bands
    plt.fill_between(params, train_mean - train_std, train_mean + train_std, color="#DDDDDD")
    plt.fill_between(params, valid_mean - valid_std, valid_mean + valid_std, color="#DDDDDD")

    # Create plot
    plt.title(title)
    plt.xlabel(param_name), plt.ylabel(score), plt.legend(loc="best")
    plt.tight_layout()
    plt.show()

# Algorithm Implementation

## 1. Decision Tree

### 1.1 No pruning

In [None]:
# Create a classifier instance 
dt = DecisionTreeClassifier()

In [None]:
# Train the model, select model based on cross-validation performance
scores_dt = cross_validate(dt, Xtrain, Ytrain, scoring= ['f1'],
                           cv=5, return_train_score=True)

print("Validate F1 macro: %0.2f (+/- %0.2f)" % (scores_dt['test_f1'].mean(), scores_dt['test_f1'].std() * 2)) 
print("Train F1 macro: %0.2f (+/- %0.2f)" % (scores_dt['train_f1'].mean(), scores_dt['train_f1'].std() * 2)) 

### 1.2 Pre-pruning

In [None]:
# available hyper parameters 
dt.get_params()

#### 1.2.1 Max depth

In [None]:
# Set the parameters searching space
param_grid = [{'max_depth': range(1, 25)}]
tuned_dt = GridSearchCV(dt, param_grid, cv=5,
                        scoring= ['f1'], refit='f1', 
                        return_train_score=True)
tuned_dt.fit(Xtrain, Ytrain)

In [None]:
tuned_dt.best_params_

In [None]:
plot_single_parameter_search_curve(range(1, 25),
                                   tuned_dt.cv_results_['mean_train_f1'], 
                                   tuned_dt.cv_results_['mean_test_f1'], 
                                   tuned_dt.cv_results_['std_train_f1'], 
                                   tuned_dt.cv_results_['std_test_f1'], 
                                   'max_depth', 
                                   'F1', 
                                   'Cover Type Hyper Parameter Tuning')

#### 1.2.2 min_samples_leaf

In [None]:
# Set the parameters searching space
param_grid = [{'min_samples_leaf': range(1, 100, 5)}]

# Search parameters with cross-validation
dt = DecisionTreeClassifier()
tuned_dt = GridSearchCV(dt, param_grid, cv=5,
                        scoring= ['f1'], refit='f1', 
                        return_train_score=True)
tuned_dt.fit(Xtrain, Ytrain)

In [None]:
tuned_dt.best_params_

In [None]:
# Plot performance of different parameters
plot_single_parameter_search_curve(range(1, 100, 5),
                                   tuned_dt.cv_results_['mean_train_f1'], 
                                   tuned_dt.cv_results_['mean_test_f1'], 
                                   tuned_dt.cv_results_['std_train_f1'], 
                                   tuned_dt.cv_results_['std_test_f1'], 
                                   'min_samples_leaf', 
                                   'F1', 
                                   'Cover Type Hyper Parameter Tuning')

#### 1.2.3 min_impurity_decrease

In [None]:
# Set the parameters searching space
param_grid = [{'min_impurity_decrease': np.linspace(0.0, 0.05, 20)}]

# Search parameters with cross-validation
dt = DecisionTreeClassifier()
tuned_dt = GridSearchCV(dt, param_grid, cv=5,
                        scoring= ['f1_macro'], refit='f1_macro', 
                        return_train_score=True)
tuned_dt.fit(Xtrain, Ytrain)

In [None]:
tuned_dt.best_params_

In [None]:
# Plot performance of different parameters
plot_single_parameter_search_curve(np.linspace(0.0, 0.05, 20),
                                   tuned_dt.cv_results_['mean_train_f1_macro'], 
                                   tuned_dt.cv_results_['mean_test_f1_macro'], 
                                   tuned_dt.cv_results_['std_train_f1_macro'], 
                                   tuned_dt.cv_results_['std_test_f1_macro'], 
                                   'min_impurity_decrease', 
                                   'F1 macro', 
                                   'Hyper Parameter Tuning')

#### 1.2.4 search across all hyper parameters for the best set 

### Post-pruning

### Learning Curve

In [None]:
# Create classifier of the best set of hyper parameters
dt = DecisionTreeClassifier(max_depth = 20, min_samples_leaf=1, min_impurity_decrease=0.0)
# learning curve
train_sizes, train_scores, valid_scores = learning_curve(dt, Xtrain, Ytrain, scoring='f1', train_sizes=np.linspace(0.1, 1.0, 10), cv=5)
plot_learning_curve(train_sizes, train_scores, valid_scores, "F1", title="Decision Tree Learning Curve")

## 2. K-NN

### 2.1 Selection of K 

In [None]:
# Create a classifier instance
neigh = KNeighborsClassifier()

# Set the parameters searching space
param_grid = [{'n_neighbors': range(1, 22, 3)}]

# Search parameters with cross-validation
tuned_neigh = GridSearchCV(neigh, param_grid, cv=5,
                           scoring= ['f1'], refit='f1', 
                           return_train_score=True)
tuned_neigh.fit(Xtrain, Ytrain)

print(tuned_neigh.best_params_)

# Plot performance of different parameters
plot_single_parameter_search_curve(range(1, 22, 3),
                                   tuned_neigh.cv_results_['mean_train_f1'], 
                                   tuned_neigh.cv_results_['mean_test_f1'], 
                                   tuned_neigh.cv_results_['std_train_f1'], 
                                   tuned_neigh.cv_results_['std_test_f1'], 
                                   'n_neighbors', 
                                   'F1', 
                                   'Hyper Parameter Tuning')

In [None]:
# Create a classifier instance
neigh = KNeighborsClassifier()

# Set the parameters searching space
param_grid = [{'n_neighbors': range(1, 4)}]

# Search parameters with cross-validation
tuned_neigh = GridSearchCV(neigh, param_grid, cv=5,
                           scoring= ['f1'], refit='f1', 
                           return_train_score=True)
tuned_neigh.fit(Xtrain, Ytrain)

print(tuned_neigh.best_params_)

# Plot performance of different parameters
plot_single_parameter_search_curve(range(1, 4),
                                   tuned_neigh.cv_results_['mean_train_f1'], 
                                   tuned_neigh.cv_results_['mean_test_f1'], 
                                   tuned_neigh.cv_results_['std_train_f1'], 
                                   tuned_neigh.cv_results_['std_test_f1'], 
                                   'n_neighbors', 
                                   'F1', 
                                   'Hyper Parameter Tuning')

### Learning Curve

In [None]:
# Create classifier of the best set of hyper parameters
neigh = KNeighborsClassifier(n_neighbors = 3)

# learning curve
train_sizes, train_scores, valid_scores = learning_curve(neigh, Xtrain, Ytrain, scoring='f1', train_sizes=np.linspace(0.1, 1.0, 10), cv=5)
plot_learning_curve(train_sizes, train_scores, valid_scores, "F1", title="KNN Learning Curve")

## Gradient Boosting

In [None]:
# Create a classifier instance
grd = GradientBoostingClassifier()
# available hyper parameters 
grd.get_params()

In [None]:
# Set the parameters searching space
param_grid = [{'n_estimators': range(1, 201, 5)}]

# Search parameters with cross-validation
tuned_grd = GridSearchCV(grd, param_grid, cv=5,
                           scoring= ['f1'], refit='f1', 
                           return_train_score=True)
tuned_grd.fit(Xtrain, Ytrain)

print(tuned_grd.best_params_)


In [None]:
# Plot performance of different parameters
plot_single_parameter_search_curve(range(1, 201, 5),
                                   tuned_grd.cv_results_['mean_train_f1'], 
                                   tuned_grd.cv_results_['mean_test_f1'], 
                                   tuned_grd.cv_results_['std_train_f1'], 
                                   tuned_grd.cv_results_['std_test_f1'], 
                                   'learning_rate', 
                                   'F1', 
                                   'Cover Type Hyper Parameter Tuning')

In [None]:
# Set the parameters searching space
param_grid = [{'max_depth': range(1, 20, 2)}]

# Search parameters with cross-validation
tuned_grd = GridSearchCV(grd, param_grid, cv=5,
                           scoring= ['f1'], refit='f1', 
                           return_train_score=True)
tuned_grd.fit(Xtrain, Ytrain)

print(tuned_grd.best_params_)

# Plot performance of different parameters
plot_single_parameter_search_curve(range(1, 20, 2),
                                   tuned_grd.cv_results_['mean_train_f1'], 
                                   tuned_grd.cv_results_['mean_test_f1'], 
                                   tuned_grd.cv_results_['std_train_f1'], 
                                   tuned_grd.cv_results_['std_test_f1'], 
                                   'max_depth', 
                                   'F1', 
                                   'Cover Type Hyper Parameter Tuning')

In [None]:
# Plot performance of different parameters
plot_single_parameter_search_curve(range(1, 20, 2),
                                   tuned_grd.cv_results_['mean_train_f1'], 
                                   tuned_grd.cv_results_['mean_test_f1'], 
                                   tuned_grd.cv_results_['std_train_f1'], 
                                   tuned_grd.cv_results_['std_test_f1'], 
                                   'max_depth', 
                                   'F1', 
                                   'Cover Type Hyper Parameter Tuning')

## SVM

In [None]:
# Create a classifier instance
svm = SVC(random_state=0, degree=2)
# available hyper parameters 
svm.get_params()

In [None]:
# Set the parameters searching space
param_grid = [{'kernel': ['linear', 'poly']}]

# Search parameters with cross-validation
tuned_svm = GridSearchCV(svm, param_grid, cv=2,
                         scoring= ['f1'], refit='f1', 
                         return_train_score=True, verbose=50)
tuned_svm.fit(Xtrain, Ytrain)

print(tuned_svm.best_params_)

# Plot performance of different parameters
plot_single_parameter_search_curve(['linear', 'poly'],
                                   tuned_svm.cv_results_['mean_train_f1'], 
                                   tuned_svm.cv_results_['mean_test_f1'], 
                                   tuned_svm.cv_results_['std_train_f1'], 
                                   tuned_svm.cv_results_['std_test_f1'], 
                                   'kernel', 
                                   'F1', 
                                   'Hyper Parameter Tuning')

## Neural Network

In [None]:
# Create a classifier instance
nn = MLPClassifier()
                   
# available hyper parameters 
nn.get_params()

In [None]:
# Set the parameters searching space
#param_grid = [{'hidden_layer_sizes': [(3,), (6,), (9,), (12,), (3, 2), (6, 3), (9, 3), (12, 6)]}]
#param_grid = [{'hidden_layer_sizes': [(20,), (40,), (60,), (20, 10), (40, 20)]}]
#param_grid = [{'hidden_layer_sizes': [(100,), (150,), (100, 50), (60, 30)]}]
param_grid = [{'hidden_layer_sizes': [(300,150)]}]
# Search parameters with cross-validation
tuned_nn = GridSearchCV(nn, param_grid, cv=5,
                         scoring= ['f1'], refit='f1', 
                         return_train_score=True, verbose=50, n_jobs=-1)
tuned_nn.fit(Xtrain, Ytrain)

print(tuned_nn.best_params_)

In [None]:
tuned_nn.cv_results_

In [None]:
# Set the parameters searching space
param_grid = [{'hidden_layer_sizes': [(200,100)],
               'max_iter': range(10, 500, 50)
              }]
# Search parameters with cross-validation
tuned_nn = GridSearchCV(nn, param_grid, cv=5,
                         scoring= ['f1'], refit='f1', 
                         return_train_score=True, verbose=50, n_jobs=-1)
tuned_nn.fit(Xtrain, Ytrain)

print(tuned_nn.best_params_)
# Plot performance of different parameters
plot_single_parameter_search_curve(range(10, 500, 50),
                                   tuned_nn.cv_results_['mean_train_f1'], 
                                   tuned_nn.cv_results_['mean_test_f1'], 
                                   tuned_nn.cv_results_['std_train_f1'], 
                                   tuned_nn.cv_results_['std_test_f1'], 
                                   'kernel', 
                                   'F1', 
                                   'Hyper Parameter Tuning')