# ML multi class classification template

- metric: F1 score

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
#ensure that plots are displayed inside the notebook
import math
import re

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
from sklearn.neighbors import KNeighborsClassifier

## <u> Preprocessing

In [None]:
def preprocess(filename, target, random_s, proportion):
    # parse the data in a dataftame
    df = pd.read_csv(filename)
    df = df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))    
    names = df.columns
    features = list(names)
    features.remove(target)
    
    # standardize the data
#     scaler = StandardScaler()
#     scaled_df = scaler.fit_transform(df)
#     scaled_df = pd.DataFrame(scaled_df, columns=names)
#   
#     # set the target and explanatory variables
#     y = scaled_df[target] 
#     X = scaled_df[features]
    
    
    # set the target and explanatory variables
    y = df[target]
    X = df[features]
    
    # split the data in train and test set
    r = random_s  # controls how the data are split in train and test sets
    proportion_test = proportion  # proportion of data that is sampled as test set
    
    return train_test_split(X, y, test_size=proportion_test, random_state=r)

In [None]:
file = 'data.csv'
tar = 'target'
ran_state = 1
prop = 0.3

## <u> Classification models

### 1.1) Logistic Regression 

In [None]:
X_train, X_test, y_train, y_test = preprocess(file, tar, ran_state, prop)
features = list(X_train.columns)

In [None]:
# create the Logistic Regression model
lr = LogisticRegression()

In [None]:
# fit the train data to the model
lr.fit(X_train, y_train)

In [None]:
# compute F1 score
y_pred = lr.predict(X_test) 
f1_1 = metrics.f1_score(y_test, y_pred, average = 'weighted')
print("F1 score:", f1_1)

In [None]:
# plot the confusion matrix
plot_confusion_matrix(lr, X_test, y_test, cmap='Reds')  
plt.title("Confusion Matrix: Logistic Regression")
plt.show()  

In [None]:
# compute feature importance
for feature, importance in zip(features, list(lr.coef_[0])):
    print('Feature: ', feature, '\t','Importance: ', importance)

In [None]:
# use the model to make classification
d =  # dataframe with explanatory features
# e.g. d = X_test.tail(3)
# display(d)
classified_labels = lr.predict(d)
print("The classifications are: ", classified_labels)

### 1.2) Logistic Regression with Recursive Feature Elimination (feature selection)

In [None]:
X_train, X_test, y_train, y_test = preprocess(file, tar, ran_state, prop)
features = list(X_train.columns)

In [None]:
# select the best n_features_to_select with Recursive Feature Elimination
rfe = RFE(estimator=LogisticRegression(), n_features_to_select=5)
lr_ = rfe.fit(X_train, y_train)
selected_features_RFE = list(X_train.columns[list(lr_.support_)])
# print(selected_features_RFE)

In [None]:
# create the Logistic Regression model
lr2 = LogisticRegression()

In [None]:
# fit the train data to the model
lr2.fit(X_train[selected_features_RFE],y_train)

In [None]:
# compute F1 score
y_pred = lr2.predict(X_test[selected_features_RFE]) 
f1_2 = metrics.f1_score(y_test, y_pred, average = 'weighted')
print("F1 score:", f1_2)

In [None]:
# plot the confusion matrix
plot_confusion_matrix(lr2, X_test[selected_features_RFE], y_test, cmap='Reds')  
plt.title("Confusion Matrix: Logistic Regression with Recursive Feature Elimination")
plt.show()  

In [None]:
# compute feature importance
for feature, importance in zip(selected_features_RFE, list(lr2.coef_[0])):
    print('Feature: ', feature, '\t','Importance: ', importance)

In [None]:
# use the model to make classification
d =  # dataframe with Recursive Feature Elimination selected features
# e.g. d = X_test[selected_features_RFE].tail(3)
# display(d)
classified_labels = lr2.predict(d)
print("The classifications are: ", classified_labels)

### 2.1) Random Forest

In [None]:
X_train, X_test, y_train, y_test = preprocess(file, tar, ran_state, prop)
features = list(X_train.columns)

In [None]:
# create the Random Forest model with 1000 trees in the forest
rf = RandomForestClassifier(n_estimators = 1000) 

In [None]:
# fit the train data to the model
rf.fit(X_train, y_train)

In [None]:
# compute F1 score
y_pred = rf.predict(X_test) 
f1_3 = metrics.f1_score(y_test, y_pred, average = 'weighted')
print("F1 score:", f1_3)

In [None]:
# plot the confusion matrix
plot_confusion_matrix(rf, X_test, y_test, cmap='Reds')  
plt.title("Confusion Matrix: Random Forest")
plt.show()  

In [None]:
# compute feature importance
for feature, importance in zip(features, rf.feature_importances_):
    print('Feature: ', feature, '\t','Importance: ', importance)

In [None]:
# use the model to make classification
d =  # dataframe with explanatory features
# e.g. d = X_test.tail(3)
# display(d)
classified_labels = rf.predict(d)
print("The classifications are: ", classified_labels)

### 2.2) Random Forest with Random Search (hyperparameters tuning)

In [None]:
X_train, X_test, y_train, y_test = preprocess(file, tar, ran_state, prop)
features = list(X_train.columns)

In [None]:
# tuning hyperparameters by randomly sampling from given parameters

# define values for hyperparameters
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 200)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

# create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

# create the Random Forest model 
rf_ = RandomForestClassifier()
# Random Search of parameters using 5-fold Cross Validation
scorer = metrics.make_scorer(metrics.f1_score, average = 'weighted')
rf_random = RandomizedSearchCV(estimator = rf_, param_distributions = random_grid, scoring=scorer, n_iter = 10, cv = 5, random_state = 42, n_jobs = -1)
# fit the train data to the model
rf_random.fit(X_train, y_train)

In [None]:
# create the Random Forest model with the best hyperparameters after Random Search
d = rf_random.best_params_
rf2 = RandomForestClassifier(n_estimators = d['n_estimators'], min_samples_split = d['min_samples_split'], min_samples_leaf = d['min_samples_leaf'], max_features = d['max_features'], max_depth = d['max_depth'], bootstrap = d['bootstrap'])

In [None]:
# fit the train data to the model
rf2.fit(X_train,y_train)

In [None]:
# compute F1 score
y_pred = rf2.predict(X_test) 
f1_4 = metrics.f1_score(y_test, y_pred, average = 'weighted')
print("F1 score:", f1_4)

In [None]:
# plot the confusion matrix
plot_confusion_matrix(rf2, X_test, y_test, cmap='Reds')  
plt.title("Confusion Matrix: Random Forest with Random Search")
plt.show()  

In [None]:
# compute feature importance
for feature, importance in zip(features, rf2.feature_importances_):
    print('Feature: ', feature, '\t','Importance: ', importance)

In [None]:
# use the model to make classification
d =  # dataframe with explanatory features
# e.g. d = X_test.tail(3)
# display(d)
classified_labels = rf2.predict(d)
print("The classifications are: ", classified_labels)

### 3.1) Gradient Boosting

In [None]:
X_train, X_test, y_train, y_test = preprocess(file, tar, ran_state, prop)
features = list(X_train.columns)

In [None]:
# create the Gradient Boositing model with the following hyperparameters 
params = {
    'learning_rate': 0.05,
    "num_leaves": 1000,  
    "n_estimators": 1000
}
gbm = lgb.LGBMClassifier(**params)

In [None]:
# fit the train data to the model
gbm.fit(X_train, y_train);

In [None]:
# compute F1 score
y_pred = gbm.predict(X_test, num_iterations = 1000)
f1_5 = metrics.f1_score(y_test, y_pred, average = 'weighted')
print("F1 score:", f1_5)

In [None]:
# plot the confusion matrix
plot_confusion_matrix(gbm, X_test, y_test, cmap='Reds')  
plt.title("Confusion Matrix: Gradient Boosting")
plt.show()  

In [None]:
# compute feature importance
for feature, importance in zip(features, gbm.feature_importances_):
    print('Feature: ', feature, '\t','Importance: ', importance)

In [None]:
# use the model to make classification
d =  # dataframe with explanatory features
# e.g. d = X_test.tail(3)
# display(d)
classified_labels = gbm.predict(d)
print("The classifications are: ", classified_labels)

### 3.2) Gradient Boosting with Random Search (hyperparameters tuning)

In [None]:
X_train, X_test, y_train, y_test = preprocess(file, tar, ran_state, prop)
features = list(X_train.columns)

In [None]:
# tuning hyperparameters by randomly sampling from given parameters.

# define values for hyperparameters
learning_rate = [x for x in np.linspace(0.01, 1, num = 100)]
num_leaves = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 200)]
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 200)]
max_depth = [int(x) for x in np.linspace(10, 110, num = 100)]
max_depth.append(None)

# create the random grid
random_grid = {'n_estimators': n_estimators,
               'num_leaves': num_leaves,
               'max_depth': max_depth,
               'learning_rate': learning_rate}

# create the Gradient Boosting model
gb = lgb.LGBMClassifier()
# Random Search of parameters using 5-fold Cross Validation
scorer = metrics.make_scorer(metrics.f1_score, average = 'weighted')
gb_random = RandomizedSearchCV(estimator = gb, param_distributions = random_grid, scoring=scorer, n_iter = 10, cv = 5, random_state = 42, n_jobs = -1);
# fit the train data to the model
gb_random.fit(X_train, y_train);

In [None]:
# create the Gradient Boosting model with the best hyperparameters after Random Search
d = gb_random.best_params_
gbm2 = lgb.LGBMClassifier(num_leaves = d['num_leaves'],n_estimators = d['n_estimators'],max_depth = d['max_depth'], learning_rate = d['learning_rate'])

In [None]:
# fit the train data to the model
gbm2.fit(X_train,y_train);

In [None]:
# compute F1 score
y_pred = gbm2.predict(X_test)
f1_6 = metrics.f1_score(y_test, y_pred, average = 'weighted')
print("F1 score:", f1_6)

In [None]:
# plot the confusion matrix
plot_confusion_matrix(gbm2, X_test, y_test, cmap='Reds')  
plt.title("Confusion Matrix: Gradient Boosting with Random Search")
plt.show()  

In [None]:
# compute feature importance
for feature, importance in zip(features, gbm2.feature_importances_):
    print('Feature: ', feature, '\t','Importance: ', importance)

In [None]:
# use the model to make classification
d =  # dataframe with explanatory features
# e.g. d = X_test.tail(3)
# display(d)
classified_labels = gbm2.predict(d)
print("The classifications are: ", classified_labels)

### 4.1) K-Nearest Neighbors

In [None]:
X_train, X_test, y_train, y_test = preprocess(file, tar, ran_state, prop)
features = list(X_train.columns)

In [None]:
# create the K-Nearest Neighbors model with 10 neighbors
knn = KNeighborsClassifier(10)

In [None]:
# fit the train data to the model
knn.fit(X_train, y_train)

In [None]:
# compute F1 score
y_pred = knn.predict(X_test)
f1_7 = metrics.f1_score(y_test, y_pred, average = 'weighted')
print("F1 score:", f1_7)

In [None]:
# plot the confusion matrix
plot_confusion_matrix(knn, X_test, y_test, cmap='Reds')  
plt.title("Confusion Matrix: K-Nearest Neighbors")
plt.show()  

In [None]:
# use the model to make classification
d =  # dataframe with explanatory features
# e.g. d = X_test.tail(3)
# display(d)
classified_labels = knn.predict(d)
print("The classifications are: ", classified_labels)

### 4.2) K-Nearest Neighbors with Grid Search (hyperparameters tuning)

In [None]:
X_train, X_test, y_train, y_test = preprocess(file, tar, ran_state, prop)
features = list(X_train.columns)

In [None]:
# tuning hyperparameters by randomly sampling from given parameters.

# define values for hyperparameters
n_neighbors = [int(x) for x in np.linspace(2, 20, 19)]
# create the  grid
param_grid = {'n_neighbors': n_neighbors}

# create the K-Nearest Neighbors model
knn_ = KNeighborsClassifier()
# Grid Search of parameters using 5-fold Cross Validation
scorer = metrics.make_scorer(metrics.f1_score, average = 'weighted')
knn_grid = GridSearchCV(estimator=knn_, param_grid=param_grid, n_jobs=-1, cv=5, scoring=scorer)
# fit the train data to the model
grid_result = knn_grid.fit(X_train, y_train)

In [None]:
# create the K-Nearest Neighbors model with the best hyperparameters after Random Search
d = grid_result.best_params_
knn2 = KNeighborsClassifier(n_neighbors = d['n_neighbors'])

In [None]:
# fit the train data to the model
knn2.fit(X_train, y_train);

In [None]:
# compute F1 score
y_pred = knn2.predict(X_test);
f1_8 = metrics.f1_score(y_test, y_pred, average = 'weighted')
print("F1 score:", f1_8)

In [None]:
# plot the confusion matrix
plot_confusion_matrix(knn2, X_test, y_test, cmap='Reds')  
plt.title("Confusion Matrix: K-Nearest Neighbors with Grid Search")
plt.show()  

In [None]:
# use the model to make classification
d =  # dataframe with explanatory features
# e.g. d = X_test.tail(3)
# display(d)
classified_labels = knn2.predict(d)
print("The classifications are: ", classified_labels)

## <u> Results

In [None]:
print('Logistic Regression:                             ', f1_1)
print('Logistic Regression with RFE:                    ', f1_2)
print()
print('Random Forest:                                   ', f1_3)
print('Random Forest with Random Search:                ', f1_4)
print()
print('Gradient Boosting:                               ', f1_5)
print('Gradient Boosting with Random Search:            ', f1_6)
print()
print('K-Nearest Neighbors:                             ', f1_7)
print('K-Nearest Neighbors with Grid Search:            ', f1_8)