<b>Data mining Project - 2021/22</b><br/>
<span>
<b>Authors:</b> Mariagiovanna Rotundo (560765), Nunzio Lopardo (600005)</a> and Renato Eschini (203021)<br/>
<b>Group:</b>3<br/>
<b>Release date:</b> 26/12/2021
</span>

# Classification task

In this notebook we use different classificators for the classification task and we evaluate the performaces. The used dataset is the dataset of player created in the preparation notebook. 
Sunce the dataset for train and test are imbalanced, for the classification task 3 different approaches are evaluated:
1. the classification is done on the train and testset without considering the imbalance
2. the classification is done on the train and test given different weight to the 2 classes (high rank and low rank)
3. the classification is done on oversapled train and testset using SMOTE

**Import libraries**

In [None]:
import math
import numpy as np
import matplotlib.pyplot as plt
import collections
import pydotplus 
import statistics 
import pandas as pd
import os
from datetime import date
from tqdm.notebook import tqdm
from IPython.display import Image  
import scikitplot as skplt
import wittgenstein as lw
from imblearn.over_sampling import SMOTE

from scipy.stats.stats import pearsonr
from scipy.spatial.distance import pdist,  squareform
import scipy.stats as stats
from scipy.stats import randint as sp_randint

from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, pairwise_distances, classification_report, plot_confusion_matrix, confusion_matrix, ConfusionMatrixDisplay # For Model evaluation
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split, cross_validate, RandomizedSearchCV, GridSearchCV, RepeatedStratifiedKFold
from sklearn import tree, metrics
from sklearn.datasets import make_blobs
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier

import seaborn as sns
import re

**Loading the dataset**

In [None]:
# load of the data
DATASET_DIR = "dataset" + os.path.sep
#index_col=False say to not use the first column as ID
df_players = pd.read_csv('players.csv', sep=',', index_col=0) 

In [None]:
df_players.info()

## Functions

In this section are defined the functions used in the notebook. 

**function to discretize categorical data**

In [None]:
def discretize_data(dataset, variables): #mapping categorical into numerical
    for variable in variables:
        #get the unique variable's values
        var = sorted(dataset[variable].unique())
        
        #generate a mapping from the variable's values to the number representation  
        mapping = dict(zip(var, range(0, len(var) + 1)))

        #add a new colum with the number representation of the variable
        dataset[variable+'_num'] = dataset[variable].map(mapping).astype(int)
    return dataset

In [None]:
#metrics computed on the test set
def report_scores(test_label, test_pred):
    print(classification_report(test_label, 
                            test_pred, 
                            target_names=['low', 'high']))

In [None]:
#metrics computed on the test set
def compare_scores(models,test_set, test_label):
    for model in models.keys():
        print('\t\t\t' + model)
        test_pred =  models[model].predict(test_set)
        report_scores(test_label, test_pred)

In [None]:
def spot_errors(test_label, test_pred):
    spotted_errors = []
    for i in range(len(test_label)):
        if(test_label.array[i]!= test_pred[i]):
            spotted_errors.append('darkred')
        else:
            spotted_errors.append('darkgray')
    return spotted_errors

**Plot the neural network training history**

In [None]:
def plot_nn_training_history(history):
    from matplotlib.pyplot import figure
    fig, (acc_plot, loss_plot) = plt.subplots(2, figsize=(15, 6), dpi=240)
    fig.suptitle('Accuracy and Loss trends')
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    epochs = range(1, len(acc) + 1)
    acc_plot.plot(epochs, acc, label='Training Acc')
    acc_plot.plot(epochs, val_acc, label='Validation Acc')
    acc_plot.legend(loc='best')
    acc_plot.set_ylabel('Accuracy')
    acc_plot.set_ylim([0,1])
    acc_plot.grid(True)
    loss_plot.plot(epochs, loss, label='Trining Loss')
    loss_plot.plot(epochs, val_loss, label='Validation Loss')
    loss_plot.legend(loc='best')
    loss_plot.set_ylabel('Loss')
    loss_plot.set_ylim([0,1])
    loss_plot.set_xlabel('Epochs')
    loss_plot.grid(True)

**Scatter plot comparison real/classified/misclassified data**

In [None]:
def scatter_pred_data(test_set, test_label, test_pred, classifier_name, x, y):
    fig, (test_true_plt, test_pred_plt, errors_plt) = plt.subplots(1,3, figsize=(18,6), sharey=True)
    title = classifier_name + ' | Real vs Predicted labels'
    plt.suptitle(title)
    test_true_plt.set_title('True Label')
    test_true_plt.scatter(test_set[x].values, test_set[y].values, c=test_label.values, s=25, cmap='viridis')
    test_pred_plt.set_title('Predicted Label')
    test_pred_plt.scatter(test_set[x].values, test_set[y].values, c=test_pred, s=25, cmap='viridis')
    spotted_errors = spot_errors(test_label, test_pred)
    errors_plt.set_title('Misclassification')
    errors_plt.scatter(test_set[x].values, test_set[y].values, c=spotted_errors, s=25, cmap='viridis')
    plt.show()

**Print the dataset composition**

In [None]:
def print_dataset_composition(train_set, train_labels, test_set, test_labels):
    print(f"{len(train_labels)} training samples:")
    print(f"\t- {len(train_labels[train_labels == 0])} samples for the class Low Rank")
    print(f"\t- {len(train_labels[train_labels == 1])} samples for the class High Rank")
    print(f"\n{len(test_labels)} test samples:")
    print(f"\t- {len(test_labels[test_labels == 0])} samples for the class Low Rank")
    print(f"\t- {len(test_labels[test_labels == 1])} samples for the class High Rank")

**Plot the ROC curve and compute the AUC**

In [None]:
def roc_curve_plot(model, test_set, test_label, test_pred, classifier_name):
    x_test = np.reshape(test_set.values, (len(test_set), 1, len(test_set.columns)))
    y_pred_keras = model.predict(x_test).ravel()
    fpr, tpr, thresholds = roc_curve(test_label, test_pred)
    auc_area = auc(fpr, tpr)
    label_name = classifier_name + '(area = {:.3f})'.format(auc_area)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.plot(fpr, tpr, label=label_name)
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('ROC curve')
    plt.legend(loc='best')
    plt.show()

**Plot used to compare multiple ROC curves and AUC**

In [None]:
def compare_roc_curves(models, test_set, test_label):
    result_table = pd.DataFrame(columns=['classifiers', 'fpr','tpr','auc'])
    for model in models.keys():  
        if ("knn" in model):
            knn_test_set = test_set.drop(columns=['sex_num', 'hand_num'])
            pred = models[model].predict(knn_test_set)
        else:     
            pred = models[model].predict(test_set)
    
        fpr, tpr, _ = roc_curve(test_label, pred)
        auc_score = auc(fpr, tpr)
        result_table = result_table.append({'classifiers':model,
                                    'fpr':fpr, 
                                    'tpr':tpr, 
                                    'auc':auc_score}, ignore_index=True)
    result_table.set_index('classifiers', inplace=True)
    fig = plt.figure(figsize=(8,6))

    for i in result_table.index:
        plt.plot(result_table.loc[i]['fpr'], 
                 result_table.loc[i]['tpr'], 
                 label="{}, AUC={:.3f}".format(i, result_table.loc[i]['auc']))

    plt.plot([0,1], [0,1], color='orange', linestyle='--')

    plt.xticks(np.arange(0.0, 1.1, step=0.1))
    plt.xlabel("Flase Positive Rate", fontsize=15)

    plt.yticks(np.arange(0.0, 1.1, step=0.1))
    plt.ylabel("True Positive Rate", fontsize=15)

    plt.title('ROC Curve Analysis', fontweight='bold', fontsize=15)
    plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))

    plt.show()

**Plot confusion matrix**

In [None]:
def plot_confusion_mx(test_label, test_pred):
    cm = confusion_matrix(test_label, test_pred, labels=test_label.unique())
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels= ['low', 'high'])
    disp.plot()
    plt.show()

**Plot multiple confusion matrix**

Given a dictionary of trained classifiers ({classifier_name: model}) plots the confusion matrix for all of them.

In [None]:
def compare_models(models_list, classifier_name, test_set, test_label):
    i = 0
    col_count = len(train_set.columns)
    fig, axs = plt.subplots(nrows=1,ncols=len(models_list), figsize=(18,6), sharey=True)
    title = classifier_name + ' | Confusion Matrix comparison'
    plt.suptitle(title)
    for model in models_list.keys():
        test_pred =  models_list[model].predict(test_set)
        cm=confusion_matrix(test_label,test_pred)
        sns.heatmap(cm, ax=axs[i], annot=True,cmap=plt.cm.Blues, fmt='g')
        axs[i].set_title(model)
        i+=1

**Function to normalize a dataframe**

In [None]:
def normalize_dataset(df):
    cols_to_norm = ['best_rank_points', 'w_tourney', 'tot_minutes', 'sv1st', 'sv1st_win', 'sv2nd_win', 
           'df', 'ace', 'bpS', 'wmatch', 'lmatch', 'nmatch', 'n_tourney']
    df[cols_to_norm] = MinMaxScaler().fit_transform(df[cols_to_norm])
    return df

## Data for classification

Selection of the colmuns that will be used to define train and testset and to define the labels. To establish if a player is high or low ranked we use the "best_rank" column

In [None]:
df_filtered = df_players[['sex', 'hand','best_rank','best_rank_points', 'w_tourney', 'tot_minutes', 'sv1st', 'sv1st_win', 'sv2nd_win', 
           'df', 'ace', 'bpS', 'wmatch', 'lmatch', 'nmatch', 'n_tourney']]

Discretization of sex and hand values since they are categorical values

In [None]:
variables = ['sex', 'hand']
df_filtered = discretize_data(df_filtered, variables)

In [None]:
df_filtered = df_filtered.drop(columns=['sex', 'hand'])

In [None]:
df_filtered

We consider as high ranked the players that are in the first 50 position

In [None]:
threshold = 50
df_filtered.loc[((df_filtered['best_rank']>0) & (df_filtered['best_rank']<=threshold)), 'ranked'] = 1 #high
df_filtered.loc[((df_filtered['best_rank']>0) & (df_filtered['best_rank']>threshold)), 'ranked'] = 0 #low

In [None]:
df_filtered.loc[((df_filtered['best_rank']>0) & (df_filtered['best_rank']<=threshold))].shape[0] #number of high

In [None]:
df_filtered.loc[((df_filtered['best_rank']>0) & (df_filtered['best_rank']>threshold))].shape[0] #number of low

255 players are considered high ranked and 4192 are considered low ranked. For the classification task we do not consider the players for which we cannot establish the rank using the best_rank value

In [None]:
df_classification = df_filtered[df_filtered['best_rank']>0]

In [None]:
df_classification = df_classification.drop(columns=['best_rank'])

In [None]:
df_classification

Creation of training set and test set and their labels. To creare these sets we mantain the same proportions between the 2 class and we use a test set smaller than the training (test_size=0.30)

In [None]:
label = df_classification.pop('ranked')
train_set, test_set, train_label, test_label = train_test_split(df_classification, label, stratify = label, test_size=0.30)

**Dataset normalization**

In [None]:
norm_train_set = normalize_dataset(train_set)
norm_test_set = normalize_dataset(test_set)

**Dataset composition**

In [None]:
print_dataset_composition(train_set, train_label, test_set, test_label)

# Classification

### Decision tree

In [None]:
dt_models = {}

**creation and fit**

In [None]:
dt = tree.DecisionTreeClassifier(criterion='gini', splitter='best', 
                                  max_depth=5, 
                                  min_samples_split=3, min_samples_leaf=4)
dt = dt.fit(train_set, train_label)
dt_models['dt_original'] = dt

In [None]:
dot_data = tree.export_graphviz(dt, out_file=None, 
                         feature_names=list(train_set.columns),  
                         class_names=['low', 'high'],  #[0, 1]
                         filled=True, rounded=True)  
graph = pydotplus.graph_from_dot_data(dot_data)  
Image(graph.create_png())

**prediction on train and test sets**

In [None]:
train_pred_dt = dt.predict(train_set)
test_pred_dt = dt.predict(test_set)

#### evaluation

In [None]:
#per il training set
report_scores(train_label, train_pred_dt)

In [None]:
#per il test set
report_scores(test_label, test_pred_dt)
#l'accuracy è un buon indicatore, è significativa se è maggiore dell'accuracy della majority class. in caso di 
#situazione unbalance anche la precision e la recall aiutano a capire quanti errori abbiamo

Confusion matrix

In [None]:
#plot the confusion matrix 
plot_confusion_mx(test_label, test_pred_dt)

**Example of plot with classification results**

In [None]:
#true labels - different colors for different class
scatter_pred_data(test_set, test_label, test_pred_dt, 'Decision tree', 'nmatch', 'best_rank_points')

### SVM

In [None]:
svm_models = {}

**creation and fit**

In [None]:
svm = SVC(kernel='sigmoid', C=0.5, gamma='scale', probability=True)
svm.fit(train_set, train_label)
svm_models['svm_original'] = svm 

**prediction**

In [None]:
train_pred_svm = svm.predict(train_set)

In [None]:
test_pred_svm = svm.predict(test_set)

**evaluation**

In [None]:
report_scores(train_label, train_pred_svm)

In [None]:
#compute the performance of the model
report_scores(test_label, test_pred_svm)

We can notice that this model classify all the players as low ranked

In [None]:
plot_confusion_mx(test_label, test_pred_svm)

In [None]:
scatter_pred_data(test_set, test_label, test_pred_svm, 'SVM', 'nmatch', 'best_rank_points')

### Rule based

The used rule-based method is RIPPER

In [None]:
rb_models = {}

**creation and fit**

In [None]:
#we run a grid search to find the best configuration of parameters' values
ripper = lw.RIPPER()
param_grid = {"prune_size": [0.5, 0.6], "k": [1, 3, 5]}
grid_search = GridSearchCV(estimator=ripper, param_grid=param_grid)

In [None]:
grid_result = grid_search.fit(train_set, train_label, pos_class=1)
grid_result.best_params_

In [None]:
#define and fit the rule-based model
#this function requires only one dataset with the labels. 
#To do so, we concatenate the train_set and the train_label
ripper = lw.RIPPER(k=grid_result.best_params_['k'], prune_size=grid_result.best_params_['prune_size'])
datas = pd.concat([train_set, train_label], axis=1)
ripper.fit(datas, class_feat='ranked', pos_class=1)

**obtained rules**

In [None]:
#in this case the model is a set of rules
ripper.out_model()

In [None]:
rb_models['rb_original'] = ripper

**evaluation**

In [None]:
ripper_pred_train = ripper.predict(train_set)
report_scores(train_label, ripper_pred_train)

In [None]:
ripper_pred = ripper.predict(test_set)
report_scores(test_label, ripper_pred)

In [None]:
#evaluation of the performance of the classifier (test set)
print('Accuracy ', ripper.score(test_set, test_label))
print('Precision ', ripper.score(test_set, test_label, precision_score))
print('Recall ', ripper.score(test_set, test_label, recall_score))

In [None]:
plot_confusion_mx(test_label, ripper_pred)

In [None]:
scatter_pred_data(test_set, test_label, ripper_pred, 'Rule based', 'nmatch', 'best_rank_points')

**prediction obtaining the used rules**

In [None]:
ripper_pred_reasons = ripper.predict(test_set, give_reasons=True)

In [None]:
indexes = [i for i,elem in enumerate(ripper_pred_reasons[0]) if elem == True]
rules_used = [ripper_pred_reasons[1][elem] for i,elem in enumerate(indexes)]

In [None]:
len(rules_used) #high predictions

In [None]:
rules_used

### Gaussian Naive Bayes

**Importing libraries**

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
gnb_models = {}

**Define the model**

In [None]:
gnb_model = GaussianNB()

**Train the Gaussain Naive Bayes classifier**

In [None]:
gnb_model.fit(train_set, train_label)
gnb_models['gnb_original'] = gnb_model

In [None]:
test_pred = gnb_model.predict(test_set)
print(classification_report(test_label, test_pred, target_names = ['low','high']))

The performance report reveals the low capacity of the GNB classifier to correctly classify the hig rank players. This is due to the highly imbalanced dataset.

Let's plot the confusion matrix

In [None]:
plot_confusion_mx(test_label, test_pred)

### AdaBoost
Adaboost is an ensemble learning algorithm that uses the boosting method.

In [None]:
# Setthe variables to make them easier to use
X_train, X_test, y_train, y_test = train_set.values, test_set.values, train_label.values, test_label.values

##### Using DecisionTreeClassifier (default) as Base Learners

In [None]:
# Create adaboost classifer object
abc = AdaBoostClassifier(n_estimators=50,learning_rate=1)
# Train Adaboost Classifer
model = abc.fit(X_train, y_train)

In [None]:
#Predict the response for test dataset
y_pred = model.predict(X_test)

In [None]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [None]:
# Report score on test label
report_scores(test_label, y_pred)

In [None]:
#Predict the response for train dataset
X_pred = model.predict(X_train)

In [None]:
# Report score on train label
report_scores(train_label, X_pred)

##### Using Support Vector Classifier as Base Learners

In [None]:
svc=SVC(probability=True, kernel='linear')

# Create adaboost classifer object
abc =AdaBoostClassifier(n_estimators=50, base_estimator=svc,learning_rate=1)

# Train Adaboost Classifer
model = abc.fit(X_train, y_train)

# Predict the response for test dataset
y_pred = model.predict(X_test)

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [None]:
# Report score on test label
report_scores(test_label, y_pred)

In [None]:
# Predict the response for train dataset
X_pred = model.predict(X_train)

In [None]:
# Report score on train label
report_scores(train_label, X_pred)

### Random Forest
Is a class of ensemble methods specifically designed for decision trees. It combines the predictions made by multiple decision trees and outputs the class that is the mode of the class's output by individual trees.

In [None]:
clf=RandomForestClassifier(n_estimators=100)
# Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(train_set,train_label)

In [None]:
# Predict on test set
y_pred=clf.predict(test_set)

In [None]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(test_label, y_pred))

In [None]:
# Report score on test label
report_scores(test_label, y_pred)

In [None]:
# Predict the response for train dataset
X_pred = model.predict(X_train)

In [None]:
# Report score on train label
report_scores(train_label, X_pred)

### Neural Networks

**Importing libraries**

In [None]:
import tensorflow as tf

In the following list we will save all the trained Neural Network models. 

In [None]:
nn_models = {}

**Define and compile the neural network model.**

Function that returns the defalut neural network model with initial weights.

In [None]:
def base_nn_model(optimizer = 'adam', activation='relu', dropout_rate=0.15, neurons=15, loss='binary_crossentropy'): #specify parameters so that we can do grid search
    # create model
    model = tf.keras.models.Sequential([
        tf.keras.layers.Flatten(input_shape=(1, len(train_set.columns))),
        tf.keras.layers.Dense(neurons, activation=activation),
        tf.keras.layers.Dropout(dropout_rate),
        tf.keras.layers.Dense(neurons, activation=activation),
        tf.keras.layers.Dropout(dropout_rate),    
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    # Compile model
    model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])
    return model

Give a model and trainig set, this function train the neural network model.

Best: 0.973970 using {'activation': 'relu', 'batch_size': 500, 'dropout_rate': 0.2, 'epochs': 100, 'loss': 'binary_crossentropy', 'neurons': 13, 'optimizer': 'adam'}

In [None]:
def train_nn_model(model, train_set, train_label, epochs=60, batch_size=128,validation_split=0.2, verbose=False, class_weights=None):
    x_train = np.reshape(train_set.values, (len(train_set), 1, len(train_set.columns)))
    if class_weights is not None:
        history = model.fit(x_train, train_label,
                    epochs=epochs,
                    batch_size=batch_size,
                    validation_split=validation_split,
                    class_weight=class_weights,
                    verbose=verbose)
    else:
        history = model.fit(x_train, train_label,
                            epochs=epochs,
                            batch_size=batch_size,
                            validation_split=validation_split,
                            verbose=verbose)
    return model, history

In [None]:
nn_model = base_nn_model()
nn_model, history = train_nn_model(nn_model, norm_train_set, train_label)
nn_models['NN_original'] = nn_model

Train the model using the original and normalized dataset.

In [None]:
nn_model.summary()

In [None]:
plot_nn_training_history(history)

In [None]:
x_train_test = np.reshape(norm_train_set.values, (len(norm_train_set), 1, len(train_set.columns)))
train_pred = (nn_model.predict(x_train_test) > 0.5).astype("int32")

In [None]:
report_scores(train_label, train_pred)

In [None]:
x_test = np.reshape(norm_test_set.values, (len(norm_test_set), 1, len(norm_test_set.columns)))
test_pred = (nn_model.predict(x_test) > 0.5).astype("int32")

In [None]:
report_scores(test_label, test_pred)

Given the trained NN model, let's look the cofusion matrix on the test set

In [None]:
cm=confusion_matrix(test_label,test_pred)
#il parametro fmt serve per evitare la notazione esponenziale dei numeri
sns.heatmap(cm, annot=True,cmap=plt.cm.Blues, fmt='g')
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

In [None]:
scatter_pred_data(norm_test_set, test_label, test_pred, 'Neural Network', 'nmatch', 'best_rank_points')

In [None]:
roc_curve_plot(nn_model, norm_test_set, test_label, test_pred, 'NN')

### KNN (K-Nearest Neighbors)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn_models = {}

Fit and scoring the classifier using the function *GridSearchCV*, by sklearn, that automatically compute the best combination of parameters for the model training. Below are created the set of parameters for the KNN training that the function will use.

For the KNN we need to remove the categorical attributes.

In [None]:
knn_train_set = train_set.drop(columns=['sex_num', 'hand_num'])
knn_test_set = test_set.drop(columns=['sex_num', 'hand_num'])

In [None]:
k_range = range(1,31)

In [None]:
knn_metrics = ['euclidean', 'manhattan']

In [None]:
knn_weights = ['uniform', 'distance']

In [None]:
knn_algorithms = ['ball_tree', 'kd_tree', 'brute']

In [None]:
knn_param_grid = {
    'n_neighbors': k_range,
    'metric': knn_metrics,
    'algorithm': knn_algorithms,
    'weights': knn_weights
            }

In [None]:
knn_grid = GridSearchCV(KNeighborsClassifier(), knn_param_grid, cv=10, scoring='accuracy')
knn_grid.fit(knn_train_set, train_label)

In [None]:
print('Accuracy: ' + str(knn_grid.best_score_))
print('Parameters: ' + str(knn_grid.best_params_))

In [None]:
knn = KNeighborsClassifier(**knn_grid.best_params_).fit(knn_train_set, train_label)

In [None]:
knn_models['knn_original'] = knn

In [None]:
train_pred = knn.predict(knn_train_set)
report_scores(train_label,train_pred)

In [None]:
test_pred = knn.predict(knn_test_set)
report_scores(test_label,test_pred)

In [None]:
scatter_pred_data(test_set, test_label, test_pred, 'KNN', 'nmatch', 'best_rank_points')

As we know from the theory, the nearest neighbor classifiers can be biased by noise points that have oversized data values that can miss lead the classification task. The solution to this problem is normalization, in the following lines of code a normalized dataset is created using the *MinMaxScaler*

In [None]:
knn_grid = GridSearchCV(KNeighborsClassifier(), knn_param_grid, cv=10, scoring='accuracy')
norm_knn_train_set = normalize_dataset(knn_train_set)
norm_knn_test_set = normalize_dataset(knn_test_set)
knn_grid.fit(norm_knn_train_set, train_label)

In [None]:
print('Accuracy: ' + str(knn_grid.best_score_))
print('Parameters: ' + str(knn_grid.best_params_))

In [None]:
knn = KNeighborsClassifier(**knn_grid.best_params_).fit(norm_knn_train_set, train_label)

In [None]:
train_pred = knn.predict(norm_knn_train_set)
report_scores(train_label,train_pred)

In [None]:
test_pred = knn.predict(norm_knn_test_set)
report_scores(test_label,test_pred)

In [None]:
plot_confusion_mx(test_label, test_pred)

# Classification with weights

Weights associated with classes in the form {class_label: weight}.

The “balanced” mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as:

$\frac{n_{samples}}{(n_{classes}\  *\  np.bincount(y))}$

Using weights we can say that the examples of a given class are more important than examples of the other class. <br>
To find the best weights for each classifier we do a grid search on a given list of weights.

In [None]:
#set weights
weights = {0:1.0, 1:100.0} #0=low, 1 = high
balance = [{0:0.8,1:3.5}, {0:1,1:5}, {0:1,1:10}, {0:1,1:15}, {0:1,1:20}, {0:1,1:50}, {0:1,1:100}, 'balanced']

### Decision tree

**creation of the model**

In [None]:
dt = tree.DecisionTreeClassifier(criterion='gini', splitter='best', 
                                  max_depth=5, class_weight=weights,
                                  min_samples_split=3, min_samples_leaf=4)
dt = dt.fit(train_set, train_label)

#### choise of weights

In [None]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
param_grid = dict(class_weight=balance)
grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='roc_auc')

In [None]:
grid_result = grid_search.fit(test_set, test_label)

In [None]:
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
dt = tree.DecisionTreeClassifier(criterion='gini', splitter='best', 
                                  max_depth=5, class_weight=grid_result.best_params_['class_weight'],
                                  min_samples_split=3, min_samples_leaf=4)
dt = dt.fit(train_set, train_label)
dt_models['dt_weighted'] = dt

In [None]:
dot_data = tree.export_graphviz(dt, out_file=None, 
                         feature_names=list(train_set.columns),  
                         class_names=['low', 'high'],  #[0, 1]
                         filled=True, rounded=True)  
graph = pydotplus.graph_from_dot_data(dot_data)  
Image(graph.create_png())

**prediction**

In [None]:
train_pred_dt = dt.predict(train_set)
test_pred_dt = dt.predict(test_set)

**evaluation**

In [None]:
report_scores(train_label, train_pred_dt)

In [None]:
report_scores(test_label, test_pred_dt)

In [None]:
plot_confusion_mx(test_label, test_pred_dt)

In [None]:
scatter_pred_data(test_set, test_label, test_pred_dt, 'Decision Tree', 'nmatch', 'best_rank_points')

### SVM

**creation of the model with fixed weights**

In [None]:
svm = SVC(gamma='scale', class_weight=weights)
svm.fit(train_set, train_label)

In [None]:
train_pred = svm.predict(train_set)
report_scores(train_label, train_pred)

In [None]:
test_pred = svm.predict(test_set)

In [None]:
#compute the performance of the model
report_scores(test_label, test_pred)

#### choise of weights

In [None]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
param_grid = dict(class_weight=balance)
grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='roc_auc')

In [None]:
grid_result = grid_search.fit(test_set, test_label)

In [None]:
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
svm = SVC(gamma='scale', class_weight=grid_result.best_params_['class_weight'])
svm.fit(train_set, train_label)
svm_models['svm_weighted'] = svm

**prediction and evaluation**

In [None]:
train_pred = svm.predict(train_set)
report_scores(train_label, train_pred)

In [None]:
test_pred = svm.predict(test_set)
report_scores(test_label, test_pred)

In [None]:
plot_confusion_mx(test_label, test_pred)

In [None]:
scatter_pred_data(test_set, test_label, test_pred, 'SVM', 'nmatch', 'best_rank_points')

### Rule based

**creation of the model and choose of weights and parameters**

In [None]:
#we run a grid search to find the best configuration of parameters' values
ripper = lw.RIPPER()
param_grid = {"prune_size": [0.5, 0.6], "k": [1, 3, 5], "class_weight": balance}
grid_search = GridSearchCV(estimator=ripper, param_grid=param_grid)

In [None]:
grid_result = grid_search.fit(train_set, train_label, pos_class=1)
grid_result.best_params_

In [None]:
ripper = lw.RIPPER(k=grid_result.best_params_['k'], prune_size=grid_result.best_params_['prune_size'])
datas = pd.concat([train_set, train_label], axis=1)
ripper.fit(datas, class_feat='ranked', pos_class=1, class_weight = grid_result.best_params_['class_weight'])

**get model (rules)**

In [None]:
#in this case the model is a set of rules
ripper.out_model()

In [None]:
rb_models['rb_weighted'] = ripper

**prediction and evaluation**

In [None]:
ripper_pred_train = ripper.predict(train_set)
report_scores(train_label, ripper_pred_train)

In [None]:
ripper_pred = ripper.predict(test_set)
report_scores(test_label, ripper_pred)

In [None]:
#evaluation of the performance of the classifier
print('Accuracy ', ripper.score(test_set, test_label))
print('Precision ', ripper.score(test_set, test_label, precision_score))
print('Recall ', ripper.score(test_set, test_label, recall_score))

In [None]:
plot_confusion_mx(test_label, test_pred)

In [None]:
scatter_pred_data(test_set, test_label, ripper_pred, 'Rule Based', 'nmatch', 'best_rank_points')

**rules used for predictions**

In [None]:
ripper_pred_reasons = ripper.predict(test_set, give_reasons=True)

In [None]:
indexes = [i for i,elem in enumerate(ripper_pred_reasons[0]) if elem == True]
rules_used = [ripper_pred_reasons[1][elem] for i,elem in enumerate(indexes)]
rules_used

### Neural Networks

Now let's re-run the neural network classifier using the weighted classes.

#### choise of weights

In [None]:
from sklearn.utils import class_weight
weights_nn = {0: 0.75, 1: 3.5}
class_weights = class_weight.compute_class_weight(class_weight = weights_nn,
                                                 classes = np.unique(train_label),
                                                 y = train_label)
class_weights = dict(enumerate(class_weights))
print(class_weights)

In [None]:
nn_model_w = base_nn_model()
nn_model_w, history = train_nn_model(nn_model_w, norm_train_set, train_label)
nn_models['NN_weighted'] = nn_model_w

Train the model using the original and normalized dataset.

In [None]:
nn_model_w.summary()

In [None]:
plot_nn_training_history(history)

In [None]:
x_train_test = np.reshape(norm_train_set.values, (len(norm_train_set), 1, len(norm_train_set.columns)))
train_pred = (nn_model_w.predict(x_train_test) > 0.5).astype("int32")

In [None]:
report_scores(train_label, train_pred)

In [None]:
x_test = np.reshape(norm_test_set.values, (len(norm_test_set), 1, len(norm_test_set.columns)))
test_pred = (nn_model_w.predict(x_test) > 0.5).astype("int32")

In [None]:
report_scores(test_label, test_pred)

Given the trained NN model, let's look the cofusion matrix on the test set

In [None]:
cm=confusion_matrix(test_label,test_pred)
#il parametro fmt serve per evitare la notazione esponenziale dei numeri
sns.heatmap(cm, annot=True,cmap=plt.cm.Blues, fmt='g')
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

In [None]:
scatter_pred_data(norm_test_set, test_label, test_pred, 'Neural Network', 'nmatch', 'best_rank_points')

In [None]:
roc_curve_plot(nn_model_w, norm_test_set, test_label, test_pred, 'NN')

# Oversampling with SMOTE

We use SMOTE to apply oversampling to the 2 classes in the training set.

In [None]:
oversample = SMOTE(sampling_strategy=0.3)
training, labels = oversample.fit_resample(train_set, train_label)

**Original Dataset**

In [None]:
print_dataset_composition(train_set, train_label, test_set, test_label)

**Dataset after oversampling**

In [None]:
print_dataset_composition(training, labels, test_set, test_label)

### Decision tree

**new model**

In [None]:
dt = tree.DecisionTreeClassifier(criterion='gini', splitter='best', 
                                  max_depth=5, 
                                  min_samples_split=3, min_samples_leaf=4)
dt = dt.fit(training, labels)
dt_models['dt_ov'] = dt

**prediction**

In [None]:
train_pred = dt.predict(train_set)
test_pred_dt = dt.predict(test_set)

**evaluation**

In [None]:
report_scores(train_label, train_pred)

In [None]:
report_scores(test_label, test_pred_dt)

In [None]:
plot_confusion_mx(test_label, test_pred)

In [None]:
scatter_pred_data(test_set, test_label, test_pred_dt, 'Decision Tree', 'nmatch', 'best_rank_points')

### SVM

**new model**

In [None]:
svm = SVC(kernel='sigmoid', gamma='scale')
svm.fit(training, labels)
svm_models['svm_ov'] = svm 

**prediction**

In [None]:
train_pred = svm.predict(train_set)
test_pred = svm.predict(test_set)

**evaluation**

In [None]:
report_scores(train_label, train_pred)

In [None]:
report_scores(test_label, test_pred)

In [None]:
plot_confusion_mx(test_label, test_pred)

In [None]:
scatter_pred_data(test_set, test_label, test_pred, 'SVM', 'nmatch', 'best_rank_points')

### Rule based

**new model**

In [None]:
#we run a grid search to find the best configuration of parameters' values
ripper = lw.RIPPER()
param_grid = {"prune_size": [0.5, 0.6], "k": [1, 3, 5]}
grid_search = GridSearchCV(estimator=ripper, param_grid=param_grid)

In [None]:
grid_result = grid_search.fit(training, labels, pos_class=1)
grid_result.best_params_

In [None]:
ripper = lw.RIPPER(k=grid_result.best_params_['k'], prune_size=grid_result.best_params_['prune_size'])
datas = pd.concat([training, labels], axis=1)
ripper.fit(datas, class_feat='ranked', pos_class=1)

**model (rules)**

In [None]:
#in this case the model is a set of rules
ripper.out_model()

In [None]:
rb_models['rb_ov'] = ripper

**prediction and evaluation**

In [None]:
ripper_pred_train = ripper.predict(train_set)
report_scores(train_label, ripper_pred_train)

In [None]:
ripper_pred = ripper.predict(test_set)
report_scores(test_label, ripper_pred)

In [None]:
#evaluation of the performance of the classifier
print('Accuracy ', ripper.score(test_set, test_label))
print('Precision ', ripper.score(test_set, test_label, precision_score))
print('Recall ', ripper.score(test_set, test_label, recall_score))

In [None]:
plot_confusion_mx(test_label, test_pred)

In [None]:
scatter_pred_data(test_set, test_label, ripper_pred, 'Rule based', 'nmatch', 'best_rank_points')

### Gaussain Naive Bayes

Gaussain Naive Bayes using the oversampled dataset.

**Define the model**

In [None]:
gnb_model = GaussianNB()

**Train the Gaussain Naive Bayes classifier**

In [None]:
gnb_model.fit(training, labels)
gnb_models['gnb_ov'] = gnb_model

In [None]:
test_pred = gnb_model.predict(test_set)
print(classification_report(test_label, test_pred, target_names = ['low','high']))

The performance report reveals the low capacity of the GNB classifier to correctly classify the hig rank players. This is due to the highly imbalanced dataset.

Let's plot the confusion matrix

In [None]:
plot_confusion_mx(test_label,test_pred)
plt.show() 

### Neural Networks

In [None]:
norm_ov_train_set = normalize_dataset(training)

In [None]:
nn_model_ov = base_nn_model()
nn_model_ov, history = train_nn_model(nn_model_ov, norm_ov_train_set, labels)
nn_models['NN_smote'] = nn_model_ov

In [None]:
nn_model_ov.summary()

In [None]:
plot_nn_training_history(history)

In [None]:
x_test = np.reshape(norm_test_set.values, (len(norm_test_set), 1, len(norm_test_set.columns)))
test_pred = (nn_model_ov.predict(x_test) > 0.5).astype("int32")

In [None]:
report_scores(test_label, test_pred)

In [None]:
cm=confusion_matrix(test_label,test_pred)
#il parametro fmt serve per evitare la notazione esponenziale dei numeri
sns.heatmap(cm, annot=True,cmap=plt.cm.Blues, fmt='g')
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

In [None]:
scatter_pred_data(test_set, test_label, test_pred, 'Neural Network', 'nmatch', 'best_rank_points')

In [None]:
roc_curve_plot(nn_model_ov, norm_test_set, test_label, test_pred, 'NN')

### KNN (K-Nearest Neighbors)

Let's test the KNN on oversampled dataset.

For the KNN we need to remove the categorical attributes.

In [None]:
knn_ov_train_set = training.drop(columns=['sex_num', 'hand_num'])

In [None]:
k_range = range(1,31)

In [None]:
knn_metrics = ['euclidean', 'manhattan']

In [None]:
knn_weights = ['uniform', 'distance']

In [None]:
knn_algorithms = ['ball_tree', 'kd_tree', 'brute']

In [None]:
knn_param_grid = {
    'n_neighbors': k_range,
    'metric': knn_metrics,
    'algorithm': knn_algorithms,
    'weights': knn_weights
            }

In [None]:
knn_grid = GridSearchCV(KNeighborsClassifier(), knn_param_grid, cv=10, scoring='accuracy')
knn_grid.fit(knn_ov_train_set, labels)

In [None]:
print('Accuracy: ' + str(knn_grid.best_score_))
print('Parameters: ' + str(knn_grid.best_params_))

In [None]:
knn = KNeighborsClassifier(**knn_grid.best_params_).fit(knn_ov_train_set, labels)
knn_models['knn_ov'] = knn

In [None]:
train_pred = knn.predict(knn_ov_train_set)
report_scores(labels,train_pred)

In [None]:
test_pred = knn.predict(knn_test_set)
report_scores(test_label,test_pred)

In [None]:
plot_confusion_mx(test_label, test_pred)

In [None]:
scatter_pred_data(knn_test_set, test_label, test_pred, 'KNN','nmatch', 'best_rank_points')

Train KNN model with normalized oversample dataset.

In [None]:
knn_grid = GridSearchCV(KNeighborsClassifier(), knn_param_grid, cv=10, scoring='accuracy')
norm_knn_train_set = normalize_dataset(knn_ov_train_set)
norm_knn_test_set = normalize_dataset(knn_test_set)
knn_grid.fit(norm_knn_train_set, labels)

In [None]:
print('Accuracy: ' + str(knn_grid.best_score_))
print('Parameters: ' + str(knn_grid.best_params_))

In [None]:
knn = KNeighborsClassifier(**knn_grid.best_params_).fit(norm_knn_train_set, labels)

In [None]:
train_pred = knn.predict(norm_knn_train_set)
report_scores(labels,train_pred)

In [None]:
test_pred = knn.predict(norm_knn_test_set)
report_scores(test_label,test_pred)

In [None]:
plot_confusion_mx(test_label, test_pred)

# Sumup Classifiers Comparison

In [None]:
classifiers = {}
classifiers.update(dt_models)
classifiers.update(svm_models)
classifiers.update(rb_models)
classifiers.update(gnb_models)
classifiers.update(knn_models)

In this section, the performances of all the analyzed classifiers are compared on the three datasets.

In [None]:
compare_roc_curves(classifiers, test_set, test_label)

### Decision tree versions comparision

In [None]:
compare_models(dt_models, 'Decision Tree', test_set, test_label)

In [None]:
print('\t\t\t TRAINING SCORES')
compare_scores(dt_models,train_set, train_label)

In [None]:
print('\t\t\t TEST SCORES')
compare_scores(dt_models,test_set, test_label)

In [None]:
compare_roc_curves(dt_models,test_set, test_label)

### SVM versions comparision

In [None]:
compare_models(svm_models, 'SVM', test_set, test_label)

In [None]:
print('\t\t\t TRAINING SCORES')
compare_scores(svm_models,train_set, train_label)

In [None]:
print('\t\t\t TEST SCORES')
compare_scores(svm_models,test_set, test_label)

### Rule based versions comparision

In [None]:
compare_models(rb_models, 'Rule Based', test_set, test_label)

In [None]:
print('\t\t\t TRAINING SCORES')
compare_scores(rb_models,train_set, train_label)

In [None]:
print('\t\t\t TEST SCORES')
compare_scores(rb_models,test_set, test_label)

### Gaussain Naive Bayes versions comparision

In [None]:
compare_models(gnb_models, 'Gaussain Naive Bayes', test_set, test_label)

In [None]:
print('\t\t\t TRAINING SCORES')
compare_scores(gnb_models,train_set, train_label)

In [None]:
print('\t\t\t TEST SCORES')
compare_scores(gnb_models,test_set, test_label)

### Neural Network versions comparision

In [None]:
def compare_nn_models(models_list, classifier_name, test_set, test_label):
    i = 0
    col_count = len(train_set.columns)
    fig, axs = plt.subplots(nrows=1,ncols=len(models_list), figsize=(18,6), sharey=True)
    title = classifier_name + ' | Confusion Matrix comparison'
    plt.suptitle(title)
    for model in models_list.keys():
        x_test = np.reshape(test_set.values, (len(test_set), 1, col_count))
        test_pred = (models_list[model].predict(x_test) > 0.5).astype("int32")
        cm=confusion_matrix(test_label,test_pred)
        sns.heatmap(cm, ax=axs[i], annot=True,cmap=plt.cm.Blues, fmt='g')
        axs[i].set_title(model)
        i+=1

In [None]:
compare_nn_models(nn_models, 'Neural Network',norm_test_set, test_label)

### KNN versions comparision

In [None]:
compare_models(knn_models, 'k-Nearest Neighbors', knn_test_set, test_label)

In [None]:
print('\t\t\t TRAINING SCORES')
compare_scores(knn_models,knn_train_set, train_label)

In [None]:
print('\t\t\t TEST SCORES')
compare_scores(knn_models,knn_test_set, test_label)