<b>Data mining Project - 2021/22</b><br/>
<span>
<b>Authors:</b> Mariagiovanna Rotundo (560765), Nunzio Lopardo (600005)</a> and Renato Eschini (203021)<br/>
<b>Group:</b>3<br/>
<b>Release date:</b> 26/12/2021
</span>

# Classification task

In this notebook we use different classificators for the classification task and we evaluate the performaces.

**Import libraries**

In [None]:
import math
import numpy as np
import matplotlib.pyplot as plt
import collections
import pydotplus 
import statistics 
import pandas as pd
import os
from datetime import date
from tqdm.notebook import tqdm
from IPython.display import Image  
import scikitplot as skplt
import wittgenstein as lw
from imblearn.over_sampling import SMOTE

from scipy.stats.stats import pearsonr
from scipy.spatial.distance import pdist,  squareform
import scipy.stats as stats
from scipy.stats import randint as sp_randint

from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, pairwise_distances, classification_report, plot_confusion_matrix, confusion_matrix, ConfusionMatrixDisplay # For Model evaluation
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split, cross_validate, RandomizedSearchCV, GridSearchCV, RepeatedStratifiedKFold
from sklearn import tree, metrics
from sklearn.datasets import make_blobs
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

from sklearn.ensemble import AdaBoostClassifier

import seaborn as sns
import re

**Loading the dataset**

In [None]:
# load of the data
DATASET_DIR = "dataset" + os.path.sep
#index_col=False say to not use the first column as ID
df_players = pd.read_csv('players.csv', sep=',', index_col=0) 

In [None]:
df_players.info()

## Functions

In this section are defined the functions used in the notebook. 

**function to discretize categorical data**

In [None]:
def discretize_data(dataset, variables): #mapping categorical into numerical
    for variable in variables:
        #get the unique variable's values
        var = sorted(dataset[variable].unique())
        
        #generate a mapping from the variable's values to the number representation  
        mapping = dict(zip(var, range(0, len(var) + 1)))

        #add a new colum with the number representation of the variable
        dataset[variable+'_num'] = dataset[variable].map(mapping).astype(int)
    return dataset

In [None]:
#metrics computed on the test set
def report_scores(test_label, test_pred):
    print(classification_report(test_label, 
                            test_pred, 
                            target_names=['low', 'high']))

In [None]:
def spot_errors(test_label, test_pred):
    spotted_errors = []
    for i in range(len(test_label)):
        if(test_label.array[i]!= test_pred[i]):
            spotted_errors.append('darkred')
        else:
            spotted_errors.append('darkgray')
    return spotted_errors

**Plot the neural network training history**

In [None]:
def plot_nn_training_history(history):
    from matplotlib.pyplot import figure
    fig, (acc_plot, loss_plot) = plt.subplots(2, figsize=(15, 6), dpi=240)
    fig.suptitle('Accuracy and Loss trends')
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    epochs = range(1, len(acc) + 1)
    acc_plot.plot(epochs, acc, label='Training Acc')
    acc_plot.plot(epochs, val_acc, label='Validation Acc')
    acc_plot.legend(loc='best')
    acc_plot.set_ylabel('Accuracy')
    acc_plot.set_ylim([0,1])
    acc_plot.grid(True)
    loss_plot.plot(epochs, loss, label='Trining Loss')
    loss_plot.plot(epochs, val_loss, label='Validation Loss')
    loss_plot.legend(loc='best')
    loss_plot.set_ylabel('Loss')
    loss_plot.set_ylim([0,1])
    loss_plot.set_xlabel('Epochs')
    loss_plot.grid(True)

**Scatter plot comparison real/classified/misclassified data**

In [None]:
def scatter_pred_data(test_set, test_label, test_pred, classifier_name, x, y):
    fig, (test_true_plt, test_pred_plt, errors_plt) = plt.subplots(1,3, figsize=(18,6), sharey=True)
    title = classifier_name + ' | Real vs Predicted labels'
    plt.suptitle(title)
    test_true_plt.set_title('True Label')
    test_true_plt.scatter(test_set[x].values, test_set[y].values, c=test_label.values, s=25, cmap='viridis')
    test_pred_plt.set_title('Predicted Label')
    test_pred_plt.scatter(test_set[x].values, test_set[y].values, c=test_pred, s=25, cmap='viridis')
    spotted_errors = spot_errors(test_label, test_pred)
    errors_plt.set_title('Misclassification')
    errors_plt.scatter(test_set[x].values, test_set[y].values, c=spotted_errors, s=25, cmap='viridis')
    plt.show()

**Print the dataset composition**

In [None]:
def print_dataset_composition(train_set, train_labels, test_set, test_labels):
    print(f"{len(train_labels)} training samples:")
    print(f"\t- {len(train_labels[train_labels == 0])} samples for the class Low Rank")
    print(f"\t- {len(train_labels[train_labels == 1])} samples for the class High Rank")
    print(f"\n{len(test_labels)} test samples:")
    print(f"\t- {len(test_labels[test_labels == 0])} samples for the class Low Rank")
    print(f"\t- {len(test_labels[test_labels == 1])} samples for the class High Rank")

**Plot the ROC curve and compute the AUC**

In [None]:
def roc_curve_plot(model, test_set, test_label, test_pred, classifier_name):
    x_test = np.reshape(test_set.values, (len(test_set), 1, len(test_set.columns)))
    y_pred_keras = model.predict(x_test).ravel()
    fpr, tpr, thresholds = roc_curve(test_label, test_pred)
    auc_area = auc(fpr, tpr)
    label_name = classifier_name + '(area = {:.3f})'.format(auc_area)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.plot(fpr, tpr, label=label_name)
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('ROC curve')
    plt.legend(loc='best')
    plt.show()

**Plot confusion matrix**

In [None]:
def plot_confusion_mx(test_label, test_pred):
    cm = confusion_matrix(test_label, test_pred, labels=test_label.unique())
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels= ['low', 'high'])
    disp.plot()
    plt.show()

**Plot multiple confusion matrix**

Given a dictionary of trained classifiers ({classifier_name: model}) plots the confusion matrix for all of them.

In [None]:
def compare_models(models_list, classifier_name, test_set, test_label):
    i = 0
    col_count = len(train_set.columns)
    fig, axs = plt.subplots(nrows=1,ncols=len(models_list), figsize=(18,6), sharey=True)
    title = classifier_name + ' | Confusion Matrix comparison'
    plt.suptitle(title)
    for model in models_list.keys():
        test_pred =  models_list[model].predict(test_set)
        cm=confusion_matrix(test_label,test_pred)
        sns.heatmap(cm, ax=axs[i], annot=True,cmap=plt.cm.Blues, fmt='g')
        axs[i].set_title(model)
        i+=1

**Function to normalize a dataframe**

In [None]:
def normalize_dataset(df):
    cols_to_norm = ['best_rank_points', 'w_tourney', 'tot_minutes', 'sv1st', 'sv1st_win', 'sv2nd_win', 
           'df', 'ace', 'bpS', 'wmatch', 'lmatch', 'nmatch', 'n_tourney']
    df[cols_to_norm] = MinMaxScaler().fit_transform(df[cols_to_norm])
    return df

## Data for classification

In [None]:
df_filtered = df_players[['sex', 'hand','best_rank','best_rank_points', 'w_tourney', 'tot_minutes', 'sv1st', 'sv1st_win', 'sv2nd_win', 
           'df', 'ace', 'bpS', 'wmatch', 'lmatch', 'nmatch', 'n_tourney']]

In [None]:
# n_match = 1

# df_filtered = df_players[
#     (df_players['best_rank']>0) & 
#     (df_players['best_rank_points']>=0) & 
#     (df_players['tot_minutes']>0) & 
#     (df_players['ace']>=0) & 
#     (df_players['bpS']>=0)][[
# 'best_rank', 
# 'best_rank_points',                            
# 'tot_minutes',
# 'sv1st',
# 'sv1st_win', 
# 'sv2nd_win', 
# 'df', 
# 'ace', 
# 'bpS', 
# 'nmatch',
# 'wmatch',
# 'lmatch',
# 'n_tourney',
# 'w_tourney']]
# df_filtered = df_filtered.loc[df_filtered['nmatch'] > n_match]

# df_filtered

In [None]:
variables = ['sex', 'hand']
df_filtered = discretize_data(df_filtered, variables)

In [None]:
df_filtered = df_filtered.drop(columns=['sex', 'hand'])

In [None]:
df_filtered

In [None]:
threshold = 50
df_filtered.loc[((df_filtered['best_rank']>0) & (df_filtered['best_rank']<=threshold)), 'ranked'] = 1 #high
df_filtered.loc[((df_filtered['best_rank']>0) & (df_filtered['best_rank']>threshold)), 'ranked'] = 0 #low

In [None]:
df_filtered.loc[((df_filtered['best_rank']>0) & (df_filtered['best_rank']<=threshold))].shape[0] #number of high

In [None]:
df_filtered.loc[((df_filtered['best_rank']>0) & (df_filtered['best_rank']>threshold))].shape[0] #number of low

In [None]:
df_classification = df_filtered[df_filtered['best_rank']>0]

In [None]:
df_classification = df_classification.drop(columns=['best_rank'])

In [None]:
df_classification

In [None]:
label = df_classification.pop('ranked')
train_set, test_set, train_label, test_label = train_test_split(df_classification, label, stratify = label, test_size=0.30)

**Dataset normalization**

In [None]:
norm_train_set = normalize_dataset(train_set)
norm_test_set = normalize_dataset(test_set)

**Dataset composition**

In [None]:
print_dataset_composition(train_set, train_label, test_set, test_label)

# Classification

### Decision tree

In [None]:
dt = tree.DecisionTreeClassifier(criterion='gini', splitter='best', 
                                  max_depth=5, 
                                  min_samples_split=3, min_samples_leaf=4)
dt = dt.fit(train_set, train_label)

In [None]:
dot_data = tree.export_graphviz(dt, out_file=None, 
                         feature_names=list(train_set.columns),  
                         class_names=['low', 'high'],  #[0, 1]
                         filled=True, rounded=True)  
graph = pydotplus.graph_from_dot_data(dot_data)  
Image(graph.create_png())

In [None]:
train_pred_dt = dt.predict(train_set)
test_pred_dt = dt.predict(test_set)

#### evaluation

In [None]:
dt.predict_proba(train_set)

In [None]:
#evaulate the accuracy on the train set and the test set
#metrics also contains precision, recall, f1 and the support
print('Accuracy train set ', metrics.accuracy_score(train_label, train_pred_dt))
print('Accuracy test set ', metrics.accuracy_score(test_label, test_pred_dt))
print('Precision train set ', metrics.precision_score(train_label, train_pred_dt, average='weighted'))
print('Recall train set ', metrics.recall_score(train_label, train_pred_dt, average='weighted'))
print('F1 score train set ', metrics.f1_score(train_label, train_pred_dt, average='weighted'))
print('Support train set ', metrics.precision_recall_fscore_support(train_label, train_pred_dt))

In [None]:
#per il training set
report_scores(train_label, train_pred_dt)

In [None]:
#per il test set
report_scores(test_label, test_pred_dt)
#l'accuracy è un buon indicatore, è significativa se è maggiore dell'accuracy della majority class. in caso di 
#situazione unbalance anche la precision e la recall aiutano a capire quanti errori abbiamo

In [None]:
### cross validation

In [None]:
scores = cross_validate(dt, train_set, train_label, cv=3, return_train_score= True)
print('Fit time ', statistics.mean(scores['fit_time']))
print('Score time ', statistics.mean(scores['score_time']))
print('Test score ', statistics.mean(scores['test_score']))
print('Train score ', statistics.mean(scores['train_score']))

In [None]:
#compute confusion matrix
cm = confusion_matrix(test_label, test_pred_dt)
cm

In [None]:
#it is possible to plot the confusion matrix 
plot_confusion_matrix(dt, test_set, test_label)
plt.show() 

In [None]:
#test_result = test_set
#test_result['ranked'] = test_label
#print classification for pairs of attributes/columns
#sns.pairplot(data = test_result, hue = 'ranked', palette = "Accent")

In [None]:
#true labels - different colors for different class
plt.scatter(test_set['best_rank_points'].values, test_set['sv1st'].values , c=test_label, s=20);
plt.show()
plt.scatter(test_set['nmatch'].values, test_set['sv1st'].values , c=test_label, s=20);

### SVM

In [None]:
svm_models = {}

In [None]:
svm = SVC(kernel='sigmoid', C=0.5, gamma='scale', probability=True)
svm.fit(train_set, train_label)
svm_models['svm_original'] = svm 

In [None]:
train_pred_svm = svm.predict(train_set)

In [None]:
report_scores(train_label, train_pred_svm)

In [None]:
#prediction on the test test
test_pred_proba_svm = svm.predict_proba(test_set)
test_pred_proba_svm

In [None]:
test_pred_svm = svm.predict(test_set)

In [None]:
#compute the performance of the model
report_scores(test_label, test_pred_svm)

In [None]:
plot_confusion_matrix(svm, test_set, test_label)
plt.show() 

### Rule based

In [None]:
rb_models = {}

In [None]:
#we run a grid search to find the best configuration of parameters' values
ripper = lw.RIPPER()
param_grid = {"prune_size": [0.5, 0.6], "k": [1, 3, 5]}
grid_search = GridSearchCV(estimator=ripper, param_grid=param_grid)
grid_search.fit(train_set, train_label, pos_class=1)

In [None]:
print('Best parameters setting ', grid_search.cv_results_['params'][0])

In [None]:
#define and fit the rule-based model
#this function requires only one dataset with the labels. 
#To do so, we concatenate the train_set and the train_label
ripper = lw.RIPPER(k=1, prune_size=0.50)
datas = pd.concat([train_set, train_label], axis=1)
ripper.fit(datas, class_feat='ranked', pos_class=1)

In [None]:
#in this case the model is a set of rules
ripper.out_model()

In [None]:
rb_models['rb_original'] = ripper

In [None]:
ripper_pred_train = ripper.predict(train_set)
report_scores(train_label, ripper_pred_train)

In [None]:
ripper_pred = ripper.predict(test_set)
report_scores(test_label, ripper_pred)

In [None]:
#evaluation of the performance of the classifier
print('Accuracy ', ripper.score(test_set, test_label))
print('Precision ', ripper.score(test_set, test_label, precision_score))
print('Recall ', ripper.score(test_set, test_label, recall_score))

In [None]:
plot_confusion_matrix(ripper, test_set, test_label)
plt.show() 

In [None]:
ripper_pred_reasons = ripper.predict(test_set, give_reasons=True)

In [None]:
indexes = [i for i,elem in enumerate(ripper_pred_reasons[0]) if elem == True]
rules_used = [ripper_pred_reasons[1][elem] for i,elem in enumerate(indexes)]

In [None]:
rules_used

### Gaussian Naive Bayes

**Importing libraries**

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
gnb_models = {}

**Define the model**

In [None]:
gnb_model = GaussianNB()

**Train the Gaussain Naive Bayes classifier**

In [None]:
gnb_model.fit(train_set, train_label)
gnb_models['GNB_original'] = gnb_model

In [None]:
test_pred = gnb_model.predict(test_set)
print(classification_report(test_label, test_pred, target_names = ['low','high']))

The performance report reveals the low capacity of the GNB classifier to correctly classify the hig rank players. This is due to the highly imbalanced dataset.

Let's plot the confusion matrix

In [None]:
cm = confusion_matrix(test_label, test_pred, labels=gnb_model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels= ['high', 'low'])
disp.plot()
plt.show()

### AdaBoost

In [None]:
X_train, X_test, y_train, y_test = train_set.values, test_set.values, train_label.values, test_label.values

In [None]:
# Using DecisionTreeClassifier (default) as Base Learners

# Create adaboost classifer object
abc = AdaBoostClassifier(n_estimators=50,learning_rate=1)
# Train Adaboost Classifer
model = abc.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = model.predict(X_test)

In [None]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [None]:
report_scores(test_label, y_pred)

In [None]:
#Predict the response for train dataset
X_pred = model.predict(X_train)

In [None]:
report_scores(train_label, X_pred)

In [None]:
# Using Support Vector Classifier as Base Learners
svc=SVC(probability=True, kernel='linear')

# Create adaboost classifer object
abc =AdaBoostClassifier(n_estimators=50, base_estimator=svc,learning_rate=1)

# Train Adaboost Classifer
model = abc.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = model.predict(X_test)


# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [None]:
report_scores(test_label, y_pred)

In [None]:
#Predict the response for train dataset
X_pred = model.predict(X_train)

In [None]:
report_scores(train_label, X_pred)

### Random Forest

In [None]:
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier
clf=RandomForestClassifier(n_estimators=100)
#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(train_set,train_label)
y_pred=clf.predict(test_set)

In [None]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(test_label, y_pred))

In [None]:
report_scores(test_label, y_pred)

In [None]:
#Predict the response for train dataset
X_pred = model.predict(X_train)

In [None]:
report_scores(train_label, X_pred)

### Neural Networks

**Importing libraries**

In [None]:
import tensorflow as tf

In the following list we will save all the trained Neural Network models. 

In [None]:
nn_models = {}

Define and compile the neural network model.

In [None]:
def base_nn_model(optimizer = 'adam', activation='relu', dropout_rate=0.2, neurons=20, loss='binary_crossentropy'): #specify parameters so that we can do grid search
    # create model
    model = tf.keras.models.Sequential([
        tf.keras.layers.Flatten(input_shape=(1, len(train_set.columns))),
        tf.keras.layers.Dense(neurons, activation=activation),
        tf.keras.layers.Dropout(dropout_rate),
        tf.keras.layers.Dense(neurons, activation=activation),
        tf.keras.layers.Dropout(dropout_rate),    
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    # Compile model
    model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])
    return model

In [None]:
def train_nn_model(model, train_set, train_label, epochs=60, batch_size=256,validation_split=0.2, verbose=False, class_wieghts=None):
    x_train = np.reshape(train_set.values, (len(train_set), 1, len(train_set.columns)))
    if class_weights:
        history = model.fit(x_train, train_label,
                    epochs=epochs,
                    batch_size=batch_size,
                    validation_split=validation_split,
                    class_weight=class_weights,
                    verbose=verbose)
    else:
        history = model.fit(x_train, y_train,
                            epochs=epochs,
                            batch_size=batch_size,
                            validation_split=validation_split,
                            verbose=verbose)
    return model, history

In [None]:
nn_model = base_nn_model()
nn_model, history = train_nn_model(nn_model, norm_train_set, train_label)
nn_models['NN_original'] = nn_model

Train the model using the original and normalized dataset.

In [None]:
nn_model.summary()

In [None]:
plot_nn_training_history(history)

In [None]:
x_train_test = np.reshape(norm_train_set.values, (len(norm_train_set), 1, len(train_set.columns)))
train_pred = (nn_model.predict(x_train_test) > 0.5).astype("int32")

In [None]:
report_scores(train_label, train_pred)

In [None]:
x_test = np.reshape(norm_test_set.values, (len(norm_test_set), 1, col_count))
test_pred = (nn_model.predict(x_test) > 0.5).astype("int32")

In [None]:
report_scores(test_label, test_pred)

Given the trained NN model, let's look the cofusion matrix on the test set

In [None]:
cm=confusion_matrix(test_label,test_pred)
#il parametro fmt serve per evitare la notazione esponenziale dei numeri
sns.heatmap(cm, annot=True,cmap=plt.cm.Blues, fmt='g')
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

In [None]:
scatter_pred_data(norm_test_set, norm_test_label, test_pred, 'Neural Network', 'nmatch', 'best_rank_points')

In [None]:
roc_curve_plot(nn_model, norm_test_set, norm_test_label, test_pred, 'NN')

### KNN (K-Nearest Neighbors)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

Fit and scoring the classifier using the function *GridSearchCV*, by sklearn, that automatically compute the best combination of parameters for the model training. Below are created the set of parameters for the KNN training that the function will use.

In [None]:
k_range = range(1,31)

In [None]:
knn_metrics = ['euclidean', 'manhattan']

In [None]:
knn_weights = ['uniform', 'distance']

In [None]:
knn_algorithms = ['ball_tree', 'kd_tree', 'brute']

In [None]:
knn_param_grid = {
    'n_neighbors': k_range,
    'metric': knn_metrics,
    'algorithm': knn_algorithms,
    'weights': knn_weights
            }

In [None]:
knn_grid = GridSearchCV(KNeighborsClassifier(), knn_param_grid, cv=10, scoring='accuracy')
knn_grid.fit(train_set, train_label)

In [None]:
print('Accuracy: ' + str(knn_grid.best_score_))
print('Parameters: ' + str(knn_grid.best_params_))

In [None]:
knn = KNeighborsClassifier(**knn_grid.best_params_).fit(train_set, train_label)

In [None]:
test_pred_knn = knn.predict(test_set)

In [None]:
report_scores(test_label,test_pred_knn)

In [None]:
scatter_pred_data(test_set, test_label, test_pred_knn, 'KNN')

As we know from the theory, the nearest neighbor classifiers can be biased by noise points that have oversized data values that can miss lead the classification task. The solution to this problem is normalization, in the following lines of code a normalized dataset is created using the *MinMaxScaler*

In [None]:
from sklearn.preprocessing import MinMaxScaler
minmax_scaler = MinMaxScaler()
cs_norm_minmax = minmax_scaler.fit_transform(df_classification.values)
norm_train_set, norm_test_set, norm_train_label, norm_test_label = train_test_split(cs_norm_minmax, label, stratify=label, test_size=0.30)

In [None]:
knn_grid = GridSearchCV(KNeighborsClassifier(), knn_param_grid, cv=10, scoring='accuracy')
knn_grid.fit(norm_train_set, norm_train_label)

In [None]:
print('Accuracy: ' + str(knn_grid.best_score_))
print('Parameters: ' + str(knn_grid.best_params_))

In [None]:
knn = KNeighborsClassifier(**knn_grid.best_params_).fit(norm_train_set, norm_train_label)

In [None]:
test_pred_knn = knn.predict(norm_test_set)

In [None]:
report_scores(test_label,test_pred)

In [None]:
plot_confusion_mx(test_label, test_pred)

# Classification with weights

In [None]:
#set weights
weights = {0:1.0, 1:100.0} #0=low, 1 = high
balance = [{0:1,1:100}, {0:1,1:50}, {0:1,1:10}, {0:1,1:1}, 'balanced']

### Decision tree

In [None]:
dt = tree.DecisionTreeClassifier(criterion='gini', splitter='best', 
                                  max_depth=5, class_weight=weights,
                                  min_samples_split=3, min_samples_leaf=4)
dt = dt.fit(train_set, train_label)

#### choise of weights

In [None]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
param_grid = dict(class_weight=balance)
grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='roc_auc')

In [None]:
grid_result = grid_search.fit(test_set, test_label)

In [None]:
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
dt = tree.DecisionTreeClassifier(criterion='gini', splitter='best', 
                                  max_depth=5, class_weight={0: 1, 1: 50},
                                  min_samples_split=3, min_samples_leaf=4)
dt = dt.fit(train_set, train_label)

In [None]:
dot_data = tree.export_graphviz(dt, out_file=None, 
                         feature_names=list(train_set.columns),  
                         class_names=['low', 'high'],  #[0, 1]
                         filled=True, rounded=True)  
graph = pydotplus.graph_from_dot_data(dot_data)  
Image(graph.create_png())

In [None]:
train_pred_dt = dt.predict(train_set)
test_pred_dt = dt.predict(test_set)

In [None]:
report_scores(train_label, train_pred_dt)

In [None]:
report_scores(test_label, test_pred_dt)

In [None]:
plot_confusion_matrix(dt, test_set, test_label)
plt.show() 

### SVM

In [None]:
svm = SVC(gamma='scale', class_weight=weights)
svm.fit(train_set, train_label)

In [None]:
train_pred = svm.predict(train_set)
report_scores(train_label, train_pred)

In [None]:
test_pred = svm.predict(test_set)

In [None]:
#compute the performance of the model
report_scores(test_label, test_pred)

#### choise of weights

In [None]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
param_grid = dict(class_weight=balance)
grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='roc_auc')

In [None]:
grid_result = grid_search.fit(test_set, test_label)

In [None]:
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
svm = SVC(gamma='scale', class_weight={0: 1, 1: 10})
svm.fit(train_set, train_label)
svm_models['svm_weighted'] = svm

In [None]:
test_pred = svm.predict(test_set)
report_scores(test_label, test_pred)

In [None]:
train_pred = svm.predict(train_set)
report_scores(train_label, train_pred)

In [None]:
plot_confusion_matrix(svm, test_set, test_label)
plt.show() 

### Rule based

In [None]:
#we run a grid search to find the best configuration of parameters' values
ripper = lw.RIPPER()
param_grid = {"prune_size": [0.5, 0.6], "k": [1, 3, 5], "class_weight": balance}
grid_search = GridSearchCV(estimator=ripper, param_grid=param_grid)
grid_search.fit(train_set, train_label, pos_class=1)

In [None]:
print('Best parameters setting ', grid_search.cv_results_['params'][0])

In [None]:
ripper = lw.RIPPER(k=1, prune_size=0.50)
datas = pd.concat([train_set, train_label], axis=1)
ripper.fit(datas, class_feat='ranked', pos_class=1, class_weight = {0: 1, 1: 100})

In [None]:
#in this case the model is a set of rules
ripper.out_model()

In [None]:
rb_models['rb_weighted'] = ripper

In [None]:
ripper_pred_train = ripper.predict(train_set)
report_scores(train_label, ripper_pred_train)

In [None]:
ripper_pred = ripper.predict(test_set)
report_scores(test_label, ripper_pred)

In [None]:
#evaluation of the performance of the classifier
print('Accuracy ', ripper.score(test_set, test_label))
print('Precision ', ripper.score(test_set, test_label, precision_score))
print('Recall ', ripper.score(test_set, test_label, recall_score))

In [None]:
plot_confusion_matrix(ripper, test_set, test_label)
plt.show() 

In [None]:
ripper_pred_reasons = ripper.predict(test_set, give_reasons=True)

In [None]:
indexes = [i for i,elem in enumerate(ripper_pred_reasons[0]) if elem == True]
rules_used = [ripper_pred_reasons[1][elem] for i,elem in enumerate(indexes)]
rules_used

### Neural Networks

Now let's re-run the neural network classifier using the weighted classes.

#### choise of weights

In [None]:
from sklearn.utils import class_weight
weights_nn = {0: 0.75, 1: 2.8}
class_weights = class_weight.compute_class_weight(class_weight = weights_nn,
                                                 classes = np.unique(train_label),
                                                 y = train_label)
class_weights = dict(enumerate(class_weights))
print(class_weights)

In [None]:
nn_model_w = base_nn_model()
nn_model_w, history = train_nn_model(nn_model_w, norm_train_set, train_label)
nn_models['NN_weighted'] = nn_model_w

Train the model using the original and normalized dataset.

In [None]:
nn_model_w.summary()

In [None]:
plot_nn_training_history(history)

In [None]:
x_train_test = np.reshape(norm_train_set.values, (len(norm_train_set), 1, len(norm_train_set.columns)))
train_pred = (nn_model_w.predict(x_train_test) > 0.5).astype("int32")

In [None]:
report_scores(train_label, train_pred)

In [None]:
x_test = np.reshape(norm_test_set.values, (len(norm_test_set), 1, col_count))
test_pred = (nn_model_w.predict(x_test) > 0.5).astype("int32")

In [None]:
report_scores(test_label, test_pred)

Given the trained NN model, let's look the cofusion matrix on the test set

In [None]:
cm=confusion_matrix(test_label,test_pred)
#il parametro fmt serve per evitare la notazione esponenziale dei numeri
sns.heatmap(cm, annot=True,cmap=plt.cm.Blues, fmt='g')
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

In [None]:
scatter_pred_data(norm_test_set, test_label, test_pred, 'Neural Network', 'nmatch', 'best_rank_points')

In [None]:
roc_curve_plot(nn_model_w, norm_test_set, test_label, test_pred, 'NN')

# Oversampling with SMOTE

In [None]:
oversample = SMOTE(sampling_strategy=0.3)
training, labels = oversample.fit_resample(train_set, train_label)

**Original Dataset**

In [None]:
print_dataset_composition(train_set, train_label, test_set, test_label)

**Dataset after oversampling**

In [None]:
print_dataset_composition(training, labels, test_set, test_label)

### Decision tree

In [None]:
dt = tree.DecisionTreeClassifier(criterion='gini', splitter='best', 
                                  max_depth=5, 
                                  min_samples_split=3, min_samples_leaf=4)
dt = dt.fit(training, labels)

In [None]:
train_pred = dt.predict(train_set)
report_scores(train_label, train_pred)

In [None]:
test_pred_dt = dt.predict(test_set)

In [None]:
report_scores(test_label, test_pred_dt)

### SVM

In [None]:
svm = SVC(kernel='sigmoid', gamma='scale')
svm.fit(training, labels)
svm_models['svm_ov'] = svm 

In [None]:
train_pred = svm.predict(train_set)
report_scores(train_label, train_pred)

In [None]:
test_pred = svm.predict(test_set)

In [None]:
report_scores(test_label, test_pred)

### Rule based

In [None]:
#we run a grid search to find the best configuration of parameters' values
ripper = lw.RIPPER()
param_grid = {"prune_size": [0.5, 0.6], "k": [1, 3, 5]}
grid_search = GridSearchCV(estimator=ripper, param_grid=param_grid)
grid_search.fit(training, labels, pos_class=1)

In [None]:
print('Best parameters setting ', grid_search.cv_results_['params'][0])

In [None]:
ripper = lw.RIPPER(k=1, prune_size=0.50)
datas = pd.concat([training, labels], axis=1)
ripper.fit(datas, class_feat='ranked', pos_class=1)

In [None]:
#in this case the model is a set of rules
ripper.out_model()

In [None]:
rb_models['rb_ov'] = ripper

In [None]:
ripper_pred_train = ripper.predict(train_set)
report_scores(train_label, ripper_pred_train)

In [None]:
ripper_pred = ripper.predict(test_set)
report_scores(test_label, ripper_pred)

In [None]:
#evaluation of the performance of the classifier
print('Accuracy ', ripper.score(test_set, test_label))
print('Precision ', ripper.score(test_set, test_label, precision_score))
print('Recall ', ripper.score(test_set, test_label, recall_score))

In [None]:
plot_confusion_mx(test_label, test_pred)

### Gaussain Naive Bayes

Gaussain Naive Bayes using the oversampled dataset.

**Define the model**

In [None]:
gnb_model = GaussianNB()

**Train the Gaussain Naive Bayes classifier**

In [None]:
gnb_model.fit(training, labels)
gnb_models['GNB_ov'] = gnb_model

In [None]:
test_pred = gnb_model.predict(test_set)
print(classification_report(test_label, test_pred, target_names = ['low','high']))

The performance report reveals the low capacity of the GNB classifier to correctly classify the hig rank players. This is due to the highly imbalanced dataset.

Let's plot the confusion matrix

In [None]:
plot_confusion_matrix(ripper, test_set, test_label)
plt.show() 

### Neural Networks

In [None]:
norm_ov_train_set = normalize_dataset(training)

In [None]:
nn_model_ov = base_nn_model()
nn_model_ov, history = train_nn_model(nn_model_ov, norm_ov_train_set, labels)
nn_models['NN_smote'] = nn_model_ov

In [None]:
nn_model_ov.summary()

In [None]:
plot_nn_training_history(history)

In [None]:
x_test = np.reshape(norm_test_set.values, (len(norm_test_set), 1, col_count))
test_pred = (nn_model_ov.predict(x_test) > 0.5).astype("int32")

In [None]:
report_scores(test_label, test_pred)

In [None]:
cm=confusion_matrix(test_label,test_pred)
#il parametro fmt serve per evitare la notazione esponenziale dei numeri
sns.heatmap(cm, annot=True,cmap=plt.cm.Blues, fmt='g')
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

In [None]:
scatter_pred_data(test_set, test_label, test_pred, 'Neural Network', 'nmatch', 'best_rank_points')

In [None]:
roc_curve_plot(nn_model_ov, norm_test_set, norm_test_label, test_pred, 'NN')

# Classifiers Comparison

## Neural Network versions comparision

In [None]:
def compare_nn_models(models_list, classifier_name, test_set, test_label):
    i = 0
    col_count = len(train_set.columns)
    fig, axs = plt.subplots(nrows=1,ncols=len(models_list), figsize=(18,6), sharey=True)
    title = classifier_name + ' | Confusion Matrix comparison'
    plt.suptitle(title)
    for model in models_list.keys():
        x_test = np.reshape(test_set.values, (len(test_set), 1, col_count))
        test_pred = (models_list[model].predict(x_test) > 0.5).astype("int32")
        cm=confusion_matrix(test_label,test_pred)
        sns.heatmap(cm, ax=axs[i], annot=True,cmap=plt.cm.Blues, fmt='g')
        axs[i].set_title(model)
        i+=1

In [None]:
compare_nn_models(nn_models, 'Neural Network',norm_test_set, test_label)

In [None]:
compare_models(gnb_models, 'Gaussain Naive Bayes', test_set, test_label)

### SVM versions comparision

In [None]:
compare_models(svm_models, 'SVM', test_set, test_label)

### Rule based versions comparision

In [None]:
compare_models(rb_models, 'Rule based', test_set, test_label)