# Classification task

In [None]:
import math
import numpy as np
import matplotlib.pyplot as plt
import collections
import pydotplus 
import statistics 
import pandas as pd
import os
from datetime import date
from tqdm.notebook import tqdm
from IPython.display import Image  

from scipy.stats.stats import pearsonr
from scipy.spatial.distance import pdist,  squareform
import scipy.stats as stats

from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, pairwise_distances, classification_report, confusion_matrix, plot_confusion_matrix # For Model evaluation
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split, cross_validate
from sklearn import tree, metrics
from sklearn.datasets import make_blobs

import seaborn as sns
import re

In [None]:
# load of the data
DATASET_DIR = "dataset" + os.path.sep
#index_col=False say to not use the first column as ID
df_players = pd.read_csv('players.csv', sep=',', index_col=0) 

In [None]:
df_players.info()

In [None]:
df_players[['sex', 'hand', 'best_rank','best_rank_points', 'best_of_3_match', 'best_of_5_match',
           'best_of_3_wins', 'best_of_5_wins', 'w_finals', 'tot_minutes', 'sv1st', 'sv1st_win', 'sv2nd_win', 
           'df', 'ace_perc', 'bpS_perc', 'wmatch', 'lmatch', 'nmatch', 'n_tourney']]

In [None]:
def discretize_data(dataset, variables): #mapping categorical into numerical
    for variable in variables:
        #get the unique variable's values
        var = sorted(dataset[variable].unique())
        
        #generate a mapping from the variable's values to the number representation  
        mapping = dict(zip(var, range(0, len(var) + 1)))

        #add a new colum with the number representation of the variable
        dataset[variable+'_num'] = dataset[variable].map(mapping).astype(int)
    return dataset

In [None]:
df_p = df_players[['sex', 'hand', 'best_rank','best_rank_points', 'best_of_3_match', 'best_of_5_match',
           'best_of_3_wins', 'best_of_5_wins', 'w_finals', 'tot_minutes', 'sv1st', 'sv1st_win', 'sv2nd_win', 
           'df', 'ace_perc', 'bpS_perc', 'wmatch', 'lmatch', 'nmatch', 'n_tourney']]

In [None]:
variables = ['sex', 'hand']
df_p = discretize_data(df_p, variables)

In [None]:
df_p.drop(columns=['sex', 'hand'], inplace=True, axis=1)

In [None]:
df_p

In [None]:
threshold = 50
df_p.loc[((df_p['best_rank']>0) & (df_p['best_rank']<=threshold)), 'ranked'] = 'high'
df_p.loc[((df_p['best_rank']>0) & (df_p['best_rank']>threshold)), 'ranked'] = 'low'

In [None]:
df_classif = df_p[df_p['best_rank']>0]

In [None]:
df_classif.drop(columns=['best_rank'], inplace=True, axis=1)

### Decision tree

In [None]:
label = df_classif.pop('ranked')
train_set, test_set, train_label, test_label = train_test_split(df_classif, label, stratify =label, test_size=0.40)

In [None]:
train_set

In [None]:
dt = tree.DecisionTreeClassifier(criterion='gini', splitter='best', 
                                  max_depth=5, 
                                  min_samples_split=3, min_samples_leaf=4)
dt = dt.fit(train_set, train_label)

In [None]:
dot_data = tree.export_graphviz(dt, out_file=None, 
                         feature_names=list(train_set.columns),  
                         class_names=['high', 'low'],  
                         filled=True, rounded=True)  
graph = pydotplus.graph_from_dot_data(dot_data)  
Image(graph.create_png())

In [None]:
train_pred_dt = dt.predict(train_set)
test_pred_dt = dt.predict(test_set)

#### evaluation

In [None]:
dt.predict_proba(train_set)

In [None]:
#evaulate the accuracy on the train set and the test set
#metrics also contains precision, recall, f1 and the support
print('Accuracy train set ', metrics.accuracy_score(train_label, train_pred_dt))
print('Accuracy test set ', metrics.accuracy_score(test_label, test_pred_dt))
print('Precision train set ', metrics.precision_score(train_label, train_pred_dt, average='weighted'))
print('Recall train set ', metrics.recall_score(train_label, train_pred_dt, average='weighted'))
print('F1 score train set ', metrics.f1_score(train_label, train_pred_dt, average='weighted'))
print('Support train set ', metrics.precision_recall_fscore_support(train_label, train_pred_dt))

In [None]:
#metrics computed on the test set
def report_scores(test_label, test_pred):
    print(classification_report(test_label, 
                            test_pred, 
                            target_names=['low', 'high']))

In [None]:
#per il training set
report_scores(train_label, train_pred_dt)

In [None]:
#per il test set
report_scores(test_label, test_pred_dt)
#l'accuracy è un buon indicatore, è significativa se è maggiore dell'accuracy della majority class. in caso di 
#situazione unbalance anche la precision e la recall aiutano a capire quanti errori abbiamo

In [None]:
### cross validation

In [None]:
scores = cross_validate(dt, train_set, train_label, cv=3, return_train_score= True)
print('Fit time ', statistics.mean(scores['fit_time']))
print('Score time ', statistics.mean(scores['score_time']))
print('Test score ', statistics.mean(scores['test_score']))
print('Train score ', statistics.mean(scores['train_score']))

In [None]:
#compute confusion matrix
cm = confusion_matrix(test_label, test_pred_dt)
cm

In [None]:
#it is possible to plot the confusion matrix 
plot_confusion_matrix(dt, test_set, test_label)
plt.show() 

In [None]:
#true labels - different colors for different class
color = ['red' if label=='low'  else 'green' for label in test_label]
plt.scatter(test_set.iloc[:, 5].values, test_set.iloc[:, 2].values , c=color, s=25);

#### excluded raws

In [None]:
df_to_rank = df_p[df_p['best_rank']==0]
df_to_rank.drop(columns=['best_rank'], inplace=True, axis=1)
df_to_rank.drop(columns = ['ranked'], inplace = True, axis=1)

In [None]:
unknown_pred_dt = dt.predict(df_to_rank)

In [None]:
df_to_rank['ranked'] = unknown_pred_dt

In [None]:
df_to_rank

### Rule based

### Naive Bayes

### SVM

### Bagging

### Boosting

### AdaBoost

### Random Forest

### Neural Networks

### KNN