# Lab IS&A
## Bagattin Enrico - Alessandro Doretto

In [None]:
# Import
import pandas as pd
import numpy as np
import glob

# Modules
from utilities import *
from dataPreparation import *

In [None]:
# Import data
years = [2017, 2018, 2019, 2020]
yearsForFeatures = [2016, 2017, 2018, 2019, 2020]
paths = []
for y in years:
    paths.append('matches/' + str(y) + '.xlsx') 
availablePaths = list(glob.glob("matches/20*.xlsx"))
matches = [pd.read_excel(path) for path in paths]
yearZeroForFeatures = pd.read_excel('matches/' + str(years[0]-1) + '.xlsx')
# TODO: Load matches based on number of past years choosen
df = pd.concat(matches, ignore_index=True, sort=False)
df.info()

In [None]:
df.head().T

In [None]:
df.describe(include='all', percentiles=[]).T

# Cleaning and preparing data

## Remove Winner/Loser reference
All the column with Winner/Loser reference will be substituted by Player0/Player1.

In [None]:
df = removeWinnerLoserReference(df)
yearZeroForFeatures = removeWinnerLoserReference(yearZeroForFeatures)

## Filling null:
* Rank: take the max rank plus one
* Pts: set default zero	
* Avg odd: take mode of matches with same (or similar) players rank
* B365, PS: fill with avg

In [None]:
rankDefault = max(df['Rank0'].max(), df['Rank1'].max())+1
df.fillna({'Rank0': rankDefault, 'Rank1': rankDefault, 'Pts0': 0, 'Pts1': 0}, inplace=True)

nullOddsDf = df[df[['B3650', 'B3651', 'PS0', 'PS1', 'Avg0', 'Avg1']].isna().any(axis=1)]
for index, row in nullOddsDf.iterrows():
    if pd.isnull(row['Avg0']) or pd.isnull(row['Avg1']):
        Avg0, Avg1 = findOddsForRow(row, df.dropna(subset=['Avg0', 'Avg1']))
        df.at[index, 'Avg0'] = row['Avg0'] = Avg0
        df.at[index, 'Avg1'] = row['Avg1'] = Avg1
    if pd.isnull(row['B3650']):
        df.at[index, 'B3650'] = row['Avg0']
    if pd.isnull(row['B3651']):
        df.at[index, 'B3651'] = row['Avg1']
    if pd.isnull(row['PS0']):
        df.at[index, 'PS0'] = row['Avg0']
    if pd.isnull(row['PS1']):
        df.at[index, 'PS1'] = row['Avg1']

df.dropna(subset=['Avg0', 'Avg1'], inplace=True) # Drop rows that hasn't similar rank matches
df.info()

## Handle Round ????????

In [None]:
# X['Round'].value_counts()

In [None]:
# you might change this according to a notion of weight
# X['Round'] = X['Round'].map ({  '1st Round'    : 1, 
#                                 '2nd Round'    : 2, 
#                                 '3rd Round'    : 4,
#                                 '4th Round'    : 8,
#                                 'Quarterfinals': 16,
#                                 'Round Robin'  : 32,
#                                 'Semifinals'   : 32,
#                                 'The Final'    : 64})

## New features
* [Elo rating](https://en.wikipedia.org/wiki/Elo_rating_system): a method for calculating the relative skill levels of players in zero-sum games
* Number of matches played during the last year
* Percentage of matches won during the last year
* Injuries: number matches in witch the player retired or walkover in the past year 
* Winning streak: current sequence of won games

In [None]:
X = addEloRatingFeature(df)

In [None]:
X = addMatchesPlayedAndWonFeatures(X, yearZeroForFeatures, yearsForFeatures)

In [None]:
X = addInjuriesAndWinningStreakFeatures(X, yearZeroForFeatures, yearsForFeatures)

In [None]:
X.to_csv('generated/beforeDuplication.csv', index=False)

## Row duplication
To use both match outcomes for our prediction models we will duplicate each row. We can do it by switching all the player features for each duplicated row and adding a Winner column for the match result

In [None]:
X.columns

In [None]:
duplication = X.copy()
duplication.columns = ['Date', 'Location', 'Tournament', 'Series', 'Court', 'Surface', 'Round',
       'Player1', 'Player0', 'Rank1', 'Rank0', 'Pts1', 'Pts0', 'Comment',
       'B3651', 'B3650', 'PS1', 'PS0', 'Avg1', 'Avg0', 'EloRating1',
       'EloRating0', 'MatchesPlayed1', 'MatchesPlayed0', 'MatchesWon1',
       'MatchesWon0', 'Injuries1', 'Injuries0', 'WinningStreak1',
       'WinningStreak0']

# Add the winner column
X = X.assign(Winner=np.zeros(X.shape[0])) # Player 0 always win
duplication = duplication.assign(Winner=np.ones(X.shape[0])) # Player 1 always win

X = pd.concat([X, duplication])
X.reset_index(inplace=True)
X.sort_values(by='index', inplace=True)
X.drop(columns=['Date', 'Comment', 'index'], inplace=True)

## One hot encoding
* Location
* Tournament
* Series
* Court
* Surface
* Round
* Players

In [None]:
X = pd.get_dummies(X)
print('Total number of columns:', len(X.columns))

X.to_csv('generated/finalDataset.csv', index=False)


## Dataset subdivision: Train, Validation, Test

Train 60%, Validation 20%, Test 20% (taking as test the last part of the dataset)

In [None]:
import pandas as pd

X = pd.read_csv('generated/finalDataset.csv')

X.head()

In [None]:
from sklearn.model_selection import train_test_split

y = X.Winner.values
X.drop(columns='Winner', inplace=True)

test_size = len(X)//5
X_test     = X[-test_size:]
y_test     = y[-test_size:]
X_train_80 = X[:-test_size]
y_train_80 = y[:-test_size]

# Random split for training and validation
X_train, X_valid, y_train, y_valid = train_test_split(X_train_80, y_train_80, test_size=0.25, random_state=42)

# Prediction models
We start by calculating how much powerful are the bookmakers' alghoritms, then we create and tune ours, let's see the results
## Baseline
Our first goal is to beat the average bookmaker accuracy

In [None]:
from sklearn.metrics import accuracy_score

#Player 1 wins if the odd is smaller than player 0
baseline = X_test['Avg1'] < X_test['Avg0']
baseline = baseline.astype(int)
baseline_test_acc = accuracy_score(y_true=y_test, y_pred=baseline)
print ("Test Accuracy: {:.2f}".format(baseline_test_acc*100), "%")

## k-Nearest-Neighbor Classifier

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)
scaled_train = scaler.transform(X_train)
scaled_valid = scaler.transform(X_valid)
scaled_train_80 = scaler.transform(X_train_80)
scaled_test = scaler.transform(X_test)

In [None]:
# Warning: kNN is very slow with this dataset

from sklearn import neighbors
from sklearn.metrics import accuracy_score

accuracies = []

for k in range(1,16):
    kNN = neighbors.KNeighborsClassifier(n_neighbors=k)
    kNN.fit(scaled_train, y_train)
    y_pred = kNN.predict(scaled_valid)
    valid_acc = accuracy_score(y_true=y_valid, y_pred=y_pred)
    print ("k: {:2d} | Validation Accuracy: {:.3f}".format(k, valid_acc))
    accuracies += [[valid_acc, k]]

In [None]:
# All the resulting accuracies for a faster running process purpose

accuracies =  [[0.5272562083585706, 1],
                [0.5399757722592369, 2],
                [0.5590551181102362, 3],
                [0.5632949727437916, 4],
                [0.5793458509993943, 5],
                [0.5838885523924894, 6],
                [0.5990308903694731, 7],
                [0.5896426408237432, 8],
                [0.6005451241671714, 9],
                [0.595396729254997, 10],
                [0.6053906723198061, 11],
                [0.5993337371290127, 12],
                [0.6035735917625682, 13],
                [0.5999394306480921, 14],
                [0.6078134463961236, 15]]

In [None]:
best_accuracy, best_k = max(accuracies)
print ( "Best K", best_k )

# here we are using both training and validation,
# to exploit the most data

kNN = neighbors.KNeighborsClassifier(n_neighbors=best_k)
kNN.fit(scaled_train_80, y_train_80)

# Finally evaluate on test
test_acc = accuracy_score(y_true=y_test, y_pred=kNN.predict(scaled_test))
print ("Test Accuracy: {:.3f}".format(test_acc))

## Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# train and predict
gnb = GaussianNB()
gnb.fit(X_train, y_train)

# compute Accuracy
train_acc = accuracy_score(y_true=y_train, y_pred=gnb.predict(X_train))
valid_acc = accuracy_score(y_true=y_valid, y_pred=gnb.predict(X_valid))
print ("Train Accuracy: {:.3f} - Validation Accuracy: {:.3f}".format(train_acc, valid_acc))

gnb.fit(X_train_80,y_train_80)

# Finally evaluate on test
test_acc = accuracy_score(y_true=y_test, y_pred=gnb.predict(X_test))
print ("Test Accuracy: {:.3f}".format(test_acc))

## Decision Tree

In [None]:
from sklearn import tree
from sklearn.metrics import accuracy_score

accuracies = []

for max_leaves in range(5, 101, 5):
    # train and predict
    dt = tree.DecisionTreeClassifier(max_leaf_nodes=max_leaves)
    dt.fit(X_train, y_train)

    # compute Accuracy
    train_acc = accuracy_score(y_true=y_train, y_pred=dt.predict(X_train))
    valid_acc = accuracy_score(y_true=y_valid, y_pred=dt.predict(X_valid))
    print ("Leaves: {:2d} - Train Accuracy: {:.3f} - Validation Accuracy: {:.3f}".format(
        max_leaves,  train_acc, valid_acc) )
    
    accuracies += [ [valid_acc, max_leaves] ]

best_accuracy, best_max_leaves = max(accuracies)
print ( "Best Max Leaves", best_max_leaves )

# here we are using both training and validation,
# to exploit the most data
dt = tree.DecisionTreeClassifier(max_leaf_nodes=best_max_leaves)
dt.fit(X_train_80,y_train_80)

# Finally evaluate on test
test_acc = accuracy_score(y_true=y_test, y_pred=dt.predict(X_test))
print ("Test Accuracy: {:.3f}".format(test_acc))

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

from sklearn import tree
from sklearn.metrics import accuracy_score

accuracies = []

for estimators in range(1, 51):
    # train a decision tree classifier
    rf = RandomForestClassifier(n_estimators=estimators)
    rf.fit(X_train, y_train)

    # compute Accuracy
    train_acc = accuracy_score(y_true=y_train, y_pred=rf.predict(X_train))
    valid_acc = accuracy_score(y_true=y_valid, y_pred=rf.predict(X_valid))
    print ("Estimators: {:2d} - Train Accuracy: {:.3f} - Validation Accuracy: {:.3f}".format(
        estimators,  train_acc, valid_acc) )
    
    accuracies += [ [valid_acc, estimators] ]

best_accuracy, best_estimators = max(accuracies)
print ( "Best Max Leaves", best_estimators )

# here we are using both training and validation,
# to exploit the most data
rf = tree.DecisionTreeClassifier(max_leaf_nodes=best_estimators)
rf.fit(X_train_80,y_train_80)

# Finally evaluate on test
test_acc = accuracy_score(y_true=y_test, y_pred=rf.predict(X_test))
print ("Test Accuracy: {:.3f}".format(test_acc))

In [None]:
from sklearn.ensemble import AdaBoostRegressor