# Australian Open 2020: Winner Predictor
## Web Intelligence Course, Ca' Foscari University, A.Y. 2019/2020
#### Diego Chinellato, 867637 - Giorgia Campardo, 867928

In [187]:
%matplotlib notebook

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report
from time import time

In [94]:
data_types = {
    'ATP': int,
    'Location': object,
    'Tournament': object,
    'Series': object,
    'Court': object,
    'Surface': object,
    'Round': object,
    'Best of': int,
    'Winner': object,
    'Loser': object,
    'WRank': float,
    'LRank': float,
    'WPts': float,
    'LPts': float,
    'W1': float,
    'L1': float,
    'W2': float,
    'L2': float,
    'W3': float,
    'L3': float,
    'W4': float,
    'L4': float,
    'W5': float,
    'L5': float,
    'Wsets': float,
    'Lsets': float,
    'Comment': object,
    'B365W': float,
    'B365L': float,
    'EXW': object,
    'EXL': float,
    'LBW': float,
    'LBL': float,
    'PSW': float,
    'PSL': float,
    'SJW': float,
    'SJL': float,
    'MaxW': float,
    'MaxL': float,
    'AvgW': float,
    'AvgL': float,
    'WElo': float,
    'WSurfElo': float,
    'WHand': object,
    'WBHand': float,
    'LElo': float,
    'LSurfElo': float,
    'LHand': object,
    'LBHand': float
}
x = pd.read_csv('data/dataset.csv', 
                encoding='utf-8-sig', 
                dtype=data_types,
                parse_dates=['Date', 'WBD', 'LBD'])

In [95]:
print(x['WHand'].unique())
print(x['LHand'].unique())
print(x['WBHand'].unique())
print(x['LBHand'].unique())

['L' 'R' nan]
[nan 'R' 'L']
[ 1.  2. nan]
[nan  2.  1.]


In [78]:
def compute_elo_rankings(data):
    """
    Given the list on matches in chronological order, for each match, computes 
    the elo ranking of the 2 players at the beginning of the match
    """
    print("Elo rankings computing...")
    players=list(pd.Series(list(data.Winner)+list(data.Loser)).value_counts().index)
    elo=pd.Series(np.ones(len(players))*1500,index=players)
    ranking_elo=[(1500,1500)]
    for i in range(1,len(data)):
        w=data.iloc[i-1,:].Winner
        l=data.iloc[i-1,:].Loser
        elow=elo[w]
        elol=elo[l]
        pwin=1 / (1 + 10 ** ((elol - elow) / 400))    
        K_win=32
        K_los=32
        new_elow=elow+K_win*(1-pwin)
        new_elol=elol-K_los*(1-pwin)
        elo[w]=new_elow
        elo[l]=new_elol
        ranking_elo.append((elo[data.iloc[i,:].Winner],elo[data.iloc[i,:].Loser])) 
    ranking_elo=pd.DataFrame(ranking_elo,columns=["elo_winner","elo_loser"])    
    ranking_elo["proba_elo"]=1 / (1 + 10 ** ((ranking_elo["elo_loser"] - ranking_elo["elo_winner"]) / 400))   
    return ranking_elo

In [162]:
def preprocess_data(max_date=2014,
                    features_to_drop=[], 
                    missing_values="drop", 
                    drop_first=False):
    """
    Processes raw data and returns a tuple (X, Y) where X is the cleaned dataset and Y is the array of labels.
    """
    if max_date > 2019 or max_date < 2011:
        raise ValueError("Wrong date parameter")
    df = pd.read_csv("data/" + str(max_date) + ".csv", encoding='utf-8-sig')
    for year in range (max_date + 1, 2020):
        filename = "data/" + str(year) + ".csv"
        df = pd.concat((df, pd.read_csv(filename, encoding='utf-8-sig', dtype=data_types)))
    
    # Sort by date to calculate ELO
    X = df.sort_values(by='Date')
    
    # Drop unuseful columns
    features_to_drop += ['ATP', 'Location', 'Tournament', 'Date', 'Comment', 
                         'Winner', 'Loser','WPts', 'LPts', 'Wsets', 'Lsets', 
                         'W1', 'L1', 'W2', 'L2', 'W3', 'L3', 'W4', 'L4', 'W5', 'L5', 
                         'B365W', 'B365L', 'EXW', 'EXL', 'LBW', 'LBL', 'PSW', 'PSL', 'SJW', 'SJL',
                         'WBD', 'LBD']
    X = X.drop(columns=features_to_drop)
    
    # Deal with missing values
    X['WRank'] = X['WRank'].fillna(value=X['WRank'].max()+100).astype(int)
    X['LRank'] = X['LRank'].fillna(value=X['LRank'].max()+100).astype(int)

    if missing_values == 'drop':
        X = X.dropna()
    elif missing_values == 'custom':
        pass
    else:
        raise ValueError('Wrong parameter: missing_values')

    # Convert ordinal features to int (higher value means more important)
    series = ['ATP250', 'ATP500', 'Masters 1000', 'Masters Cup', 'Grand Slam']
    series2int = {s: i for i, s in enumerate(series)}
    rounds2int = {'1st Round': 0,
                  '2nd Round': 1,
                  '3rd Round': 2,
                  '4th Round': 3,
                  'Round Robin': 4,
                  'Quarterfinals': 5,
                  'Semifinals': 6,
                  'The Final': 7,
                 }
    X = X.replace({'Round': rounds2int, 'Series': series2int})
    
    # Convert categorical (binary) fields to int
    X = X.replace({'Court': {'Outdoor': 0, 'Indoor': 1}, 
                   'WHand': {'R': 0, 'L': 1}, 
                   'LHand': {'R': 0, 'L': 1}})
    X.astype({'WBHand': int, 'LBHand': int})
    
    # One hot encode categorical features into binary features
    X = pd.get_dummies(X, prefix=['Surface_'], columns=['Surface'], drop_first=drop_first)
    
    # Rename columns
    X = X.rename(columns={'WRank':'P1Rank', 'LRank':'P2Rank', 
                          'MaxW':'MaxP1', 'MaxL':'MaxP2', 
                          'AvgW':'AvgP1', 'AvgL':'AvgP2'})
    
    # Generate labels
    Y = np.concatenate([np.ones(X.shape[0], dtype=int), np.zeros(X.shape[0], dtype=int)])
    # Swap columns and concatenate to data
    tmp = X.copy()
    cols_to_swap = ['P1Rank', 'P2Rank', 'MaxP1', 'MaxP2',  'AvgP1',  'AvgP2',
                    'WElo', 'LElo', 'WSurfElo', 'LSurfElo', 'WHand', 'LHand', 'WBHand', 'LBHand']
    cols_swapped = ['P2Rank', 'P1Rank', 'MaxP2', 'MaxP1',  'AvgP2',  'AvgP1', 
                    'LElo', 'WElo', 'LSurfElo', 'WSurfElo', 'LHand', 'WHand', 'LBHand', 'WBHand']

    tmp[cols_to_swap] = tmp[cols_swapped]
    tmp.index = np.array(range(X.shape[0] + 1, X.shape[0] * 2 + 1))
    X = pd.concat((X, tmp))
    
    return X, Y

In [163]:
X, Y = preprocess_data()
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16902 entries, 0 to 16902
Data columns (total 21 columns):
Series            16902 non-null int64
Court             16902 non-null int64
Round             16902 non-null int64
Best of           16902 non-null int64
P1Rank            16902 non-null int32
P2Rank            16902 non-null int32
MaxP1             16902 non-null float64
MaxP2             16902 non-null float64
AvgP1             16902 non-null float64
AvgP2             16902 non-null float64
WElo              16902 non-null float64
WSurfElo          16902 non-null float64
WHand             16902 non-null int64
WBHand            16902 non-null float64
LElo              16902 non-null float64
LSurfElo          16902 non-null float64
LHand             16902 non-null int64
LBHand            16902 non-null float64
Surface__Clay     16902 non-null uint8
Surface__Grass    16902 non-null uint8
Surface__Hard     16902 non-null uint8
dtypes: float64(10), int32(2), int64(6), uint8(3)
me

In [164]:
X.describe()

Unnamed: 0,Series,Court,Round,Best of,P1Rank,P2Rank,MaxP1,MaxP2,AvgP1,AvgP2,...,WSurfElo,WHand,WBHand,LElo,LSurfElo,LHand,LBHand,Surface__Clay,Surface__Grass,Surface__Hard
count,16902.0,16902.0,16902.0,16902.0,16902.0,16902.0,16902.0,16902.0,16902.0,16902.0,...,16902.0,16902.0,16902.0,16902.0,16902.0,16902.0,16902.0,16902.0,16902.0,16902.0
mean,1.480062,0.167317,1.62679,3.410839,63.712519,63.712519,8.126537,8.126537,2.673033,2.673033,...,1722.765158,0.148207,1.791208,1773.112141,1722.765158,0.148207,1.791208,0.287895,0.12957,0.582535
std,1.504953,0.37327,2.122314,0.80804,85.755495,85.755495,460.776937,460.776937,2.468632,2.468632,...,162.677812,0.355316,0.406457,169.911279,162.677812,0.355316,0.406457,0.452795,0.33584,0.493156
min,0.0,0.0,0.0,3.0,1.0,1.0,1.01,1.01,1.01,1.01,...,1222.9,0.0,1.0,1273.3,1222.9,0.0,1.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,3.0,21.0,21.0,1.47,1.47,1.41,1.41,...,1615.2,0.0,2.0,1662.8,1615.2,0.0,2.0,0.0,0.0,0.0
50%,1.0,0.0,1.0,3.0,46.0,46.0,2.0,2.0,1.89,1.89,...,1724.7,0.0,2.0,1782.0,1724.7,0.0,2.0,0.0,0.0,1.0
75%,2.0,0.0,2.0,3.0,82.0,82.0,3.14,3.14,2.87,2.87,...,1804.6,0.0,2.0,1850.6,1804.6,0.0,2.0,1.0,0.0,1.0
max,4.0,1.0,7.0,5.0,2259.0,2259.0,42586.0,42586.0,32.55,32.55,...,2163.5,1.0,2.0,2203.4,2163.5,1.0,2.0,1.0,1.0,1.0


Models construction

In [189]:
def timeit(fun):
    def timed(*args, **kwargs):
        start = time()
        result = fun(*args, **kwargs)
        end = time()
        print('Execution took {:.2f} min'.format((end-start)/60))
        return result
    return timed

In [172]:
def baseline_model(X, Y):
    # This model will always predict the winner as the player with the highest rank.
    # It's the lower bound on accuracy that we wish to improve
    y_pred = (X['P1Rank'] > X['P2Rank']).astype(int)
    accuracy = round((y_pred == Y).sum()/len(Y), 2)
    return accuracy
    
print('Accuracy for the baseline model is:', baseline_model(X, Y))

Accuracy for the baseline model is: 0.34


In [191]:
@timeit
def build_decision_tree(X_train, Y_train, X_valid, Y_valid):
    from sklearn.tree import DecisionTreeClassifier
    # Builds a decision tree and performs automatic hyper-parameters tuning
    scores = []
    for criterion in ('gini', 'entropy'):
        for depth in range(5, 50, 5):
            for leaves in range(10, 201, 30):
                dt = DecisionTreeClassifier(max_leaf_nodes=leaves,
                                            criterion=criterion,
                                            max_depth=depth)
                dt.fit(X_train, Y_train)
                valid_acc = round(accuracy_score(y_true=Y_valid, y_pred=dt.predict(X_valid)), 3)
                scores += [(valid_acc, criterion, depth, leaves)]
    best = max(scores)
    acc, criterion, depth, leaves = best
    print('Max accuracy on validation set:', acc)
    print('Criterion:', criterion)
    print('Max depth:', depth)
    print('Max leaves:', leaves)
    dt = DecisionTreeClassifier(max_leaf_nodes=leaves,
                                 criterion=criterion,
                                 max_depth=depth)
    dt.fit(pd.concat([X_train, X_valid]), np.concatenate([Y_train, Y_valid]))
    return dt, best

In [174]:
@timeit
def build_bagging_classifier(X_train, Y_train, X_valid, Y_valid):
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.ensemble import BaggingClassifier
    scores = []
    for bootstrap in (True, False):
        for n_est in range(10, 201, 20):
            for max_samples in (0.25, 0.50, 0.75, 1.0):
                for criterion in ('gini', 'entropy'):
                    dt = DecisionTreeClassifier(criterion=criterion)
                    bagged_dt = BaggingClassifier(dt, bootstrap=bootstrap,
                                                  n_estimators=n_est,
                                                  max_samples=max_samples,
                                                  n_jobs=-1)
                    bagged_dt.fit(X_train, Y_train)
                    valid_acc = round(accuracy_score(y_true=Y_valid, y_pred=bagged_dt.predict(X_valid)), 3)
                    scores += [(valid_acc, bootstrap, n_est, max_samples, criterion)]
    best = max(scores)
    acc, bootsrap, n_est, max_samples, criterion = best
    print('Max accuracy on validation set:', acc)
    print('Boostrap:', bootsrap)
    print('N. estimators:', n_est)
    print('Max samples:', max_samples)
    print('Tree criterion:', criterion)
    bagged_dt = BaggingClassifier(dt, 
                                  bootstrap=bootstrap,
                                  n_estimators=n_est, 
                                  max_samples=max_samples)
    bagged_dt.fit(pd.concat([X_train, X_valid]), np.concatenate([Y_train, Y_valid]))
    return bagged_dt, best

In [175]:
@timeit
def build_adaboost(X_train, Y_train, X_valid, Y_valid):
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.ensemble import AdaBoostClassifier
    scores = []
    for n_est in range(50, 301, 50):
        for learning_rate in (0.50, 0.75, 1.0, 1.5):
            for criterion in ('gini', 'entropy'):
                for depth in range(5, 21, 10):
                    for leaves in range(5, 100, 25):
                        dt = DecisionTreeClassifier(max_leaf_nodes=leaves,
                                                    criterion=criterion,
                                                    max_depth=depth)
                        boosted_dt = AdaBoostClassifier(dt,
                                                        n_estimators=n_est,
                                                        learning_rate=learning_rate)
                        boosted_dt.fit(X_train, Y_train)
                        valid_acc = round(accuracy_score(y_true=Y_valid, y_pred=bagged_dt.predict(X_valid)), 3)
                        scores += [(valid_acc, n_est, learning_rate, leaves, criterion, depth)]
    best = max(scores)
    acc, n_est, learning_rate, leaves, criterion, depth = best
    print('Max accuracy on validation set:', acc)
    print('N. estimators:', n_est)
    print('Learning rate:', learning_rate)
    print('Tree max leaves:', leaves)
    print('Tree max depth:', depth)
    print('Tree criterion:', criterion)
    dt = DecisionTreeClassifier(max_leaf_nodes=leaves,
                                criterion=criterion,
                                max_depth=depth)
    boosted_dt = AdaBoostClassifier(dt,
                                    n_estimators=n_est,
                                    learning_rate=learning_rate)
    boosted_dt.fit(pd.concat([X_train, X_valid]), np.concatenate([Y_train, Y_valid]))
    return boosted_dt, best

In [176]:
@timeit
def build_random_forest(X_train, Y_train, X_valid, Y_valid):
    from sklearn.ensemble import RandomForestClassifier
    scores = []
    for n_est in range(50, 501, 50):
        for criterion in ('gini', 'entropy'):
            for bootstrap in (True, False):
                for n_features in (None, 'sqrt', 'log2'):
                    rf = RandomForestClassifier(n_estimators=n_est,
                                                bootstrap=bootstrap,
                                                criterion=criterion,
                                                max_features=n_features,
                                                n_jobs=-1)
                    rf.fit(X_train, Y_train)
                    valid_acc = round(accuracy_score(y_true=Y_valid, y_pred=rf.predict(X_valid)), 3)
                    scores += [(valid_acc, n_est, criterion, bootstrap, n_features)]
    best = max(scores)
    acc, n_est, criterion, bootstrap, features = best
    print('Max accuracy on validation set:', acc)
    print('N. estimators:', n_est)
    print('Criterion:', criterion)
    print('Bootstrap:', bootstrap)
    print('Features criterion (None means all features):', features)
    rf = RandomForestClassifier(n_estimators=n_est,
                               bootstrap=bootstrap,
                               criterion=criterion,
                               n_jobs=-1)
    rf.fit(pd.concat([X_train, X_valid]), np.concatenate([Y_train, Y_valid]))
    return rf, best

Let's split the data into train set, validation set and test set

In [177]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20)
X_train, X_valid, Y_train, Y_valid = train_test_split(X_train, Y_train, test_size=0.25)

In [192]:
dt, dt_params = build_decision_tree(X_train, Y_train, X_valid, Y_valid)

Max accuracy on validation set: 0.695
Criterion: entropy
Max depth: 5
Max leaves: 190
Execution took 0.08 min


In [179]:
bagged_dt, bagged_params = build_bagging_classifier(X_train, Y_train, X_valid, Y_valid)

Max accuracy on validation set: 0.699
Boostrap: True
N. estimators: 130
Max samples: 0.25
Tree criterion: gini


In [180]:
boosted_dt, boosted_params = build_adaboost(X_train, Y_train, X_valid, Y_valid)

Max accuracy on validation set: 0.867
N. estimators: 300
Learning rate: 1.5
Tree max leaves: 80
Tree max depth: 15
Tree criterion: gini


In [181]:
rf, rf_params = build_random_forest(X_train, Y_train, X_valid, Y_valid)

Max accuracy on validation set: 0.695
N. estimators: 400
Criterion: entropy
Bootstrap: True
Features criterion (None means all features): sqrt


In [182]:
def report(X, Y, models):
    for model in models:
        print('Algorithm:', str(type(model)).split('.')[-1][:-2])
        rep = classification_report(y_true=Y, y_pred=model.predict(X))
        print(rep)
        print()

In [183]:
report(X_test, Y_test, [dt, bagged_dt, boosted_dt, rf])

Algorithm: DecisionTreeClassifier
              precision    recall  f1-score   support

           0       0.71      0.63      0.66      1716
           1       0.66      0.73      0.69      1665

    accuracy                           0.68      3381
   macro avg       0.68      0.68      0.68      3381
weighted avg       0.68      0.68      0.68      3381


Algorithm: BaggingClassifier
              precision    recall  f1-score   support

           0       0.68      0.69      0.69      1716
           1       0.68      0.67      0.68      1665

    accuracy                           0.68      3381
   macro avg       0.68      0.68      0.68      3381
weighted avg       0.68      0.68      0.68      3381


Algorithm: AdaBoostClassifier
              precision    recall  f1-score   support

           0       0.62      0.62      0.62      1716
           1       0.60      0.60      0.60      1665

    accuracy                           0.61      3381
   macro avg       0.61      0.61