# Open 2020: Winner Predictor
## Diego Chinellato - 867637
## Giorgia Campardo - 867928
### Web Intelligence Course

In [1]:
%matplotlib notebook

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn import tree

In [2]:
df = pd.read_excel('2019.xlsx')
df_ext = pd.read_excel('2019+.xlsx')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2234 entries, 0 to 2233
Data columns (total 36 columns):
ATP           2234 non-null int64
Location      2234 non-null object
Tournament    2234 non-null object
Date          2234 non-null datetime64[ns]
Series        2234 non-null object
Court         2234 non-null object
Surface       2234 non-null object
Round         2234 non-null object
Best of       2234 non-null int64
Winner        2234 non-null object
Loser         2234 non-null object
WRank         2230 non-null float64
LRank         2221 non-null float64
WPts          2231 non-null float64
LPts          2221 non-null float64
W1            2215 non-null float64
L1            2215 non-null float64
W2            2205 non-null float64
L2            2205 non-null float64
W3            1119 non-null float64
L3            1119 non-null float64
W4            265 non-null float64
L4            265 non-null float64
W5            96 non-null float64
L5            96 non-null float64
Wset

In [4]:
df_ext.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23634 entries, 0 to 23633
Data columns (total 42 columns):
ATP           23634 non-null int64
Location      23634 non-null object
Tournament    23634 non-null object
Date          23634 non-null datetime64[ns]
Series        23634 non-null object
Court         23634 non-null object
Surface       23634 non-null object
Round         23634 non-null object
Best of       23634 non-null int64
Winner        23634 non-null object
Loser         23634 non-null object
WRank         23624 non-null float64
LRank         23586 non-null float64
WPts          23626 non-null float64
LPts          23587 non-null float64
W1            23483 non-null float64
L1            23485 non-null float64
W2            23260 non-null float64
L2            23260 non-null float64
W3            11173 non-null float64
L3            11173 non-null float64
W4            2200 non-null float64
L4            2200 non-null float64
W5            816 non-null float64
L5          

In [6]:
df.describe()

Unnamed: 0,ATP,Best of,WRank,LRank,WPts,LPts,W1,L1,W2,L2,...,Wsets,Lsets,B365W,B365L,PSW,PSL,MaxW,MaxL,AvgW,AvgL
count,2234.0,2234.0,2230.0,2221.0,2231.0,2221.0,2215.0,2215.0,2205.0,2205.0,...,2215.0,2215.0,2225.0,2225.0,2225.0,2225.0,2234.0,2234.0,2234.0,2234.0
mean,28.635184,3.453894,57.94843,79.944169,1723.479606,1104.742008,5.823928,4.190971,5.822676,4.043537,...,2.200451,0.445147,1.850387,3.222966,1.937683,3.526445,2.002936,3.743693,1.871629,3.186124
std,15.50522,0.837904,54.876823,76.409609,2022.360691,1145.908936,1.199468,1.828982,1.22949,1.840488,...,0.455321,0.577775,0.917769,3.228215,1.0518,3.723625,1.12587,4.409477,0.933585,2.769056
min,1.0,3.0,1.0,1.0,17.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.002,1.07,1.005,1.07,1.01,1.08,1.01,1.06
25%,17.0,3.0,20.0,36.0,707.0,577.0,6.0,3.0,6.0,3.0,...,2.0,0.0,1.3,1.66,1.33,1.76,1.36,1.82,1.31,1.72
50%,29.0,3.0,48.0,64.0,983.0,830.0,6.0,4.0,6.0,4.0,...,2.0,0.0,1.57,2.3,1.64,2.44,1.67,2.51,1.6,2.34
75%,41.0,3.0,78.0,98.0,1740.0,1205.0,6.0,6.0,6.0,6.0,...,2.0,1.0,2.1,3.5,2.19,3.65,2.27,3.8,2.12,3.4775
max,54.0,5.0,455.0,1415.0,12415.0,12355.0,7.0,7.0,7.0,7.0,...,3.0,2.0,9.0,41.0,11.73,37.8,12.22,67.0,9.64,28.49


In [7]:
def compute_elo_rankings(data):
    """
    Given the list on matches in chronological order, for each match, computes 
    the elo ranking of the 2 players at the beginning of the match
    """
    print("Elo rankings computing...")
    players=list(pd.Series(list(data.Winner)+list(data.Loser)).value_counts().index)
    elo=pd.Series(np.ones(len(players))*1500,index=players)
    ranking_elo=[(1500,1500)]
    for i in range(1,len(data)):
        w=data.iloc[i-1,:].Winner
        l=data.iloc[i-1,:].Loser
        elow=elo[w]
        elol=elo[l]
        pwin=1 / (1 + 10 ** ((elol - elow) / 400))    
        K_win=32
        K_los=32
        new_elow=elow+K_win*(1-pwin)
        new_elol=elol-K_los*(1-pwin)
        elo[w]=new_elow
        elo[l]=new_elol
        ranking_elo.append((elo[data.iloc[i,:].Winner],elo[data.iloc[i,:].Loser])) 
    ranking_elo=pd.DataFrame(ranking_elo,columns=["elo_winner","elo_loser"])    
    ranking_elo["proba_elo"]=1 / (1 + 10 ** ((ranking_elo["elo_loser"] - ranking_elo["elo_winner"]) / 400))   
    return ranking_elo

In [8]:
def preprocess_data(df, 
                    features_to_drop=[], 
                    missing_values="drop", 
                    drop_first=False):
    """
    Processes raw data and returns a tuple (X, Y) where X is the cleaned dataset and Y is the array of labels.
    """
    # Sort by date to calculate ELO
    X = df.sort_values(by='Date')
    
    # Drop unuseful columns
    features_to_drop += ['ATP', 'Location', 'Tournament', 'Date', 'Comment', 
                         'WPts', 'LPts', 'Wsets', 'Lsets', 
                         'W1', 'L1', 'W2', 'L2', 'W3', 'L3', 'W4', 'L4', 'W5', 'L5', 
                         'B365W', 'B365L', 'EXW', 'EXL', 'LBW', 'LBL', 'PSW', 'PSL', 'SJW', 'SJL']
    X = X.drop(columns=features_to_drop)
    
    # Deal with missing values
    X['WRank'] = X['WRank'].fillna(value=X['WRank'].max()+1).astype(int)
    X['LRank'] = X['LRank'].fillna(value=X['LRank'].max()+1).astype(int)
    
    if missing_values == 'drop':
        X = X.dropna()
    elif missing_values == 'mean':
        from sklearn.impute import SimpleImputer
        pass
    elif missing_values == 'custom':
        pass
    else:
        raise ValueError('Wrong parameter: missing_values')

    # Convert ordinal features to int (higher value means more important)
    series = ['ATP250', 'ATP500', 'Masters 1000', 'Masters Cup', 'Grand Slam']
    series2int = {s: i for i, s in enumerate(series)}
    rounds2int = {'1st Round': 0,
                  '2nd Round': 1,
                  '3rd Round': 2,
                  '4th Round': 3,
                  'Round Robin': 4,
                  'Quarterfinals': 5,
                  'Semifinals': 6,
                  'The Final': 7,
                 }
    X = X.replace({'Round': rounds2int, 'Series': series2int})
    
    # Convert court to binary
    X = X.replace({'Court': {'Outdoor': 0, 'Indoor': 1}})
    
    # One hot encode categorical features into binary features
    X = pd.get_dummies(X, prefix=['Surface_'], columns=['Surface'], drop_first=drop_first)
    
    # Convert players to numeric ?
    players = set(X['Winner']) | set(X['Loser'])
    players_to_id = {}
    for i, player in enumerate(players):
        players_to_id[player] = i
    X = X.replace({'Winner': players_to_id, 'Loser': players_to_id})
    X = X.astype({'Winner':int, 'Loser':int})

    X = X.rename(columns={'Winner':'1st Player', 'Loser':'2nd Player', 
                          'WRank':'P1Rank', 'LRank':'P2Rank', 
                          'MaxW':'MaxP1', 'MaxL':'MaxP2', 
                          'AvgW':'AvgP1', 'AvgL':'AvgP2'})
    
    # Generate labels
    Y = np.concatenate([np.ones(X.shape[0], dtype=int), np.zeros(X.shape[0], dtype=int)])
    # Swap columns and concatenate to data
    tmp = X.copy()
    cols_to_swap = ['1st Player', '2nd Player', 'P1Rank', 'P2Rank', 'MaxP1', 'MaxP2',  'AvgP1',  'AvgP2']
    cols_swapped = ['2nd Player', '1st Player', 'P2Rank', 'P1Rank', 'MaxP2', 'MaxP1',  'AvgP2',  'AvgP1']
    tmp[cols_to_swap] = tmp[cols_swapped]
    tmp.index = np.array(range(X.shape[0] + 1, X.shape[0] * 2 + 1))
    X = pd.concat((X, tmp))
    
    return X, Y

In [9]:
data, labels = preprocess_data(df_ext)

In [10]:
print(all(type(t) == np.int32 for t in labels))

True


In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 47218 entries, 220 to 47218
Data columns (total 15 columns):
Series            47218 non-null int64
Court             47218 non-null int64
Round             47218 non-null int64
Best of           47218 non-null int64
1st Player        47218 non-null int32
2nd Player        47218 non-null int32
P1Rank            47218 non-null int32
P2Rank            47218 non-null int32
MaxP1             47218 non-null float64
MaxP2             47218 non-null float64
AvgP1             47218 non-null float64
AvgP2             47218 non-null float64
Surface__Clay     47218 non-null uint8
Surface__Grass    47218 non-null uint8
Surface__Hard     47218 non-null uint8
dtypes: float64(4), int32(4), int64(4), uint8(3)
memory usage: 4.1 MB


In [12]:
def foo(X, Y):
    # Division of dataset in training set and test set.
    # Validation set not required since we're going to use cross-validation
    # by GIORGIA: ma dove usi i parametri x e y? 
    X_train, X_test, Y_train, Y_test = train_test_split(data, labels, test_size=0.25)
    # do we need to preserve date ordering?
    # test with decision trees
    print("{:10} - {:10} - {:10} - {:10}".format('Leaves', 'Train acc', 'Test acc', 'k-valid'))
    for leaves in range(2, 50):
        dt = tree.DecisionTreeClassifier(class_weight="balanced",
                                         max_leaf_nodes=leaves)
        # 1st method, no validation set
        # not sure if it's right the way it's done
        dt.fit(X_train, Y_train)
        train_acc = accuracy_score(y_true=Y_train, y_pred=dt.predict(X_train))
        test_acc = accuracy_score(y_true=Y_test, y_pred=dt.predict(X_test))
        
        # 2nd method using cross validation, should perform better
        dt = tree.DecisionTreeClassifier(class_weight="balanced",
                                         max_leaf_nodes=leaves)
        scores = cross_val_score(dt, X_train, Y_train, 
                                 cv=5, scoring='accuracy')
        
        print(leaves, train_acc,test_acc, scores.mean())

In [13]:
foo(data, labels)

Leaves     - Train acc  - Test acc   - k-valid   
2 0.7008160844887471 0.6998729351969505 0.7000537277207106
3 0.7008160844887471 0.6998729351969505 0.7000537277207106
4 0.7008160844887471 0.6998729351969505 0.7000537277207106
5 0.7008160844887471 0.6998729351969505 0.7000537277207106
6 0.7008160844887471 0.6998729351969505 0.7000537277207106
7 0.7008160844887471 0.6998729351969505 0.7000537277207106
8 0.7008160844887471 0.6998729351969505 0.7000537277207106
9 0.7008160844887471 0.6998729351969505 0.7000537277207106
10 0.7008160844887471 0.6998729351969505 0.7000537277207106
11 0.7008160844887471 0.6998729351969505 0.7000537277207106
12 0.7008160844887471 0.6998729351969505 0.7000537277207106
13 0.7008160844887471 0.6998729351969505 0.7000537277207106
14 0.7008160844887471 0.6998729351969505 0.6999125446062111
15 0.7008160844887471 0.6998729351969505 0.6999125446062111
16 0.7008160844887471 0.6998729351969505 0.6999125446062111
17 0.7008160844887471 0.6998729351969505 0.699912544606211

In [14]:
def dowload_ao2020_data():
    """
    To get the data for the tournament? if we need to scrape these are the links of the matches
    https://ausopen.com/tournament-schedule
    https://ausopen.com/schedule#!8071
    """
    
def get_elo():
    from requests import get
    

# Primo Test con k-Nearest-Neighbor Classifiers


In [48]:
from sklearn import neighbors
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

def kNN_fun(X, Y) : 
    
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=42)
    
    for k in range(1,30):

        kNN = neighbors.KNeighborsClassifier(n_neighbors=k)
        kNN.fit(X_train,Y_train)

        Y_pred = kNN.predict(X_test)

        # compute Accuracy
        print ("k:", k," | Accuracy:", accuracy_score(y_true=Y_test, y_pred=Y_pred) )

In [49]:
kNN_fun(data, labels)

k: 1  | Accuracy: 0.5766200762388818
k: 2  | Accuracy: 0.5757729775518848
k: 3  | Accuracy: 0.6033036848792884
k: 4  | Accuracy: 0.6071156289707751
k: 5  | Accuracy: 0.623718763235917
k: 6  | Accuracy: 0.6195679796696315
k: 7  | Accuracy: 0.6264294790343075
k: 8  | Accuracy: 0.6271918678526048
k: 9  | Accuracy: 0.6315120711562897
k: 10  | Accuracy: 0.6324438797119865
k: 11  | Accuracy: 0.6342227869546803
k: 12  | Accuracy: 0.6354934349851757
k: 13  | Accuracy: 0.6356628547225752
k: 14  | Accuracy: 0.634561626429479
k: 15  | Accuracy: 0.6375264718339687
k: 16  | Accuracy: 0.6365099534095722
k: 17  | Accuracy: 0.6388818297331639
k: 18  | Accuracy: 0.6409148665819568
k: 19  | Accuracy: 0.6425243540872512
k: 20  | Accuracy: 0.6436255823803473
k: 21  | Accuracy: 0.6415078356628547
k: 22  | Accuracy: 0.6462515883100381
k: 23  | Accuracy: 0.6433714527742482
k: 24  | Accuracy: 0.6451503600169419
k: 25  | Accuracy: 0.6442185514612452
k: 26  | Accuracy: 0.6453197797543414
k: 27  | Accuracy: 0.64

### Otteniamo al più una precisione del 64% con k = 27

In [52]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

def kNN_fun_MinMaxScaler(X, Y) : 
    
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=42)
    
    scaler = MinMaxScaler()
    scaler.fit(X_train)
    
    for k in range(1,20):

        kNN = neighbors.KNeighborsClassifier(n_neighbors=k)
        kNN.fit(scaler.transform(X_train),Y_train)

        Y_pred = kNN.predict(scaler.transform(X_test))

        # compute Accuracy
        print ("k:", k," | Accuracy:", accuracy_score(y_true=Y_test, y_pred=Y_pred) )

def kNN_fun_StandardScaler(X, Y) : 
    
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=42)
    
    scaler = StandardScaler()
    scaler.fit(X_train)
    
    for k in range(1,20):

        kNN = neighbors.KNeighborsClassifier(n_neighbors=k)
        kNN.fit(scaler.transform(X_train),Y_train)

        Y_pred = kNN.predict(scaler.transform(X_test))

        # compute Accuracy
        print ("k:", k," | Accuracy:", accuracy_score(y_true=Y_test, y_pred=Y_pred) )

In [53]:
print ("kNN MinMax Scaler")
kNN_fun_MinMaxScaler(data, labels)
print ("kNN Standard Scaler")
kNN_fun_StandardScaler(data, labels)

kNN MinMax Scaler
k: 1  | Accuracy: 0.5839898348157561
k: 2  | Accuracy: 0.5802626005929691
k: 3  | Accuracy: 0.5996611605252011
k: 4  | Accuracy: 0.5958492164337146
k: 5  | Accuracy: 0.6033036848792884
k: 6  | Accuracy: 0.6042354934349852
k: 7  | Accuracy: 0.6088098263447692
k: 8  | Accuracy: 0.6058449809402795
k: 9  | Accuracy: 0.607708598051673
k: 10  | Accuracy: 0.6064379500211775
k: 11  | Accuracy: 0.6144853875476493
k: 12  | Accuracy: 0.6094027954256671
k: 13  | Accuracy: 0.6131300296484541
k: 14  | Accuracy: 0.6120288013553579
k: 15  | Accuracy: 0.6108428631935621
k: 16  | Accuracy: 0.6121135112240575
k: 17  | Accuracy: 0.613977128335451
k: 18  | Accuracy: 0.6104193138500635
k: 19  | Accuracy: 0.6156713257094452
kNN Standard Scaler
k: 1  | Accuracy: 0.5987293519695045
k: 2  | Accuracy: 0.6027954256670902
k: 3  | Accuracy: 0.629140194832698
k: 4  | Accuracy: 0.6309191020753918
k: 5  | Accuracy: 0.642270224481152
k: 6  | Accuracy: 0.6398136382888606
k: 7  | Accuracy: 0.64642100804

## Test size : 0.33
### Con kNN MinMax Scaler non andiamo oltre il 62%
### Con kNN Standard Scaler abbiamo risultati migliori ma comunque non superiamo il 66%
## Test size : 0.25
### Con kNN MinMax Scaler non andiamo oltre il 61%
### Con kNN Standard Scaler abbiamo risultati migliori ma comunque non superiamo il 66%