# Open 2020: Winner Predictor
## Diego Chinellato - 867637
## Giorgia Campardo - 867928
### Web Intelligence Course

vedere light gbm

In [1]:
%matplotlib notebook

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn import tree

In [2]:
df_ext = pd.read_excel('2019+.xlsx')

In [4]:
df_ext.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23634 entries, 0 to 23633
Data columns (total 42 columns):
ATP           23634 non-null int64
Location      23634 non-null object
Tournament    23634 non-null object
Date          23634 non-null datetime64[ns]
Series        23634 non-null object
Court         23634 non-null object
Surface       23634 non-null object
Round         23634 non-null object
Best of       23634 non-null int64
Winner        23634 non-null object
Loser         23634 non-null object
WRank         23624 non-null float64
LRank         23586 non-null float64
WPts          23626 non-null float64
LPts          23587 non-null float64
W1            23483 non-null float64
L1            23485 non-null float64
W2            23260 non-null float64
L2            23260 non-null float64
W3            11173 non-null float64
L3            11173 non-null float64
W4            2200 non-null float64
L4            2200 non-null float64
W5            816 non-null float64
L5          

In [5]:
df_ext[df_ext['Date'] < pd.Timestamp(2014, 1, 1)].index

Int64Index([  135,   136,   137,   138,   139,   140,   141,   142,   143,
              144,
            ...
            23470, 23471, 23472, 23473, 23474, 23475, 23476, 23477, 23478,
            23479],
           dtype='int64', length=7953)

In [6]:
def compute_elo_rankings(data):
    """
    Given the list on matches in chronological order, for each match, computes 
    the elo ranking of the 2 players at the beginning of the match
    """
    print("Elo rankings computing...")
    players=list(pd.Series(list(data.Winner)+list(data.Loser)).value_counts().index)
    elo=pd.Series(np.ones(len(players))*1500,index=players)
    ranking_elo=[(1500,1500)]
    for i in range(1,len(data)):
        w=data.iloc[i-1,:].Winner
        l=data.iloc[i-1,:].Loser
        elow=elo[w]
        elol=elo[l]
        pwin=1 / (1 + 10 ** ((elol - elow) / 400))    
        K_win=32
        K_los=32
        new_elow=elow+K_win*(1-pwin)
        new_elol=elol-K_los*(1-pwin)
        elo[w]=new_elow
        elo[l]=new_elol
        ranking_elo.append((elo[data.iloc[i,:].Winner],elo[data.iloc[i,:].Loser])) 
    ranking_elo=pd.DataFrame(ranking_elo,columns=["elo_winner","elo_loser"])    
    ranking_elo["proba_elo"]=1 / (1 + 10 ** ((ranking_elo["elo_loser"] - ranking_elo["elo_winner"]) / 400))   
    return ranking_elo

In [7]:
def preprocess_data(df,
                    max_date=2014,
                    features_to_drop=[], 
                    missing_values="drop", 
                    drop_first=False):
    """
    Processes raw data and returns a tuple (X, Y) where X is the cleaned dataset and Y is the array of labels.
    """
    # Sort by date to calculate ELO
    X = df.sort_values(by='Date')
    
    # Drop old data
    X = X.drop(index=X[X['Date'] < pd.Timestamp(max_date, 1, 1)].index)
    
    # Drop unuseful columns
    features_to_drop += ['ATP', 'Location', 'Tournament', 'Date', 'Comment', 
                         'WPts', 'LPts', 'Wsets', 'Lsets', 
                         'W1', 'L1', 'W2', 'L2', 'W3', 'L3', 'W4', 'L4', 'W5', 'L5', 
                         'B365W', 'B365L', 'EXW', 'EXL', 'LBW', 'LBL', 'PSW', 'PSL', 'SJW', 'SJL']
    X = X.drop(columns=features_to_drop)
    
    # Deal with missing values
    X['WRank'] = X['WRank'].fillna(value=X['WRank'].max()+1).astype(int)
    X['LRank'] = X['LRank'].fillna(value=X['LRank'].max()+1).astype(int)
    
    if missing_values == 'drop':
        X = X.dropna()
    elif missing_values == 'mean':
        from sklearn.impute import SimpleImputer
        pass
    elif missing_values == 'custom':
        pass
    else:
        raise ValueError('Wrong parameter: missing_values')

    # Convert ordinal features to int (higher value means more important)
    series = ['ATP250', 'ATP500', 'Masters 1000', 'Masters Cup', 'Grand Slam']
    series2int = {s: i for i, s in enumerate(series)}
    rounds2int = {'1st Round': 0,
                  '2nd Round': 1,
                  '3rd Round': 2,
                  '4th Round': 3,
                  'Round Robin': 4,
                  'Quarterfinals': 5,
                  'Semifinals': 6,
                  'The Final': 7,
                 }
    X = X.replace({'Round': rounds2int, 'Series': series2int})
    
    # Convert court to binary
    X = X.replace({'Court': {'Outdoor': 0, 'Indoor': 1}})
    
    # One hot encode categorical features into binary features
    X = pd.get_dummies(X, prefix=['Surface_'], columns=['Surface'], drop_first=drop_first)
    
    # Convert players to numeric ?
    players = set(X['Winner']) | set(X['Loser'])
    players_to_id = {}
    for i, player in enumerate(players):
        players_to_id[player] = i
    X = X.replace({'Winner': players_to_id, 'Loser': players_to_id})
    X = X.astype({'Winner':int, 'Loser':int})

    X = X.rename(columns={'Winner':'1st Player', 'Loser':'2nd Player', 
                          'WRank':'P1Rank', 'LRank':'P2Rank', 
                          'MaxW':'MaxP1', 'MaxL':'MaxP2', 
                          'AvgW':'AvgP1', 'AvgL':'AvgP2'})
    
    # Generate labels
    Y = np.concatenate([np.ones(X.shape[0], dtype=int), np.zeros(X.shape[0], dtype=int)])
    # Swap columns and concatenate to data
    tmp = X.copy()
    cols_to_swap = ['1st Player', '2nd Player', 'P1Rank', 'P2Rank', 'MaxP1', 'MaxP2',  'AvgP1',  'AvgP2']
    cols_swapped = ['2nd Player', '1st Player', 'P2Rank', 'P1Rank', 'MaxP2', 'MaxP1',  'AvgP2',  'AvgP1']
    tmp[cols_to_swap] = tmp[cols_swapped]
    tmp.index = np.array(range(X.shape[0] + 1, X.shape[0] * 2 + 1))
    X = pd.concat((X, tmp))
    
    return X, Y

In [8]:
data, labels = preprocess_data(df_ext)

Let's split the data into train set, validation set and test set

In [20]:
X_train, X_test, Y_train, Y_test = train_test_split(data, labels, test_size=0.20)
X_train, X_valid, Y_train, Y_valid = train_test_split(X_train, Y_train, test_size=0.25)

In [37]:
def build_decision_tree(X_train, Y_train, X_valid, Y_valid):
    # This function builds a decision tree and performs automatic hyper-parameters tuning
    scores = []
    for criterion in ('gini', 'entropy'):
        for depth in range(5, 50):
            for leaves in range(5, 20):
                dt = tree.DecisionTreeClassifier(max_leaf_nodes=leaves,
                                                 criterion=criterion,
                                                 max_depth=depth)
                dt.fit(X_train, Y_train)
                valid_acc = round(accuracy_score(y_true=Y_valid, y_pred=dt.predict(X_valid)), 3)
                scores += [(valid_acc, criterion, depth, leaves)]
    best = max(scores)
    print(best)
    acc, criterion, depth, leaves = best
    dt = tree.DecisionTreeClassifier(max_leaf_nodes=leaves,
                                     criterion=criterion,
                                     max_depth=depth)
    dt.fit(X_train, Y_train)
    return dt, best

In [38]:
dt, params = build_decision_tree(X_train, Y_train, X_valid, Y_valid)

(0.69758, 'gini', 49, 13)


# Primo Test con k-Nearest-Neighbor Classifiers


In [12]:
from sklearn import neighbors
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

def kNN_fun(X, Y) : 
    
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=42)
    
    for k in range(1,30):

        kNN = neighbors.KNeighborsClassifier(n_neighbors=k)
        kNN.fit(X_train,Y_train)

        Y_pred = kNN.predict(X_test)

        # compute Accuracy
        print ("k:", k," | Accuracy:", accuracy_score(y_true=Y_test, y_pred=Y_pred) )

In [13]:
kNN_fun(data, labels)

k: 1  | Accuracy: 0.5758039816232772
k: 2  | Accuracy: 0.5678917815211842
k: 3  | Accuracy: 0.597753956100051
k: 4  | Accuracy: 0.59315977539561
k: 5  | Accuracy: 0.6089841755997958
k: 6  | Accuracy: 0.6086013272077591
k: 7  | Accuracy: 0.6126850433894845
k: 8  | Accuracy: 0.613323124042879
k: 9  | Accuracy: 0.622766717713119
k: 10  | Accuracy: 0.6177896886166412
k: 11  | Accuracy: 0.6294027565084227
k: 12  | Accuracy: 0.6230219499744768
k: 13  | Accuracy: 0.6271056661562021
k: 14  | Accuracy: 0.6237876467585503
k: 15  | Accuracy: 0.6282542113323124
k: 16  | Accuracy: 0.6248085758039816
k: 17  | Accuracy: 0.6332312404287902
k: 18  | Accuracy: 0.6309341500765697
k: 19  | Accuracy: 0.6339969372128637
k: 20  | Accuracy: 0.6354007146503318
k: 21  | Accuracy: 0.6383358856559469
k: 22  | Accuracy: 0.6373149566105155
k: 23  | Accuracy: 0.6385911179173047
k: 24  | Accuracy: 0.6387187340479836
k: 25  | Accuracy: 0.6406329760081675
k: 26  | Accuracy: 0.6406329760081675
k: 27  | Accuracy: 0.63910

### Otteniamo al più una precisione del 64% con k = 27

In [14]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

def kNN_fun_MinMaxScaler(X, Y) : 
    
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=42)
    
    scaler = MinMaxScaler()
    scaler.fit(X_train)
    
    for k in range(1,20):

        kNN = neighbors.KNeighborsClassifier(n_neighbors=k)
        kNN.fit(scaler.transform(X_train),Y_train)

        Y_pred = kNN.predict(scaler.transform(X_test))

        # compute Accuracy
        print ("k:", k," | Accuracy:", accuracy_score(y_true=Y_test, y_pred=Y_pred) )

def kNN_fun_StandardScaler(X, Y) : 
    
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=42)
    
    scaler = StandardScaler()
    scaler.fit(X_train)
    
    for k in range(1,20):

        kNN = neighbors.KNeighborsClassifier(n_neighbors=k)
        kNN.fit(scaler.transform(X_train),Y_train)

        Y_pred = kNN.predict(scaler.transform(X_test))

        # compute Accuracy
        print ("k:", k," | Accuracy:", accuracy_score(y_true=Y_test, y_pred=Y_pred) )

In [15]:
print ("kNN MinMax Scaler")
kNN_fun_MinMaxScaler(data, labels)
print ("kNN Standard Scaler")
kNN_fun_StandardScaler(data, labels)

kNN MinMax Scaler
k: 1  | Accuracy: 0.5844818785094436
k: 2  | Accuracy: 0.5793772332822869
k: 3  | Accuracy: 0.5972434915773354
k: 4  | Accuracy: 0.5935426237876468
k: 5  | Accuracy: 0.6010719754977029
k: 6  | Accuracy: 0.6004338948443083
k: 7  | Accuracy: 0.6027309851965288
k: 8  | Accuracy: 0.5957120980091883
k: 9  | Accuracy: 0.6033690658499234
k: 10  | Accuracy: 0.6004338948443083
k: 11  | Accuracy: 0.6047728432873916
k: 12  | Accuracy: 0.6027309851965288
k: 13  | Accuracy: 0.6092394078611536
k: 14  | Accuracy: 0.6063042368555386
k: 15  | Accuracy: 0.6126850433894845
k: 16  | Accuracy: 0.6074527820316488
k: 17  | Accuracy: 0.6123021949974476
k: 18  | Accuracy: 0.6108984175599795
k: 19  | Accuracy: 0.6097498723838694
kNN Standard Scaler
k: 1  | Accuracy: 0.5960949464012251
k: 2  | Accuracy: 0.5950740173557938
k: 3  | Accuracy: 0.6230219499744768
k: 4  | Accuracy: 0.6145992853496682
k: 5  | Accuracy: 0.6360387953037264
k: 6  | Accuracy: 0.6282542113323124
k: 7  | Accuracy: 0.6360387

## Test size : 0.33
### Con kNN MinMax Scaler non andiamo oltre il 62%
### Con kNN Standard Scaler abbiamo risultati migliori ma comunque non superiamo il 66%
## Test size : 0.25
### Con kNN MinMax Scaler non andiamo oltre il 61%
### Con kNN Standard Scaler abbiamo risultati migliori ma comunque non superiamo il 66%