In [1]:
# libraries
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from time import sleep
from bs4 import BeautifulSoup
from requests import get
# our modules
from preprocessing import *

In [47]:
# Build the model
def build_best_model(X,Y):
    rf = RandomForestClassifier(n_estimators=400,
                            bootstrap=True,
                            criterion='gini',
                            max_features=None,
                            max_depth=20,
                            n_jobs=-1)
    rf.fit(X, Y)
    return rf

def scraping_rank_points(url):
    r =  get(url)
    page = BeautifulSoup(r.content)
    table = page.find(id='matchs_info')
    allTr = table.find_all('tr')
    for tr in allTr[1:]:
        rank = tr.contents[0].contents[0]
        points = tr.contents[3].contents[0]
        name = uniform_name(tr.contents[1].contents[0].contents[0])
        if name == 'Busta C.P.':
            name = 'Carreno-Busta P.'
        elif name == 'Vinolas R.A.':
            name = 'Ramos-Vinolas A.'
        elif name == 'Monteiro M.T.':
            name = 'Monteiro T.'
        elif name == 'Clar M.A.J.':
            name = 'Munar J.'
        elif name == 'Potro D.M.J.':
            name = 'Del Potro J.M.'
        elif name == 'McDonald M.':
            name = 'Mcdonald M.'
        elif name == 'Galan E.D.':
            name = 'Galan D.E.'
        elif name == 'Lee D.':
            name = 'Lee D.H.'
        elif name == 'Kubler M.J.':
            name = 'Kubler J.'
        elif name == 'Kwiatkowski T.':
            name = 'Kwiatkowski T.S.'
        elif name == 'Fernandez H.J.':
            name = 'Hernandez-Fernandez J.'
        elif name == 'Silva D.D.R.':
            name = 'Dutra Silva R.'
        else: 
            rank_points.update({name : {'Rank': rank, 'Points': points}})

# Scraping players of Australian Open 2020
def scraping_players():
    driver = webdriver.Chrome()
    driver.get('https://ausopen.com/draws#!mens-singles')
    sleep(5)
    round1 = driver.find_element_by_class_name('columm-1')
    matches = round1.find_elements_by_class_name('match-teams')
    firstRound = []
    
    for m in matches:
        players = m.find_elements_by_class_name('team-detail__players')
        firstRound.append((
            uniform_name(players[0].find_element_by_class_name('player-full-name').get_attribute('innerHTML')) ,
            uniform_name(players[1].find_element_by_class_name('player-full-name').get_attribute('innerHTML')) ))
    driver.close()
    # Correzioni manuali a causa del non perfetto funzionamento della uniform_name()
    firstRound[3] = ('Kovalik J.', 'Carreno-Busta P.')
    firstRound[11] = ('Gulbis E.', 'Auger-Aliassime F.')
    firstRound[14] = ('Bolt A.', 'Ramos-Vinolas A.')
    firstRound[19] = ('Popyrin A.', 'Tsonga J.W.')
    firstRound[21] = ('Tabilo A.', 'Galan D.E.')
    firstRound[25] = ('Herbert P.H.', 'Norrie C.')
    firstRound[28] = ('Basilashvili N.', 'Kwon S.W.')
    firstRound[34] = ('Carballes Baena R.', 'Berankis R.')
    firstRound[36] = ('Pella G.', 'Smith J.P.')
    firstRound[43] = ('Londero J.I.', 'Dimitrov G.')
    firstRound[52] = ('Paire B.', 'Stebe C.M.')
    firstRound[55] = ('Lopez F.', 'Bautista Agut R.')
    firstRound[57] = ('Davidovich Fokina A.', 'Gombos N.')
    firstRound[60] = ('Evans D.', 'Mcdonald M.')
    firstRound[63] = ('Struff J.L.', 'Djokovic N.')
    
    return firstRound

def build_dataset(playersInRound, nRound):
    dataSet = {'Series': [], 'Court': [], 'Surface': [], 'Round': [], 'Best of': [], 'Winner': [], 'Loser': [],
               'WRank': [], 'LRank': [], 'WPts': [], 'LPts': [], 'WBD': [], 'WHand': [], 'WBHand': [], 'LBD': [],
               'LHand': [], 'LBHand': [], 'WEloCalc': [], 'LEloCalc': [], 'ProbaElo': [], 'ATP': [], 'Location': [], 
               'Tournament': [], 'Date': [], 'Comment': [], 'Wsets': [], 'Lsets': [], 'W1': [], 'L1': [], 'W2': [], 
               'L2': [], 'W3': [], 'L3': [], 'W4': [], 'L4': [], 'W5': [], 'L5': [], 'B365W': [], 'B365L': [], 
               'EXW': [], 'EXL': [], 'LBW': [], 'LBL': [], 'PSW': [], 'PSL': [], 'SJW': [], 'SJL': [], 
               'MaxW': [], 'MaxL': [], 'AvgW': [], 'AvgL': [] }
    for p1, p2 in playersInRound:
        dataSet['Series'].append('Grand Slam')
        dataSet['Court'].append('Outdoor')
        dataSet['Surface'].append('Hard')
        dataSet['Round'].append(nRound)
        dataSet['Best of'].append(5)
        dataSet['Winner'].append(p1)
        dataSet['Loser'].append(p2)

        try:
            dataSet['WRank'].append(int(rank_points[p1]['Rank']))
        except KeyError:
            dataSet['WRank'].append(np.nan)
        try:
            dataSet['LRank'].append(int(rank_points[p2]['Rank']))
        except KeyError:
            dataSet['LRank'].append(np.nan)
        try:
            dataSet['WPts'].append(int(rank_points[p1]['Points']))
        except KeyError:
            dataSet['WPts'].append(np.nan)
        try:
            dataSet['LPts'].append(int(rank_points[p2]['Points']))
        except KeyError:
            dataSet['LPts'].append(np.nan)

        dataSet['WBD'].append(playersdata.loc[p1]['BirthDate'])
        dataSet['WHand'].append(playersdata.loc[p1]['Hand'])
        dataSet['WBHand'].append(playersdata.loc[p1]['BackHand'])
        
        dataSet['LBD'].append(playersdata.loc[p2]['BirthDate'])
        dataSet['LHand'].append(playersdata.loc[p2]['Hand'])
        dataSet['LBHand'].append(playersdata.loc[p2]['BackHand'])

        try:
            p1elo = playersElo[p1]
        except KeyError:
            p1elo = 1500
        dataSet['WEloCalc'].append(p1elo)

        try:    
            p2elo = playersElo[p2]
        except KeyError:
            p2elo = 1500
        dataSet['LEloCalc'].append(p2elo)  

        dataSet['ProbaElo'].append(compute_probability_elo(p1elo, p2elo))

        # colonne inutili (drop in unify_data())
        dataSet['ATP'].append(0)
        dataSet['Location'].append('Melbourne')
        dataSet['Tournament'].append('Australian Open')
        dataSet['Date'].append(pd.to_datetime('2020-01-20'))
        dataSet['Comment'].append('')
        dataSet['Wsets'].append(0)
        dataSet['Lsets'].append(0)
        dataSet['W1'].append(0)
        dataSet['L1'].append(0)
        dataSet['W2'].append(0)
        dataSet['L2'].append(0)
        dataSet['W3'].append(0)
        dataSet['L3'].append(0)
        dataSet['W4'].append(0)
        dataSet['L4'].append(0)
        dataSet['W5'].append(0)
        dataSet['L5'].append(0)
        dataSet['B365W'].append(0)
        dataSet['B365L'].append(0)
        dataSet['EXW'].append(0)
        dataSet['EXL'].append(0)
        dataSet['LBW'].append(0)
        dataSet['LBL'].append(0)
        dataSet['PSW'].append(0)
        dataSet['PSL'].append(0)
        dataSet['SJW'].append(0)
        dataSet['SJL'].append(0)
        dataSet['MaxW'].append(0)
        dataSet['MaxL'].append(0)
        dataSet['AvgW'].append(0)
        dataSet['AvgL'].append(0)
    return pd.DataFrame(data=dataSet)

def simulation(model,
               next_round, 
               features_to_drop=[]):
    rounds = ['1st Round', '2nd Round', '3rd Round', '4th Round', 'Quarterfinals', 'Semifinals', 'The Final']
    
    for r in rounds:
        #costruire il dataSet dati le coppie di giocatori e il round 
        dfRound = build_dataset(next_round, r)
        #unify_data
        round_to_test = unify_data(dfRound, features_to_drop)
        if 'Surface' not in features_to_drop:
            round_to_test['Surface__Clay'] = np.zeros(round_to_test.shape[0], dtype=int)
            round_to_test['Surface__Grass'] = np.zeros(round_to_test.shape[0], dtype=int)
        
        #prediction: prediction = model.predict(tournament_to_test[tournament_to_test['Round'] == i])
        prediction = model.predict(round_to_test)
        #print
        i = 0
        this_round = [(p1,p2) for p1,p2 in next_round]
        print()
        print(r)
        for p1, p2 in this_round:
            print('P1: ' + p1, 'P2: '+ p2, 'Wins:', 'P1' if prediction[i] else 'P2')
            i+=1
        #da prediction ricostruire le coppie di giocatori per il prossimo match 
        i = 0
        next_round = []
        for m1, m2 in zip(this_round[::2], this_round[1::2]):
            p1, p2 = m1
            p3, p4 = m2
            next_round.append((p1 if prediction[i] else p2, p3 if prediction[i+1] else p4))
            i+=2 

In [3]:
features_to_drop = ['Best of', 'Surface', 'Court', 'WHand', 'LHand', 'WBHand', 'LBHand']
X, Y, playersElo = preprocess_data(max_date = 2020, features_to_drop = features_to_drop, returnElo=True)

In [4]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 36848 entries, 0 to 36848
Data columns (total 16 columns):
Series      36848 non-null int64
Round       36848 non-null int64
P1Rank      36848 non-null int32
P2Rank      36848 non-null int32
P1Pts       36848 non-null int32
P2Pts       36848 non-null int32
WElo        36848 non-null float64
LElo        36848 non-null float64
ProbaElo    36848 non-null float64
P1Age       36848 non-null float64
P2Age       36848 non-null float64
AgeDiff     36848 non-null float64
RankDiff    36848 non-null int32
PtsDiff     36848 non-null int32
Top10P1     36848 non-null int32
Top10P2     36848 non-null int32
dtypes: float64(6), int32(8), int64(2)
memory usage: 3.7 MB


In [5]:
model = build_best_model(X,Y)

In [14]:
# Dati giocatori per riempire il dataSet
playersdata = pd.read_csv("data/playersdata.csv", 
                          encoding='utf-8-sig',  
                         parse_dates=['BirthDate'])
playersdata.index = playersdata['Player']
#playersdata.drop(columns='Player', inplace=True)
playersdata.info()

rank_points = {k:{'Rank': 0, 'Points': 0} for k in playersdata.index}

<class 'pandas.core.frame.DataFrame'>
Index: 746 entries, Harris A. to Dutra Da Silva R.
Data columns (total 4 columns):
Player       746 non-null object
BirthDate    540 non-null datetime64[ns]
Hand         540 non-null object
BackHand     467 non-null float64
dtypes: datetime64[ns](1), float64(1), object(2)
memory usage: 29.1+ KB


In [27]:
playersdata.iloc[332]

Player              Gojowczyk P.
BirthDate    1989-07-15 00:00:00
Hand                           R
BackHand                       2
Name: Gojowczyk P., dtype: object

In [7]:
scraping_rank_points('http://www.stevegtennis.com/men-atp-rankings/2020-01-13/1/all/')
scraping_rank_points('http://www.stevegtennis.com/men-atp-rankings/2020-01-13/301/all/')
scraping_rank_points('http://www.stevegtennis.com/men-atp-rankings/2020-01-13/601/all/')

In [8]:
firstRound = scraping_players()

In [48]:
simulation(model, firstRound, features_to_drop)

[('Nadal R.', 'Dellien H.'), ('Delbonis F.', 'Sousa J.'), ('Eubanks C.', 'Gojowczyk P.'), ('Kovalik J.', 'Carreno-Busta P.'), ('Kyrgios N.', 'Sonego L.'), ('Cuevas P.', 'Simon G.'), ('Uchiyama Y.', 'Ymer M.'), ('Martinez V.M.', 'Khachanov K.'), ('Monfils G.', 'Lu H.Y.'), ('Karlovic I.', 'Pospisil V.'), ('Duckworth J.', 'Bedene A.'), ('Gulbis E.', 'Auger-Aliassime F.'), ('Fritz T.', 'Griekspoor T.'), ('Ivashka I.', 'Anderson K.'), ('Bolt A.', 'Ramos-Vinolas A.'), ('Mannarino A.', 'Thiem D.'), ('Medvedev D.', 'Tiafoe F.'), ('Koepfer D.', 'Martinez P.'), ('Gaston H.', 'Munar J.'), ('Popyrin A.', 'Tsonga J.W.'), ('Isner J.', 'Monteiro T.'), ('Tabilo A.', 'Galan D.E.'), ('Kecmanovic M.', 'Seppi A.'), ('Dzumhur D.', 'Wawrinka S.'), ('Goffin D.', 'Chardy J.'), ('Herbert P.H.', 'Norrie C.'), ('Sugita Y.', 'Benchetrit E.'), ("O'Connell C.", 'Rublev A.'), ('Basilashvili N.', 'Kwon S.W.'), ('Verdasco F.', 'Donskoy E.'), ('Ruud C.', 'Gerasimov E.'), ('Cecchinato M.', 'Zverev A.'), ('Berrettini M.'


1st Round
P1: Nadal R. P2: Dellien H. Wins: P1
P1: Delbonis F. P2: Sousa J. Wins: P1
P1: Eubanks C. P2: Gojowczyk P. Wins: P1
P1: Kovalik J. P2: Carreno-Busta P. Wins: P1
P1: Kyrgios N. P2: Sonego L. Wins: P1
P1: Cuevas P. P2: Simon G. Wins: P1
P1: Uchiyama Y. P2: Ymer M. Wins: P1
P1: Martinez V.M. P2: Khachanov K. Wins: P1
P1: Monfils G. P2: Lu H.Y. Wins: P1
P1: Karlovic I. P2: Pospisil V. Wins: P1
P1: Duckworth J. P2: Bedene A. Wins: P1
P1: Gulbis E. P2: Auger-Aliassime F. Wins: P1
P1: Fritz T. P2: Griekspoor T. Wins: P1
P1: Ivashka I. P2: Anderson K. Wins: P1
P1: Bolt A. P2: Ramos-Vinolas A. Wins: P1
P1: Mannarino A. P2: Thiem D. Wins: P1
P1: Medvedev D. P2: Tiafoe F. Wins: P1
P1: Koepfer D. P2: Martinez P. Wins: P1
P1: Gaston H. P2: Munar J. Wins: P1
P1: Popyrin A. P2: Tsonga J.W. Wins: P1
P1: Isner J. P2: Monteiro T. Wins: P1
P1: Tabilo A. P2: Galan D.E. Wins: P1
P1: Kecmanovic M. P2: Seppi A. Wins: P1
P1: Dzumhur D. P2: Wawrinka S. Wins: P1
P1: Goffin D. P2: Chardy J. Wins: P1
P

IndexError: index 63 is out of bounds for axis 0 with size 63