In [1]:
# libraries
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from time import sleep
from bs4 import BeautifulSoup
from requests import get
# our modules
from preprocessing import *

In [2]:
features_to_drop = []

In [3]:
X, Y, playersElo = preprocess_data(2011, 2020, features_to_drop, returnElo=True)
build_best_model(X,Y)

KeyboardInterrupt: 

In [None]:
# Build the model
def build_best_model(X,Y):
    rf = RandomForestClassifier(n_estimators=250,
                            bootstrap=True,
                            criterion='entropy',
                            max_features='log2',
                            max_depth=15,
                            n_jobs=-1)
    rf.fit(X, Y)
    return rf

def scraping_rank_points(url):
    r =  get(url)
    page = BeautifulSoup(r.content)
    table = page.find(id='matchs_info')
    allTr = table.find_all('tr')
    for tr in allTr[1:]:
        rank = tr.contents[0].contents[0]
        points = tr.contents[3].contents[0]
        name = uniform_name(tr.contents[1].contents[0].contents[0])
        if name == 'Busta C.P.':
            name = 'Carreno-Busta P.'
        elif name == 'Vinolas R.A.':
            name = 'Ramos-Vinolas A.'
        elif name == 'Monteiro M.T.':
            name = 'Monteiro T.'
        elif name == 'Clar M.A.J.':
            name = 'Munar J.'
        elif name == 'Potro D.M.J.':
            name = 'Del Potro J.M.'
        elif name == 'McDonald M.':
            name = 'Mcdonald M.'
        elif name == 'Galan E.D.':
            name = 'Galan D.E.'
        elif name == 'Lee D.':
            name = 'Lee D.H.'
        elif name == 'Kubler M.J.':
            name = 'Kubler J.'
        elif name == 'Kwiatkowski T.':
            name = 'Kwiatkowski T.S.'
        elif name == 'Fernandez H.J.':
            name = 'Hernandez-Fernandez J.'
        elif name == 'Silva D.D.R.':
            name = 'Dutra Silva R.'
        else: 
            rank_points.update({name : {'Rank': rank, 'Points': points}})

# Scraping players of Australian Open 2020
def scraping_players():
    driver = webdriver.Chrome()
    driver.get('https://ausopen.com/draws#!mens-singles')
    sleep(5)
    round1 = driver.find_element_by_class_name('columm-1')
    matches = round1.find_elements_by_class_name('match-teams')
    firstRound = []
    
    for m in matches:
        players = m.find_elements_by_class_name('team-detail__players')
        firstRound.append((
            uniform_name(players[0].find_element_by_class_name('player-full-name').get_attribute('innerHTML')) ,
            uniform_name(players[1].find_element_by_class_name('player-full-name').get_attribute('innerHTML')) ))
    driver.close()
    # Correzioni manuali a causa del non perfetto funzionamento della uniform_name()
    firstRound[3] = ('Kovalik J.', 'Carreno-Busta P.')
    firstRound[11] = ('Gulbis E.', 'Auger-Aliassime F.')
    firstRound[14] = ('Bolt A.', 'Ramos-Vinolas A.')
    firstRound[19] = ('Popyrin A.', 'Tsonga J.W.')
    firstRound[21] = ('Tabilo A.', 'Galan D.E.')
    firstRound[25] = ('Herbert P.H.', 'Norrie C.')
    firstRound[28] = ('Basilashvili N.', 'Kwon S.W.')
    firstRound[34] = ('Carballes Baena R.', 'Berankis R.')
    firstRound[36] = ('Pella G.', 'Smith J.P.')
    firstRound[43] = ('Londero J.I.', 'Dimitrov G.')
    firstRound[52] = ('Paire B.', 'Stebe C.M.')
    firstRound[55] = ('Lopez F.', 'Bautista Agut R.')
    firstRound[57] = ('Davidovich Fokina A.', 'Gombos N.')
    firstRound[60] = ('Evans D.', 'Mcdonald M.')
    firstRound[63] = ('Struff J.L.', 'Djokovic N.')
    
    return firstRound

def build_dataset(playersInRound, nRound):
    dataSet = {'Series': [], 'Court': [], 'Surface': [], 'Round': [], 'Best of': [], 'Winner': [], 'Loser': [],
               'WRank': [], 'LRank': [], 'WPts': [], 'LPts': [], 'WBD': [], 'WHand': [], 'WBHand': [], 'LBD': [],
               'LHand': [], 'LBHand': [], 'WEloCalc': [], 'LEloCalc': [], 'ProbaElo': [], 'ATP': [], 'Location': [], 
               'Tournament': [], 'Date': [], 'Comment': [], 'Wsets': [], 'Lsets': [], 'W1': [], 'L1': [], 'W2': [], 
               'L2': [], 'W3': [], 'L3': [], 'W4': [], 'L4': [], 'W5': [], 'L5': [], 'B365W': [], 'B365L': [], 
               'EXW': [], 'EXL': [], 'LBW': [], 'LBL': [], 'PSW': [], 'PSL': [], 'SJW': [], 'SJL': [], 
               'MaxW': [], 'MaxL': [], 'AvgW': [], 'AvgL': [] }
    for p1, p2 in playersInRound:
        dataSet['Series'].append('Grand Slam')
        dataSet['Court'].append('Outdoor')
        dataSet['Surface'].append('Hard')
        dataSet['Round'].append(nRound)
        dataSet['Best of'].append(5)
        dataSet['Winner'].append(p1)
        dataSet['Loser'].append(p2)

        try:
            dataSet['WRank'].append(rank_points[p1]['Rank'])
        except KeyError:
            dataSet['WRank'].append(np.nan)
        try:
            dataSet['LRank'].append(rank_points[p2]['Rank'])
        except KeyError:
            dataSet['LRank'].append(np.nan)
        try:
            dataSet['WPts'].append(rank_points[p1]['Points'])
        except KeyError:
            dataSet['WPts'].append(np.nan)
        try:
            dataSet['LPts'].append(rank_points[p2]['Points'])
        except KeyError:
            dataSet['LPts'].append(np.nan)

        dataSet['WBD'].append(playersdata.loc[p1]['BirthDate'])
        dataSet['WHand'].append(playersdata.loc[p1]['Hand'])
        dataSet['WBHand'].append(playersdata.loc[p1]['BackHand'])

        dataSet['LBD'].append(playersdata.loc[p2]['BirthDate'])
        dataSet['LHand'].append(playersdata.loc[p2]['Hand'])
        dataSet['LBHand'].append(playersdata.loc[p2]['BackHand'])

        try:
            p1elo = playersElo[p1]
        except KeyError:
            p1elo = 1500
        dataSet['WEloCalc'].append(p1elo)

        try:    
            p2elo = playersElo[p2]
        except KeyError:
            p2elo = 1500
        dataSet['LEloCalc'].append(p2elo)  

        dataSet['ProbaElo'].append(compute_probability_elo(p1elo, p2elo))

        # colonne inutili (drop in unify_data())
        dataSet['ATP'].append(0)
        dataSet['Location'].append('')
        dataSet['Tournament'].append('')
        dataSet['Date'].append(pd.to_datetime('2020-01-20'))
        dataSet['Comment'].append('')
        dataSet['Wsets'].append(0)
        dataSet['Lsets'].append(0)
        dataSet['W1'].append(0)
        dataSet['L1'].append(0)
        dataSet['W2'].append(0)
        dataSet['L2'].append(0)
        dataSet['W3'].append(0)
        dataSet['L3'].append(0)
        dataSet['W4'].append(0)
        dataSet['L4'].append(0)
        dataSet['W5'].append(0)
        dataSet['L5'].append(0)
        dataSet['B365W'].append(0)
        dataSet['B365L'].append(0)
        dataSet['EXW'].append(0)
        dataSet['EXL'].append(0)
        dataSet['LBW'].append(0)
        dataSet['LBL'].append(0)
        dataSet['PSW'].append(0)
        dataSet['PSL'].append(0)
        dataSet['SJW'].append(0)
        dataSet['SJL'].append(0)
        dataSet['MaxW'].append(0)
        dataSet['MaxL'].append(0)
        dataSet['AvgW'].append(0)
        dataSet['AvgL'].append(0)

    return pd.DataFrame(data=dataSet)

def simulation(model,
               next_round, 
               features_to_drop=[]):
    rounds = ['1st Round', '2nd Round', '3rd Round', '4th Round', 'Quarterfinals', 'Semifinals', 'The Final']
    
    for r in rounds:
        #costruire il dataSet dati le coppie di giocatori e il round 
        dfRound = build_dataset(next_round, r)
        #unify_data
        round_to_test = unify_data(dfRound, features_to_drop)
        round_to_test.info()
        #prediction: prediction = model.predict(tournament_to_test[tournament_to_test['Round'] == i])
        prediction = model.predict(round_to_test)
        #print
        i = 0
        this_round = [(p1,p2) for p1,p2 in next_round]
        print(r)
        for p1, p2 in this_round:
            print('P1: ' + p1, 'P2: '+ p2, 'Wins:', 'P1' if prediction[i] else 'P2')
            i+=1
        #da prediction ricostruire le coppie di giocatori per il prossimo match 
        i = 0
        next_round = []
        for m1, m2 in zip(this_round[::2], this_round[1::2]):
            p1, p2 = m1
            p3, p4 = m2
            next_round.append((p1 if prediction[i] else p2, p3 if prediction[i+1] else p4))
            i+=2

In [None]:
# Dati giocatori per riempire il dataSet
playersdata = pd.read_csv("data/playersdata.csv", 
                          encoding='utf-8-sig',  
                         parse_dates=['BirthDate'])
playersdata.info()
playersdata.index = playersdata['Player']
playersdata.index.name = None

rank_points = {k:{'Rank': 0, 'Points': 0} for k in playersdata.index}

In [None]:
scraping_rank_points('http://www.stevegtennis.com/men-atp-rankings/2020-01-13/1/all/')
scraping_rank_points('http://www.stevegtennis.com/men-atp-rankings/2020-01-13/301/all/')
scraping_rank_points('http://www.stevegtennis.com/men-atp-rankings/2020-01-13/601/all/')

In [None]:
firstRound = scraping_players()

In [None]:
simulation(rf, firstRound, features_to_drop)