# 3. Building model
Here we are going to build model for a players that will be playing match and we want to predict results.

In [1]:
import numpy as np
import pandas as pd

### Loading list of players

In [2]:
playersList = pd.Series.from_csv("Data/Players/PlayersList.csv")

### Loading shortened history of games

In [3]:
games = pd.read_csv("Data/Raw.csv",encoding = "ISO-8859-1", low_memory=False)
#For now we are going to drop last 29 columns (betting odds data)
gamesNrCol = len(games.columns)
games.drop(games.columns[np.arange(gamesNrCol-29,gamesNrCol)], axis=1, inplace=True)
games.fillna(0, inplace=True)
#number of columns
gamesNrCol = len(games.columns)
print("# of columns {}".format(gamesNrCol))
#reversing dataset
games=games.iloc[::-1]
games=games.iloc[:250]

games.Date = pd.to_datetime(games.Date, format='%d/%m/%Y')

# of columns 25


## Building model for a specific player, starts with specific moment

In [4]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier


def playerBuildModel(player, startsWithIndex):
    #playerDataset np.float64
    playerDataset = pd.read_csv("Data/Players/{}csv".format(player), index_col=0)
    
    """, dtype={
            "1LastGameDiff":np.float64,
            "1LastGameGamesDiff":np.float64,
            "2LastGameDiff":np.float64,
            "2LastGameGamesDiff":np.float64,
            "3LastGameDiff":np.float64,
            "3LastGameGamesDiff":np.float64,
            "4LastGameDiff":np.float64,
            "4LastGameGamesDiff":np.float64,
            "5LastGameDiff":np.float64,
            "5LastGameGamesDiff":np.float64,        
        }"""
    
    playerDataset.Date = pd.to_datetime(playerDataset.Date, format='%Y-%m-%d')
    #limit matches only to those before specific event
    playerDataset = prepareDataset(playerDataset)   
    
    #we have to limit nr of matches only to those before specific match
    playerDataset = playerDataset.loc[(startsWithIndex-1):]

    #how many matches he played
    matchesPlayed = len(playerDataset)
    
    #columns in dataset
    columns = playerDataset.columns
    
    #get rid of useless columns
    X = playerDataset[columns[2:-1]]
    y = playerDataset[columns[-1]]
    
    clf = RandomForestClassifier() 
    
    #clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
    #                     algorithm="SAMME",
    #                     n_estimators=200)
    
    clf.fit(X,y)
    
    print("Model accuracy for {} is: {} with {} matches".format(player, clf.score(X,y), matchesPlayed))
    
    return (clf, playerDataset, matchesPlayed)

In [5]:
def prepareDataset(dataset):
    
    courts = [('Indoor', 0), ('Outdoor', 1)]
    surfaces = [('Clay', 0), ('Carpet', 1), ('Grass', 2), ('Hard', 3)]
    
    for court, i in courts: dataset.replace(court, i, inplace=True);
    for surface, i in surfaces: dataset.replace(surface, i, inplace=True);
    
    return dataset

games=prepareDataset(games)

In [6]:
def buildInputDataset(match, playerName, playerGames, predict = False):
    toReturn = []
    if not(predict):
        toReturn.append(match.ATP)
        toReturn.append(match.Date)
    #player rank, at the specific match
    rank = match.WRank if match.Winner == playerName else match.LRank
    toReturn.append(rank)
    
    #type of court
    court = 0 if match.Court == 'Indoor' else 1
    toReturn.append(court)
    
    #type of Surface
    if(match.Surface == 'Clay'): surface = 0;
    elif(match.Surface == 'Carpet'): surface = 1;
    elif(match.Surface == 'Grass'): surface = 2;
    else : surface = 3;
    toReturn.append(surface)
    
    #setting results for the last 5 games
    matchDate = match.Date
    for i in range(5):
        #result from last i-th game
        game = playerGames.iloc[i]
    
        if not(predict):
            #match results
            lastGameResult= 1 if game.Winner == playerName else 0
            
            #number od sets, number of won sets, sets difficultty
            sets = game.Wsets + game.Lsets
            wonSets = game.Wsets if game.Winner == playerName else game.Lsets
            setsDiff = wonSets/sets
            
            #number od games, number of won games, games difficultty
            nrOfGames = game.W1 + game.W2 +game.W3 + game.W4 + game.W5 + game.L1 + game.L2 + game.L3 + game.L4 + game.L5
            nrOfWonGames = game.W1 + game.W2 + game.W3 + game.W4 + game.W5 if game.Winner == playerName else game.L1 + game.L2 + game.L3 + game.L4 + game.L5
            gameDiff = nrOfWonGames / nrOfGames
            

            setsDiff = 0 if np.isnan(setsDiff) else setsDiff
            gameDiff = 0 if np.isnan(gameDiff) else gameDiff

            
        else:
            lastGameResult = game["{}LastGame".format(i+1)]
            setsDiff = game["{}LastGameDiff".format(i+1)]
            gameDiff = game["{}LastGameGamesDiff".format(i+1)]
            nrOfGames = game["{}LastGameNrOfGames".format(i+1)]

            
        lastGameDate = game.Date
        daysFree = matchDate - lastGameDate
        #Agregating history
        
        toReturn.append(daysFree.days)
        toReturn.append(lastGameResult)
        toReturn.append(setsDiff)
        toReturn.append(gameDiff)
        toReturn.append(nrOfGames)
        
        #new refference date
        matchDate = lastGameDate

    
    #calculating match in the tournament
    matchYear = match.Date.year
    Tournament = playerGames[ (playerGames['ATP'] == match.ATP) & (playerGames['Date'] >= '{}-1-1'.format(matchYear)) ]
    matchesInTour = len(Tournament)
    toReturn.append(matchesInTour)
    
    
    #depend wheater we want to predict result or we are building playerDataset
    if not(predict):
        #game result -> Player win=1 lose=0
        toReturn.append( int(1) if match.Winner == playerName else int(0))
  

    return toReturn

## Testing algorythm

In [7]:
def testMatchResults(games, testSample=50, probDiff=0.1, minMatches=100):
    #nr of matches that we will be testing
    nrMatches = testSample

    #known results of matches
    matchResults = []
    #predicted results of matches
    matchPredResults = []
    #predicted probabilites
    matchProb = []

    for i in range(nrMatches):
        match = games.iloc[i]
        players= [match.Winner, match.Loser]

        print('='*80)
        print("Match: {} vs. {}".format(players[0], players[1]))
        print()
        #if we dont have enough data about one of two players then we skip it
        if players[0] not in playersList.values or players[1] not in playersList.values:
            continue

        #here we will save probabilities that specific player will win
        playersProbab = []
        matches =[]
        for player in players:

            #at first we will create model for a specific player
            playerModel, playerDataset, matchesPlayed = playerBuildModel(player, match.name)
            matches.append(matchesPlayed)
            #building input
            inputData = buildInputDataset(match, player, playerDataset,predict =True)

            #compute probability to win for a specific player
            prob = playerModel.predict_proba([inputData])

            #save probability to win
            playersProbab.append(prob[0][1])

        if(matches[0] < minMatches or matches[1] < minMatches):
            continue

        #compare probabilites to win for a specific players
        diff = playersProbab[0] - playersProbab[1]
        if abs(diff) > probDiff:
            #match results
            matchResults.append(0)
            print("Winner is {}".format(players[0]))
            
            if diff > 0: 
                matchPredResults.append(0)
                print("Predicted winner is {}".format(players[0]))
            else: 
                matchPredResults.append(1)
                print("Predicted winner is {}".format(players[1]))
            
            
            matchProb.append(playersProbab)

    return matchResults, matchPredResults, matchProb

## Analysis test results

In [8]:
from sklearn import metrics

matchResults, matchPredResults, matchProb = testMatchResults(games, testSample=150, probDiff=0.2, minMatches=100)

test_score = metrics.accuracy_score(matchResults, matchPredResults)
print(test_score)
print(len(matchResults))

Match: Zverev A. vs. Wawrinka S.

Model accuracy for Zverev A. is: 0.978494623655914 with 93 matches
Model accuracy for Wawrinka S. is: 0.9890453834115805 with 639 matches
Match: Wawrinka S. vs. Bautista Agut R.

Model accuracy for Wawrinka S. is: 0.9890282131661442 with 638 matches
Model accuracy for Bautista Agut R. is: 0.96 with 25 matches
Match: Zverev A. vs. Berdych T.

Model accuracy for Zverev A. is: 0.9782608695652174 with 92 matches
Model accuracy for Berdych T. is: 0.9912390488110138 with 799 matches
Match: Wawrinka S. vs. Troicki V.

Model accuracy for Wawrinka S. is: 0.9921507064364207 with 637 matches
Model accuracy for Troicki V. is: 0.992822966507177 with 418 matches
Winner is Wawrinka S.
Predicted winner is Wawrinka S.
Match: Zverev A. vs. Youzhny M.

Model accuracy for Zverev A. is: 0.9560439560439561 with 91 matches
Model accuracy for Youzhny M. is: 0.9863692688971499 with 807 matches
Match: Bautista Agut R. vs. Sousa J.

Model accuracy for Bautista Agut R. is: 1.0 wi

## Finding the 98% accuracy parameters

In [9]:
probabDiffs = np.arange(0, 0.3, 0.02)
minMatchess = range(50, 250, 10)

from IPython.core.display import clear_output
x=[]
y=[]
z=[]
k=[]
for i in probabDiffs:
    for j in minMatchess:
        matchResults, matchPredResults, matchProb = testMatchResults(games, testSample=200, probDiff=i, minMatches=j)
        test_score = metrics.accuracy_score(matchResults, matchPredResults)
        x.append(i)
        y.append(j)
        z.append(test_score)
        k.append(len(matchResults))
        clear_output()
        

In [18]:

from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
%matplotlib qt

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.plot_trisurf(x, y, z)

plt.show()

<mpl_toolkits.mplot3d.art3d.Poly3DCollection at 0x7f37098d2978>

In [21]:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.plot_trisurf(x, y, k)
plt.show()