# 1. Define a problem

Basing datasets avalible on http://www.tennis-data.co.uk/data.php, I will try to predict winner in a particular match.

# 2. Prepare data

Raw data structure is described in RawMetaData.txt. From that I will create dataset, where each line what happened before one target event. There are following features:


## Loading raw dataset

In [1]:
#loading clear_output for progress bar
from IPython.core.display import clear_output
#importing pandas
import numpy as np
import pandas as pd
raw = pd.read_csv("Data/Raw.csv",encoding = "ISO-8859-1", low_memory=False)

## Data overview

In [2]:
#For now we are going to drop last 29 columns (betting odds data)
raw_nr_col = len(raw.columns)
raw.drop(raw.columns[np.arange(raw_nr_col-29,raw_nr_col)], axis=1, inplace=True)
raw.fillna(0, inplace=True)
#number of columns
nr_col = len(raw.columns)
print("# of columns {}".format(nr_col))

#reversing dataset
raw=raw.iloc[::-1]
raw.columns
#converting Date to pandas DateTime type
raw.Date = pd.to_datetime(raw.Date, format='%d/%m/%Y')
intRawCol = ["W1", "W2", "W3", "W4", "W5", "L1", "L2", "L3", "L4", "L5", "Wsets", "Lsets"]
raw[intRawCol] = raw[intRawCol].astype(int)

# of columns 25


## Building player dataset

In [3]:
def buildInputDataset(match, playerName, playerGames, predict = False):
    toReturn = []
    if not(predict):
        toReturn.append(match.ATP)
        toReturn.append(match.Date)
    #player rank, at the specific match
    rank = match.WRank if match.Winner == playerName else match.LRank
    toReturn.append(rank)
    
    #type of court
    court = 0 if match.Court == 'Indoor' else 1
    toReturn.append(court)
    
    #type of Surface
    if(match.Surface == 'Clay'): surface = 0;
    elif(match.Surface == 'Carpet'): surface = 1;
    elif(match.Surface == 'Grass'): surface = 2;
    else : surface = 3;
    toReturn.append(surface)
    
    #setting results for the last 5 games
    matchDate = match.Date
    for i in range(5):
        #result from last i-th game
        game = playerGames.iloc[i]
    
        if not(predict):
            #match results
            lastGameResult= 1 if game.Winner == playerName else 0
            
            #number od sets, number of won sets, sets difficultty
            sets = game.Wsets + game.Lsets
            wonSets = game.Wsets if game.Winner == playerName else game.Lsets
            setsDiff = wonSets/sets
            
            #number od games, number of won games, games difficultty
            nrOfGames = game.W1 + game.W2 +game.W3 + game.W4 + game.W5 + game.L1 + game.L2 + game.L3 + game.L4 + game.L5
            nrOfWonGames = game.W1 + game.W2 + game.W3 + game.W4 + game.W5 if game.Winner == playerName else game.L1 + game.L2 + game.L3 + game.L4 + game.L5
            gameDiff = nrOfWonGames / nrOfGames
    
        else:
            lastGameResult = game["{}LastGame".format(i+1)]
            setsDiff = game["{}LastGameDiff".format(i+1)]
            gameDiff = game["{}LastGameGamesDiff".format(i+1)]
            nrOfGames = game["{}LastGameNrOfGames".format(i+1)]
            
        
        lastGameDate = game.Date
        daysFree = matchDate - lastGameDate
        #Agregating history
        
        toReturn.append(daysFree.days)
        toReturn.append(lastGameResult)
        toReturn.append(setsDiff)
        toReturn.append(gameDiff)
        toReturn.append(int(nrOfGames))
        
        #new refference date
        matchDate = lastGameDate

    
    #calculating match in the tournament
    matchYear = match.Date.year
    Tournament = playerGames[ (playerGames['ATP'] == match.ATP) & (playerGames['Date'] >= '{}-1-1'.format(matchYear)) ]
    matchesInTour = len(Tournament)
    toReturn.append(matchesInTour)
    
    
    #depend wheater we want to predict result or we are building playerDataset
    if not(predict):
        #game result -> Player win=1 lose=0
        toReturn.append( int(1) if match.Winner == playerName else int(0))
  

    return toReturn

In [4]:
numMatchesThreshold = 20 #number of matches that player have to play if we wont to build a dataset for him
numMatchesHistoryCollectionThreashold = 5

#building dataset for specific player, based on his history
def createPlayerDataset(playerName, data):
    
    #player matches
    playerMatches = data.loc[((data.Winner == playerName) | (data.Loser == playerName))]
     
    #columns in new Dataset
    columns = [
        "ATP",
        "Date",
        "Rank", 
        "Court", 
        "Surface",
        
        "DaysOffFromMatch1",
        "1LastGame", 
        "1LastGameDiff", 
        "1LastGameGamesDiff", 
        "1LastGameNrOfGames", 
        
        "DaysOffFromMatch2",
        "2LastGame", 
        "2LastGameDiff", 
        "2LastGameGamesDiff", 
        "2LastGameNrOfGames", 
        
        "DaysOffFromMatch3",
        "3LastGame", 
        "3LastGameDiff", 
        "3LastGameGamesDiff", 
        "3LastGameNrOfGames", 
        
        "DaysOffFromMatch4",
        "4LastGame", 
        "4LastGameDiff", 
        "4LastGameGamesDiff", 
        "4LastGameNrOfGames", 
        
        "DaysOffFromMatch5",
        "5LastGame",
        "5LastGameDiff",
        "5LastGameGamesDiff", 
        "5LastGameNrOfGames", 
        
        "MatchesInTour",
        "Result"
    ]
    intTypes=[
        "Rank", 
        "Court", 
        "Surface",
        
        "DaysOffFromMatch1",
        "1LastGame", 
        "1LastGameNrOfGames", 
        
        "DaysOffFromMatch2",
        "2LastGame", 
        "2LastGameNrOfGames", 
        
        "DaysOffFromMatch3",
        "3LastGame", 
        "3LastGameNrOfGames", 
        
        "DaysOffFromMatch4",
        "4LastGame", 
        "4LastGameNrOfGames", 
        
        "DaysOffFromMatch5",
        "5LastGame",
        "5LastGameNrOfGames", 
        
        "MatchesInTour",
        "Result"
        
    ]
    
    playerDataset = pd.DataFrame(columns=columns)
    playerDataset[intTypes] = playerDataset[intTypes].astype(int)
    
    if(len(playerMatches) <= numMatchesThreshold):
        return False
    
    #building dataset for a specific player
    while (len(playerMatches) > numMatchesHistoryCollectionThreashold):
 
        row = playerMatches.iloc[0]
        #build player dataset
        data = buildInputDataset(row, playerName, playerMatches.iloc[1:])
        
        playerDataset.loc[ row.name ] = data
        #pruning history
        playerMatches = playerMatches.loc[ (row.name -1):]

    
    return playerDataset;

### Creating players history
Columns:
<ol>
<li>ATP</li>
<li>MatchDate</li>
<li>Player rank</li>
<li>Type of court</li>
<li>Type of surface</li>

<li>Days off before match and last game</li>
<li>Last game result - win=1, lose=0</li>
<li>Last game difficulty - number of won sets / total sets</li>
<li>Last game, games difficulty - number of won games / total games</li>

<li>Days off before last game and 2nd last game</li>
<li>2nd last game result - win=1, lose=0</li>
<li>2nd last game difficulty - number of won sets / total sets</li>
<li>2nd last game, games difficulty - number of won games / total games</li>

<li>Days off before 2nd last game and 3rd last game</li>
<li>3rd last game result - win=1, lose=0</li>
<li>3rd last game difficulty - number of won sets / total sets</li>
<li>3rd last game, games difficulty - number of won games / total games</li>

<li>Days off before 3rd last game and 4th last game</li>
<li>4th last game result - win=1, lose=0</li>
<li>4th last game difficulty - number of won sets / total sets</li>
<li>4th last game, games difficulty - number of won games / total games</li>

<li>Days off before 4th last game and 5th last game</li>
<li>5th last game result - win=1, lose=0</li>
<li>5th last game difficulty - number of won sets / total sets</li>
<li>5th last game, games difficulty - number of won games / total games</li>

<li>Match in tournament - 0=1st round, 1=2 round</li>
<li>Match result - win=1, lose=0</li>
</ol>

In [5]:
#creating basing player datasets
players= set(raw.Winner.values) | set(raw.Loser.values)
totalPlayers = len(players)

players_data = []
refuse_players = []

import time
t1 = time.time()
for i, player in enumerate(players):
    #building dataset for specific player
    playerDataset = createPlayerDataset(player, raw)
  
    #
    if(type(playerDataset) != bool):
        #saving to file
        playerDataset.to_csv("Data/Players/{}csv".format(player))
        players_data.append(player)
    else:
        refuse_players.append(player)
        
    #clearing output and printing progress
    clear_output()
    progress = (i * 100.) / (totalPlayers - 1)
    print("Progress: {}%".format(progress))
    
t2 = time.time()
print("Time=%s" % (t2 - t1))

Progress: 100.0%
Time=1767.7727136611938


In [6]:
#saving names of players for whom we have informations
pd.Series(players_data).to_csv("Data/Players/PlayersList.csv")
pd.Series(refuse_players).to_csv("Data/Players/PlayersListRefused.csv")

#
print( len(players_data) )

468


## Extending player dataset of informations about oponents

Here we are going to extend players datasets of informations about oponents