In [1]:
import numpy as np
import pandas as pd
import glob, os 
import datetime

## Initial data import
Use data starting in 2000

In [2]:
pathname = "./tennis_atp/atp_matches_20??.csv"
df = pd.concat((pd.read_csv(f) for f in glob.iglob(pathname, recursive=True)), ignore_index=True)
# Filter matches without data
df = df[list(map(lambda x: not np.isnan(x), df.loc[:]["minutes"]))]

df['tourney_date'] = pd.to_datetime(df['tourney_date'], format='%Y%m%d')
df.head()

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,...,w_bpFaced,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced
0,2004-360,Casablanca,Clay,32.0,A,2004-05-17,1.0,103103.0,1.0,,...,6.0,1.0,4.0,63.0,28.0,17.0,15.0,8.0,5.0,9.0
1,2004-360,Casablanca,Clay,32.0,A,2004-05-17,2.0,102231.0,,,...,11.0,5.0,7.0,97.0,59.0,35.0,14.0,15.0,6.0,13.0
2,2004-360,Casablanca,Clay,32.0,A,2004-05-17,3.0,103700.0,,,...,3.0,0.0,5.0,52.0,23.0,15.0,7.0,8.0,3.0,8.0
3,2004-360,Casablanca,Clay,32.0,A,2004-05-17,4.0,103169.0,8.0,,...,4.0,4.0,6.0,87.0,36.0,25.0,23.0,10.0,17.0,20.0
4,2004-360,Casablanca,Clay,32.0,A,2004-05-17,5.0,103898.0,,,...,7.0,2.0,6.0,68.0,33.0,20.0,16.0,9.0,4.0,8.0


In [3]:
df.columns

Index(['tourney_id', 'tourney_name', 'surface', 'draw_size', 'tourney_level',
       'tourney_date', 'match_num', 'winner_id', 'winner_seed', 'winner_entry',
       'winner_name', 'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age',
       'winner_rank', 'winner_rank_points', 'loser_id', 'loser_seed',
       'loser_entry', 'loser_name', 'loser_hand', 'loser_ht', 'loser_ioc',
       'loser_age', 'loser_rank', 'loser_rank_points', 'score', 'best_of',
       'round', 'minutes', 'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon',
       'w_2ndWon', 'w_SvGms', 'w_bpSaved', 'w_bpFaced', 'l_ace', 'l_df',
       'l_svpt', 'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved',
       'l_bpFaced'],
      dtype='object')

## Match only

In [4]:
games = df[['tourney_id', 'winner_id', 'winner_rank', 'loser_id', 'loser_rank', 'tourney_date']].copy()
def label_data(winner_id, winner_rank, loser_id, loser_rank):
    if winner_rank < loser_rank:
        top_player_rank = winner_rank
        top_player_id = winner_id
        low_player_rank = loser_rank
        low_player_id = loser_id
        top_player_won = 1.
    else:
        top_player_rank = loser_rank
        top_player_id = loser_id
        low_player_rank = winner_rank
        low_player_id = winner_id
        top_player_won = 0.
    return top_player_id, top_player_rank, low_player_id, low_player_rank, top_player_won

games['top_player_id'], games['top_player_rank'], games['low_player_id'], games['low_player_rank'], games['top_player_won'] = \
np.vectorize(label_data)(games['winner_id'], games['winner_rank'], games['loser_id'], games['loser_rank'])
games.drop(['winner_id', 'winner_rank', 'loser_id', 'loser_rank'], axis=1, inplace=True)
games.head()
        

Unnamed: 0,tourney_id,tourney_date,top_player_id,top_player_rank,low_player_id,low_player_rank,top_player_won
0,2004-360,2004-05-17,103103.0,32.0,104259.0,149.0,1.0
1,2004-360,2004-05-17,103153.0,101.0,102231.0,150.0,0.0
2,2004-360,2004-05-17,103700.0,136.0,102558.0,165.0,1.0
3,2004-360,2004-05-17,103169.0,90.0,103007.0,451.0,1.0
4,2004-360,2004-05-17,103105.0,80.0,103898.0,109.0,0.0


### Database by player

In [5]:
winners = (df[list(df.columns[:7]) + [column for column in df.columns if column[0]=='w'] + ['l_bpFaced', 'l_bpSaved']]
               .rename(columns={'l_bpFaced': '_break_points_for', 'l_bpSaved':'_break_points_missed'})
          )

for column in winners.columns[7:]:
    index = column.find('_') + 1
    winners.rename(columns={column: column[index:]}, inplace = True)
    
losers = (df[list(df.columns[:7]) + [column for column in df.columns if column[0]=='l'] + ['w_bpFaced', 'w_bpSaved']]
               .rename(columns={'w_bpFaced': '_break_points_for', 'w_bpSaved':'_break_points_missed'})
          )
for column in losers.columns[7:]:
    index = column.find('_') + 1
    losers.rename(columns={column: column[index:]}, inplace = True)

players = pd.DataFrame.append(winners, losers)
players['id'] = players['id'].astype('int')

players['%1st_serve_in'] = players['1stIn']/players['svpt']
players['%1st_serve_won'] = players['1stWon']/players['1stIn']
players['%2nd_serve_won'] = players['2ndWon']/(players['svpt'] - players['1stIn'])
players['%break_points_saved'] = players['bpSaved']/players['bpFaced']
players['%break_points_converted'] = 1. - players['break_points_missed']/players['break_points_for']

players.rename(columns={
    'df': 'double_fault',
    'bpFaced': 'break_points_against'
}, inplace=True)

#players.set_index(['id','tourney_date'], inplace=True)

players.head()

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,id,seed,entry,...,SvGms,bpSaved,break_points_against,break_points_for,break_points_missed,%1st_serve_in,%1st_serve_won,%2nd_serve_won,%break_points_saved,%break_points_converted
0,2004-360,Casablanca,Clay,32.0,A,2004-05-17,1.0,103103,1.0,,...,8.0,6.0,6.0,9.0,5.0,0.571429,0.84375,0.458333,1.0,0.444444
1,2004-360,Casablanca,Clay,32.0,A,2004-05-17,2.0,102231,,,...,14.0,6.0,11.0,13.0,6.0,0.710843,0.559322,0.5,0.545455,0.538462
2,2004-360,Casablanca,Clay,32.0,A,2004-05-17,3.0,103700,,,...,7.0,3.0,3.0,8.0,3.0,0.52,0.769231,0.541667,1.0,0.625
3,2004-360,Casablanca,Clay,32.0,A,2004-05-17,4.0,103169,8.0,,...,10.0,3.0,4.0,20.0,17.0,0.65,0.692308,0.619048,0.75,0.15
4,2004-360,Casablanca,Clay,32.0,A,2004-05-17,5.0,103898,,,...,9.0,6.0,7.0,8.0,4.0,0.589041,0.674419,0.533333,0.857143,0.5


## Statistics

In [6]:
stats = ['ace', 'double_fault', '%1st_serve_in', '%1st_serve_won', '%2nd_serve_won', '%break_points_saved', 'break_points_against', '%break_points_converted']

### Life to date

In [7]:
def life_to_date_stats(df, player_id, tdate):
    tdate = pd.to_datetime(tdate)
    column_names = [stat + '_ltd' for stat in stats]
    results = (df.loc[(df['id'] == player_id) & (df['tourney_date'] < tdate)]
                    .groupby('id')
                    .agg('mean')[stats]
                    #.set_index('id')
                    .rename(columns=dict(zip(stats, column_names)))
              )
    return [results[column] for column in column_names] + [tdate]

#life_to_date_stats(players, 104735, '2016-01-04')
np.vectorize(life_to_date_stats, excluded=['df'])(df=players, player_id=players['id'], tdate=players['tourney_date'])


array([list([id
103103    5.444882
Name: ace_ltd, dtype: float64, id
103103    3.992126
Name: double_fault_ltd, dtype: float64, id
103103    0.572798
Name: %1st_serve_in_ltd, dtype: float64, id
103103    0.721495
Name: %1st_serve_won_ltd, dtype: float64, id
103103    0.506247
Name: %2nd_serve_won_ltd, dtype: float64, id
103103    0.579465
Name: %break_points_saved_ltd, dtype: float64, id
103103    7.051181
Name: break_points_against_ltd, dtype: float64, id
103103    0.429896
Name: %break_points_converted_ltd, dtype: float64, Timestamp('2004-05-17 00:00:00')]),
       list([id
102231    6.0
Name: ace_ltd, dtype: float64, id
102231    1.826087
Name: double_fault_ltd, dtype: float64, id
102231    0.577988
Name: %1st_serve_in_ltd, dtype: float64, id
102231    0.712575
Name: %1st_serve_won_ltd, dtype: float64, id
102231    0.486431
Name: %2nd_serve_won_ltd, dtype: float64, id
102231    0.605624
Name: %break_points_saved_ltd, dtype: float64, id
102231    7.456522
Name: break_points_against_l

### Last x games

In [50]:
def last_x_games_avg(df, player_id, tdate, x):
    
    column_names = ['last_' + str(x) + '_games_' + stat for stat in stats]
    results = (df.loc[(df['id'] == player_id) & (df['tourney_date'] < tdate)]
                    .sort_values(by=['tourney_date'], ascending=False)
                    .head(x)
                    .agg('mean')[stats]
                    .rename(columns=dict(zip(stats, column_names)))
              )
    return results

#last_x_games_avg(players, 104735, '2016-01-04', 10)
np.vectorize(last_x_games_avg, excluded=['df', 'x'])(df=players, player_id=players['id'], tdate=players['tourney_date'], x=10)


KeyboardInterrupt: 

### Current year

In [1]:
players.head()

NameError: name 'players' is not defined

### Face to face

In [None]:
# Add number of wins to stats