In [65]:
import numpy as np
import pandas as pd
import glob, os 

## Initial data import
Use data starting in 2000

In [66]:
pathname = "./tennis_atp/atp_matches_20??.csv"
df = pd.concat((pd.read_csv(f) for f in glob.iglob(pathname, recursive=True)), ignore_index=True)
# Filter matches without data
df = df[list(map(lambda x: not np.isnan(x), df.loc[:]["minutes"]))]

df['tourney_date'] = pd.to_datetime(df['tourney_date'], format='%Y%m%d')
df.head()

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,...,w_bpFaced,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced
0,2000-717,Orlando,Clay,32.0,A,2000-05-01,1.0,102179.0,,,...,15.0,13.0,4.0,110.0,59.0,49.0,31.0,17.0,4.0,4.0
1,2000-717,Orlando,Clay,32.0,A,2000-05-01,2.0,103602.0,,Q,...,6.0,0.0,0.0,57.0,24.0,13.0,17.0,10.0,4.0,9.0
2,2000-717,Orlando,Clay,32.0,A,2000-05-01,3.0,103387.0,,,...,0.0,2.0,2.0,65.0,39.0,22.0,10.0,8.0,6.0,10.0
3,2000-717,Orlando,Clay,32.0,A,2000-05-01,4.0,101733.0,,,...,12.0,4.0,6.0,104.0,57.0,35.0,24.0,15.0,6.0,11.0
4,2000-717,Orlando,Clay,32.0,A,2000-05-01,5.0,101727.0,4.0,,...,1.0,0.0,3.0,47.0,28.0,17.0,10.0,8.0,3.0,6.0


In [67]:
df.columns

Index(['tourney_id', 'tourney_name', 'surface', 'draw_size', 'tourney_level',
       'tourney_date', 'match_num', 'winner_id', 'winner_seed', 'winner_entry',
       'winner_name', 'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age',
       'winner_rank', 'winner_rank_points', 'loser_id', 'loser_seed',
       'loser_entry', 'loser_name', 'loser_hand', 'loser_ht', 'loser_ioc',
       'loser_age', 'loser_rank', 'loser_rank_points', 'score', 'best_of',
       'round', 'minutes', 'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon',
       'w_2ndWon', 'w_SvGms', 'w_bpSaved', 'w_bpFaced', 'l_ace', 'l_df',
       'l_svpt', 'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved',
       'l_bpFaced'],
      dtype='object')

## Match only

In [68]:
games = df[['tourney_id', 'winner_id', 'loser_id', 'tourney_date']]

### Merge winner and losers

In [69]:
winners = (df[list(df.columns[:7]) + [column for column in df.columns if column[0]=='w'] + ['l_bpFaced', 'l_bpSaved']]
               .rename(columns={'l_bpFaced': 'bpOcc', 'l_bpSaved':'bpMissed'})
          )

for column in winners.columns[7:]:
    index = column.find('_') + 1
    winners.rename(columns={column: column[index:]}, inplace = True)
    
losers = (df[list(df.columns[:7]) + [column for column in df.columns if column[0]=='l'] + ['w_bpFaced', 'w_bpSaved']]
               .rename(columns={'w_bpFaced': 'bpOcc', 'w_bpSaved':'bpMissed'})
          )
for column in losers.columns[7:]:
    index = column.find('_') + 1
    losers.rename(columns={column: column[index:]}, inplace = True)

players = pd.DataFrame.append(winners, losers)
players['id'] = players['id'].astype('int')
#players.set_index(['id'], inplace=True)
players.head()

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,id,seed,entry,...,df,svpt,1stIn,1stWon,2ndWon,SvGms,bpSaved,bpFaced,bpOcc,bpMissed
0,2000-717,Orlando,Clay,32.0,A,2000-05-01,1.0,102179,,,...,1.0,126.0,76.0,56.0,29.0,16.0,14.0,15.0,4.0,4.0
1,2000-717,Orlando,Clay,32.0,A,2000-05-01,2.0,103602,,Q,...,2.0,67.0,35.0,25.0,16.0,10.0,4.0,6.0,9.0,4.0
2,2000-717,Orlando,Clay,32.0,A,2000-05-01,3.0,103387,,,...,1.0,46.0,29.0,23.0,11.0,8.0,0.0,0.0,10.0,6.0
3,2000-717,Orlando,Clay,32.0,A,2000-05-01,4.0,101733,,,...,6.0,109.0,56.0,43.0,21.0,15.0,9.0,12.0,11.0,6.0
4,2000-717,Orlando,Clay,32.0,A,2000-05-01,5.0,101727,4.0,,...,0.0,50.0,27.0,22.0,16.0,9.0,1.0,1.0,6.0,3.0


## Statistics

In [70]:
stats = ['ace', 'df', 'svpt', '%1stIn', '%1stWon', '%2ndWon', '%bpSaved', 'bpFaced', '%bpConverted']

### Lifetime

In [71]:
players['%1stIn'] = players['1stIn']/players['svpt']
players['%1stWon'] = players['1stWon']/players['1stIn']
players['%2ndWon'] = players['2ndWon']/(players['svpt'] - players['1stIn'])
players['%bpSaved'] = players['bpSaved']/players['bpFaced']
players['%bpConverted'] = 1. - players['bpMissed']/players['bpOcc']
players.groupby(['id', 'tourney_level', 'surface']).agg('mean')[stats].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,ace,df,svpt,%1stIn,%1stWon,%2ndWon,%bpSaved,bpFaced,%bpConverted
id,tourney_level,surface,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
100644,A,Clay,5.058824,3.529412,73.088235,0.611158,0.650078,0.526503,0.504844,6.176471,0.440921
100644,A,Grass,8.357143,2.071429,64.928571,0.680107,0.761056,0.493371,0.638571,4.071429,0.566728
100644,A,Hard,6.516667,3.45,72.933333,0.623039,0.746237,0.514597,0.610666,5.483333,0.477384
100644,D,Clay,4.5,4.0,76.0,0.615893,0.563507,0.497619,0.208333,7.0,0.5
100644,D,Hard,15.333333,5.5,119.833333,0.633422,0.722764,0.564025,0.715385,8.5,0.396122


### Last x games

In [78]:
def last_x_games_avg(df, player_id, tdate, x):
    column_names = ['last_' + str(x) + '_games_' + stat for stat in stats]
    results = (df.loc[(df['id'] == player_id) & (df['tourney_date'] < tdate)]
                    .sort_values(by=['tourney_date'], ascending=False)
                    .head(x)
                    .agg('mean')[stats]
                    .rename(columns=dict(zip(stats, column_names)))
              )
    return results

#last_x_games_avg(players, 104735, '2016-01-04', 10)
np.vectorize(last_x_games_avg, excluded=['df', 'x'])(df=players, player_id=players['id'], tdate=players['tourney_date'], x=10)


TypeError: <class 'int'> type object 957139200000000000

### Current year

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,id,seed,entry,...,SvGms,bpSaved,bpFaced,bpOcc,bpMissed,%1stIn,%1stWon,%2ndWon,%bpSaved,%bpConverted
0,2000-717,Orlando,Clay,32.0,A,2000-05-01,1.0,102179,,,...,16.0,14.0,15.0,4.0,4.0,0.603175,0.736842,0.58,0.933333,0.0
1,2000-717,Orlando,Clay,32.0,A,2000-05-01,2.0,103602,,Q,...,10.0,4.0,6.0,9.0,4.0,0.522388,0.714286,0.5,0.666667,0.555556
2,2000-717,Orlando,Clay,32.0,A,2000-05-01,3.0,103387,,,...,8.0,0.0,0.0,10.0,6.0,0.630435,0.793103,0.647059,,0.4
3,2000-717,Orlando,Clay,32.0,A,2000-05-01,4.0,101733,,,...,15.0,9.0,12.0,11.0,6.0,0.513761,0.767857,0.396226,0.75,0.454545
4,2000-717,Orlando,Clay,32.0,A,2000-05-01,5.0,101727,4.0,,...,9.0,1.0,1.0,6.0,3.0,0.54,0.814815,0.695652,1.0,0.5


### Face to face

In [None]:
# Add number of wins to stats