In [1]:
from collections import defaultdict
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
%matplotlib inline

In [2]:
pd.options.display.max_columns = 50

In [3]:
filestem = 'https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_{}.csv'
c = ['tourney_id', 'tourney_name', 'surface', 'draw_size', 'tourney_level',
       'tourney_date', 'match_num', 'winner_id', 'winner_seed', 'winner_entry',
       'winner_name', 'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age',
       'winner_rank', 'winner_rank_points', 'loser_id', 'loser_seed',
       'loser_entry', 'loser_name', 'loser_hand', 'loser_ht', 'loser_ioc',
       'loser_age', 'loser_rank', 'loser_rank_points', 'score', 'best_of',
       'round', 'minutes', 'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon',
       'w_2ndWon', 'w_SvGms', 'w_bpSaved', 'w_bpFaced', 'l_ace', 'l_df',
       'l_svpt', 'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved',
       'l_bpFaced']
dlist = []


for i in range(1991, 2018):
    filename = filestem.format(i)
    d = pd.read_csv(filename)
    if i == 2017:
        d = d.drop(['l_bpSaved', 'l_bpFaced'], axis=1).reset_index()
        d.columns = c            
    dlist.append(d)
df = pd.concat(dlist)
df = df.reset_index(drop=True)
df = df.drop(['draw_size', 'best_of', 'match_num', 'minutes', 'winner_id', 'loser_id'], axis=1)
df.tourney_date = pd.to_datetime(df.tourney_date, format='%Y%M%d');

In [133]:
player_dict = {}

winner_stats = ['w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon',
       'w_2ndWon', 'w_SvGms', 'w_bpSaved', 'w_bpFaced']

loser_stats = ['l_ace', 'l_df', 'l_svpt', 'l_1stIn', 'l_1stWon',
               'l_2ndWon', 'l_SvGms', 'l_bpSaved', 'l_bpFaced']

player_stats = ['ace', 'df', 'svpt', '1stIn', '1stWon',
       '2ndWon', 'SvGms', 'bpSaved', 'bpFaced']

test = df[(df.winner_name == 'Rafael Nadal') | (df.loser_name == 'Rafael Nadal')]

for index, row in df.iterrows():
    
    player_dict[(row['winner_name'], index)] = {row[stat] for stat in winner_stats}
    player_dict[(row['loser_name'], index)] = {row[stat] for stat in loser_stats}

In [134]:
player_df = pd.DataFrame.from_dict(player_dict, orient='index')
player_df.index = pd.MultiIndex.from_tuples(player_df.index)
player_df = player_df.reset_index()
player_df.columns = ['player_name','game_index'] + player_stats

In [135]:
def add_expanding_windows(df, column):
    """
    Create new feature using pandas expanding window function.
    Basically a cumulative average.
    """
    new_name = str(column) + "_expw"
    new_column = df.groupby('player_name')[column].expanding().mean()
    df[new_name] = new_column.reset_index(level=0, drop=True)
    return df

In [136]:
def add_moving_averages(df, column, window=20):
    """
    Create new feautre using pandas rolling window function.
    Basially create a moving average.
    """
    new_name = str(column) + "_mavg"
    new_column = df.groupby('player_name')[column].rolling(window=20, min_periods=2).mean()
    df[new_name] = new_column.reset_index(level=0, drop=True)
    return df

In [137]:
for stat in player_stats:
    player_df = add_expanding_windows(player_df, stat)
    player_df = add_moving_averages(player_df, stat)

In [138]:
player_df

Unnamed: 0,player_name,game_index,ace,df,svpt,1stIn,1stWon,2ndWon,SvGms,bpSaved,bpFaced,ace_expw,ace_mavg,df_expw,df_mavg,svpt_expw,svpt_mavg,1stIn_expw,1stIn_mavg,1stWon_expw,1stWon_mavg,2ndWon_expw,2ndWon_mavg,SvGms_expw,SvGms_mavg,bpSaved_expw,bpSaved_mavg,bpFaced_expw,bpFaced_mavg
0,Guillermo Perez Roldan,0,0.0,1.0,34.0,4.0,7.0,41.0,25.0,,,0.000000,,1.000000,,34.000000,,4.000000,,7.000000,,41.000000,,25.000000,,,,,
1,Diego Perez,0,0.0,2.0,6.0,7.0,40.0,9.0,12.0,22.0,,0.000000,,2.000000,,6.000000,,7.000000,,40.000000,,9.000000,,12.000000,,22.000000,,,
2,Menno Oosting,1,1.0,2.0,3.0,7.0,43.0,49.0,30.0,,,1.000000,,2.000000,,3.000000,,7.000000,,43.000000,,49.000000,,30.000000,,,,,
3,Olli Rahnasto,1,1.0,34.0,3.0,6.0,17.0,,,,,1.000000,,34.000000,,3.000000,,6.000000,,17.000000,,,,,,,,,
4,Libor Nemecek,2,0.0,3.0,5.0,6.0,7.0,40.0,49.0,21.0,,0.000000,,3.000000,,5.000000,,6.000000,,7.000000,,40.000000,,49.000000,,21.000000,,,
5,Fernando Luna,2,0.0,4.0,6.0,8.0,42.0,13.0,15.0,54.0,,0.000000,,4.000000,,6.000000,,8.000000,,42.000000,,13.000000,,15.000000,,54.000000,,,
6,Carlos Costa,3,1.0,2.0,4.0,7.0,74.0,11.0,44.0,12.0,31.0,1.000000,,2.000000,,4.000000,,7.000000,,74.000000,,11.000000,,44.000000,,12.000000,,31.000000,
7,Jose Francisco Altur,3,0.0,3.0,4.0,69.0,7.0,9.0,11.0,46.0,29.0,0.000000,,3.000000,,4.000000,,69.000000,,7.000000,,9.000000,,11.000000,,46.000000,,29.000000,
8,Franco Davin,4,0.0,1.0,2.0,7.0,43.0,23.0,29.0,,,0.000000,,1.000000,,2.000000,,7.000000,,43.000000,,23.000000,,29.000000,,,,,
9,Massimo Ardinghi,4,0.0,2.0,6.0,7.0,11.0,44.0,18.0,,,0.000000,,2.000000,,6.000000,,7.000000,,11.000000,,44.000000,,18.000000,,,,,


In [139]:
expw_stats = ['ace_expw', 'df_expw', 'svpt_expw', '1stIn_expw', '1stWon_expw',
'2ndWon_expw', 'SvGms_expw', 'bpSaved_expw', 'bpFaced_expw']

mavg_stats = ['ace_mavg', 'df_mavg', 'svpt_mavg', '1stIn_mavg', '1stWon_mavg',
'2ndWon_mavg', 'SvGms_mavg', 'bpSaved_mavg', 'bpFaced_mavg']

In [None]:
"""
Player DF is complete.
Each player has raw stats, expw stats, and mavg stats
that are computed looking backwards.
Game index coresponds to index of game_df.
"""