In [None]:
from collections import defaultdict
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
%matplotlib inline

In [None]:
pd.options.display.max_columns = 50

In [None]:
def add_moving_averages(df, column, window=20):
    """
    Create new feautre using pandas rolling window function.
    Basially create a moving average.
    """
    new_name = str(column) + "_mavg"
    new_column = df.groupby('player_name')[column].rolling(window=20, min_periods=2).mean()
    df[new_name] = new_column.reset_index(level=0, drop=True)
    return df

In [None]:
def add_expanding_windows(df, column):
    """
    Create new feature using pandas expanding window function.
    Basically a cumulative average.
    """
    new_name = str(column) + "_expw"
    new_column = df.groupby('player_name')[column].expanding().mean()
    df[new_name] = new_column.reset_index(level=0, drop=True)
    return df

In [None]:
def create_player_df():
    """
    This function creates player_df.
    This is a pandas DataFrame which contains player stats over time.
    Each row in the DataFrame is a "player-game," corresponding to a
    single player's in-game stats for every game in the data.
    Each game therefore is represented by two rows, one for each player.
    The DataFrame contains raw stats as well as "stats over time."
    "Stats over time" includes two evolving "averages" of each stat,
    one computed with a moving window (size 20) and the other computed
    with an expanding window.  My aim is to use these computed stats as
    an estimation of a player's ability, and ultaimtely as predictors in
    the model, because in-game stats cannot be known ex-ante, and they
    are very noisy for a given player anyways.
    Possible extensions: add parameters for data years, and moving
    average window size.
    """
    
    # Grab the data, read into DF, handle some data issues and convert tourney_date to datetime.

    filestem = 'https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_{}.csv'
    c = ['tourney_id', 'tourney_name', 'surface', 'draw_size', 'tourney_level',
           'tourney_date', 'match_num', 'winner_id', 'winner_seed', 'winner_entry',
           'winner_name', 'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age',
           'winner_rank', 'winner_rank_points', 'loser_id', 'loser_seed',
           'loser_entry', 'loser_name', 'loser_hand', 'loser_ht', 'loser_ioc',
           'loser_age', 'loser_rank', 'loser_rank_points', 'score', 'best_of',
           'round', 'minutes', 'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon',
           'w_2ndWon', 'w_SvGms', 'w_bpSaved', 'w_bpFaced', 'l_ace', 'l_df',
           'l_svpt', 'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved',
           'l_bpFaced']
    dlist = []


    for i in range(1991, 2018):
        filename = filestem.format(i)
        d = pd.read_csv(filename)
        if i == 2017:
            d = d.drop(['l_bpSaved', 'l_bpFaced'], axis=1).reset_index()
            d.columns = c            
        dlist.append(d)
    df = pd.concat(dlist)
    df = df.reset_index(drop=True)
    df = df.drop(['draw_size', 'best_of', 'match_num', 'minutes', 'winner_id', 'loser_id'], axis=1)
    df.tourney_date = pd.to_datetime(df.tourney_date, format='%Y%M%d')
    
    
    # Create a dictionary of players' in-game stats, indexed by (player_name, game_id)
    
    player_dict = {}

    winner_stats = ['w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon',
           'w_2ndWon', 'w_SvGms', 'w_bpSaved', 'w_bpFaced']

    loser_stats = ['l_ace', 'l_df', 'l_svpt', 'l_1stIn', 'l_1stWon',
                   'l_2ndWon', 'l_SvGms', 'l_bpSaved', 'l_bpFaced']

    player_stats = ['ace', 'df', 'svpt', '1stIn', '1stWon',
           '2ndWon', 'SvGms', 'bpSaved', 'bpFaced']

    for index, row in df.iterrows():

        player_dict[(row['winner_name'], index)] = {row[stat] for stat in winner_stats}
        player_dict[(row['loser_name'], index)] = {row[stat] for stat in loser_stats}
     
    
    # Feed dictionary into a new pandas DataFrame, player_df.
    
    player_df = pd.DataFrame.from_dict(player_dict, orient='index')
    player_df.index = pd.MultiIndex.from_tuples(player_df.index)
    player_df = player_df.reset_index()
    player_df.columns = ['player_name','game_index'] + player_stats
    
    
    # At this point, every row in the DataFrame represents a "player-game."
    # Columns are the stats.
    # Values in each row correspond to a player's in-game stats for the particular game.
    # For example, game 0 has two rows: one for each player that participated in game 0.
    # Next step is to calulate moving-average and expanding-window stats.
    # These are computed using the helper functions defined above.
    # Because we can't use in-game stats to predict a future game, the moving_average
    # and expanding_window stats are (hopefully) a good estimation of a player's talent.
    # These stats will ultimately serve as my predictive features since they can be computed
    # in advanc of any given game.
    
    for stat in player_stats:
    player_df = add_expanding_windows(player_df, stat)
    player_df = add_moving_averages(player_df, stat)
    
    return player_df