# Refactored Code: `game_df` and `player_df`

This is code is all nice and refactored.  

Calling `create_game_df()` creates the DataFrame to feed into sklearn classifier models.  

Calling `create_player_df()` creates player_df, a databaes of player-matches and their corresponding stats.

Last thing that needs to be done is to fill in some function docstrings.

In [1]:
import pandas as pd
import numpy as np
import pickle
%matplotlib inline

In [2]:
pd.set_option('display.max_columns', 300)

In [3]:
def add_moving_averages(df, column, window=20):
    """
    Create new feautre using pandas rolling window function.
    Basially create a moving average.
    """
    new_name = str(column) + "_mavg"
    new_column = df.groupby('player_name')[column].rolling(window=20, min_periods=2).mean()
    df[new_name] = new_column.reset_index(level=0, drop=True)
    return df

In [4]:
def add_expanding_windows(df, column):
    """
    Create new feature using pandas expanding window function.
    Basically a cumulative average.
    """
    new_name = str(column) + "_expw"
    new_column = df.groupby('player_name')[column].expanding().mean()
    df[new_name] = new_column.reset_index(level=0, drop=True)
    return df

In [5]:
def read_data_into_df(beginning_year=1991, ending_year=2017):
    """
    This function reads the data from the GitHub repo.
    Returns a pandas DataFrame containing the raw data.
    Accepts a beginning year and ending year as inputs:
    default values are 1991 and 2017, as those are the
    max boundaries of the data that is suitable for the
    project.
    
    The data for year 2017 is misformatted; this function
    handles that if data from the 2017 year is requested.
    """
    
    # Initialize some variables.    
    filestem = 'https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_{}.csv'
    c = ['tourney_id', 'tourney_name', 'surface', 'draw_size', 'tourney_level',
           'tourney_date', 'match_num', 'winner_id', 'winner_seed', 'winner_entry',
           'winner_name', 'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age',
           'winner_rank', 'winner_rank_points', 'loser_id', 'loser_seed',
           'loser_entry', 'loser_name', 'loser_hand', 'loser_ht', 'loser_ioc',
           'loser_age', 'loser_rank', 'loser_rank_points', 'score', 'best_of',
           'round', 'minutes', 'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon',
           'w_2ndWon', 'w_SvGms', 'w_bpSaved', 'w_bpFaced', 'l_ace', 'l_df',
           'l_svpt', 'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved',
           'l_bpFaced']
    dlist = []    # An empty list to hold DataFrames
    
    # Each year's data is stored in a separate CSV file.
    # Iteratively call for each year's file,
    # read into a DataFrame, and concat all resulting
    # DataFrames.  Convert tourney_date to datetime,
    # and drop irrelevant columns.
    
    for i in range(beginning_year, ending_year + 1):
        filename = filestem.format(i)
        d = pd.read_csv(filename)
        if i == 2017:
            d = d.drop(['l_bpSaved', 'l_bpFaced'], axis=1).reset_index()
            d.columns = c            
        dlist.append(d)
    df = pd.concat(dlist)
    df = df.reset_index(drop=True)
    df = df.drop(['draw_size', 'best_of', 'match_num', 'minutes', 'winner_id', 'loser_id'], axis=1)
    df.tourney_date = pd.to_datetime(df.tourney_date, format='%Y%M%d')
    
    return df

In [6]:
def create_player_dictionary(df):
    """
    This function creates a dictionary of player-matches.
    Each key in the dictionary is a 2-tuple containing
    player's name and match index.  The values in the
    dictionary are the player's in-match stats that
    were recorded by that player in that match.
    Therefore, every match played is represented by
    two key-value pairs: one for each player that
    participated in the match.
    
    Importantly, this dictionary contains player
    stats for every match, regardless of whether or not
    the player won the match.
    """
    
    # Initialize some items.    
    player_dict = {}
    winner_stats = [
                    'winner_seed', 'winner_ht', 'winner_age', 'winner_rank', 'winner_rank_points',
                    'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon',
                   'w_2ndWon', 'w_SvGms', 'w_bpSaved', 'w_bpFaced'
                    ]
    loser_stats = [
                    'loser_seed', 'loser_ht', 'loser_age', 'loser_rank', 'loser_rank_points',
                    'l_ace', 'l_df', 'l_svpt', 'l_1stIn', 'l_1stWon',
                   'l_2ndWon', 'l_SvGms', 'l_bpSaved', 'l_bpFaced'
                    ]

    # Iterate through the rows of the DataFrame and
    # insert information into player_dict.
    for index, row in df.iterrows():
        player_dict[(row['winner_name'], index)] = {row[stat] for stat in winner_stats}
        player_dict[(row['loser_name'], index)] = {row[stat] for stat in loser_stats}  
    return player_dict

In [7]:
def dictionary_to_df(player_dict, window=20):
    """
    This function takes a dictionary of
    in-match stats for player-matches, and
    returns a DataFrame of the stats.  It also
    computes moving-average and expanding-window
    stats and adds them to the DataFrame.
    """
    
    # Initialize a list of statistics.
    player_stats = [
            'seed', 'ht', 'age', 'rank', 'rank_points',
            'ace', 'df', 'svpt', '1stIn', '1stWon',
           '2ndWon', 'SvGms', 'bpSaved', 'bpFaced'
            ]
    
    # Feed dictionary into a new pandas DataFrame, player_df.
    player_df = pd.DataFrame.from_dict(player_dict, orient='index')
    player_df.index = pd.MultiIndex.from_tuples(player_df.index)
    player_df = player_df.reset_index()
    player_df.columns = ['player_name','game_index'] + player_stats

    # Compute moving-average and expanding-window columns.
    # Ignore certain stats: seed, ht, age, rank, rank_points.
    for stat in player_stats:
        if stat not in ['seed', 'ht', 'age', 'rank', 'rank_points']:
            player_df = add_expanding_windows(player_df, stat)
            player_df = add_moving_averages(player_df, stat, window=window)
    
    return player_df

In [8]:
def create_player_df(beginning_year=1991, ending_year=2017, window=20):
    """
    THIS IS A MAIN FUNCTION.
    
    This function creates player_df.
    This is a pandas DataFrame which contains player stats over time.
    Each row in the DataFrame is a "player-game," corresponding to a
    single player's in-game stats for every game in the data.
    Each game therefore is represented by two rows, one for each player.
    The DataFrame contains raw stats as well as "stats over time."
    "Stats over time" includes two evolving "averages" of each stat,
    one computed with a moving window (size 20) and the other computed
    with an expanding window.  My aim is to use these computed stats as
    an estimation of a player's ability, and ultaimtely as predictors in
    the model, because in-game stats cannot be known ex-ante, and they
    are very noisy for a given player anyways.
    """
    
    # Grab the data, read into DF, handle some data issues and convert tourney_date to datetime.
    df = read_data_into_df(beginning_year=1991, ending_year=2017)
    
    # Create a dictionary of players' in-game stats, indexed by (player_name, game_id)
    player_dict = create_player_dictionary(df) 
        
    # Feed dictionary into a new pandas DataFrame, player_df.
    player_df = dictionary_to_df(player_dict, window=window)
    
    return player_df

In [9]:
def create_labels(df):
    """
    Create labels to append to games_df.
    Return a column with same lenght as DataFrame.
    Labels are randomized 0, 1 values;
    1 to indicate matches where player 1 wins;
    0 otherwise.
    """
    
    labels = pd.DataFrame(np.random.choice(2, len(df)))
    return labels

In [10]:
def create_win_df(df, labels):
    """
    
    """
    
    win_df = df['winner_name']
    win_df = pd.concat([win_df, labels], axis=1)
    win_df = win_df.reset_index()
    win_df.columns = ['game_index', 'player_name', 'label']
    
    return win_df

In [11]:
def create_lose_df(df, labels):
    """
    
    """
    
    lose_df = df['loser_name']

    # Create "reverse labels" to concat onto lose_df.

    reverse_labels = pd.DataFrame(np.zeros(len(labels)))
    reverse_labels[labels == 0] = 1
    reverse_labels = reverse_labels.astype(int)

    # Concat to finish creating lose_df.
    lose_df = pd.concat([lose_df, reverse_labels], axis=1)    
    lose_df = lose_df.reset_index()
    lose_df.columns = ['game_index', 'player_name', 'label']
    
    return lose_df

In [12]:
def players_stack(df, labels, beginning_year, ending_year, window):
    """
    Long DF of players.
    Winners stacked on top of losers.
    """
    
    # Create separate DataFrame of match winners
    # and losers, and concat randomly assigned labels.
    win_df = create_win_df(df, labels)
    lose_df = create_lose_df(df, labels)
    player_df = create_player_df(beginning_year, ending_year, window)

    # Concat player's stats that are relevant for each match.
    # This includes in-match stats as well as backward-looking
    # (moving-average and expanding-window) stats.
    win_df = pd.merge(win_df, player_df, how='left', on=['player_name', 'game_index'])
    lose_df = pd.merge(lose_df, player_df, how='left', on=['player_name', 'game_index'])
    players = pd.concat([win_df, lose_df], axis=0)
    players = players.reset_index(drop=True)
    
    return players

In [13]:
def create_player1_df(players):
    """
    
    """
    
    player1_df = players[players.label == 1]
    player1_df = player1_df.drop('label', axis=1)
    player1_df = player1_df.reset_index(drop=True)

    player1_stats = ['game_index', 'player1_name',
           'player1_seed', 'player1_ht', 'player1_age', 'player1_rank', 'player1_rank_points',         
           'player1_ace', 'player1_df', 'player1_svpt', 'player1_1stIn', 'player1_1stWon',
           'player1_2ndWon', 'player1_SvGms', 'player1_bpSaved', 'player1_bpFaced', 'player1_ace_expw', 'player1_ace_mavg',
           'player1_df_expw', 'player1_df_mavg', 'player1_svpt_expw', 'player1_svpt_mavg', 'player1_1stIn_expw',
           'player1_1stIn_mavg', 'player1_1stWon_expw', 'player1_1stWon_mavg', 'player1_2ndWon_expw',
           'player1_2ndWon_mavg', 'player1_SvGms_expw', 'player1_SvGms_mavg', 'player1_bpSaved_expw',
           'player1_bpSaved_mavg', 'player1_bpFaced_expw', 'player1_bpFaced_mavg']
    player1_df.columns = player1_stats
    player1_df = player1_df.sort_values('game_index')
    player1_df = player1_df.reset_index(drop=True)
    
    return player1_df

In [14]:
def create_player2_df(players):
    """
    
    """
    
    player2_df = players[players.label == 0]
    player2_df = player2_df.drop('label', axis=1)
    player2_df = player2_df.reset_index(drop=True)

    player2_stats = ['game_index', 'player2_name',
           'player2_seed', 'player2_ht', 'player2_age', 'player2_rank', 'player2_rank_points',         
           'player2_ace', 'player2_df', 'player2_svpt', 'player2_1stIn', 'player2_1stWon',
           'player2_2ndWon', 'player2_SvGms', 'player2_bpSaved', 'player2_bpFaced', 'player2_ace_expw', 'player2_ace_mavg',
           'player2_df_expw', 'player2_df_mavg', 'player2_svpt_expw', 'player2_svpt_mavg', 'player2_1stIn_expw',
           'player2_1stIn_mavg', 'player2_1stWon_expw', 'player2_1stWon_mavg', 'player2_2ndWon_expw',
           'player2_2ndWon_mavg', 'player2_SvGms_expw', 'player2_SvGms_mavg', 'player2_bpSaved_expw',
           'player2_bpSaved_mavg', 'player2_bpFaced_expw', 'player2_bpFaced_mavg']
    player2_df.columns = player2_stats
    player2_df = player2_df.sort_values('game_index')
    player2_df = player2_df.drop('game_index', axis=1)
    player2_df = player2_df.reset_index(drop=True)
    
    return player2_df

In [15]:
def grow_game_df(game_df, player1_df, player2_df, labels):
    """
    
    """
    
    game_df = pd.concat([game_df, player1_df, player2_df], axis=1)
    game_df['label'] = labels
    
    return game_df

In [16]:
def add_delta_stats(df):
    """
    Addnew features to the DF by taking the difference
    between player1 and player2 stats.
    """
    
    delta_stats_p1 = [
        'player1_seed', 'player1_ht', 'player1_age', 'player1_rank', 'player1_rank_points',
        'player1_ace_mavg', 'player1_df_mavg', 'player1_svpt_mavg',
        'player1_1stIn_mavg', 'player1_1stWon_mavg', 'player1_2ndWon_mavg',
        'player1_SvGms_mavg', 'player1_bpSaved_mavg', 'player1_bpFaced_mavg',
        'player1_ace_expw', 'player1_df_expw', 'player1_svpt_expw',
           'player1_1stIn_expw', 'player1_1stWon_expw', 'player1_2ndWon_expw',
           'player1_SvGms_expw', 'player1_bpSaved_expw', 'player1_bpFaced_expw',
        ]

    delta_stats_p2 = [
        'player2_seed', 'player2_ht', 'player2_age', 'player2_rank', 'player2_rank_points',
        'player2_ace_mavg', 'player2_df_mavg', 'player2_svpt_mavg',
        'player2_1stIn_mavg', 'player2_1stWon_mavg', 'player2_2ndWon_mavg',
        'player2_SvGms_mavg', 'player2_bpSaved_mavg', 'player2_bpFaced_mavg',
        'player2_ace_expw', 'player2_df_expw', 'player2_svpt_expw',
           'player2_1stIn_expw', 'player2_1stWon_expw', 'player2_2ndWon_expw',
           'player2_SvGms_expw', 'player2_bpSaved_expw', 'player2_bpFaced_expw'
        ]

    for stat_p1, stat_p2 in zip(delta_stats_p1, delta_stats_p2):
        new_stat_name = "delta_" + '_'.join(stat_p1.split('_')[1:])
        df[new_stat_name] = df[stat_p1] - df[stat_p2]
        
    return df

In [17]:
def create_game_df(beginning_year=1991, ending_year=2017, window=20):
    """
    THIS IS A MAIN FUNCTION.
    
    This function creates the DataFrame of matches.
    Formatted so that a randomized subset of the rows
    were won by player 1 -- noted with a label of 1.
    The other half of rows were won by player 2 and are
    marked with a label of 0.
    
    Columns include match-level data as well as all stats:
    in-match, moving-average, and moving-window.
    
    This is the DataFrame that will ultimately be fed
    into the sklearn classification models for fitting.
    """
    
    # Grab the data, read into DF, handle some data issues and convert tourney_date to datetime.
    df = read_data_into_df(beginning_year=beginning_year, ending_year=ending_year)

    # Create labels; they will be the final label
    # for games_df, and will be added in intermittant
    # steps.
    labels = create_labels(df)

    # Create long DF of players.  Winners are stacked
    # on top of losers.
    players = players_stack(df, labels, beginning_year, ending_year, window)

    # Create stump of game_df to concat onto later.
    game_stats = ['surface', 'tourney_level', 'tourney_date', 'tourney_name']
    game_df = df[game_stats]

    # Create DF of all player1's
    player1_df = create_player1_df(players)

    # Create DF of all player2's
    player2_df = create_player2_df(players)

    # Concat player1_df and player2_df onto stump game_df.
    game_df = grow_game_df(game_df, player1_df, player2_df, labels)
    
    # Create new columns by subtracting player2 stats
    # from player1 stats.
    game_df = add_delta_stats(game_df)
    
    return game_df

In [18]:
game_df = create_game_df()

In [19]:
# picklefile = 'tennis_data.pkl'
# with open(picklefile, 'wb') as f_obj:
#     pickle.dump(game_df, f_obj)