In [19]:
import pandas as pd
import numpy as np
import pickle
%matplotlib inline

This code is a revised version of the earlier one that creates the game dataframe.

This does the same thing, but pulls in a player's mavg and expw stats intelligently --
had a problem before of artifically splitting winners and losers.

The big function at the end returns game_df and stats_database.

game_df is a df of all the games, with randomized player1's and player2's, with label and all according stats.

stats_database is a df with about 17k player-rows, two rows for each game.

In [20]:
pd.set_option('display.max_columns', 300)

In [21]:
def add_moving_averages(df, column, window=20):
    """
    Create new feautre using pandas rolling window function.
    Basially create a moving average.
    """
    new_name = str(column) + "_mavg"
    new_column = df.groupby('player_name')[column].rolling(window=20, min_periods=2).mean()
    df[new_name] = new_column.reset_index(level=0, drop=True)
    return df

In [22]:
def add_expanding_windows(df, column):
    """
    Create new feature using pandas expanding window function.
    Basically a cumulative average.
    """
    new_name = str(column) + "_expw"
    new_column = df.groupby('player_name')[column].expanding().mean()
    df[new_name] = new_column.reset_index(level=0, drop=True)
    return df

In [23]:
def create_player_df():
    """
    This function creates player_df.
    This is a pandas DataFrame which contains player stats over time.
    Each row in the DataFrame is a "player-game," corresponding to a
    single player's in-game stats for every game in the data.
    Each game therefore is represented by two rows, one for each player.
    The DataFrame contains raw stats as well as "stats over time."
    "Stats over time" includes two evolving "averages" of each stat,
    one computed with a moving window (size 20) and the other computed
    with an expanding window.  My aim is to use these computed stats as
    an estimation of a player's ability, and ultaimtely as predictors in
    the model, because in-game stats cannot be known ex-ante, and they
    are very noisy for a given player anyways.
    Possible extensions: add parameters for data years, and moving
    average window size.
    """
    
    # Grab the data, read into DF, handle some data issues and convert tourney_date to datetime.

    filestem = 'https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_{}.csv'
    c = ['tourney_id', 'tourney_name', 'surface', 'draw_size', 'tourney_level',
           'tourney_date', 'match_num', 'winner_id', 'winner_seed', 'winner_entry',
           'winner_name', 'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age',
           'winner_rank', 'winner_rank_points', 'loser_id', 'loser_seed',
           'loser_entry', 'loser_name', 'loser_hand', 'loser_ht', 'loser_ioc',
           'loser_age', 'loser_rank', 'loser_rank_points', 'score', 'best_of',
           'round', 'minutes', 'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon',
           'w_2ndWon', 'w_SvGms', 'w_bpSaved', 'w_bpFaced', 'l_ace', 'l_df',
           'l_svpt', 'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved',
           'l_bpFaced']
    dlist = []


    for i in range(1991, 2018):
        filename = filestem.format(i)
        d = pd.read_csv(filename)
        if i == 2017:
            d = d.drop(['l_bpSaved', 'l_bpFaced'], axis=1).reset_index()
            d.columns = c            
        dlist.append(d)
    df = pd.concat(dlist)
    df = df.reset_index(drop=True)
    df = df.drop(['draw_size', 'best_of', 'match_num', 'minutes', 'winner_id', 'loser_id'], axis=1)
    df.tourney_date = pd.to_datetime(df.tourney_date, format='%Y%M%d')
    
    
    # Create a dictionary of players' in-game stats, indexed by (player_name, game_id)
    
    player_dict = {}

    winner_stats = ['w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon',
           'w_2ndWon', 'w_SvGms', 'w_bpSaved', 'w_bpFaced']

    loser_stats = ['l_ace', 'l_df', 'l_svpt', 'l_1stIn', 'l_1stWon',
                   'l_2ndWon', 'l_SvGms', 'l_bpSaved', 'l_bpFaced']

    player_stats = ['ace', 'df', 'svpt', '1stIn', '1stWon',
           '2ndWon', 'SvGms', 'bpSaved', 'bpFaced']

    for index, row in df.iterrows():

        player_dict[(row['winner_name'], index)] = {row[stat] for stat in winner_stats}
        player_dict[(row['loser_name'], index)] = {row[stat] for stat in loser_stats}
     
    
    # Feed dictionary into a new pandas DataFrame, player_df.
    
    player_df = pd.DataFrame.from_dict(player_dict, orient='index')
    player_df.index = pd.MultiIndex.from_tuples(player_df.index)
    player_df = player_df.reset_index()
    player_df.columns = ['player_name','game_index'] + player_stats
    
    
    # At this point, every row in the DataFrame represents a "player-game."
    # Columns are the stats.
    # Values in each row correspond to a player's in-game stats for the particular game.
    # For example, game 0 has two rows: one for each player that participated in game 0.
    # Next step is to calulate moving-average and expanding-window stats.
    # These are computed using the helper functions defined above.
    # Because we can't use in-game stats to predict a future game, the moving_average
    # and expanding_window stats are (hopefully) a good estimation of a player's talent.
    # These stats will ultimately serve as my predictive features since they can be computed
    # in advanc of any given game.
    
    for stat in player_stats:
        player_df = add_expanding_windows(player_df, stat)
        player_df = add_moving_averages(player_df, stat)
    
    return player_df

In [28]:
def create_game_df_and_player_df():
    
    
    
    filestem = 'https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_{}.csv'
    c = ['tourney_id', 'tourney_name', 'surface', 'draw_size', 'tourney_level',
               'tourney_date', 'match_num', 'winner_id', 'winner_seed', 'winner_entry',
               'winner_name', 'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age',
               'winner_rank', 'winner_rank_points', 'loser_id', 'loser_seed',
               'loser_entry', 'loser_name', 'loser_hand', 'loser_ht', 'loser_ioc',
               'loser_age', 'loser_rank', 'loser_rank_points', 'score', 'best_of',
               'round', 'minutes', 'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon',
               'w_2ndWon', 'w_SvGms', 'w_bpSaved', 'w_bpFaced', 'l_ace', 'l_df',
               'l_svpt', 'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved',
               'l_bpFaced']
    dlist = []



    for i in range(1991, 2018):
        filename = filestem.format(i)
        d = pd.read_csv(filename)
        if i == 2017:
            d = d.drop(['l_bpSaved', 'l_bpFaced'], axis=1).reset_index()
            d.columns = c            
        dlist.append(d)
    df = pd.concat(dlist)
    df = df.reset_index(drop=True)


    labels = pd.DataFrame(np.random.choice(2, len(df)))


    win_df = df['winner_name']
    win_df = pd.concat([win_df, labels], axis=1)
    win_df.columns = ['winner_name', 'label']


    lose_df = df['loser_name']

    # Create "reverse labels" to concat onto lose_df.

    reverse_labels = pd.DataFrame(np.zeros(len(labels)))
    reverse_labels[labels == 0] = 1
    reverse_labels = reverse_labels.astype(int)

    # Concat to finish creating lose_df.
    lose_df = pd.concat([lose_df, reverse_labels], axis=1)
    lose_df.columns = ['loser_name', 'label']


    stat_database = create_player_df()


    win_df = win_df.reset_index()
    win_df.index
    win_df.columns = ['game_index', 'player_name', 'label']
    win_df = pd.merge(win_df, stat_database, how='left', on=['player_name', 'game_index'])


    lose_df = lose_df.reset_index()
    lose_df.index
    lose_df.columns = ['game_index', 'player_name', 'label']
    lose_df = pd.merge(lose_df, stat_database, how='left', on=['player_name', 'game_index'])


    player_df = pd.concat([win_df, lose_df], axis=0)


    player_df = player_df.reset_index(drop=True)


    game_stats = ['surface', 'tourney_level', 'tourney_date']
    game_df = df[game_stats]


    player1_df = player_df[player_df.label == 1]
    player1_df = player1_df.drop('label', axis=1)
    player1_df = player1_df.reset_index(drop=True)

    player1_stats = ['game_index', 'player1_name', 'player1_ace', 'player1_df', 'player1_svpt', 'player1_1stIn', 'player1_1stWon',
           'player1_2ndWon', 'player1_SvGms', 'player1_bpSaved', 'player1_bpFaced', 'player1_ace_expw', 'player1_ace_mavg',
           'player1_df_expw', 'player1_df_mavg', 'player1_svpt_expw', 'player1_svpt_mavg', 'player1_1stIn_expw',
           'player1_1stIn_mavg', 'player1_1stWon_expw', 'player1_1stWon_mavg', 'player1_2ndWon_expw',
           'player1_2ndWon_mavg', 'player1_SvGms_expw', 'player1_SvGms_mavg', 'player1_bpSaved_expw',
           'player1_bpSaved_mavg', 'player1_bpFaced_expw', 'player1_bpFaced_mavg']
    player1_df.columns = player1_stats
    player1_df = player1_df.sort_values('game_index')


    player2_df = player_df[player_df.label == 0]
    player2_df = player2_df.drop('label', axis=1)
    player2_df = player2_df.reset_index(drop=True)

    player2_stats = ['game_index', 'player2_name', 'player2_ace', 'player2_df', 'player2_svpt', 'player2_1stIn', 'player2_1stWon',
           'player2_2ndWon', 'player2_SvGms', 'player2_bpSaved', 'player2_bpFaced', 'player2_ace_expw', 'player2_ace_mavg',
           'player2_df_expw', 'player2_df_mavg', 'player2_svpt_expw', 'player2_svpt_mavg', 'player2_1stIn_expw',
           'player2_1stIn_mavg', 'player2_1stWon_expw', 'player2_1stWon_mavg', 'player2_2ndWon_expw',
           'player2_2ndWon_mavg', 'player2_SvGms_expw', 'player2_SvGms_mavg', 'player2_bpSaved_expw',
           'player2_bpSaved_mavg', 'player2_bpFaced_expw', 'player2_bpFaced_mavg']
    player2_df.columns = player2_stats
    player2_df = player2_df.sort_values('game_index')
    player2_df = player2_df.drop('game_index', axis=1)


    player1_df = player1_df.reset_index(drop=True)
    player2_df = player2_df.reset_index(drop=True)
    game_df = pd.concat([game_df, player1_df, player2_df], axis=1)
    game_df['label'] = labels
    
    return game_df, stat_database

In [29]:
game_df, stats_database = create_game_df_and_player_df()

In [30]:
# picklefile = 'game_df_oct_18.pkl'
# with open(picklefile, 'wb') as f_obj:
#     pickle.dump(game_df, f_obj)