In [1]:
import pandas as pd
import numpy as np
import pickle
%matplotlib inline

In [2]:
# Helper Function to add moving avg column to DF

def add_moving_averages(df, column, window=20):
    """
    Create new feautre using pandas rolling window function.
    Basially create a moving average.
    """
    new_name = str(column) + "_mavg"
    if new_name[0] == 'w':
        new_column = df.groupby('winner_name')[column].rolling(window=20, min_periods=2).mean()
        df[new_name] = new_column.reset_index(level=0, drop=True)
    elif new_name[0] == 'l':
        new_column = df.groupby('loser_name')[column].rolling(window=20, min_periods=2).mean()
        df[new_name] = new_column.reset_index(level=0, drop=True)
    return df

In [3]:
# Helper Function to add cumulative avg column to DF

def add_expanding_windows(df, column):
    """
    Create new feature using pandas expanding window function.
    Basically a cumulative average.
    """
    new_name = str(column) + "_expw"
    if new_name[0] == 'w':
        new_column = df.groupby('winner_name')[column].expanding().mean()
        df[new_name] = new_column.reset_index(level=0, drop=True)
    elif new_name[0] == 'l':
        new_column = df.groupby('loser_name')[column].expanding().mean()
        df[new_name] = new_column.reset_index(level=0, drop=True)
    return df

In [4]:
def create_dataframe(mavg_window):
    """
    Create a dataframe using the passed window value for the mavg attributes.
    """
    
    # Create initial DF by pulling data.

    filestem = 'https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_{}.csv'
    c = ['tourney_id', 'tourney_name', 'surface', 'draw_size', 'tourney_level',
           'tourney_date', 'match_num', 'winner_id', 'winner_seed', 'winner_entry',
           'winner_name', 'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age',
           'winner_rank', 'winner_rank_points', 'loser_id', 'loser_seed',
           'loser_entry', 'loser_name', 'loser_hand', 'loser_ht', 'loser_ioc',
           'loser_age', 'loser_rank', 'loser_rank_points', 'score', 'best_of',
           'round', 'minutes', 'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon',
           'w_2ndWon', 'w_SvGms', 'w_bpSaved', 'w_bpFaced', 'l_ace', 'l_df',
           'l_svpt', 'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved',
           'l_bpFaced']
    dlist = []


    for i in range(2016, 2018):
        filename = filestem.format(i)
        d = pd.read_csv(filename)
        if i == 2017:
            d = d.drop(['l_bpSaved', 'l_bpFaced'], axis=1).reset_index()
            d.columns = c            
        dlist.append(d)
    df = pd.concat(dlist)
    df = df.reset_index(drop=True)

    # Enlarge DF Feature space by adding moving averages to all match-level stats.

    match_stats = ['w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon',
           'w_2ndWon', 'w_SvGms', 'w_bpSaved', 'w_bpFaced', 'l_ace', 'l_df',
           'l_svpt', 'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved',
           'l_bpFaced']
    for stat in match_stats:
        df = add_moving_averages(df, stat, mavg_window)
        df = add_expanding_windows(df, stat)

    # Randomly choose labels from {0, 1}
    labels = np.random.choice(2, len(df))
    labels

    winner_stats = ['winner_seed', 'winner_name', 'winner_hand', 'winner_ht', 'winner_age',
           'winner_rank', 'winner_rank_points', 'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon',
           'w_2ndWon', 'w_SvGms', 'w_bpSaved', 'w_bpFaced']

    loser_stats = [ 'loser_seed', 'loser_name', 'loser_hand', 'loser_ht',
           'loser_age', 'loser_rank', 'loser_rank_points', 'l_ace', 'l_df',
           'l_svpt', 'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved',
           'l_bpFaced']

    global_stats = ['surface', 'tourney_level', 'tourney_date']

    # Create main_df, the starting df.
    main_df = df.copy(deep=True)

    # Create random labels, to be used later.
    labels = pd.DataFrame(np.random.choice(2, len(main_df)))

    # Create win_df containing just the index, winners, and winner stats.

    # List of all columns to go in the win_df
    winner_stats = ['winner_seed', 'winner_name', 'winner_hand', 'winner_ht', 'winner_age',
           'winner_rank', 'winner_rank_points', 'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon',
           'w_2ndWon', 'w_SvGms', 'w_bpSaved', 'w_bpFaced', 'w_ace_mavg', 'w_df_mavg', 'w_svpt_mavg',
            'w_1stIn_mavg', 'w_1stWon_mavg', 'w_2ndWon_mavg', 'w_SvGms_mavg', 'w_bpSaved_mavg',
           'w_bpFaced_mavg']

    win_df = main_df[winner_stats]
    win_df = pd.concat([win_df, labels], axis=1)
    win_df.columns = winner_stats + ['label']

    # Create lose_df containing just the index, losers, and loser stats.

    # List of all columns to go in the lose_df.
    loser_stats = [ 'loser_seed', 'loser_name', 'loser_hand', 'loser_ht',
           'loser_age', 'loser_rank', 'loser_rank_points', 'l_ace', 'l_df',
           'l_svpt', 'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved',
           'l_bpFaced', 'l_ace_mavg', 'l_df_mavg', 'l_svpt_mavg',
           'l_1stIn_mavg', 'l_1stWon_mavg', 'l_2ndWon_mavg', 'l_SvGms_mavg',
           'l_bpSaved_mavg', 'l_bpFaced_mavg']

    lose_df = main_df[loser_stats]

    # Create "reverse labels" to concat onto lose_df.

    reverse_labels = pd.DataFrame(np.zeros(len(labels)))
    reverse_labels[labels == 0] = 1
    reverse_labels = reverse_labels.astype(int)

    # Concat to finish creating lose_df.
    lose_df = pd.concat([lose_df, reverse_labels], axis=1)
    lose_df.columns = winner_stats + ['label']

    # Create player_df, with wins stacked atop losses.
    player_df = pd.concat([lose_df, win_df], axis=0)

    # Create games_df, in its starting incarnation.
    # Just index of games and game information.

    # Game stats to go into game_df.
    game_stats = ['surface', 'tourney_level', 'tourney_date']

    game_df = main_df[game_stats]

    # Create player1_df, just the subset of player_df with label == 1
    player1_df = player_df[player_df.label == 1]
    player1_df = player1_df.drop('label', axis=1)
    player1_df = player1_df.reset_index(drop=True)

    player1_stats = ['player1_seed', 'player1_name', 'player1_hand', 'player1_ht', 'player1_age',
           'player1_rank', 'player1_rank_points', 'player1_ace', 'player1_df', 'player1_svpt', 'player1_1stIn', 'player1_1stWon',
           'player1_2ndWon', 'player1_SvGms', 'player1_bpSaved', 'player1_bpFaced', 'player1_ace_mavg', 'player1_df_mavg', 'player1_svpt_mavg',
            'player1_1stIn_mavg', 'player1_1stWon_mavg', 'player1_2ndWon_mavg', 'player1_SvGms_mavg', 'player1_bpSaved_mavg',
           'player1_bpFaced_mavg']
    player1_df.columns = player1_stats

    # Create player2_df, just the subset of player_df with label == 0
    player2_df = player_df[player_df.label == 0]
    player2_df = player2_df.drop('label', axis=1)
    player2_df = player2_df.reset_index(drop=True)

    player2_stats = ['player2_seed', 'player2_name', 'player2_hand', 'player2_ht', 'player2_age',
           'player2_rank', 'player2_rank_points', 'player2_ace', 'player2_df', 'player2_svpt', 'player2_1stIn', 'player2_1stWon',
           'player2_2ndWon', 'player2_SvGms', 'player2_bpSaved', 'player2_bpFaced', 'player2_ace_mavg', 'player2_df_mavg', 'player2_svpt_mavg',
            'player2_1stIn_mavg', 'player2_1stWon_mavg', 'player2_2ndWon_mavg', 'player2_SvGms_mavg', 'player2_bpSaved_mavg',
           'player2_bpFaced_mavg']
    player2_df.columns = player2_stats

    # Create final df by merging onto game_df.
    game_df = pd.concat([game_df, player1_df, player2_df], axis=1)

    game_df['label'] = labels

    return game_df

In [5]:
a = create_dataframe(1)

In [6]:
a.head()

Unnamed: 0,surface,tourney_level,tourney_date,player1_seed,player1_name,player1_hand,player1_ht,player1_age,player1_rank,player1_rank_points,...,player2_ace_mavg,player2_df_mavg,player2_svpt_mavg,player2_1stIn_mavg,player2_1stWon_mavg,player2_2ndWon_mavg,player2_SvGms_mavg,player2_bpSaved_mavg,player2_bpFaced_mavg,label
0,Hard,A,20160104,1.0,Roger Federer,R,185.0,34.406571,3.0,8265.0,...,,,,,,,,,,0
1,Hard,A,20160104,8.0,Dominic Thiem,R,,22.335387,20.0,1600.0,...,,,,,,,,,,0
2,Hard,A,20160104,3.0,Marin Cilic,R,198.0,27.266256,13.0,2405.0,...,,,,,,,,,,1
3,Hard,A,20160104,,Denis Kudla,R,180.0,23.381246,69.0,719.0,...,,,,,,,,,,1
4,Hard,A,20160104,6.0,David Goffin,R,163.0,25.075975,16.0,1880.0,...,,,,,,,,,,0


In [7]:
# picklefile = '2016_2017_season.pkl'
# with open(picklefile, 'wb') as f_obj:
#     pickle.dump(a, f_obj)