In [1]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import pickle
%matplotlib inline

In [2]:
pd.options.display.max_columns = 50

In [3]:
# Clean up the DF: column headers and rows are misaligned for some reason.

filename = 'https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_2017.csv'
df = pd.read_csv(filename)
c = ['tourney_id', 'tourney_name', 'surface', 'draw_size', 'tourney_level',
       'tourney_date', 'match_num', 'winner_id', 'winner_seed', 'winner_entry',
       'winner_name', 'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age',
       'winner_rank', 'winner_rank_points', 'loser_id', 'loser_seed',
       'loser_entry', 'loser_name', 'loser_hand', 'loser_ht', 'loser_ioc',
       'loser_age', 'loser_rank', 'loser_rank_points', 'score', 'best_of',
       'round', 'minutes', 'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon',
       'w_2ndWon', 'w_SvGms', 'w_bpSaved', 'w_bpFaced', 'l_ace', 'l_df',
       'l_svpt', 'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved',
       'l_bpFaced']
df = df.drop(['l_bpSaved', 'l_bpFaced'], axis=1).reset_index()
df.columns = c
df.shape

(2400, 49)

In [4]:
# Find which rows have the index problem in the big dataframe.

bad_indices = []
good_indices = []
for i in df.index:
    try:
        int(i)
    except:
        bad_indices.append(i)
    else:
        good_indices.append(i)
        
len(bad_indices)

0

In [5]:
filestem = 'https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_{}.csv'
c = ['tourney_id', 'tourney_name', 'surface', 'draw_size', 'tourney_level',
       'tourney_date', 'match_num', 'winner_id', 'winner_seed', 'winner_entry',
       'winner_name', 'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age',
       'winner_rank', 'winner_rank_points', 'loser_id', 'loser_seed',
       'loser_entry', 'loser_name', 'loser_hand', 'loser_ht', 'loser_ioc',
       'loser_age', 'loser_rank', 'loser_rank_points', 'score', 'best_of',
       'round', 'minutes', 'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon',
       'w_2ndWon', 'w_SvGms', 'w_bpSaved', 'w_bpFaced', 'l_ace', 'l_df',
       'l_svpt', 'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved',
       'l_bpFaced']
dlist = []


for i in range(1991, 2018):
    filename = filestem.format(i)
    d = pd.read_csv(filename)
    if i == 2017:
        d = d.drop(['l_bpSaved', 'l_bpFaced'], axis=1).reset_index()
        d.columns = c            
    dlist.append(d)
df = pd.concat(dlist)
df = df.reset_index(drop=True);

In [6]:
df_copy = df.copy(deep=True)

In [7]:
new_column = df_copy.groupby('winner_name').w_ace.rolling(window=20, min_periods=2).mean()
df_copy['w_ace_mavg'] = new_column.reset_index(level=0, drop=True)

In [8]:
# Helper Functions to add moving avg column to DF

def add_moving_averages(df, column, window=20):
    new_name = str(column) + "_mavg"
    if new_name[0] == 'w':
        new_column = df.groupby('winner_name')[column].rolling(window=20, min_periods=2).mean()
        df[new_name] = new_column.reset_index(level=0, drop=True)
    elif new_name[0] == 'l':
        new_column = df.groupby('loser_name')[column].rolling(window=20, min_periods=2).mean()
        df[new_name] = new_column.reset_index(level=0, drop=True)
    return df

def add_expanding_windows(df, column):
    """
    Create new feature using pandas expanding window function.
    Basically a cumulative average.
    """
    new_name = str(column) + "_expw"
    if new_name[0] == 'w':
        new_column = df.groupby('winner_name')[column].expanding().mean()
        df[new_name] = new_column.reset_index(level=0, drop=True)
    elif new_name[0] == 'l':
        new_column = df.groupby('loser_name')[column].expanding().mean()
        df[new_name] = new_column.reset_index(level=0, drop=True)
    return df

In [9]:
# Enlarge DF Feature space by adding moving averages to all match-level stats.

match_stats = ['w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon',
       'w_2ndWon', 'w_SvGms', 'w_bpSaved', 'w_bpFaced', 'l_ace', 'l_df',
       'l_svpt', 'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved',
       'l_bpFaced']
for stat in match_stats:
    df = add_moving_averages(df, stat)
    df = add_expanding_windows(df, stat)

In [10]:
# At this point, ready to create new DF that contains the labels to predict.
# Label=1 if Player 1 won, 0 otherwise.

In [11]:
# Randomly choose labels from {0, 1}
labels = np.random.choice(2, len(df))
labels

array([1, 1, 0, ..., 1, 0, 1])

In [12]:
df.columns

Index(['tourney_id', 'tourney_name', 'surface', 'draw_size', 'tourney_level',
       'tourney_date', 'match_num', 'winner_id', 'winner_seed', 'winner_entry',
       'winner_name', 'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age',
       'winner_rank', 'winner_rank_points', 'loser_id', 'loser_seed',
       'loser_entry', 'loser_name', 'loser_hand', 'loser_ht', 'loser_ioc',
       'loser_age', 'loser_rank', 'loser_rank_points', 'score', 'best_of',
       'round', 'minutes', 'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon',
       'w_2ndWon', 'w_SvGms', 'w_bpSaved', 'w_bpFaced', 'l_ace', 'l_df',
       'l_svpt', 'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved',
       'l_bpFaced', 'w_ace_mavg', 'w_ace_expw', 'w_df_mavg', 'w_df_expw',
       'w_svpt_mavg', 'w_svpt_expw', 'w_1stIn_mavg', 'w_1stIn_expw',
       'w_1stWon_mavg', 'w_1stWon_expw', 'w_2ndWon_mavg', 'w_2ndWon_expw',
       'w_SvGms_mavg', 'w_SvGms_expw', 'w_bpSaved_mavg', 'w_bpSaved_expw',
       'w_bpFaced_mavg', 

# Plan that Sophie helped me come up with

7 iterations of DFs.
- main_df is a deep copy of the starting DF, from above.
- win_df is a df of just the winners, with true label.
- lose_df is a df of just the losers, with **reverse** label.
- player_df is a concat of win_df and lose_df, with win stacked atop lose.
- game_df is the stump DF that things will be appended to, which will ultimately form the final df.
- player_1 df is a df of all player 1's, selected based on random label.
- player_2 df is a df of all player 2's, selected based on random label.
- final_df results from first merging player_1 df to game_df, then merging player_2 df to game_df.

In [13]:
# Create main_df, the starting df.
main_df = df.copy(deep=True)

# Create random labels, to be used later.
labels = pd.DataFrame(np.random.choice(2, len(main_df)))

In [14]:
# Create win_df containing just the index, winners, and winner stats.

# List of all columns to go in the win_df
winner_stats = ['winner_seed', 'winner_name', 'winner_hand', 'winner_ht', 'winner_age',
       'winner_rank', 'winner_rank_points', 'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon',
       'w_2ndWon', 'w_SvGms', 'w_bpSaved', 'w_bpFaced', 'w_ace_mavg', 'w_df_mavg', 'w_svpt_mavg',
        'w_1stIn_mavg', 'w_1stWon_mavg', 'w_2ndWon_mavg', 'w_SvGms_mavg', 'w_bpSaved_mavg',
       'w_bpFaced_mavg', 'w_ace_expw', 'w_df_expw', 'w_svpt_expw',
        'w_1stIn_expw', 'w_1stWon_expw', 'w_2ndWon_expw', 'w_SvGms_expw', 'w_bpSaved_expw',
       'w_bpFaced_expw']

win_df = main_df[winner_stats]
win_df = pd.concat([win_df, labels], axis=1)
win_df.columns = winner_stats + ['label']

In [15]:
# Create lose_df containing just the index, losers, and loser stats.

# List of all columns to go in the lose_df.
loser_stats = [ 'loser_seed', 'loser_name', 'loser_hand', 'loser_ht',
       'loser_age', 'loser_rank', 'loser_rank_points', 'l_ace', 'l_df',
       'l_svpt', 'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved',
       'l_bpFaced', 'l_ace_mavg', 'l_df_mavg', 'l_svpt_mavg',
       'l_1stIn_mavg', 'l_1stWon_mavg', 'l_2ndWon_mavg', 'l_SvGms_mavg',
       'l_bpSaved_mavg', 'l_bpFaced_mavg', 'l_ace_expw', 'l_df_expw', 'l_svpt_expw',
        'l_1stIn_expw', 'l_1stWon_expw', 'l_2ndWon_expw', 'l_SvGms_expw', 'l_bpSaved_expw',
       'l_bpFaced_expw']

lose_df = main_df[loser_stats]

# Create "reverse labels" to concat onto lose_df.

reverse_labels = pd.DataFrame(np.zeros(len(labels)))
reverse_labels[labels == 0] = 1
reverse_labels = reverse_labels.astype(int)

# Concat to finish creating lose_df.
lose_df = pd.concat([lose_df, reverse_labels], axis=1)
lose_df.columns = winner_stats + ['label']

In [16]:
# Create player_df, with wins stacked atop losses.
player_df = pd.concat([lose_df, win_df], axis=0)

In [17]:
# Create games_df, in its starting incarnation.
# Just index of games and game information.

# Game stats to go into game_df.
game_stats = ['surface', 'tourney_level', 'tourney_date']

game_df = main_df[game_stats]

In [18]:
# Create player1_df, just the subset of player_df with label == 1
player1_df = player_df[player_df.label == 1]
player1_df = player1_df.drop('label', axis=1)
player1_df = player1_df.reset_index(drop=True)

player1_stats = ['player1_seed', 'player1_name', 'player1_hand', 'player1_ht', 'player1_age',
       'player1_rank', 'player1_rank_points', 'player1_ace', 'player1_df', 'player1_svpt', 'player1_1stIn', 'player1_1stWon',
       'player1_2ndWon', 'player1_SvGms', 'player1_bpSaved', 'player1_bpFaced', 'player1_ace_mavg', 'player1_df_mavg', 'player1_svpt_mavg',
        'player1_1stIn_mavg', 'player1_1stWon_mavg', 'player1_2ndWon_mavg', 'player1_SvGms_mavg', 'player1_bpSaved_mavg',
       'player1_bpFaced_mavg', 'player1_ace_expw', 'player1_df_expw', 'player1_svpt_expw',
        'player1_1stIn_expw', 'player1_1stWon_expw', 'player1_2ndWon_expw', 'player1_SvGms_expw', 'player1_bpSaved_expw',
       'player1_bpFaced_expw']
player1_df.columns = player1_stats

In [19]:
# Create player2_df, just the subset of player_df with label == 0
player2_df = player_df[player_df.label == 0]
player2_df = player2_df.drop('label', axis=1)
player2_df = player2_df.reset_index(drop=True)

player2_stats = ['player2_seed', 'player2_name', 'player2_hand', 'player2_ht', 'player2_age',
       'player2_rank', 'player2_rank_points', 'player2_ace', 'player2_df', 'player2_svpt', 'player2_1stIn', 'player2_1stWon',
       'player2_2ndWon', 'player2_SvGms', 'player2_bpSaved', 'player2_bpFaced', 'player2_ace_mavg', 'player2_df_mavg', 'player2_svpt_mavg',
        'player2_1stIn_mavg', 'player2_1stWon_mavg', 'player2_2ndWon_mavg', 'player2_SvGms_mavg', 'player2_bpSaved_mavg',
       'player2_bpFaced_mavg', 'player2_ace_expw', 'player2_df_expw', 'player2_svpt_expw',
        'player2_1stIn_expw', 'player2_1stWon_expw', 'player2_2ndWon_expw', 'player2_SvGms_expw', 'player2_bpSaved_expw',
       'player2_bpFaced_expw']
player2_df.columns = player2_stats

In [20]:
# Create final df by merging onto game_df.
game_df = pd.concat([game_df, player1_df, player2_df], axis=1)

In [21]:
# IT IS ALIVE
game_df.columns

Index(['surface', 'tourney_level', 'tourney_date', 'player1_seed',
       'player1_name', 'player1_hand', 'player1_ht', 'player1_age',
       'player1_rank', 'player1_rank_points', 'player1_ace', 'player1_df',
       'player1_svpt', 'player1_1stIn', 'player1_1stWon', 'player1_2ndWon',
       'player1_SvGms', 'player1_bpSaved', 'player1_bpFaced',
       'player1_ace_mavg', 'player1_df_mavg', 'player1_svpt_mavg',
       'player1_1stIn_mavg', 'player1_1stWon_mavg', 'player1_2ndWon_mavg',
       'player1_SvGms_mavg', 'player1_bpSaved_mavg', 'player1_bpFaced_mavg',
       'player1_ace_expw', 'player1_df_expw', 'player1_svpt_expw',
       'player1_1stIn_expw', 'player1_1stWon_expw', 'player1_2ndWon_expw',
       'player1_SvGms_expw', 'player1_bpSaved_expw', 'player1_bpFaced_expw',
       'player2_seed', 'player2_name', 'player2_hand', 'player2_ht',
       'player2_age', 'player2_rank', 'player2_rank_points', 'player2_ace',
       'player2_df', 'player2_svpt', 'player2_1stIn', 'player2_1stWon

In [22]:
# now just need to prune columns, take differences of stats/mavgs.

game_df.head()

Unnamed: 0,surface,tourney_level,tourney_date,player1_seed,player1_name,player1_hand,player1_ht,player1_age,player1_rank,player1_rank_points,player1_ace,player1_df,player1_svpt,player1_1stIn,player1_1stWon,player1_2ndWon,player1_SvGms,player1_bpSaved,player1_bpFaced,player1_ace_mavg,player1_df_mavg,player1_svpt_mavg,player1_1stIn_mavg,player1_1stWon_mavg,player1_2ndWon_mavg,...,player2_svpt,player2_1stIn,player2_1stWon,player2_2ndWon,player2_SvGms,player2_bpSaved,player2_bpFaced,player2_ace_mavg,player2_df_mavg,player2_svpt_mavg,player2_1stIn_mavg,player2_1stWon_mavg,player2_2ndWon_mavg,player2_SvGms_mavg,player2_bpSaved_mavg,player2_bpFaced_mavg,player2_ace_expw,player2_df_expw,player2_svpt_expw,player2_1stIn_expw,player2_1stWon_expw,player2_2ndWon_expw,player2_SvGms_expw,player2_bpSaved_expw,player2_bpFaced_expw
0,Clay,A,19910729,,Diego Perez,R,178.0,29.46475,201.0,153.0,0.0,2.0,40.0,22.0,9.0,6.0,6.0,7.0,12.0,,,,,,,...,34.0,17.0,6.0,6.0,6.0,1.0,6.0,,,,,,,,,,1.0,3.0,34.0,17.0,6.0,6.0,6.0,1.0,6.0
1,Clay,A,19910729,,Massimo Ardinghi,R,175.0,20.396988,257.0,92.0,0.0,2.0,44.0,18.0,6.0,11.0,7.0,2.0,7.0,,,,,,,...,54.0,42.0,15.0,4.0,8.0,6.0,13.0,,,,,,,,,,0.0,0.0,54.0,42.0,15.0,4.0,8.0,6.0,13.0
2,Clay,A,19910729,,Joao Cunha Silva,R,173.0,23.66872,141.0,253.0,1.0,4.0,67.0,35.0,24.0,11.0,10.0,2.0,6.0,,,,,,,...,69.0,46.0,29.0,9.0,11.0,3.0,7.0,,,,,,,,,,0.0,4.0,69.0,46.0,29.0,9.0,11.0,3.0,7.0
3,Clay,A,19910729,,Slava Dosedel,R,183.0,20.95551,140.0,259.0,3.0,10.0,102.0,60.0,38.0,18.0,15.0,4.0,9.0,,,,,,,...,49.0,21.0,12.0,11.0,8.0,1.0,6.0,,,,,,,,,,0.0,2.0,49.0,21.0,12.0,11.0,8.0,1.0,6.0
4,Clay,A,19910729,,Christian Miniussi,R,185.0,24.065708,161.0,206.0,1.0,6.0,59.0,25.0,13.0,12.0,8.0,4.0,10.0,,,,,,,...,55.0,31.0,12.0,9.0,7.0,5.0,11.0,,,,,,,,,,0.0,0.0,55.0,31.0,12.0,9.0,7.0,5.0,11.0


In [23]:
game_df['label'] = labels

In [24]:
picklefile = 'expw_oct17.pkl'
with open(picklefile, 'wb') as f_obj:
    pickle.dump(game_df, f_obj)

In [25]:
# Pickled at 12:42 Tuesday October 17th.