# TODO: 12/2

1. Fix Francis' matplotlib
2. Set up private github repository
3. Set up github workflow
4. Handle missing data
5. Add in glicko2 column
6. Switch the tuples of h2h_all and h2h_rec columns to reflect the perspectives of player1 and player2 (currently based on winner's point of view)
7. Ensemble of the lone Elo regression, lone Trueskill regression, lone glicko2 regression, likely using constrained non-negative coefficients: http://maggotroot.blogspot.com/2013/11/constrained-linear-least-squares-in.html
8. Make ROC/AUC plots
9. Bring in Jacob's point-by-point model.
10. Website
11. Explanations in ipython Notebook

# Data Clean-up

In [248]:
!pip install git+https://github.com/sublee/elo
!pip install git+https://github.com/sublee/glicko2

Collecting git+https://github.com/sublee/elo
  Cloning https://github.com/sublee/elo to /var/folders/v3/q2d976pd1rncj7rg5x7411pc0000gn/T/pip-yH_5zu-build
Installing collected packages: elo
  Running setup.py install for elo
Successfully installed elo-0.1.dev0


In [226]:
# Imports necessary libraries

import pandas as pd
import numpy as np
import datetime
from sklearn import linear_model
import math
from operator import itemgetter, attrgetter
import elo
import glicko2
import trueskill


import matplotlib as mpl
import matplotlib.cm as cm
import seaborn as sns
import matplotlib.pyplot as plt

ImportError: cannot import name unpack_labeled_data

In [2]:
''' Combines CSV 1999-2015 complete match files and constructs 1 dataframe.
    Choose 1999 as starting year because that's when Federer starts.
    We can change this if we want more data.
'''
atp_year_list = []
for i in xrange(1999,2016):
    atp_year_list.append(pd.read_csv("atp_matches_{0}.csv".format(i)))
atp_all_matches = pd.concat(atp_year_list, ignore_index = True)

# Set up the pbp (point by point) column that will contain the pbp information for matches we have info on
atp_all_matches['pbp'] = [None]*atp_all_matches.shape[0]
atp_all_matches.shape

(53054, 50)

In [3]:
''' More dataframe preparation for combining dateframes.
    Matches detailed match info for select matches to correct row in complete match list
    If date is within 30 days of each other is correct, players are correct,
    result is same, and first set score is same, then it's the same match. '''

# Get score into the same string format (a couple scores are mis-recorded),
# so that's why I don't match on the entire score string
atp_all_matches['score'] = atp_all_matches['score'].astype(str)

# Get dates into the same format
atp_all_matches['tourney_date'].apply(lambda x: (str(x)[8:16]))
atp_all_matches['tourney_date1'] = atp_all_matches['tourney_date'].apply(lambda x: datetime.datetime.strptime(str(x), "%Y%m%d"))
atp_all_matches['match_year'] = atp_all_matches['tourney_date1'].apply(lambda x: x.year)
atp_all_matches['match_month'] = atp_all_matches['tourney_date1'].apply(lambda x: x.month)
atp_all_matches['score_start'] = atp_all_matches['score'].apply(lambda x: x[:3])
atp_all_matches = atp_all_matches.sort(['tourney_date1'], ascending=1).reset_index()
atp_all_matches.head()

Unnamed: 0,index,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,...,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,pbp,tourney_date1,match_year,match_month,score_start
0,2954,1999-451,Doha,Hard,32,A,19990104,20,102223,,...,38,19,16,2,7,,1999-01-04,1999,1,6-2
1,2937,1999-451,Doha,Hard,32,A,19990104,3,102950,,...,16,7,7,2,6,,1999-01-04,1999,1,6-1
2,2938,1999-451,Doha,Hard,32,A,19990104,4,101543,,...,22,10,9,4,7,,1999-01-04,1999,1,6-3
3,2939,1999-451,Doha,Hard,32,A,19990104,5,102338,3.0,...,38,17,13,11,16,,1999-01-04,1999,1,2-6
4,2940,1999-451,Doha,Hard,32,A,19990104,6,102271,,...,31,30,14,11,16,,1999-01-04,1999,1,1-6


In [4]:
''' Combine all the matches that have pbp (point by point) information into one dataframe
    and clean up columns in preparation for matching with the all_atp_matches dataframe.
'''

pbp_matches_archive = pd.read_csv("pbp_matches_atp_main_archive.csv")
pbp_matches_current = pd.read_csv("pbp_matches_atp_main_current.csv")
pbp_matches = [pbp_matches_archive,pbp_matches_current]
pbp_matches = pd.concat(pbp_matches)
pbp_matches.winner = pbp_matches.winner - 1
pbp_matches['winner_name'] = np.where(pbp_matches['winner'] == 0, pbp_matches['server1'], pbp_matches['server2'])
pbp_matches['loser_name'] = np.where(pbp_matches['winner'] == 0, pbp_matches['server2'], pbp_matches['server1'])
pbp_matches['date'] = pd.to_datetime(pbp_matches['date'])
pbp_matches['match_year'] = pbp_matches['date'].apply(lambda x: x.year)
pbp_matches['match_month'] = pbp_matches['date'].apply(lambda x: x.month)
pbp_matches['score_start'] = pbp_matches['score'].apply(lambda x: x[:3])

# Makes pbp_matches dataframe only contain the matching columns and the new column we want "pbp"
pbp_matches = pbp_matches.iloc[:,[7,10,11,12,13,14]]

''' Here, we actually add the 'pbp' column to the correct row in the other dataframe,
    based on criteria I chose that should be the same match,
    ie. correct winner and loser, same first set score, and match dates
    are within 30 days of one another.
'''
pbp_matches1 = pd.merge(atp_all_matches, pbp_matches, \
                        on=['winner_name', 'loser_name', 'match_year', 'score_start', 'match_month'], how='left')

In [5]:
print pbp_matches.shape
print len(set(pbp_matches['pbp'].values))

(11663, 6)

### Useful commands for looking at data

In [None]:
#atp_all_matches.tourney_date1[1].values
#t = ['1999-09-26T20:00:00.000000000-0400','2004-05-16T20:00:00.000000000-0400']
#atp_all_matches.loc[atp_all_matches.tourney_date1.isin(t)]
len(set(pbp_matches1['pbp_y'].values))
# Useful Pandas operations to play around with

# Look at a couple of matches
atp_all_matches.iloc[25001:25010,:]

# A way for extracting specific elements
atp_all_matches.iloc[2]['winner_id']

# conditional create new column
# http://stackoverflow.com/questions/19913659/pandas-conditional-creation-of-a-series-dataframe-column

# merge dataframes based on indices or columns
# http://pandas.pydata.org/pandas-docs/stable/merging.html

# Combines Grand Slam Point by Point CSV files via similar method as cell above
grandslam_names = ["ausopen", "frenchopen", "frenchopen", "wimbledon"]
for year in xrange(2011-2016):
    
# Pandas Practice
x = pd.DataFrame({'col1':[1,2,3,4,5], 'col2':[10,9,8,7,6], 'col3':pd.Series([1,2,3,4,5]), \
                 'col4':np.random.choice(6,5,replace=True), 'col5': ['c','a','b','d','e']})
y = x.sort_index(axis=0, ascending=False)

#x.loc[index[0]]
x.loc[:,['col1','col5']]
x.iloc[3]

### SEE FEATURE CREATION SECTION FOR GENERALIZED VERSION ###

matches = atp_all_matches[0:1000]
h2h_list = []
for i, row in matches.iterrows():
    h2h_list.append(row['winner_name'])
h2h_list

def get_h2h_for_match(matches = atp_all_matches, recent = False):
    new_list = []
    h2h_matches = matches[((matches['winner_name'] == 'Roger Federer') & (matches['loser_name'] == 'Marin Cilic')) | \
                      ((matches['winner_name'] == 'Marin Cilic') & (matches['loser_name'] == 'Roger Federer'))]
    for i, row in h2h_matches.iterrows():
        match_date = pd.Timestamp(row['tourney_date1']).to_pydatetime()
        all_time = [datetime.datetime.strptime('1999-01-01', '%Y-%m-%d'), match_date]
        h2h_all_prev_matches = h2h_matches[(h2h_matches.tourney_date1 > all_time[0]) & (h2h_matches.tourney_date1 < all_time[1])]
        print (sum(h2h_all_prev_matches['winner_name'] == 'Roger Federer'),sum(h2h_all_prev_matches['winner_name'] == 'Marin Cilic'))
    
get_h2h_for_match()



## Feature Creation

#### Functions for Adding in Columns

In [8]:
# Functions for adding in columns. Will get applied to each row. Returns for both players
def what_round(draw_size, match_number):
    pass

''' Returns 2 lists of head to head scores to be concatenated to the dataframe '''
def h2h(match_dataframe):
    matches = match_dataframe
    h2h_list_all, h2h_list_recent  = [], []
    for i, row in matches.iterrows():
        h2h_matches = matches[((matches['winner_name'] == row['winner_name']) & (matches['loser_name'] == row['loser_name'])) | \
                          ((matches['winner_name'] == row['loser_name']) & (matches['loser_name'] == row['winner_name']))]

        match_date = pd.Timestamp(row['tourney_date1']).to_pydatetime()
        one_year_date = match_date + datetime.timedelta(weeks=-52)
        begin_date = datetime.datetime.strptime('1999-01-01', '%Y-%m-%d')
        
        h2h_all_prev_matches = h2h_matches[(h2h_matches.tourney_date1 >= begin_date) & (h2h_matches.tourney_date1 < match_date)]
        h2h_recent_matches = h2h_matches[(h2h_matches.tourney_date1 >= one_year_date) & (h2h_matches.tourney_date1 < match_date)]

        player1_perspective_all = (sum(h2h_all_prev_matches['winner_name'] == row['winner_name']),sum(h2h_all_prev_matches['winner_name'] == row['loser_name']))
        player1_perspective_rec = (sum(h2h_recent_matches['winner_name'] == row['winner_name']),sum(h2h_recent_matches['winner_name'] == row['loser_name']))
        h2h_list_all.append(player1_perspective_all)
        h2h_list_recent.append(player1_perspective_rec)
    return h2h_list_all, h2h_list_recent

def momentum(player1, player2):
    pass

def prior_tournament_result(player1, player2):
    pass

''' Returns player records for that match's surface '''
def twelve_month_record(player1, player2, surface):
    
    for i, row in matches.iterrows():
        # check surface
        one_year_range = [match_date + datetime.timedelta(weeks=-52), match_date]

def top10_player(player1, player2):
    pass

def is_clay_player(player1, player2):
    pass



#### Add columns to dataframe

In [9]:
# What kind of Grandslam is it?
atp_all_matches['grandslam_type'] = np.where(atp_all_matches['tourney_name'] == \
                                        ('US Open' or 'Wimbledon' or 'Australian Open' or 'Roland Garros'), \
                                        atp_all_matches['tourney_name'],'No') 
# Is it a Davis Cup Match?
atp_all_matches['is_davis_cup'] = np.where(atp_all_matches['tourney_name'].str.contains('Davis Cup'), 'Yes', 'No')


In [10]:
# Add Head to Head columns
#temp_matches = atp_all_matches.iloc[40000:, :].copy()
%%time
a, b = h2h(atp_all_matches)
atp_all_matches['h2h_all'] = pd.Series(a, index=atp_all_matches.index)
atp_all_matches['h2h_recent'] =  pd.Series(b, index=atp_all_matches.index)

# Is it Roger, Rafa, Novak, Andy, or No?
# Do we even need this column?

#print set(atp_all_matches['tourney_name'].values)

In [32]:
# shuffles winner and loser and constructs player1 and player2 columns for everything
atp_all_matches.columns = [x.replace('winner', 'w') for x in atp_all_matches.columns]
atp_all_matches.columns = [x.replace('loser', 'l') for x in atp_all_matches.columns]
p1_columns = ['p1_rank', 'p1_seed','p1_rank_points','p1_ace', 'p1_df', 'p1_svpt', 'p1_1stWon', \
              'p1_2ndWon', 'p1_SvGms', 'p1_bpSaved', 'p1_bpFaced']
p2_columns = [x.replace('p1','p2') for x in p1_columns]
pw_columns = [x.replace('p1','w') for x in p1_columns]
pl_columns = [x.replace('p1','l') for x in p1_columns]

atp_all_matches['player1'] = atp_all_matches.apply(lambda x: np.random.choice([x['w_name'],x['l_name']],1, replace = False)[0], axis = 1)
atp_all_matches['player2'] = atp_all_matches.apply(lambda x: x['w_name'] if x['player1'] == x['l_name'] else x['l_name'], axis = 1)

for i, column_name in enumerate(p1_columns):
    atp_all_matches[column_name] = atp_all_matches.apply(lambda x: x[pw_columns[i]] if (x['player1'] == x['w_name']) else x[pl_columns[i]], axis = 1)
for i, column_name in enumerate(p2_columns):
    atp_all_matches[column_name] = atp_all_matches.apply(lambda x: x[pw_columns[i]] if (x['player2'] == x['w_name']) else x[pl_columns[i]], axis = 1)
    
atp_all_matches['player1_wins'] = atp_all_matches.apply(lambda x: 1 if (x['w_name'] == x['player1']) else 0, axis = 1)

## Cell Below doesn't work yet.  I want to re-arrange the tuple of h2h to reflect player1,player2, but that wasn't working so I was going to make it into separate columns and then recombine.

In [45]:

atp_all_matches['h2h_p1_persp_all'] = atp_all_matches.apply(lambda x: (zip(x['h2h_all'][0],x['h2h_all'])[1])  if (x['player1_wins'] == 1) else zip(x['h2h_all'][1],x['h2h_all'][0]), axis = 1)
#h2h_p1_persp_all_win = atp_all_matches.apply(lambda x: (x['h2h_all'])[0] if (x['player1_wins'] == 1) else (x['h2h_all'])[1], axis = 1)
#h2h_p1_persp_all_loss = atp_all_matches.apply(lambda x: (x['h2h_all'])[1] if (x['player1_wins'] == 1) else (x['h2h_all'])[0], axis = 1)
#h2h_p1_persp_rec_win = atp_all_matches.apply(lambda x: (x['h2h_recent'])[0] if (x['player1_wins'] == 1) else (x['h2h_recent'])[1], axis = 1)
#h2h_p1_persp_rec_loss = atp_all_matches.apply(lambda x: (x['h2h_recent'])[1] if (x['player1_wins'] == 1) else (x['h2h_recent'])[0], axis = 1)


TypeError: ('zip argument #1 must support iteration', u'occurred at index 0')

# Exploratory Analysis

### Player Analysis

#### Overall

In [None]:
# Plot of elo over time

# Plot of win percentage

# Plot of ranking

# Plot of average win spread



#### Roger Federer

In [None]:
# Plot against others

# Clutchness (breakpoints?)

# 

### Principle Component Analysis

In [220]:
# Proving problematic because of all the missing values
atp_all_matches.columns[27:28]

Index([u'score'], dtype='object')

In [206]:
atp_all_matches.iloc[1000:2500,10:25]

Unnamed: 0,winner_name,winner_hand,winner_ht,winner_ioc,winner_age,winner_rank,winner_rank_points,loser_id,loser_seed,loser_entry,loser_name,loser_hand,loser_ht,loser_ioc,loser_age
1000,Juan Carlos Ferrero,R,183,ESP,19.104723,,,102223,4,,Karim Alami,R,185,MAR,25.826146
1001,Alberto Martin,R,175,ESP,20.585900,,,101997,,,Marzio Martelli,R,180,ITA,27.268994
1002,Oliver Gross,R,185,GER,25.760438,,,103202,,,Oscar Serrano,R,173,ESP,20.824093
1003,Jens Knippschild,R,190,GER,24.095825,,,102154,,,Marcos Ondruska,R,183,RSA,26.255989
1004,Fernando Meligeni,L,180,BRA,27.942505,,,102375,,,Dinu Pescariu,R,185,ROU,24.941821
1005,Johan Van Herck,R,185,BEL,24.826831,,,103084,,,Guillermo Canas,R,185,ARG,21.319644
1006,Markus Hipfl,R,178,AUT,20.903491,,,101947,,,Vincenzo Santopadre,L,183,ITA,27.611225
1007,Fernando Vicente,R,180,ESP,22.036961,,,102519,,,Marcio Carlsson,R,175,BRA,24.156057
1008,Stefan Koubek,L,175,AUT,22.214921,,,102723,,,Eduardo Medica,R,180,ARG,23.110198
1009,Albert Portas,R,188,ESP,25.347023,,,102231,,,Alex Calatrava,R,190,ESP,25.768652


In [224]:
#http://sebastianraschka.com/Articles/2014_pca_step_by_step.html
#https://github.com/cs109/2015lab5/blob/master/Classification.ipynb
from sklearn.decomposition import PCA
pca = PCA(n_components=60)
X = pca.fit_transform(atp_all_matches.iloc[-1000:,14:16])


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

## Rating Systems

### Elo
http://gobase.org/studying/articles/elo/
http://www.gamefaqs.com/boards/610657-dota-2/67994646
https://github.com/sublee/elo/blob/master/elo.py
http://stephenwan.net/thoughts/2012/10/02/elo-rating-system.html

In [164]:
''' Make a dict of players with elo, glicko2, and trueskill ratings'''
players_list = np.union1d(atp_all_matches.w_name.values, atp_all_matches.l_name.values)

players = dict(zip(list(set(players_list)), [elo.Rating(1200)] *len(players_list)))
players

#,glicko2.Rating(),trueskill.Rating()]
#list(set(players_list))

{'Sam Barry': elo.Rating(1200.000),
 'Marc Rosset': elo.Rating(1200.000),
 'Bob Borella': elo.Rating(1200.000),
 'Philippos Tsangaridis': elo.Rating(1200.000),
 'Andy Zingman': elo.Rating(1200.000),
 'Musaad Al Jazzaf': elo.Rating(1200.000),
 'Marcus Sarstrand': elo.Rating(1200.000),
 'Raoul Behr': elo.Rating(1200.000),
 'Nikos Rovas': elo.Rating(1200.000),
 'Florian Mayer': elo.Rating(1200.000),
 'Davide Sanguinetti': elo.Rating(1200.000),
 'Aljaz Bedene': elo.Rating(1200.000),
 'Ben Fatael': elo.Rating(1200.000),
 'Adelo Abadia': elo.Rating(1200.000),
 'Andrey Dernovskiy': elo.Rating(1200.000),
 'Hady Habib': elo.Rating(1200.000),
 'Rohan Bopanna': elo.Rating(1200.000),
 'Igor Kunitsyn': elo.Rating(1200.000),
 'Yari Bernardo': elo.Rating(1200.000),
 'Ricardo Mena': elo.Rating(1200.000),
 'Alvaro Raposo De Oliveira': elo.Rating(1200.000),
 'Julien Benneteau': elo.Rating(1200.000),
 'Valentin Rahmine': elo.Rating(1200.000),
 'Sunil Kumar': elo.Rating(1200.000),
 'Andrea Arnaboldi': elo

In [178]:
# EXAMPLE to understand ELO #

r1,r2 = elo.rate_1vs1(elo.Rating(1200), elo.Rating(1200))
print r1, r2
print elo.Rating(r1), elo.Rating(r2)
players['Roger Federer'] = elo.Rating(r1)
players['Rafael Nadal'] = elo.Rating(r2)
print  players['Roger Federer'], players['Rafael Nadal']
print 
r3,r4 = elo.rate_1vs1(players['Rafael Nadal'], players['Roger Federer'])
print r3, r4
#(players['Roger Federer'].value - players['Todd Martin'].value)


1205.0 1195.0
elo.Rating(1205.000) elo.Rating(1195.000)
elo.Rating(1205.000) elo.Rating(1195.000)
1200.14387184 1199.85612816


In [165]:
mean_elo, diff_elo = [], []

for i, row in atp_all_matches.iterrows():
    # Find Elo rating in dictionary
    elo1 = players[str(row['player1'])]
    elo2 = players[str(row['player2'])]
    
    mean_elo.append((elo1.value + elo2.value)/2)
    diff_elo.append(float(elo1.value) - float(elo2.value))
    
    # Calculate the new Elo ratings and add to list
    if row['player1_wins'] == 1:
        rating1, rating2 = elo.rate_1vs1(elo1,elo2)
    else:
        rating2, rating1 = elo.rate_1vs1(elo2, elo1)
    # Update the new Elo ratings in dictionary
    players[str(row['player1'])] = elo.Rating(rating1)
    players[str(row['player2'])] = elo.Rating(rating2)


#Add columns

atp_all_matches['mean_elo'] = pd.Series(mean_elo, index = atp_all_matches.index)
atp_all_matches['diff_elo'] = pd.Series(diff_elo, index = atp_all_matches.index)

In [169]:
print atp_all_matches['mean_elo']

0        1200.000000
1        1200.000000
2        1200.000000
3        1200.000000
4        1200.000000
5        1197.500000
6        1202.500000
7        1200.000000
8        1200.000000
9        1200.000000
10       1200.000000
11       1200.000000
12       1200.000000
13       1200.000000
14       1200.000000
15       1200.000000
16       1205.000000
17       1205.000000
18       1200.000000
19       1205.000000
20       1205.000000
21       1205.000000
22       1205.000000
23       1207.500000
24       1209.964025
25       1210.000000
26       1210.000000
27       1212.500518
28       1197.500000
29       1215.000000
            ...     
53024    1210.693061
53025    1263.666716
53026    1275.519107
53027    1299.018645
53028    1307.370286
53029    1202.944005
53030    1519.502469
53031    1269.878904
53032    1452.567570
53033    1389.816078
53034    1209.522994
53035    1231.230128
53036    1229.886089
53037    1216.500888
53038    1211.004884
53039    1188.294090
53040    1240

### Glicko

In [None]:
#https://deltadata.wordpress.com/2014/01/11/glicko-2-for-tennis-part-2-the-model/comment-page-1/#comment-192
#https://github.com/sublee/glicko2/blob/master/glicko2.py


### TrueSkill

In [176]:
players = dict(zip(list(set(players_list)), [trueskill.Rating()] *len(players_list)))
players1 = dict(zip(list(set(players_list)), [[elo.Rating(), trueskill.Rating()]] *len(players_list)))
players1

{'Sam Barry': [elo.Rating(1200.000), trueskill.Rating(mu=25.000, sigma=8.333)],
 'Marc Rosset': [elo.Rating(1200.000),
  trueskill.Rating(mu=25.000, sigma=8.333)],
 'Bob Borella': [elo.Rating(1200.000),
  trueskill.Rating(mu=25.000, sigma=8.333)],
 'Philippos Tsangaridis': [elo.Rating(1200.000),
  trueskill.Rating(mu=25.000, sigma=8.333)],
 'Andy Zingman': [elo.Rating(1200.000),
  trueskill.Rating(mu=25.000, sigma=8.333)],
 'Musaad Al Jazzaf': [elo.Rating(1200.000),
  trueskill.Rating(mu=25.000, sigma=8.333)],
 'Marcus Sarstrand': [elo.Rating(1200.000),
  trueskill.Rating(mu=25.000, sigma=8.333)],
 'Raoul Behr': [elo.Rating(1200.000),
  trueskill.Rating(mu=25.000, sigma=8.333)],
 'Nikos Rovas': [elo.Rating(1200.000),
  trueskill.Rating(mu=25.000, sigma=8.333)],
 'Florian Mayer': [elo.Rating(1200.000),
  trueskill.Rating(mu=25.000, sigma=8.333)],
 'Davide Sanguinetti': [elo.Rating(1200.000),
  trueskill.Rating(mu=25.000, sigma=8.333)],
 'Aljaz Bedene': [elo.Rating(1200.000),
  trueskill

In [171]:
# http://trueskill.org/, https://pypi.python.org/pypi/trueskill
mean_ts, diff_ts = [], []
for i, row in atp_all_matches.iterrows():
    # Find TS rating in dictionary
    ts1 = players[row['player1']]
    ts2 = players[row['player2']]
    
    mean_ts.append((ts1.mu + ts2.mu)/2)
    diff_ts.append(ts1.mu - ts2.mu)
    
    # Calculate the new TS ratings and add to list
    if row['w_name'] == row['player1']:
        rating1, rating2 = trueskill.rate_1vs1(ts1, ts2)
    else:
        rating2, rating1 = trueskill.rate_1vs1(ts2, ts1)
    
    
    # Update the new TS ratings in dictionary
    players[row['player1']] = rating1
    players[row['player2']] = rating2

#Add columns
atp_all_matches['mean_ts'] = pd.Series(mean_ts, index = atp_all_matches.index)
atp_all_matches['diff_ts'] = pd.Series(diff_ts, index = atp_all_matches.index)


In [173]:
print atp_all_matches['diff_ts']

0         0.000000
1         0.000000
2         0.000000
3         0.000000
4         0.000000
5        -4.395832
6        -4.395832
7         0.000000
8         0.000000
9         0.000000
10        0.000000
11        0.000000
12        0.000000
13        0.000000
14        0.000000
15        0.000000
16        0.000000
17        0.000000
18        0.000000
19        0.000000
20        0.000000
21        0.000000
22        0.000000
23        3.676850
24       -1.115501
25        0.000000
26        0.000000
27       -1.206015
28       -4.395832
29        0.000000
           ...    
53024    -8.464342
53025     0.229498
53026    -4.625823
53027     0.665644
53028    -2.440286
53029     0.879858
53030    -7.563112
53031    -4.018878
53032    10.260165
53033    -6.577643
53034     2.695101
53035    -2.620987
53036    -1.682251
53037    -3.062027
53038     2.111510
53039    -0.038051
53040     3.430552
53041     1.773201
53042     2.537483
53043     1.418642
53044    -1.746453
53045     1.

# Prediction

### Baseline Predictor

In [None]:
# Head to Head score


# Recent Head to head score

In [184]:
atp_all_matches.columns['player1']

Index([u'index', u'tourney_id', u'tourney_name', u'surface', u'draw_size',
       u'tourney_level', u'tourney_date', u'match_num', u'w_id', u'w_seed',
       u'w_entry', u'w_name', u'w_hand', u'w_ht', u'w_ioc', u'w_age',
       u'w_rank', u'w_rank_points', u'l_id', u'l_seed', u'l_entry', u'l_name',
       u'l_hand', u'l_ht', u'l_ioc', u'l_age', u'l_rank', u'l_rank_points',
       u'score', u'best_of', u'round', u'minutes', u'w_ace', u'w_df',
       u'w_svpt', u'w_1stIn', u'w_1stWon', u'w_2ndWon', u'w_SvGms',
       u'w_bpSaved', u'w_bpFaced', u'l_ace', u'l_df', u'l_svpt', u'l_1stIn',
       u'l_1stWon', u'l_2ndWon', u'l_SvGms', u'l_bpSaved', u'l_bpFaced',
       u'pbp', u'tourney_date1', u'match_year', u'match_month', u'score_start',
       u'grandslam_type', u'is_davis_cup', u'h2h_all', u'h2h_recent',
       u'player1', u'player2', u'p1_rank', u'p1_seed', u'p1_rank_points',
       u'p1_ace', u'p1_df', u'p1_svpt', u'p1_1stWon', u'p1_2ndWon',
       u'p1_SvGms', u'p1_bpSaved', u'p1_bpFa

In [240]:
from sklearn.cross_validation import train_test_split
from sklearn.svm import LinearSVC
from sklearn.grid_search import GridSearchCV

itrain, itest = train_test_split(xrange(atp_all_matches.shape[0]), train_size=0.7)

mask=np.ones(atp_all_matches.shape[0], dtype='int')
mask[itrain]=1
mask[itest]=0
mask = (mask==1)

elocols = ['mean_elo', 'diff_elo',]
tscols = ['mean_ts', 'diff_ts']

clfsvm=LinearSVC(loss="hinge")
Cs=[0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
Xmatrix=atp_all_matches[elocols].values
Yresp=atp_all_matches['player1_wins'].values

Xmatrix_train=Xmatrix[mask]
Xmatrix_test=Xmatrix[~mask]
Yresp_train=Yresp[mask]
Yresp_test=Yresp[~mask]

In [241]:
grid = GridSearchCV(clfsvm, param_grid=dict(C=Cs), cv = 5, scoring = None)
grid.fit(Xmatrix_train, Yresp_train)
print "BEST", grid.best_params_, 'hi', grid.best_score_, 'hi2', grid.grid_scores_
best = grid.best_estimator_

#calculate the accuracy here
training_accuracy = grid.score(Xmatrix_train, Yresp_train)

BEST {'C': 0.01} hi 0.592212618144 hi2 [mean: 0.56927, std: 0.05356, params: {'C': 0.001}, mean: 0.59221, std: 0.07575, params: {'C': 0.01}, mean: 0.53618, std: 0.04020, params: {'C': 0.1}, mean: 0.51081, std: 0.00947, params: {'C': 1.0}, mean: 0.50831, std: 0.01010, params: {'C': 10.0}, mean: 0.55756, std: 0.06114, params: {'C': 100.0}]


In [242]:
%%time
clfsvm, Xtrain, ytrain, Xtest, ytest = do_classify(LinearSVC(loss="hinge"), {"C": [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]}, atp_all_matches,elocols, 'player1_wins',1, mask=mask)

using mask
BEST {'C': 0.1} 0.549721302205 [mean: 0.53615, std: 0.06600, params: {'C': 0.001}, mean: 0.51202, std: 0.01743, params: {'C': 0.01}, mean: 0.54972, std: 0.04775, params: {'C': 0.1}, mean: 0.50190, std: 0.00477, params: {'C': 1.0}, mean: 0.52557, std: 0.04407, params: {'C': 10.0}, mean: 0.50933, std: 0.01133, params: {'C': 100.0}]
############# based on standard predict ################
Accuracy on training data: 0.55
Accuracy on test data:     0.55
[[7844  139]
 [7100  834]]
########################################################
CPU times: user 2min 12s, sys: 318 ms, total: 2min 12s
Wall time: 2min 14s




In [243]:
reuse_split=dict(Xtrain=Xtrain, Xtest=Xtest, ytrain=ytrain, ytest=ytest)

In [244]:
clflogELO, Xtrain, ytrain, Xtest, ytest = do_classify(clf=linear_model.LogisticRegression(penalty="l1"),\
                                                   parameters = {"C": [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]}, \
                                                   indf=atp_all_matches, featurenames=elocols, targetname = 'player1_wins', \
                                                   target1val = 1, mask=mask, reuse_split= reuse_split)

using mask
using reuse split
BEST {'C': 0.01} 0.652395185395 [mean: 0.65186, std: 0.02262, params: {'C': 0.001}, mean: 0.65240, std: 0.02288, params: {'C': 0.01}, mean: 0.65234, std: 0.02306, params: {'C': 0.1}, mean: 0.65218, std: 0.02293, params: {'C': 1.0}, mean: 0.65223, std: 0.02287, params: {'C': 10.0}, mean: 0.65229, std: 0.02290, params: {'C': 100.0}]
############# based on standard predict ################
Accuracy on training data: 0.65
Accuracy on test data:     0.65
[[5276 2707]
 [2800 5134]]
########################################################




In [235]:
with sns.color_palette("hls", 2):
    ax = make_roc("logistic-with-lasso",clflog, ytest, Xtest, labe=200, skip=50)
    ax = make_roc("svm-all-features", clfsvm, ytest, Xtest, labe=200, skip=50, ax = ax, proba = False)

NameError: name 'sns' is not defined

### Logistic Regression

Probability player 1 wins = ? given (for example) head-head, clay player or not, surface, momentum, grandslam, grandslam_player, relative elo, etc.

### Random Forest

### Support Vector Machine

### Bayes Point-by-Point Probability Model

In [None]:
# https://github.com/CamDavidsonPilon/Probabilistic-Programming-and-Bayesian-Methods-for-Hackers/blob/master/Chapter6_Priorities/Chapter6.ipynb

# Subset the dataframe only for rows that we have point-by-point data on
atp_point_by_point_matches = atp_all_matches.iloc['pbp' == None]

# Establish prior probabilities for each given the players involved, or perhaps based on the other models

# Is Markov chain approach more appropriate?

# 

### Ensemble Model

In [None]:
# See HW4 part 4
train, intermediate_set = cross_validation.train_test_split(input_set, train_size=0.6, test_size=0.4)
cv, test = cross_validation.train_test_split(intermediate_set, train_size=0.5, test_size=0.5)

dfensemble=pd.DataFrame.from_dict({'knn_r':predictions_valid['knn_r'],
                                   'svd':predictions_valid['svd'], 
                                   'baseline_r':predictions_valid['baseline_r'], 'y':validatedf.stars.values})

from sklearn import linear_model
valreg = linear_model.LinearRegression()
valreg.fit(dfensemble[['knn_r','svd', 'baseline_r']], dfensemble['y'])

dfensembletest = pd.DataFrame.from_dict({'knn_r':predictions['knn_r'],
                                   'svd':predictions['svd'], 
                                   'baseline_r':predictions['baseline_r'], 'y':testdf.stars.values})
epreds = valreg.predict(dfensembletest[['knn_r','svd', 'baseline_r']])
testactual = dfensembletest['y'].values

In [195]:
def cv_optimize(clf, parameters, X, y, n_folds=5, score_func=None):
    if score_func:
        gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds, n_jobs=n_jobs, scoring=score_func)
    else:
        gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds)
    gs.fit(X, y)
    print "BEST", gs.best_params_, gs.best_score_, gs.grid_scores_
    best = gs.best_estimator_
    return best

from sklearn.metrics import confusion_matrix
def do_classify(clf, parameters, indf, featurenames, targetname, target1val, mask=None, reuse_split=None, score_func=None, n_folds=5):
    subdf=indf[featurenames]
    X=subdf.values
    y=(indf[targetname].values==target1val)*1
    if mask !=None:
        print "using mask"
        Xtrain, Xtest, ytrain, ytest = X[mask], X[~mask], y[mask], y[~mask]
    if reuse_split !=None:
        print "using reuse split"
        Xtrain, Xtest, ytrain, ytest = reuse_split['Xtrain'], reuse_split['Xtest'], reuse_split['ytrain'], reuse_split['ytest']
    if parameters:
        clf = cv_optimize(clf, parameters, Xtrain, ytrain, n_folds=n_folds, score_func=score_func)
    clf=clf.fit(Xtrain, ytrain)
    training_accuracy = clf.score(Xtrain, ytrain)
    test_accuracy = clf.score(Xtest, ytest)
    print "############# based on standard predict ################"
    print "Accuracy on training data: %0.2f" % (training_accuracy)
    print "Accuracy on test data:     %0.2f" % (test_accuracy)
    print confusion_matrix(ytest, clf.predict(Xtest))
    print "########################################################"
    return clf, Xtrain, ytrain, Xtest, ytest

from sklearn.metrics import roc_curve, auc
def make_roc(name, clf, ytest, xtest, ax=None, labe=5, proba=True, skip=0):
    initial=False
    if not ax:
        ax=plt.gca()
        initial=True
    if proba:#for stuff like logistic regression
        fpr, tpr, thresholds=roc_curve(ytest, clf.predict_proba(xtest)[:,1])
    else:#for stuff like SVM
        fpr, tpr, thresholds=roc_curve(ytest, clf.decision_function(xtest))
    roc_auc = auc(fpr, tpr)
    if skip:
        l=fpr.shape[0]
        ax.plot(fpr[0:l:skip], tpr[0:l:skip], '.-', alpha=0.3, label='ROC curve for %s (area = %0.2f)' % (name, roc_auc))
    else:
        ax.plot(fpr, tpr, '.-', alpha=0.3, label='ROC curve for %s (area = %0.2f)' % (name, roc_auc))
    label_kwargs = {}
    label_kwargs['bbox'] = dict(
        boxstyle='round,pad=0.3', alpha=0.2,
    )
    if labe!=None:
        for k in xrange(0, fpr.shape[0],labe):
            #from https://gist.github.com/podshumok/c1d1c9394335d86255b8
            threshold = str(np.round(thresholds[k], 2))
            ax.annotate(threshold, (fpr[k], tpr[k]), **label_kwargs)
    if initial:
        ax.plot([0, 1], [0, 1], 'k--')
        ax.set_xlim([0.0, 1.0])
        ax.set_ylim([0.0, 1.05])
        ax.set_xlabel('False Positive Rate')
        ax.set_ylabel('True Positive Rate')
        ax.set_title('ROC')
    ax.legend(loc="lower right")
    return ax