In [1]:
import pandas as pd, matplotlib.pyplot as plt, numpy as np, seaborn as sns, sklearn as sk
import csv

### Tournmanet Dataframe

In [2]:
# Import tournaments data and create dataframe, adding column headings.
tournaments = pd.read_csv('data/tournaments_1877-2017_UNINDEXED.csv', header=None)
tournamentscolumns = ['tourney_year','tourney_order','tourney_name','tourney_id','tourney_slug','tourney_location',
                      'tourney_dates','tourney_month','tourney_day','tourney_singles_draw','tourney_doubles_draw',
                      'tourney_conditions','tourney_surface','tourney_fin_commit','tourney_url_suffix','singles_winner_name',
                      'singles_winner_url','singles_winner_player_slug','singles_winner_player_id','doubles_winner_1_name',
                      'doubles_winner_1_url','doubles_winner_1_player_slug','doubles_winner_1_player_id','doubles_winner_2_name',
                      'doubles_winner_2_url','doubles_winner_2_player_slug','doubles_winner_2_player_id','tourney_year_id']
tournaments.columns = tournamentscolumns

In [3]:
# Drop unnecessary columns in tournaments. (Only keep 'tourney_dates' and 'tourney_year_id'). Will allow match dates to be included in model.
tournaments.drop(["tourney_year","tourney_order","tourney_name","tourney_id","tourney_slug","tourney_location",
                  "tourney_month","tourney_day","tourney_singles_draw","tourney_doubles_draw","tourney_conditions","tourney_surface",
                  "tourney_fin_commit","tourney_url_suffix","singles_winner_name","singles_winner_url","singles_winner_player_slug",
                  "singles_winner_player_id","doubles_winner_1_name","doubles_winner_1_url","doubles_winner_1_player_slug",
                  "doubles_winner_1_player_id","doubles_winner_2_name","doubles_winner_2_url","doubles_winner_2_player_slug",
                  "doubles_winner_2_player_id"], inplace=True, axis=1)

In [4]:
# Convert tournament date strings to datetime objects.
tournaments['tourney_dates'] = pd.to_datetime(pd.Series(tournaments['tourney_dates']))

In [5]:
tournaments = tournaments[(tournaments['tourney_dates'].dt.year > 1989) ]

In [6]:
tournaments.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2054 entries, 2060 to 4113
Data columns (total 2 columns):
tourney_dates      2054 non-null datetime64[ns]
tourney_year_id    2054 non-null object
dtypes: datetime64[ns](1), object(1)
memory usage: 48.1+ KB


### Scores Dataframe

In [7]:
# Import scores data and create dataframe, adding column headings.
scores = pd.read_csv('data/match_scores_1991-2016_unindexed.csv', header=None)
scorescolumns = ['tourney_year_id','tourney_order','tourney_slug','tourney_url_suffix','tourney_round_name','round_order',
                 'match_order','winner_name','winner_player_id','winner_slug','loser_name','loser_player_id','loser_slug',
                 'winner_seed','loser_seed','match_score_tiebreaks','winner_sets_won','loser_sets_won','winner_games_won',
                 'loser_games_won','winner_tiebreaks_won','loser_tiebreaks_won','match_id','match_stats_url_suffix']
scores.columns = scorescolumns

In [8]:
# Drop unnecessary columns in scores.
scores.drop(["tourney_order","tourney_slug","tourney_url_suffix","tourney_round_name","round_order","match_order","winner_name",
             "winner_slug","loser_name","loser_slug","winner_seed","loser_seed","match_stats_url_suffix"], inplace=True, axis=1)

In [9]:
# Drop empty rows in scores.
nan_rows_scores = scores[ (scores['match_score_tiebreaks'].isnull() == True) ].index
scores.drop(nan_rows_scores, inplace=True)

### Winner and Loser Dataframes

In [10]:
winner = pd.DataFrame([scores.tourney_year_id, scores.winner_player_id, scores.match_score_tiebreaks,
                       scores.winner_sets_won, scores.winner_games_won, scores.winner_tiebreaks_won, scores.match_id]).transpose()

In [11]:
loser = pd.DataFrame([scores.tourney_year_id, scores.loser_player_id, scores.match_score_tiebreaks,
                       scores.loser_sets_won, scores.loser_games_won, scores.loser_tiebreaks_won, scores.match_id]).transpose()

In [12]:
winner = winner.rename(columns={'winner_player_id': 'player_id'})
loser = loser.rename(columns={'loser_player_id': 'player_id'})

In [13]:
winner = pd.merge(left=winner,right=tournaments,how='left',left_on='tourney_year_id',right_on='tourney_year_id')

In [14]:
loser = pd.merge(left=loser,right=tournaments,how='left',left_on='tourney_year_id',right_on='tourney_year_id')

In [15]:
winner.head()

Unnamed: 0,tourney_year_id,player_id,match_score_tiebreaks,winner_sets_won,winner_games_won,winner_tiebreaks_won,match_id,tourney_dates
0,1991-7308,k181,63 16 62,2,13,0,1991-7308-k181-s351,1990-12-31
1,1991-7308,s351,64 76(6),2,13,1,1991-7308-s351-c243,1990-12-31
2,1991-7308,k181,75 64,2,13,0,1991-7308-k181-l206,1990-12-31
3,1991-7308,c243,76(3) 63,2,13,1,1991-7308-c243-s367,1990-12-31
4,1991-7308,s351,36 63 63,2,15,0,1991-7308-s351-a031,1990-12-31


### Rankings Dataframe

In [16]:
rankings = pd.read_csv('data/rankings_1973-2017_csv.csv', header=0, low_memory=False)

In [17]:
# Drop rows before 1990.
drop_rank_rows = rankings[ (rankings['week_year'] < 1990)].index
rankings = rankings.drop(drop_rank_rows)

In [18]:
# Drop unnecessary columns in rankings.
rankings.drop(["move_positions","move_direction","player_age","tourneys_played","player_url","player_slug"], inplace=True, axis=1)

In [19]:
# Convert rankings date strings to datetime objects.
rankings['week_title'] = pd.to_datetime(pd.Series(rankings['week_title']))

In [20]:
# Create 'date' column for later merging.
rankings['date'] = rankings['week_title']

In [21]:
# sort rankings by year, week, day and rank.
rankings = rankings.sort_values(['week_title'])

### Create dataframe to round dates.

In [22]:
# Create dataframe of all dates from Jan 1, 1991 to Dec 31, 2017.
df=pd.DataFrame({'date':pd.date_range('1990-12-15','2017-12-31')})

In [23]:
# Create reference column for the beginning day of the week for each day.
df['BeginWeek']=np.where(df.date.dt.weekday==0, # offset on Non Mondays only
                        df['date'],
                        df['date']-pd.DateOffset(weekday=0,weeks=1),
                        )



### Add rounding dates to model dataframes for merging.

In [24]:
rankings = pd.merge(left=rankings,right=df,how='left',left_on='date',right_on='date')

In [25]:
winner = pd.merge(left=winner,right=df,how='left',left_on='tourney_dates', right_on='date')

In [26]:
loser = pd.merge(left=loser,right=df,how='left',left_on='tourney_dates', right_on='date')

### Clean rankings/winner/loser of redundant date columns.

In [27]:
# Drop unnecessary columns in rankings/winner/loser.
rankings.drop(["week_title","date","week_year","week_month","week_day"], inplace=True, axis=1)
winner.drop(["tourney_dates","date"], inplace=True, axis=1)
loser.drop(["tourney_dates","date"], inplace=True, axis=1)

### Merge rankings into winner and loser dataframes.

In [28]:
winner1 = pd.merge(winner,rankings,on=["player_id","BeginWeek"], how="left")

In [29]:
loser1 = pd.merge(loser,rankings,on=["player_id","BeginWeek"], how="left")

In [30]:
# Label specific columns by dataframe they come from in the respective columns to identify winner and loser info.
winner1 = winner1.rename(columns={'player_id': 'winner_player_id'})
winner1 = winner1.rename(columns={'rank_text': 'winner_rank_text'})
winner1 = winner1.rename(columns={'rank_number': 'winner_rank_number'})
winner1 = winner1.rename(columns={'ranking_points': 'winner_ranking_points'})
loser1 = loser1.rename(columns={'player_id': 'loser_player_id'})
loser1 = loser1.rename(columns={'rank_text': 'loser_rank_text'})
loser1 = loser1.rename(columns={'rank_number': 'loser_rank_number'})
loser1 = loser1.rename(columns={'ranking_points': 'loser_ranking_points'})

In [31]:
winner1.head()

Unnamed: 0,tourney_year_id,winner_player_id,match_score_tiebreaks,winner_sets_won,winner_games_won,winner_tiebreaks_won,match_id,BeginWeek,winner_rank_text,winner_rank_number,winner_ranking_points
0,1991-7308,k181,63 16 62,2,13,0,1991-7308-k181-s351,1990-12-31,51,51.0,0.0
1,1991-7308,s351,64 76(6),2,13,1,1991-7308-s351-c243,1990-12-31,42,42.0,0.0
2,1991-7308,k181,75 64,2,13,0,1991-7308-k181-l206,1990-12-31,51,51.0,0.0
3,1991-7308,c243,76(3) 63,2,13,1,1991-7308-c243-s367,1990-12-31,25,25.0,0.0
4,1991-7308,s351,36 63 63,2,15,0,1991-7308-s351-a031,1990-12-31,42,42.0,0.0


In [32]:
# Drop empty rows in winner1/loser1 where rankings are missing.
nan_rows_winner1 = winner1[ (winner1['winner_rank_text'].isnull() == True) ].index
winner1.drop(nan_rows_winner1, inplace=True)

nan_rows_loser1 = loser1[ (loser1['loser_rank_text'].isnull() == True) ].index
loser1.drop(nan_rows_loser1, inplace=True)

In [33]:
winner1.isnull().sum()

tourney_year_id          0
winner_player_id         0
match_score_tiebreaks    0
winner_sets_won          0
winner_games_won         0
winner_tiebreaks_won     0
match_id                 0
BeginWeek                0
winner_rank_text         0
winner_rank_number       0
winner_ranking_points    0
dtype: int64

In [34]:
loser1.isnull().sum()

tourney_year_id          0
loser_player_id          0
match_score_tiebreaks    0
loser_sets_won           0
loser_games_won          0
loser_tiebreaks_won      0
match_id                 0
BeginWeek                0
loser_rank_text          0
loser_rank_number        0
loser_ranking_points     0
dtype: int64

### Match Statistics Dataframe

In [35]:
matchstats = pd.read_csv('data/match_stats_1991-2016_unindexed.csv', header=None)
matchstatscolumns = ['tourney_order','match_id','match_stats_url_suffix','match_time','match_duration','winner_aces',
                     'winner_double_faults','winner_first_serves_in','winner_first_serves_total','winner_first_serve_points_won',
                     'winner_first_serve_points_total','winner_second_serve_points_won','winner_second_serve_points_total',
                     'winner_break_points_saved','winner_break_points_serve_total','winner_service_points_won',
                     'winner_service_points_total','winner_first_serve_return_won','winner_first_serve_return_total',
                     'winner_second_serve_return_won','winner_second_serve_return_total','winner_break_points_converted',
                     'winner_break_points_return_total','winner_service_games_played','winner_return_games_played',
                     'winner_return_points_won','winner_return_points_total','winner_total_points_won','winner_total_points_total',
                     'loser_aces','loser_double_faults','loser_first_serves_in','loser_first_serves_total',
                     'loser_first_serve_points_won','loser_first_serve_points_total','loser_second_serve_points_won',
                     'loser_second_serve_points_total','loser_break_points_saved','loser_break_points_serve_total',
                     'loser_service_points_won','loser_service_points_total','loser_first_serve_return_won',
                     'loser_first_serve_return_total','loser_second_serve_return_won','loser_second_serve_return_total',
                     'loser_break_points_converted','loser_break_points_return_total','loser_service_games_played',
                     'loser_return_games_played','loser_return_points_won','loser_return_points_total','loser_total_points_won',
                     'loser_total_points_total']
matchstats.columns = matchstatscolumns
# matchstats.head()

In [36]:
# Drop empty rows in matchstats.
nan_rows_match_time = matchstats[ (matchstats['match_time'].isnull() == True) ].index
matchstats.drop(nan_rows_match_time, inplace=True)

In [37]:
# Drop matches(rows) where there were either no total points, or no service points by both the winner and loser.
zero_points = matchstats[ (matchstats['winner_total_points_total'] == 0) ].index
matchstats.drop(zero_points, inplace=True)
zero_wservice_points = matchstats[ (matchstats['winner_service_points_total'] == 0) ].index
matchstats.drop(zero_wservice_points, inplace=True)
zero_lservice_points = matchstats[ (matchstats['loser_service_points_total'] == 0) ].index
matchstats.drop(zero_lservice_points, inplace=True)

In [38]:
# Drop matches(rows) where there were fewer than 12 total games played (incomplete match).
not_enough_winner_serve = matchstats[ (matchstats['winner_service_games_played'] < 6) ].index
matchstats.drop(not_enough_winner_serve, inplace=True)
not_enough_loser_serve = matchstats[ (matchstats['loser_service_games_played'] < 6) ].index
matchstats.drop(not_enough_loser_serve, inplace=True)
not_enough_winner_return = matchstats[ (matchstats['winner_return_games_played'] < 6) ].index
matchstats.drop(not_enough_winner_return, inplace=True)
not_enough_loser_return = matchstats[ (matchstats['loser_return_games_played'] < 6) ].index
matchstats.drop(not_enough_loser_return, inplace=True)

In [39]:
# Drop rows with matches lasting longer than 6 hours
match_too_long = matchstats[ (matchstats['match_duration'] > 360) ].index
matchstats.drop(match_too_long, inplace=True)

In [40]:
# Drop unnecessary columns in matchstats.
matchstats.drop(["tourney_order","match_stats_url_suffix"], inplace=True, axis=1)

In [41]:
# matchstats.isnull().sum()

### Bring in rankings data for winners1 and losers1 dataframes into the matchstats dataframe.

In [42]:
matchstats1 = pd.merge(matchstats,winner1,on=["match_id"], how="inner")

In [43]:
matchstats1 = pd.merge(matchstats1,loser1,on=["match_id","BeginWeek","tourney_year_id","match_score_tiebreaks"], how="inner")

In [44]:
# Drop redundant and/or irrelevant columns in matchstats1.
matchstats1.drop(["winner_rank_text","loser_rank_text"], inplace=True, axis=1)

In [45]:
# Correct for player ranking differences over 300
# Player rankings outside of top 300 are generally not very relevant since the rankings points differences are very small.
matchstats1.loc[(matchstats1.winner_rank_number > 300),'winner_rank_number'] = 300
matchstats1.loc[(matchstats1.loser_rank_number > 300),'loser_rank_number'] = 300

In [46]:
matchstats1.loc[(matchstats1.winner_break_points_converted > matchstats1.winner_break_points_return_total),
                'winner_break_points_converted'] = matchstats1.loc[matchstats1.winner_break_points_return_total < matchstats1.winner_break_points_converted]

### Create EDA Variables

In [47]:
# Create Ace Percent variable, which is the number of aces in a match divided by the total number of serves.
matchstats1 = matchstats1.assign(winner_ace_pct =lambda matchstats1: (matchstats1['winner_aces'] /
                                                                      (matchstats1['winner_first_serve_points_total'] + matchstats1['winner_second_serve_points_total']) * 100))
matchstats1 = matchstats1.assign(loser_ace_pct =lambda matchstats1: (matchstats1['loser_aces'] / 
                                                                     (matchstats1['loser_first_serve_points_total'] + matchstats1['loser_second_serve_points_total']) * 100))

In [48]:
# Create Double Fault Percent variable, which is the number of double faults in a match divided by the total number of serves.
matchstats1 = matchstats1.assign(winner_df_pct =lambda matchstats1: (matchstats1['winner_double_faults'] /
                                                                    (matchstats1['winner_first_serve_points_total'] + matchstats1['winner_second_serve_points_total']) * 100))
matchstats1 = matchstats1.assign(loser_df_pct =lambda matchstats1: (matchstats1['loser_double_faults'] / 
                                                                    (matchstats1['loser_first_serve_points_total'] + matchstats1['loser_second_serve_points_total']) * 100))

In [49]:
matchstats1 = matchstats1.assign(winner_firstsrv_in_pct =lambda matchstats1: (matchstats1['winner_first_serves_in'] / matchstats1['winner_first_serves_total']) * 100)
matchstats1 = matchstats1.assign(loser_firstsrv_in_pct =lambda matchstats1: (matchstats1['loser_first_serves_in'] / matchstats1['loser_first_serves_total']) * 100)

In [50]:
matchstats1 = matchstats1.assign(winner_firstsrv_won_pct =lambda matchstats1: (matchstats1['winner_first_serve_points_won'] / matchstats1['winner_first_serve_points_total']) * 100)
matchstats1 = matchstats1.assign(loser_firstsrv_won_pct =lambda matchstats1: (matchstats1['loser_first_serve_points_won'] / matchstats1['loser_first_serve_points_total']) * 100)

In [51]:
matchstats1 = matchstats1.assign(winner_secondsrv_won_pct =lambda matchstats1: (matchstats1['winner_second_serve_points_won'] / matchstats1['winner_second_serve_points_total']) * 100)
matchstats1 = matchstats1.assign(loser_secondsrv_won_pct =lambda matchstats1: (matchstats1['loser_second_serve_points_won'] / matchstats1['loser_second_serve_points_total']) * 100)

In [52]:
matchstats1 = matchstats1.assign(winner_srv_pts_pct =lambda matchstats1: (matchstats1['winner_service_points_won'] / matchstats1['winner_service_points_total']) * 100)
matchstats1 = matchstats1.assign(loser_srv_pts_pct =lambda matchstats1: (matchstats1['loser_service_points_won'] / matchstats1['loser_service_points_total']) * 100)

In [53]:
matchstats1 = matchstats1.assign(winner_rtn_pts_pct =lambda matchstats1: (matchstats1['winner_return_points_won'] / matchstats1['winner_return_points_total']) * 100)
matchstats1 = matchstats1.assign(loser_rtn_pts_pct =lambda matchstats1: (matchstats1['loser_return_points_won'] / matchstats1['loser_return_points_total']) * 100)

In [54]:
matchstats1 = matchstats1.assign(winner_brk_pts_pct =lambda matchstats1: (matchstats1['winner_break_points_converted'] / matchstats1['winner_break_points_return_total']) * 100)
matchstats1 = matchstats1.assign(loser_brk_pts_pct =lambda matchstats1: (matchstats1['loser_break_points_converted'] / matchstats1['loser_break_points_return_total']) * 100)

In [55]:
matchstats1 = matchstats1.assign(winner_points_won_pct =lambda matchstats1: (matchstats1['winner_total_points_won'] / matchstats1['winner_total_points_total']) * 100)
matchstats1 = matchstats1.assign(loser_points_won_pct =lambda matchstats1: (100 - matchstats1['winner_points_won_pct']))

In [56]:
# Fill missing values
matchstats1['loser_rtn_pts_pct'].fillna(0, inplace=True)
matchstats1['winner_brk_pts_pct'].fillna(0, inplace=True)
matchstats1['loser_brk_pts_pct'].fillna(0, inplace=True)

In [57]:
# Correct for break point percentage errors
matchstats1.loc[(matchstats1.winner_brk_pts_pct > 100),'winner_brk_pts_pct'] = 100
matchstats1.loc[(matchstats1.loser_brk_pts_pct > 100),'loser_brk_pts_pct'] = 100

In [58]:
nan_rows_loser_points_won_pct = matchstats1[ (matchstats1['loser_points_won_pct'].isnull() == True) ].index
matchstats1.drop(nan_rows_loser_points_won_pct, inplace=True)
matchstats1.isnull().sum()

match_id                 0
match_time               0
match_duration           0
winner_aces              0
winner_double_faults     0
                        ..
loser_rtn_pts_pct        0
winner_brk_pts_pct       0
loser_brk_pts_pct        0
winner_points_won_pct    0
loser_points_won_pct     0
Length: 84, dtype: int64

In [59]:
# Create columns for differences in key categories between winner and loser.
matchstats1 = matchstats1.assign(ace_dif =lambda matchstats1: (matchstats1['winner_ace_pct'] - matchstats1['loser_ace_pct']))
matchstats1 = matchstats1.assign(df_dif =lambda matchstats1: (matchstats1['winner_df_pct'] - matchstats1['loser_df_pct']))
matchstats1 = matchstats1.assign(firstsrv_in_dif =lambda matchstats1: (matchstats1['winner_firstsrv_in_pct'] - matchstats1['loser_firstsrv_in_pct']))
matchstats1 = matchstats1.assign(firstsrv_won_dif =lambda matchstats1: (matchstats1['winner_firstsrv_won_pct'] - matchstats1['loser_firstsrv_won_pct']))
matchstats1 = matchstats1.assign(secondsrv_won_dif =lambda matchstats1: (matchstats1['winner_secondsrv_won_pct'] - matchstats1['loser_secondsrv_won_pct']))
matchstats1 = matchstats1.assign(srv_pts_dif =lambda matchstats1: (matchstats1['winner_srv_pts_pct'] - matchstats1['loser_srv_pts_pct']))
matchstats1 = matchstats1.assign(rtn_pts_dif =lambda matchstats1: (matchstats1['winner_rtn_pts_pct'] - matchstats1['loser_rtn_pts_pct']))
matchstats1 = matchstats1.assign(brk_pts_dif =lambda matchstats1: (matchstats1['winner_brk_pts_pct'] - matchstats1['loser_brk_pts_pct']))
matchstats1 = matchstats1.assign(total_points_pct_dif =lambda matchstats1: (matchstats1['winner_points_won_pct'] - matchstats1['loser_points_won_pct']))
matchstats1 = matchstats1.assign(rank_dif =lambda matchstats1: (matchstats1['loser_rank_number'] - matchstats1['winner_rank_number']))

In [60]:
# Correct for short match durations errors
# short = matchstats1.loc[(matchstats1.match_duration < 30)]
# print(short)
matchstats1.loc[(matchstats1.match_duration < 60),'match_duration'] = int(np.mean(matchstats1['match_duration']))

In [61]:
# matchstats1[['ace_dif','df_dif','srv_pts_dif','rtn_pts_dif','brk_pts_dif','total_points_pct_dif','rank_dif']].describe().T

## Miscellaneous Cleanup

In [62]:
# Normalize Dataframe Dates and Sort
matchstats1['Date'] = matchstats1['BeginWeek'].dt.normalize()
matchstats1['year'] = pd.to_datetime(matchstats1['BeginWeek']).dt.year
matchstats1['month'] = pd.to_datetime(matchstats1['BeginWeek']).dt.month
matchstats1['day'] = pd.to_datetime(matchstats1['BeginWeek']).dt.day
matchstats1.sort_values(by='BeginWeek', inplace=True)
matchstats1.drop(["match_time","BeginWeek"], inplace=True, axis=1)

In [63]:
# Check for errors in break points converted data.
matchstats1.loc[(matchstats1.winner_break_points_converted > matchstats1.winner_break_points_return_total),
                'winner_break_points_converted'] = matchstats1.winner_break_points_return_total

In [64]:
matchstats1 = matchstats1.sort_values(['Date','match_id'], ascending=True).reset_index(drop=True)

In [65]:
# df2.loc[startrow:endrow, startcolumn:endcolumn] # matchstats1.iloc[1200:1220,0:10]

### Create Result Dataframe for Inferential Statistics

In [66]:
winner_columns = ['match_id','match_duration','rank_dif','Date','year','month','day','winner_aces','winner_double_faults','winner_first_serves_in','winner_first_serves_total',
                  'winner_first_serve_points_won','winner_first_serve_points_total','winner_second_serve_points_won','winner_second_serve_points_total','winner_break_points_saved',
                  'winner_break_points_serve_total','winner_service_points_won','winner_service_points_total','winner_first_serve_return_won','winner_first_serve_return_total',
                  'winner_second_serve_return_won','winner_second_serve_return_total','winner_break_points_converted','winner_break_points_return_total','winner_service_games_played',
                  'winner_return_games_played','winner_return_points_won','winner_return_points_total','winner_total_points_won','winner_total_points_total',
                  'winner_sets_won','winner_games_won','winner_tiebreaks_won','winner_rank_number','winner_ranking_points','winner_ace_pct','winner_df_pct','winner_firstsrv_in_pct',
                  'winner_firstsrv_won_pct','winner_secondsrv_won_pct','winner_srv_pts_pct','winner_rtn_pts_pct','winner_brk_pts_pct','winner_points_won_pct']

winner_df = pd.DataFrame(matchstats1[winner_columns])

winner_df['win'] = 1

columns = ['match_id','match_duration','rank_dif','date','year','month','day','aces','double_faults','first_serves_in','first_serves_total',
           'first_serve_points_won','first_serve_points_total','second_serve_points_won','second_serve_points_total','break_points_saved',
           'break_points_serve_total','service_points_won','service_points_total','first_serve_return_won','first_serve_return_total',
           'second_serve_return_won','second_serve_return_total','break_points_converted','break_points_return_total','service_games_played',
           'return_games_played','return_points_won','return_points_total','total_points_won','total_points_total',
           'sets_won','games_won','tiebreaks_won','rank_number','ranking_points','ace_pct','df_pct','firstsrv_in_pct',
           'firstsrv_won_pct','secondsrv_won_pct','srv_pts_pct','rtn_pts_pct','brk_pts_pct','points_won_pct','win']

winner_df.columns = columns

In [67]:
loser_columns = ['match_id','match_duration','rank_dif','Date','year','month','day','loser_aces','loser_double_faults','loser_first_serves_in','loser_first_serves_total',
                 'loser_first_serve_points_won','loser_first_serve_points_total','loser_second_serve_points_won','loser_second_serve_points_total','loser_break_points_saved',
                 'loser_break_points_serve_total','loser_service_points_won','loser_service_points_total','loser_first_serve_return_won','loser_first_serve_return_total',
                 'loser_second_serve_return_won','loser_second_serve_return_total','loser_break_points_converted','loser_break_points_return_total','loser_service_games_played',
                 'loser_return_games_played','loser_return_points_won','loser_return_points_total','loser_total_points_won','loser_total_points_total',
                 'loser_sets_won','loser_games_won','loser_tiebreaks_won','loser_rank_number','loser_ranking_points','loser_ace_pct','loser_df_pct','loser_firstsrv_in_pct',
                 'loser_firstsrv_won_pct','loser_secondsrv_won_pct','loser_srv_pts_pct','loser_rtn_pts_pct','loser_brk_pts_pct','loser_points_won_pct']

loser_df = pd.DataFrame(matchstats1[loser_columns])

loser_df['win'] = 0
loser_df['rank_dif'] = -loser_df['rank_dif']

columns = ['match_id','match_duration','rank_dif','date','year','month','day','aces','double_faults','first_serves_in','first_serves_total',
           'first_serve_points_won','first_serve_points_total','second_serve_points_won','second_serve_points_total','break_points_saved',
           'break_points_serve_total','service_points_won','service_points_total','first_serve_return_won','first_serve_return_total',
           'second_serve_return_won','second_serve_return_total','break_points_converted','break_points_return_total','service_games_played',
           'return_games_played','return_points_won','return_points_total','total_points_won','total_points_total',
           'sets_won','games_won','tiebreaks_won','rank_number','ranking_points','ace_pct','df_pct','firstsrv_in_pct',
           'firstsrv_won_pct','secondsrv_won_pct','srv_pts_pct','rtn_pts_pct','brk_pts_pct','points_won_pct','win']

loser_df.columns = columns

In [68]:
results = pd.concat([winner_df,loser_df], sort=False)
results = results.sort_values(['date','match_id'], ascending=True).reset_index(drop=True)
results.head()

Unnamed: 0,match_id,match_duration,rank_dif,date,year,month,day,aces,double_faults,first_serves_in,...,ace_pct,df_pct,firstsrv_in_pct,firstsrv_won_pct,secondsrv_won_pct,srv_pts_pct,rtn_pts_pct,brk_pts_pct,points_won_pct,win
0,1991-354-a028-c113,66.0,51.0,1990-12-31,1990,12,31,2.0,1.0,29.0,...,4.166667,2.083333,60.416667,89.655172,63.157895,79.166667,41.071429,25.0,58.653846,1
1,1991-354-a028-c113,66.0,-51.0,1990-12-31,1990,12,31,3.0,3.0,42.0,...,5.357143,5.357143,75.0,64.285714,42.857143,58.928571,20.833333,0.0,41.346154,0
2,1991-354-b040-c260,147.0,-59.0,1990-12-31,1990,12,31,3.0,2.0,56.0,...,3.191489,2.12766,59.574468,66.071429,55.263158,61.702128,39.622642,20.0,50.0,1
3,1991-354-b040-c260,147.0,59.0,1990-12-31,1990,12,31,4.0,1.0,72.0,...,3.773585,0.943396,67.924528,58.333333,64.705882,60.377358,38.297872,50.0,50.0,0
4,1991-354-b040-m048,68.0,40.0,1990-12-31,1990,12,31,2.0,1.0,37.0,...,4.081633,2.040816,75.510204,67.567568,50.0,63.265306,54.545455,54.545455,58.653846,1


In [69]:
matchstats1.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
match_duration,88206.0,104.614777,33.616448,60.000000,80.000000,99.000000,121.000000,353.000000
winner_aces,88206.0,6.236208,5.080645,0.000000,3.000000,5.000000,9.000000,61.000000
winner_double_faults,88206.0,2.728306,2.340481,0.000000,1.000000,2.000000,4.000000,23.000000
winner_first_serves_in,88206.0,45.306260,20.206875,0.000000,32.000000,43.000000,56.000000,194.000000
winner_first_serves_total,88206.0,77.114323,28.263841,22.000000,56.000000,72.000000,93.000000,258.000000
...,...,...,...,...,...,...,...,...
total_points_pct_dif,88206.0,10.935045,8.231343,-39.393939,5.050505,9.638554,15.463918,65.517241
rank_dif,88206.0,29.128869,87.778525,-298.000000,-18.000000,22.000000,70.000000,299.000000
year,88206.0,2004.223012,7.684788,1990.000000,1997.000000,2005.000000,2011.000000,2016.000000
month,88206.0,5.662925,2.985387,1.000000,3.000000,6.000000,8.000000,12.000000


In [70]:
breakpointerrors = matchstats1.loc[matchstats1['winner_break_points_converted'] > matchstats1['winner_break_points_return_total']]
breakpointdf = pd.DataFrame(breakpointerrors)
breakpointdf.head()

Unnamed: 0,match_id,match_duration,winner_aces,winner_double_faults,winner_first_serves_in,winner_first_serves_total,winner_first_serve_points_won,winner_first_serve_points_total,winner_second_serve_points_won,winner_second_serve_points_total,...,secondsrv_won_dif,srv_pts_dif,rtn_pts_dif,brk_pts_dif,total_points_pct_dif,rank_dif,Date,year,month,day


In [71]:
matchstats1.loc[matchstats1['brk_pts_dif'] > 100 ]

Unnamed: 0,match_id,match_duration,winner_aces,winner_double_faults,winner_first_serves_in,winner_first_serves_total,winner_first_serve_points_won,winner_first_serve_points_total,winner_second_serve_points_won,winner_second_serve_points_total,...,secondsrv_won_dif,srv_pts_dif,rtn_pts_dif,brk_pts_dif,total_points_pct_dif,rank_dif,Date,year,month,day


### Export Completed File to csv format for use in EDA

In [72]:
matchstats1.to_csv(r'data/matchstats.csv', index=False)

In [73]:
results.to_csv(r'data/result.csv',index=False)