# Analysis of the Age of Grand Slam Tennis Finalists
An excercise in manipulation of pandas dataframes and data visualization  

## 1) Data Manipulation
Get raw data into dataframe that can be used for plotting

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import datetime
import math
import mpld3
from mpld3 import plugins, fig_to_html
import ipdb


gsWinners = pd.read_csv('GrandSlamWinners2.csv', encoding='latin1')
playersList = pd.read_csv('TennisPlayers.csv', encoding='latin')

#Change Name to First 2 Letters of First Name Plus Surname To Compare acorss dataFrames
playersList['NameShortened'] = playersList['Name'].map(lambda string: string[0] + string[string.rfind(' ')+1:])
gsWinners['WinnerShortened'] = gsWinners['WINNER'].map(lambda string: string[0] + string[string.rfind(' ')+1:])
gsWinners['RUShortened'] = gsWinners['RUNNER-UP'].map(lambda string: string[0] + string[string.rfind(' ')+1:])

#Compare DataFrame Containing Names and BirthYears with That of Winners and Runners-Up
playersList.set_index('NameShortened',inplace=True)
gsWinners.set_index('WinnerShortened',inplace=True)
#Add a column for winner Birth Year
gsWinners = gsWinners.join(playersList).rename(columns={'Birth':'WinnerBirthYear','Name':'WinnerNameCheck'})
#Do same for Runner up
gsWinners.set_index('RUShortened',inplace=True)
gsWinners = gsWinners.join(playersList).rename(columns={'Birth':'RUBirthYear','Name':'RUNameCheck'})
gsWinners.reset_index(inplace=True)

#Calculate Ages
gsWinners['WinnerAge'] = gsWinners['YEAR'] - gsWinners['WinnerBirthYear']
gsWinners['RUAge'] = gsWinners['YEAR'] - gsWinners['RUBirthYear']

# #Check concatenation was done correctly by checking for eldest and youngest
# print(gsWinners.sort_values('WinnerAge').head())
# print(gsWinners.sort_values('WinnerAge',ascending=False).head())
# print(gsWinners.sort_values('RUAge').head())
# print(gsWinners.sort_values('RUAge',ascending=False).head())

#Mix-up between Sam Smith and Sidney Smith who were born 74 years apart. 
gsWinners['WinnerAge'] = gsWinners['WinnerAge'].map(lambda x: np.nan if (x > 50 or x<15)else x)
gsWinners['RUAge'] = gsWinners['RUAge'].map(lambda x: np.nan if (x > 50 or x <15) else x)


#Get year plus month as decimal for plotting
#Dictionary to change year and month based on a fraction
fractionalDates = {'Wimbledon':0.6,
                   'U.S. Open': 0.85,
                   'French Open': 0.35,
                   'Australian Open': 0.1,
                   'Australian Open (Jan)': 0.1,
                   'Australian Open (Dec)': 0.99}


gsWinners['YearExact'] = gsWinners['YEAR'] + gsWinners['TOURNAMENT'].replace(fractionalDates).astype('float32')
gsWinners = gsWinners.sort_values('YearExact')

#Smooth out variations with an 8 Slam Rolling Mean
gsWinners['Winner (8 Slam Rolling Mean)'] = gsWinners['WinnerAge'].rolling(window = 8, min_periods = 1,center=False).mean()
gsWinners['Runner-Up (8 Slam Rolling Mean)'] = gsWinners['RUAge'].rolling(window = 8, min_periods = 1,center=False).mean()        


# 2) Plotting

In [9]:
fig, ax = plt.subplots(1, 1,figsize=(8, 4))

css = '''
table
{
  border-collapse: collapse;
}
th
{
  color: #ffffff;
  background-color: #000000;
}
td
{
  background-color: #cccccc;
}
table, th, td
{
  font-family:Arial;
  border: 1px solid black;
  text-align: right;
}
.mpld3-figure path {
    pointer-events: none;
}
.mpld3-figure path.mpld3-path {
    pointer-events: auto;
}
'''

scatterColours = {'Wimbledon':'green',
                  'U.S. Open':'black',
                  'French Open': 'orange',
                  'Australian Open':'blue', 
                  'Australian Open (Jan)': 'blue', 
                  'Australian Open (Dec)': 'blue'}

gsWinners['scatterColours'] = gsWinners['TOURNAMENT'].replace(scatterColours) 

#Drop NaNs Otherwise tooltips don't line up with scatter
gsWinners.dropna(subset=['WinnerAge'],inplace=True)
scatter = ax.scatter(gsWinners['YearExact'],gsWinners['WinnerAge'],
                     alpha=0.2,color=gsWinners['scatterColours'])

gsWinners['ToolTip'] = gsWinners['TOURNAMENT'] + ' ' \
                       + gsWinners['YEAR'].astype(str) + ': ' \
                       + gsWinners['WINNER'] + ' beat ' \
                       + gsWinners['RUNNER-UP']


#Scatter Plot shows ages of winners. Line plots for ages of winners and runners-up
tooltip = plugins.PointHTMLTooltip(scatter, gsWinners['ToolTip'].tolist(),
                                   voffset=10, hoffset=10, css=css)

plugins.connect(fig, tooltip)
gsWinners.plot(ax = ax, x='YearExact', y='Winner (8 Slam Rolling Mean)')
gsWinners.plot(ax = ax, x='YearExact', y='Runner-Up (8 Slam Rolling Mean)')
ax.plot([1968.3,1968.3],[0,40])
ax.text(1963, 40.5, 'Open Era Begins')
ax.set_xlim([1960, 2020])
ax.set_ylim([15, 45])
ax.set_xlabel('Year')
ax.set_ylabel('Age')
#mpld3.save_html(fig, 'GSWinnersAge.html')
mpld3.display()


# Does the older or younger player win more often?

In [3]:
gsWinners['AgeDifference'] = gsWinners['WinnerAge'] - gsWinners['RUAge']
gsWinners.dropna(subset=['AgeDifference'],inplace=True)
#Count Number of times older or younger player wins
isOlder = gsWinners['AgeDifference'].astype(int).map(lambda x: np.sign(x)).replace({0:np.nan}).dropna().value_counts()
print("Fraction of grandslam titles won by the older player:")
print(round(isOlder.loc[1]/float(isOlder.sum()),2))



Fraction of grandslam titles won by the older player:
0.48
