In [1]:
import matplotlib.pyplot as plt
from datetime import datetime
from elopy import *
import pandas as pd
import numpy as np
from scipy.optimize import curve_fit
from elopy import Implementation

In [2]:
# Import & drop duplicates for ttElite
dfElite = pd.read_csv('./ttennisData/TTElite.csv')
dfElite = dfElite.drop_duplicates(keep='last')
dfElite.to_csv('./ttennisData/TTElite.csv', index=False)

# Import & drop duplicates for ttCup
df = pd.read_csv("./ttennisData/TTCup.csv")
df = df.drop_duplicates(keep='last')
df.to_csv('./ttennisData/TTCup.csv', index=False)

print(f"{df.shape} {dfElite.shape}")
df = pd.concat([df, dfElite])
print(f"{df.shape}")


  df = pd.read_csv("./ttennisData/TTCup.csv")


(122721, 20) (187781, 20)
(310502, 20)


In [3]:
# df = pd.read_csv("./ttennisData/TTCup.csv")
print(df.shape)
df = df.fillna(0) # G4+ have 

# ------- Create new columns -------
# Combine P1 and P2 scores for each game
for i in range(1, 6):
    df[f'G{i}_Total'] = df[f'P1_G{i}'] + df[f'P2_G{i}']
    df[f'H_g{i}'] = df[f'P1_G{i}'] > df[f'P2_G{i}']

# Player 1 total score from G1 to G5
df['Total_P1'] = df[[f'P1_G{i}' for i in range(1, 6)]].sum(axis=1)
df['Total_Avg_P1'] = df['Total_P1']

# Player 2 total score from G1 to G5
df['Total_P2'] = df[[f'P2_G{i}' for i in range(1, 6)]].sum(axis=1)
df['Total_Avg_P2'] = df['Total_P2']

# Sum of combined scores from G1 to G5
df['Total_Score'] = df[[f'G{i}_Total' for i in range(1, 6)]].sum(axis=1)
df['Over_74'] = df['Total_Score'] > 74
df['G5'] = df['G5_Total'] > 0
df['G4'] = (df['G4_Total'] > 0) & (df['G5_Total'] == 0)
df['G45'] = (df['G4_Total'] > 0) | (df['G5_Total'] > 0)
df['G34'] = (df['G5_Total'] == 0)

for i in range(1,6):
    df.rename(columns={f'P1_G{i}': f'G{i}_P1', f'P2_G{i}': f'G{i}_P2'}, inplace=True)

df.rename(columns={'P1_Total': 'Total_P1', 'P2_Total': 'Total_P2'}, inplace=True)

# df['No_Odds'] = (df['Odds_P1'] + df['Odds_P2'] == 0)
# print(df['No_Odds'].sum())

df['Total_Allowed_P1'] = df['Total_P2']
df['Total_Allowed_P2'] = df['Total_P1']

df['Sets_Allowed_P1'] = df['Sets_P2']
df['Sets_Allowed_P2'] = df['Sets_P1']

df['Win_P1'] = df['Total_P1'] > df['Total_P2']
df['Win_P2'] = df['Total_P2'] > df['Total_P1']

# This is just for bill james pythagoren
#df['G34_P1'] = df['G34']
#df['G34_P2'] = df['G34']
df['H_won'] = df['Total_P1'] > df['Total_P2']
df = df.sort_values(by='Date')
df.info()

(310502, 20)
<class 'pandas.core.frame.DataFrame'>
Index: 310502 entries, 122689 to 187780
Data columns (total 47 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Date              310502 non-null  float64
 1   Player1           310502 non-null  float64
 2   Player2           310502 non-null  float64
 3   Player1_Name      310502 non-null  object 
 4   Player2_Name      310502 non-null  object 
 5   Sets_P1           310502 non-null  float64
 6   Sets_P2           310502 non-null  float64
 7   Odds_P1           310502 non-null  float64
 8   Odds_P2           310502 non-null  float64
 9   G1_P1             310502 non-null  float64
 10  G1_P2             310502 non-null  float64
 11  G2_P1             310502 non-null  float64
 12  G2_P2             310502 non-null  float64
 13  G3_P1             310502 non-null  float64
 14  G3_P2             310502 non-null  float64
 15  G4_P1             310502 non-null  float64
 16  G4_P2  

In [4]:
rows = df.shape[0]

# If one player wins first 2 games, how often do they win the whole match
df['first_two_games'] = df['H_g1'] == df['H_g2']
winFirstTwo = df['first_two_games'].sum()
print(f"Either player wins first two games: {winFirstTwo / rows * 100}%")
df['first_two_and_lose'] = (df['first_two_games'] == 1) & (df['H_g1'] != df['H_won'])
firstTwoAndLose = df['first_two_and_lose'].sum()
print(f"Winner of first 2 games lose match: {firstTwoAndLose / rows * 100}%\n\n")


# How often are 5 games played
fiveGamesPlayed = df['G5'].sum()
print(f"How often are 5 games played: {fiveGamesPlayed / rows * 100}%")
fourGamesPlayed = df['G4'].sum()
print(f"How often are 4 games played: {fourGamesPlayed / rows * 100}%\n\n")

print(f"How often are 4 or 5 games played: {(fourGamesPlayed + fiveGamesPlayed) / rows * 100}%\n\n")

# How accurate are the odds
df['H_odds'] = df['Odds_P1'] < df['Odds_P2']
oddsPredResult = (df['H_odds'] == df['H_won']).sum()
print(f"How often did odds predict result: {oddsPredResult / rows * 100}%\n\n")



Either player wins first two games: 53.996753644098916%
Winner of first 2 games lose match: 4.4569761225370526%


How often are 5 games played: 29.27259727795634%
How often are 4 games played: 38.431636511198%


How often are 4 or 5 games played: 67.70423378915433%


How often did odds predict result: 60.218935787853226%


