In [1]:
import os
import pandas as pd
import numpy as np


directory = 'tennis_datav2'


if not os.path.exists(directory):
    os.makedirs(directory)

# years range
years = range(2000, 2025)
dfs = []
# save the file
for year in years:
    file_path = os.path.join(directory, f'atp_matches_{year}.csv')


    if os.path.exists(file_path):
        df = pd.read_csv(file_path)
    else:
        url = f'https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_{year}.csv'
        df = pd.read_csv(url)
        df.to_csv(file_path, index=False)

    dfs.append(df)

# single dataframe
atp_matches = pd.concat(dfs, ignore_index=True)

atp_matches['tourney_date'] = pd.to_datetime(atp_matches['tourney_date'], format='%Y%m%d')

# chronological order
atp_matches = atp_matches.sort_values(by='tourney_date').reset_index(drop=True)

# split the data
total_matches = len(atp_matches)
initial_50 = atp_matches.iloc[:total_matches//2]
next_25 = atp_matches.iloc[total_matches//2:total_matches*3//4]
final_25 = atp_matches.iloc[total_matches*3//4:]

# empty players dictionary
players = {}

# player class
class Player:
    def __init__(self, name):
        self.name = name
        self.total_matches = 0
        self.total_wins = 0
        self.surface_matches = {'Hard': 0, 'Clay': 0, 'Grass': 0, 'Carpet': 0, 'Unknown': 0}
        self.surface_wins = {'Hard': 0, 'Clay': 0, 'Grass': 0, 'Carpet': 0, 'Unknown': 0}
        self.ranks = []
        self.heights = []
        self.hands = []
        self.seeds = []
        self.ages = []
        self.last_5_matches = []
        self.last_10_matches = []
        self.head_to_head = {}
        self.matches_against_right_handers = 0
        self.wins_against_right_handers = 0
        self.matches_against_left_handers = 0
        self.wins_against_left_handers = 0
        self.win_streak = 0
        self.loss_streak = 0
        self.tourney_performance = {}
        self.surface_performance = {'Hard': [], 'Clay': [], 'Grass': [], 'Carpet': [], 'Unknown': []}

    def update_stats(self, opponent, opponent_hand, is_winner, surface, rank, height, hand, seed, age, tourney_id):
        self.total_matches += 1
        self.ranks.append(rank)
        self.heights.append(height)
        self.hands.append(hand)
        self.seeds.append(seed)
        self.ages.append(age)

        self.last_5_matches.append(is_winner)
        if len(self.last_5_matches) > 5:
            self.last_5_matches.pop(0)

        self.last_10_matches.append(is_winner)
        if len(self.last_10_matches) > 10:
            self.last_10_matches.pop(0)

        if opponent not in self.head_to_head:
            self.head_to_head[opponent] = {'matches': 0, 'wins': 0}
        self.head_to_head[opponent]['matches'] += 1
        if is_winner:
            self.total_wins += 1
            self.surface_wins[surface] += 1
            self.head_to_head[opponent]['wins'] += 1
            self.win_streak += 1
            self.loss_streak = 0
        else:
            self.win_streak = 0
            self.loss_streak += 1

        if opponent_hand == 'R':
            self.matches_against_right_handers += 1
            if is_winner:
                self.wins_against_right_handers += 1
        elif opponent_hand == 'L':
            self.matches_against_left_handers += 1
            if is_winner:
                self.wins_against_left_handers += 1

        self.surface_matches[surface] += 1
        self.surface_performance[surface].append(is_winner)
        if len(self.surface_performance[surface]) > 10:
            self.surface_performance[surface].pop(0)

        if tourney_id not in self.tourney_performance:
            self.tourney_performance[tourney_id] = {'matches': 0, 'wins': 0}
        self.tourney_performance[tourney_id]['matches'] += 1
        if is_winner:
            self.tourney_performance[tourney_id]['wins'] += 1

    def win_percentage(self, surface):
        if self.surface_matches[surface] == 0:
            return 0
        return self.surface_wins[surface] / self.surface_matches[surface]

    def overall_win_percentage(self):
        if self.total_matches == 0:
            return 0
        return self.total_wins / self.total_matches

    def last_5_win_percentage(self):
        if len(self.last_5_matches) == 0:
            return 0
        return sum(self.last_5_matches) / len(self.last_5_matches)

    def last_10_win_percentage(self):
        if len(self.last_10_matches) == 0:
            return 0
        return sum(self.last_10_matches) / len(self.last_10_matches)

    def surface_last_10_win_percentage(self, surface):
        surface_mapping = {0: 'Clay', 1: 'Grass', 2: 'Hard'}  # encode the surface
        if isinstance(surface, int):
            surface = surface_mapping.get(surface, 'Unknown')

        if surface not in self.surface_performance:
            return 0
        if len(self.surface_performance[surface]) == 0:
            return 0
        return sum(self.surface_performance[surface]) / len(self.surface_performance[surface])

    def head_to_head_stats(self, opponent):
        if opponent not in self.head_to_head:
            return {'matches': 0, 'wins': 0}
        return self.head_to_head[opponent]

    def win_percentage_against_right_handers(self):
        if self.matches_against_right_handers == 0:
            return 0
        return self.wins_against_right_handers / self.matches_against_right_handers

    def win_percentage_against_left_handers(self):
        if self.matches_against_left_handers == 0:
            return 0
        return self.wins_against_left_handers / self.matches_against_left_handers

    def is_top_10(self, rank):
        return rank <= 10 if rank else False

    def tourney_win_percentage(self, tourney_id):
        if self.tourney_performance[tourney_id]['matches'] == 0:
            return 0
        return self.tourney_performance[tourney_id]['wins'] / self.tourney_performance[tourney_id]['matches']

    def preferred_surface(self):
        best_surface = max(self.surface_wins, key=lambda x: self.win_percentage(x))
        return best_surface


def update_player_stats(row, is_winner):
    name = row['winner_name'] if is_winner else row['loser_name']
    opponent = row['loser_name'] if is_winner else row['winner_name']
    opponent_hand = row['loser_hand'] if is_winner else row['winner_hand']
    surface = row['surface'] if pd.notna(row['surface']) else 'Unknown'
    rank = row['winner_rank'] if is_winner else row['loser_rank']
    height = row['winner_ht'] if is_winner else row['loser_ht']
    hand = row['winner_hand'] if is_winner else row['loser_hand']
    seed = row['winner_seed'] if is_winner else row['loser_seed']
    age = row['winner_age'] if is_winner else row['loser_age']
    tourney_id = row['tourney_id']

    if name not in players:
        players[name] = Player(name)

    players[name].update_stats(
        opponent=opponent,
        opponent_hand=opponent_hand,
        is_winner=is_winner,
        surface=surface,
        rank=rank,
        height=height,
        hand=hand,
        seed=seed,
        age=age,
        tourney_id=tourney_id
    )


# input initial data
for index, row in initial_50.iterrows():
    update_player_stats(row, is_winner=True)
    update_player_stats(row, is_winner=False)

# append data row by row
for index, row in next_25.iterrows():
    update_player_stats(row, is_winner=True)
    update_player_stats(row, is_winner=False)







In [2]:
player_name = 'Novak Djokovic'
if player_name in players:
    player = players[player_name]
    print(f'{player.name} win percentage on Hard: {player.win_percentage("Hard"):.2f}')
    print(f'{player.name} win percentage on Clay: {player.win_percentage("Clay"):.2f}')
    print(f'{player.name} win percentage on Grass: {player.win_percentage("Grass"):.2f}')
    print(f'{player.name} overall win percentage: {player.overall_win_percentage():.2f}')
    print(f'{player.name} last 5 match win percentage: {player.last_5_win_percentage():.2f}')
    print(f'{player.name} win percentage against right-handers: {player.win_percentage_against_right_handers():.2f}')
    print(f'{player.name} win percentage against left-handers: {player.win_percentage_against_left_handers():.2f}')
    print(f'{player.name} is top 10: {player.is_top_10(player.ranks[-1])}')
    print(f'{player.name} current win streak: {player.win_streak}')
    print(f'{player.name} current loss streak: {player.loss_streak}')
    print(f'{player.name} ranks: {player.ranks}')
    print(f'{player.name} heights: {player.heights}')
    print(f'{player.name} hands: {player.hands}')
    print(f'{player.name} seeds: {player.seeds}')
    print(f'{player.name} ages: {player.ages}')
    print(f'{player.name} best surface: {player.preferred_surface()}')
else:
    print(f'Player {player_name} not found.')


Novak Djokovic win percentage on Hard: 0.84
Novak Djokovic win percentage on Clay: 0.80
Novak Djokovic win percentage on Grass: 0.82
Novak Djokovic overall win percentage: 0.83
Novak Djokovic last 5 match win percentage: 0.80
Novak Djokovic win percentage against right-handers: 0.84
Novak Djokovic win percentage against left-handers: 0.75
Novak Djokovic is top 10: True
Novak Djokovic current win streak: 4
Novak Djokovic current loss streak: 0
Novak Djokovic ranks: [606.0, 368.0, 272.0, 272.0, 248.0, 188.0, 160.0, 160.0, 160.0, 142.0, 142.0, 153.0, 153.0, 128.0, 128.0, 128.0, 97.0, 97.0, 97.0, 97.0, 97.0, 97.0, 88.0, 88.0, 85.0, 85.0, 85.0, 76.0, 81.0, 81.0, 81.0, 81.0, 70.0, 70.0, 72.0, 72.0, 72.0, 67.0, 66.0, 66.0, 64.0, 64.0, 67.0, 66.0, 71.0, 71.0, 63.0, 63.0, 63.0, 63.0, 63.0, 40.0, 40.0, 39.0, 39.0, 39.0, 39.0, 36.0, 36.0, 36.0, 36.0, 36.0, 28.0, 28.0, 28.0, 28.0, 28.0, 24.0, 24.0, 23.0, 23.0, 23.0, 21.0, 21.0, 22.0, 22.0, 22.0, 22.0, 22.0, 16.0, 16.0, 17.0, 17.0, 17.0, 16.0, 16.0

In [3]:
def filter_players_with_min_matches(df, min_matches=10):
    player_match_count = df['winner_name'].value_counts().add(df['loser_name'].value_counts(), fill_value=0)
    eligible_players = player_match_count[player_match_count >= min_matches].index

    df = df[df['winner_name'].isin(eligible_players) & df['loser_name'].isin(eligible_players)]
    return df
next_25 = filter_players_with_min_matches(next_25)
final_25 = filter_players_with_min_matches(final_25)

In [4]:
def create_player_vs_player_dataset(df):
    df_new = df.copy()
    swap = np.random.rand(len(df)) < 0.5  # shuffling the dataset

    # assign player1 and player 2
    df_new['player1'] = np.where(swap, df_new['winner_name'], df_new['loser_name'])
    df_new['player2'] = np.where(swap, df_new['loser_name'], df_new['winner_name'])

    df_new['player1_seed'] = np.where(swap, df_new['winner_seed'], df_new['loser_seed'])
    df_new['player2_seed'] = np.where(swap, df_new['loser_seed'], df_new['winner_seed'])

    df_new['player1_rank'] = np.where(swap, df_new['winner_rank'], df_new['loser_rank'])
    df_new['player2_rank'] = np.where(swap, df_new['loser_rank'], df_new['winner_rank'])

    df_new['player1_age'] = np.where(swap, df_new['winner_age'], df_new['loser_age'])
    df_new['player2_age'] = np.where(swap, df_new['loser_age'], df_new['winner_age'])

    df_new['player1_seed'] = np.where(swap, df_new['winner_seed'], df_new['loser_seed'])
    df_new['player2_seed'] = np.where(swap, df_new['loser_seed'], df_new['winner_seed'])

    df_new['player1_ht'] = np.where(swap, df_new['winner_ht'], df_new['loser_ht'])
    df_new['player2_ht'] = np.where(swap, df_new['loser_ht'], df_new['winner_ht'])

    df_new['player1_hand'] = np.where(swap, df_new['winner_hand'], df_new['loser_hand'])
    df_new['player2_hand'] = np.where(swap, df_new['loser_hand'], df_new['winner_hand'])

    df_new['result'] = np.where(swap, 1, 2)  # player1 wins if not swapped, otherwise player2 wins
    return df_new
next_25 = create_player_vs_player_dataset(next_25)
final_25 = create_player_vs_player_dataset(final_25)
initial_50 = create_player_vs_player_dataset(initial_50)

In [5]:
import numpy as np
from sklearn.preprocessing import LabelEncoder


def create_features(df, encoder=None):
    surface_mapping = {
        'Hard': 'Hard', 'hard': 'Hard', 'HARD': 'Hard',
        'Clay': 'Clay', 'clay': 'Clay', 'CLAY': 'Clay',
        'Grass': 'Grass', 'grass': 'Grass', 'GRASS': 'Grass'
    }

    df['surface'] = df['surface'].map(surface_mapping).fillna('Unknown')

    # encode surface
    if encoder is None:
        encoder = LabelEncoder()
        surfaces_with_unknown = pd.concat([df['surface'], pd.Series(['Unknown'])])
        encoder.fit(surfaces_with_unknown)

    df['surface_encoded'] = encoder.transform(df['surface'])

    # feature engineering
    df['rank_diff'] = df['player1_rank'] - df['player2_rank']
    df['log_rank_diff'] = np.log1p(np.abs(df['player1_rank'] - df['player2_rank']))
    df['top_10_vs_not'] = ((df['player1_rank'] <= 10) & (df['player2_rank'] > 10)) | ((df['player2_rank'] <= 10) & (df['player1_rank'] > 10))

    df['age_diff'] = df['player1_age'] - df['player2_age']
    df['young_vs_old'] = ((df['player1_age'] < 25) & (df['player2_age'] > 30)) | ((df['player2_age'] < 25) & (df['player1_age'] > 30))

    df['player1_seed'] = pd.to_numeric(df['player1_seed'], errors='coerce')
    df['player2_seed'] = pd.to_numeric(df['player2_seed'], errors='coerce')

    df['seed_diff'] = df['player1_seed'] - df['player2_seed']
    df['seeded_vs_unseeded'] = (df['player1_seed'].notna() & df['player2_seed'].isna()) | (df['player2_seed'].notna() & df['player1_seed'].isna())

    df['height_diff'] = df['player1_ht'] - df['player2_ht']
    df['tall_vs_short'] = ((df['player1_ht'] > 190) & (df['player2_ht'] < 180)) | ((df['player2_ht'] > 190) & (df['player1_ht'] < 180))

    df['same_hand'] = df['player1_hand'] == df['player2_hand']
    df['left_vs_right'] = ((df['player1_hand'] == 'L') & (df['player2_hand'] == 'R')) | ((df['player2_hand'] == 'L') & (df['player1_hand'] == 'R'))

    # win percentages
    df['player1_last_5_win_percentage'] = df.apply(lambda row: players.get(row['player1'], Player(row['player1'])).last_5_win_percentage(), axis=1)
    df['player2_last_5_win_percentage'] = df.apply(lambda row: players.get(row['player2'], Player(row['player2'])).last_5_win_percentage(), axis=1)
    df['player1_last_10_win_percentage'] = df.apply(lambda row: players.get(row['player1'], Player(row['player1'])).last_10_win_percentage(), axis=1)
    df['player2_last_10_win_percentage'] = df.apply(lambda row: players.get(row['player2'], Player(row['player2'])).last_10_win_percentage(), axis=1)

    df['player1_surface_last_10_win_percentage'] = df.apply(lambda row: players.get(row['player1'], Player(row['player1'])).surface_last_10_win_percentage(row['surface']), axis=1)
    df['player2_surface_last_10_win_percentage'] = df.apply(lambda row: players.get(row['player2'], Player(row['player2'])).surface_last_10_win_percentage(row['surface']), axis=1)

    # preferred surface
    df['player1_preferred_surface'] = df.apply(lambda row: players.get(row['player1'], Player(row['player1'])).preferred_surface(), axis=1)
    df['player2_preferred_surface'] = df.apply(lambda row: players.get(row['player2'], Player(row['player2'])).preferred_surface(), axis=1)

    # head to head stats
    df['head_to_head_wins_p1'] = df.apply(lambda row: players.get(row['player1'], Player(row['player1'])).head_to_head_stats(row['player2']).get('wins', 0), axis=1)
    df['head_to_head_wins_p2'] = df.apply(lambda row: players.get(row['player2'], Player(row['player2'])).head_to_head_stats(row['player1']).get('wins', 0), axis=1)

    # match surface vs preferred surface
    df['player1_surface_match'] = df.apply(lambda row: row['surface'] == players.get(row['player1'], Player(row['player1'])).preferred_surface(), axis=1)
    df['player2_surface_match'] = df.apply(lambda row: row['surface'] == players.get(row['player2'], Player(row['player2'])).preferred_surface(), axis=1)
    df['surface_preference_diff'] = df['player1_surface_match'].astype(int) - df['player2_surface_match'].astype(int)

    return df
# apply the function
next_25 = create_features(next_25)
final_25 = create_features(final_25)




In [None]:
print(next_25.columns.tolist())

['tourney_id', 'tourney_name', 'surface', 'draw_size', 'tourney_level', 'tourney_date', 'match_num', 'winner_id', 'winner_seed', 'winner_entry', 'winner_name', 'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age', 'loser_id', 'loser_seed', 'loser_entry', 'loser_name', 'loser_hand', 'loser_ht', 'loser_ioc', 'loser_age', 'score', 'best_of', 'round', 'minutes', 'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon', 'w_2ndWon', 'w_SvGms', 'w_bpSaved', 'w_bpFaced', 'l_ace', 'l_df', 'l_svpt', 'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved', 'l_bpFaced', 'winner_rank', 'winner_rank_points', 'loser_rank', 'loser_rank_points', 'player1', 'player2', 'player1_seed', 'player2_seed', 'player1_rank', 'player2_rank', 'player1_age', 'player2_age', 'player1_ht', 'player2_ht', 'player1_hand', 'player2_hand', 'result', 'surface_encoded', 'rank_diff', 'log_rank_diff', 'top_10_vs_not', 'age_diff', 'young_vs_old', 'seed_diff', 'seeded_vs_unseeded', 'height_diff', 'tall_vs_short', 'same_hand', 'left_v

In [6]:
import pandas as pd
import pandas as pd
import numpy as np

def fill_na_values(df):
    # fill missing values
    df['player1'].fillna('Unknown', inplace=True)
    df['player2'].fillna('Unknown', inplace=True)
    df['player1_hand'].fillna('U', inplace=True)
    df['player2_hand'].fillna('U', inplace=True)
    df['result'].fillna(-1, inplace=True) # -1 is unknown result
    df['surface_encoded'].fillna(-1, inplace=True)

    df['player1_rank'].fillna(999, inplace=True)
    df['player2_rank'].fillna(999, inplace=True)

    df['player1_age'].fillna(df['player1_age'].median(), inplace=True)
    df['player2_age'].fillna(df['player2_age'].median(), inplace=True)

    df['player1_seed'] = pd.to_numeric(df['player1_seed'], errors='coerce')
    df['player2_seed'] = pd.to_numeric(df['player2_seed'], errors='coerce')
    df['player1_seed'].fillna(-1, inplace=True)
    df['player2_seed'].fillna(-1, inplace=True)

    df['player1_ht'].fillna(df['player1_ht'].median(), inplace=True)
    df['player2_ht'].fillna(df['player2_ht'].median(), inplace=True)

    df['rank_diff'].fillna(df['rank_diff'].median(), inplace=True)
    df['log_rank_diff'].fillna(df['log_rank_diff'].median(), inplace=True)
    df['top_10_vs_not'].fillna(0, inplace=True)
    df['age_diff'].fillna(df['age_diff'].median(), inplace=True)
    df['young_vs_old'].fillna(0, inplace=True)
    df['seed_diff'].fillna(df['seed_diff'].median(), inplace=True)
    df['seeded_vs_unseeded'].fillna(0, inplace=True)
    df['height_diff'].fillna(df['height_diff'].median(), inplace=True)
    df['tall_vs_short'].fillna(0, inplace=True)
    df['same_hand'].fillna(False, inplace=True)
    df['left_vs_right'].fillna(0, inplace=True)
    df['player1_last_5_win_percentage'].fillna(df['player1_last_5_win_percentage'].median(), inplace=True)
    df['player2_last_5_win_percentage'].fillna(df['player2_last_5_win_percentage'].median(), inplace=True)
    df['player1_last_10_win_percentage'].fillna(df['player1_last_10_win_percentage'].median(), inplace=True)
    df['player2_last_10_win_percentage'].fillna(df['player2_last_10_win_percentage'].median(), inplace=True)
    df['player1_surface_last_10_win_percentage'].fillna(df['player1_surface_last_10_win_percentage'].median(), inplace=True)
    df['player2_surface_last_10_win_percentage'].fillna(df['player2_surface_last_10_win_percentage'].median(), inplace=True)
    df['player1_preferred_surface'].fillna('Unknown', inplace=True)
    df['player2_preferred_surface'].fillna('Unknown', inplace=True)
    df['head_to_head_wins_p1'].fillna(0, inplace=True)
    df['head_to_head_wins_p2'].fillna(0, inplace=True)
    df['player1_surface_match'].fillna(False, inplace=True)
    df['player2_surface_match'].fillna(False, inplace=True)
    df['surface_preference_diff'].fillna(0, inplace=True)

    return df


# apply the function
next_25 = fill_na_values(next_25)
final_25 = fill_na_values(final_25)


In [7]:
from sklearn.preprocessing import LabelEncoder

# fitting encoder on surface types
def fit_surface_encoder(df):

    surfaces = ['Hard', 'Clay', 'Grass', 'Unknown']
    encoder = LabelEncoder()
    encoder.fit(surfaces)
    return encoder

def apply_surface_encoding(df, encoder):


    surface_mapping = {
        'Hard': 'Hard', 'hard': 'Hard', 'HARD': 'Hard',
        'Clay': 'Clay', 'clay': 'Clay', 'CLAY': 'Clay',
        'Grass': 'Grass', 'grass': 'Grass', 'GRASS': 'Grass'
    }

    # map the surfaces
    df['player1_preferred_surface'] = df['player1_preferred_surface'].astype(str).map(surface_mapping).fillna('Unknown')
    df['player2_preferred_surface'] = df['player2_preferred_surface'].astype(str).map(surface_mapping).fillna('Unknown')

    # no unseen surfaces
    unique_surfaces = pd.concat([df['player1_preferred_surface'], df['player2_preferred_surface']]).unique()
    unseen_surfaces = set(unique_surfaces) - set(encoder.classes_)
    if unseen_surfaces:
        raise ValueError(f"Unseen surface types found: {unseen_surfaces}")

    # apply encoder
    df['player1_preferred_surface'] = encoder.transform(df['player1_preferred_surface'])
    df['player2_preferred_surface'] = encoder.transform(df['player2_preferred_surface'])

    return df

# encoder on surface types
le_surface = fit_surface_encoder(next_25)

# apply the function
next_25 = apply_surface_encoding(next_25, le_surface)
final_25 = apply_surface_encoding(final_25, le_surface)







In [8]:
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
import pandas as pd

# convert numerical cols to float
numerical_columns = [
    'rank_diff',
    'log_rank_diff',
    'top_10_vs_not',
    'age_diff',
    'young_vs_old',
    'seed_diff',
    'seeded_vs_unseeded',
    'height_diff',
    'tall_vs_short',
    'player1_last_5_win_percentage',
    'player2_last_5_win_percentage',
    'player1_last_10_win_percentage',
    'player2_last_10_win_percentage',
    'player1_surface_last_10_win_percentage',
    'player2_surface_last_10_win_percentage',
    'player1_surface_match',
    'player2_surface_match',
    'surface_preference_diff',
    'head_to_head_wins_p1',
    'head_to_head_wins_p2',
    'surface_encoded'
]


for col in numerical_columns:
    next_25[col] = pd.to_numeric(next_25[col], errors='coerce')
    final_25[col] = pd.to_numeric(final_25[col], errors='coerce')

imputer = SimpleImputer(strategy='median')
next_25[numerical_columns] = imputer.fit_transform(next_25[numerical_columns])
final_25[numerical_columns] = imputer.transform(final_25[numerical_columns])

# Combine both datasets to fit LabelEncoder on all categories
combined = pd.concat([next_25, final_25])

# Process categorical columns
categorical_columns = combined.select_dtypes(include=['object']).columns
for col in categorical_columns:
    le = LabelEncoder()
    combined[col] = combined[col].astype(str)
    combined[col] = le.fit_transform(combined[col])

# Split the combined data back into next_25 and final_25
next_25 = combined.iloc[:len(next_25)].copy()
final_25 = combined.iloc[len(next_25):].copy()

# Now next_25 and final_25 have the categorical columns processed correctly



In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.impute import SimpleImputer


valid_surfaces = ['Hard', 'Clay', 'Grass']


# update player stats
for index, row in next_25.iterrows():
    if row['surface'] not in valid_surfaces:
        row['surface'] = 'Unknown'
    update_player_stats(row, is_winner=True)
    update_player_stats(row, is_winner=False)

# missing values
imputer = SimpleImputer(strategy='median')
next_25[numerical_columns] = imputer.fit_transform(next_25[numerical_columns])

next_25['player1_seed'].fillna('Not Seeded', inplace=True)
next_25['player2_seed'].fillna('Not Seeded', inplace=True)

for df in [next_25, final_25]:
    df['player1_preferred_surface'] = df['player1_preferred_surface'].astype(str)
    df['player2_preferred_surface'] = df['player2_preferred_surface'].astype(str)

    le_surface = LabelEncoder()
    df['player1_preferred_surface'] = le_surface.fit_transform(df['player1_preferred_surface'])
    df['player2_preferred_surface'] = le_surface.transform(df['player2_preferred_surface'])

# encode categorical columns
categorical_columns = next_25.select_dtypes(include=['object']).columns
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    next_25[col] = next_25[col].astype(str)
    next_25[col] = le.fit_transform(next_25[col])
    label_encoders[col] = le

# scale features
scaler = StandardScaler()
next_25[numerical_columns] = scaler.fit_transform(next_25[numerical_columns])

# split data into players in top 10 vs everyone else
top_10_data = next_25[(next_25['player1_rank'] <= 10) | (next_25['player2_rank'] <= 10)]
other_data = next_25[(next_25['player1_rank'] > 10) & (next_25['player2_rank'] > 10)]

# Features and target
features = [
    'rank_diff',
    'log_rank_diff',
    'top_10_vs_not',
    'age_diff',
    'young_vs_old',
    'seed_diff',
    'seeded_vs_unseeded',
    'height_diff',
    'tall_vs_short',
    'player1_last_5_win_percentage',
    'player2_last_5_win_percentage',
    'player1_last_10_win_percentage',
    'player2_last_10_win_percentage',
    'player1_surface_last_10_win_percentage',
    'player2_surface_last_10_win_percentage',
    'player1_preferred_surface',
    'player2_preferred_surface',
    'player1_surface_match',
    'player2_surface_match',
    'surface_preference_diff',
    'head_to_head_wins_p1',
    'head_to_head_wins_p2',
    'surface',
    'surface_encoded'
]

target = 'result'

# split the data
X_top_10_train, X_top_10_test, y_top_10_train, y_top_10_test = train_test_split(top_10_data[features], top_10_data[target], test_size=0.2, random_state=42)
X_other_train, X_other_test, y_other_train, y_other_test = train_test_split(other_data[features], other_data[target], test_size=0.2, random_state=42)

# train top 10 model using histgradientboostingclassifier
model_top_10 = HistGradientBoostingClassifier(random_state=42)
model_top_10.fit(X_top_10_train, y_top_10_train)

# train other model using histgradientboostingclassifier
model_other = HistGradientBoostingClassifier(random_state=42)
model_other.fit(X_other_train, y_other_train)

# evalulate
y_pred_top_10 = model_top_10.predict(X_top_10_test)
print("Top 10 Players Model Evaluation")
print("Accuracy:", accuracy_score(y_top_10_test, y_pred_top_10))
print("Precision:", precision_score(y_top_10_test, y_pred_top_10, average='weighted'))
print("Recall:", recall_score(y_top_10_test, y_pred_top_10, average='weighted'))
print("F1 Score:", f1_score(y_top_10_test, y_pred_top_10, average='weighted'))

y_pred_other = model_other.predict(X_other_test)
print("Other Players Model Evaluation")
print("Accuracy:", accuracy_score(y_other_test, y_pred_other))
print("Precision:", precision_score(y_other_test, y_pred_other, average='weighted'))
print("Recall:", recall_score(y_other_test, y_pred_other, average='weighted'))
print("F1 Score:", f1_score(y_other_test, y_pred_other, average='weighted'))









Top 10 Players Model Evaluation
Accuracy: 0.8359580052493438
Precision: 0.8362065093425821
Recall: 0.8359580052493438
F1 Score: 0.8359656333355913
Other Players Model Evaluation
Accuracy: 0.8342391304347826
Precision: 0.8343833775415811
Recall: 0.8342391304347826
F1 Score: 0.8342195687117409


In [10]:
# loop for missing players
for index, row in final_25.iterrows():
    player1 = row['player1']
    player2 = row['player2']


    if player1 not in players:
        print(f"Adding missing player: {player1}")
        players[player1] = Player(player1)


    if player2 not in players:
        print(f"Adding missing player: {player2}")
        players[player2] = Player(player2)


surface_map = {0: 'Clay', 1: 'Grass', 2: 'Hard'}

# update stats
for index, row in final_25.iterrows():

    row['surface'] = surface_map.get(row['surface_encoded'], 'Unknown')

    update_player_stats(row, is_winner=True)
    update_player_stats(row, is_winner=False)

Adding missing player: 375
Adding missing player: 459
Adding missing player: 206
Adding missing player: 434
Adding missing player: 338
Adding missing player: 3
Adding missing player: 303
Adding missing player: 455
Adding missing player: 503
Adding missing player: 156
Adding missing player: 96
Adding missing player: 235
Adding missing player: 87
Adding missing player: 183
Adding missing player: 473
Adding missing player: 513
Adding missing player: 267
Adding missing player: 439
Adding missing player: 125
Adding missing player: 508
Adding missing player: 75
Adding missing player: 84
Adding missing player: 271
Adding missing player: 18
Adding missing player: 77
Adding missing player: 251
Adding missing player: 19
Adding missing player: 123
Adding missing player: 293
Adding missing player: 506
Adding missing player: 518
Adding missing player: 295
Adding missing player: 516
Adding missing player: 510
Adding missing player: 275
Adding missing player: 23
Adding missing player: 14
Adding missi

In [11]:
# Ensure the players dictionary is initialized
  # Assuming players dictionary is already populated elsewhere

# Surface mapping: mapping string to integers
surface_mapping = {
    'Clay': 0,
    'Grass': 1,
    'Hard': 2
}
import numpy as np

def create_featuresv2(player1, player2, surface):

    rank_diff = player1.ranks[-1] - player2.ranks[-1]
    log_rank_diff = np.log(abs(rank_diff) + 1)


    top_10_vs_not = int(player1.is_top_10(player1.ranks[-1]) and not player2.is_top_10(player2.ranks[-1]))


    age_diff = player1.ages[-1] - player2.ages[-1]
    young_vs_old = int(player1.ages[-1] < player2.ages[-1])


    seed_diff = (player1.seeds[-1] or 999) - (player2.seeds[-1] or 999)
    seeded_vs_unseeded = int((player1.seeds[-1] is not None) and (player2.seeds[-1] is None))


    height_diff = player1.heights[-1] - player2.heights[-1]
    tall_vs_short = int(player1.heights[-1] > player2.heights[-1])


    player1_last_5_win_percentage = player1.last_5_win_percentage()
    player2_last_5_win_percentage = player2.last_5_win_percentage()
    player1_last_10_win_percentage = player1.last_10_win_percentage()
    player2_last_10_win_percentage = player2.last_10_win_percentage()


    player1_surface_last_10_win_percentage = player1.surface_last_10_win_percentage(surface)
    player2_surface_last_10_win_percentage = player2.surface_last_10_win_percentage(surface)


    player1_preferred_surface = int(player1.preferred_surface() == surface)
    player2_preferred_surface = int(player2.preferred_surface() == surface)

    player1_surface_match = int(player1_preferred_surface)
    player2_surface_match = int(player2_preferred_surface)


    surface_preference_diff = player1_preferred_surface - player2_preferred_surface


    head_to_head_wins_p1 = player1.head_to_head_stats(player2.name)['wins']
    head_to_head_wins_p2 = player2.head_to_head_stats(player1.name)['wins']


    surface_encoded = surface


    features = [
        rank_diff, log_rank_diff, top_10_vs_not,
        age_diff, young_vs_old,
        seed_diff, seeded_vs_unseeded,
        height_diff, tall_vs_short,
        player1_last_5_win_percentage, player2_last_5_win_percentage,
        player1_last_10_win_percentage, player2_last_10_win_percentage,
        player1_surface_last_10_win_percentage, player2_surface_last_10_win_percentage,
        player1_preferred_surface, player2_preferred_surface,
        player1_surface_match, player2_surface_match,
        surface_preference_diff, head_to_head_wins_p1, head_to_head_wins_p2,
        surface_encoded, surface
    ]

    return features




In [12]:
from sklearn.impute import SimpleImputer

def predict_match(player1_name, player2_name, surface, model_top_10, model_other):
    """
    Predicts the outcome of a match between two players on a specific surface.

    Args:
    - player1_name: Name of the first player
    - player2_name: Name of the second player
    - surface: Encoded integer representing the surface (0 for Clay, 1 for Grass, 2 for Hard)
    - model_top_10: Model used if either player is in the top 10
    - model_other: Model used if neither player is in the top 10

    Returns:
    - result: Prediction of whether player1 (1) or player2 (2) wins
    """

    player1 = players.get(player1_name)
    player2 = players.get(player2_name)

    if not player1 or not player2:
        raise ValueError("Player not found in the player dictionary")

    # determines model to use
    if player1.is_top_10(player1.ranks[-1]) or player2.is_top_10(player2.ranks[-1]):
        model = model_top_10
    else:
        model = model_other

    # generate features
    features = create_featuresv2(player1, player2, surface)

    features = [features]

    # make prediction
    prediction = model.predict(features)

    # result
    result = 1 if prediction == 1 else 2

    return result


# example match
player1_name = 'Borna Coric'
player2_name = 'Dominic Thiem'
surface_string = 'Hard'  # or 'Grass', 'Hard'
surface_encoded = surface_mapping[surface_string]

#add players from final_25 and update stats
if player1_name not in players:
    print(f"Adding missing player: {player1_name}")
    players[player1_name] = Player(player1_name)

if player2_name not in players:
    print(f"Adding missing player: {player2_name}")
    players[player2_name] = Player(player2_name)

for index, row in final_25.iterrows():

    row['surface'] = surface_map.get(row['surface_encoded'], 'Unknown')


    if row['player1'] == player1_name or row['player2'] == player1_name:
        update_player_stats(row, is_winner=(row['player1'] == player1_name))

    if row['player1'] == player2_name or row['player2'] == player2_name:
        update_player_stats(row, is_winner=(row['player1'] == player2_name))
#print(f"Stats for {player1_name}: {players[player1_name].__dict__}")
#print(f"Stats for {player2_name}: {players[player2_name].__dict__}")
# apply the function
result = predict_match(player1_name, player2_name, surface_encoded, model_top_10, model_other)
print("Prediction:", result)


Prediction: 2


