In [12]:
import pandas as pd
import os
import glob


In [1]:
path_to_files = "/Users/bowmannovey/Desktop/Transformer_Tennis/tennis_predictor/Github_Data"

In [5]:
match_charting_repo_path = os.path.join(path_to_files, 'tennis_MatchChartingProject')
atp_repo_path = os.path.join(path_to_files, 'tennis_atp')

points2020_df = pd.read_csv(os.path.join(match_charting_repo_path, 'charting-m-points-2020s.csv'))
points2010_df = pd.read_csv(os.path.join(match_charting_repo_path, 'charting-m-points-2010s.csv'))
points2000s_df = pd.read_csv(os.path.join(match_charting_repo_path, 'charting-m-points-to-2009.csv'))
matches_df = pd.read_csv(os.path.join(match_charting_repo_path, 'charting-m-matches.csv'))
names_df = pd.read_csv(os.path.join(atp_repo_path, 'atp_players.csv'))
rankings20s_df = pd.read_csv(os.path.join(atp_repo_path, 'atp_rankings_20s.csv'))
rankings10s_df = pd.read_csv(os.path.join(atp_repo_path, 'atp_rankings_10s.csv'))
rankings00s_df = pd.read_csv(os.path.join(atp_repo_path, 'atp_rankings_00s.csv'))

  points2020_df = pd.read_csv(os.path.join(match_charting_repo_path, 'charting-m-points-2020s.csv'))
  names_df = pd.read_csv(os.path.join(atp_repo_path, 'atp_players.csv'))


In [6]:
# Function to encode one match's point sequence with extended context
def process_match(match_id, match_data, score_to_idx_map):
    # Keep only relevant columns and drop rows without score
    match_data = match_data[["Svr", "Pts", "1st", "2nd", "PtWinner", "Gm1", "Gm2", "Set1", "Set2"]].copy()
    match_data = match_data[match_data["Pts"].notna()]

    # Encode features
    match_data["score_encoded"] = match_data["Pts"].map(score_to_idx_map)
    match_data["has_1st_serve"] = match_data["1st"].notna().astype(int)
    match_data["has_2nd_serve"] = match_data["2nd"].notna().astype(int)
    match_data["server_encoded"] = match_data["Svr"].apply(lambda x: 0 if x == 1 else 1)

    # Fill NaNs in game/set columns with 0
    match_data["Gm1"] = match_data["Gm1"].fillna(0).astype(int)
    match_data["Gm2"] = match_data["Gm2"].fillna(0).astype(int)
    match_data["Set1"] = match_data["Set1"].fillna(0).astype(int)
    match_data["Set2"] = match_data["Set2"].fillna(0).astype(int)

    # Create the point-level feature vectors
    feature_cols = ["server_encoded", "score_encoded", "has_1st_serve", "has_2nd_serve", "Gm1", "Gm2", "Set1", "Set2"]
    point_sequence = match_data[feature_cols].values.tolist()

    # Determine match winner (based on majority of points won)
    pt_counts = match_data["PtWinner"].value_counts()
    match_winner = 0 if pt_counts.get(1, 0) > pt_counts.get(2, 0) else 1

    return {
        "match_id": match_id,
        "point_sequence": point_sequence,
        "match_winner": match_winner
    }

# List of dataframes to process
dataframes_to_process = [points2020_df, points2010_df, points2000s_df]
processed_dataframes = []

for df in dataframes_to_process:
    # Group by match_id
    grouped_matches = df.groupby("match_id")

    # Use a smaller sample to avoid long processing times (e.g., first 5 matches)
    sample_match_ids = df["match_id"].drop_duplicates()

    # Create a score-to-index mapping
    all_scores = df["Pts"].dropna().unique()
    score_to_idx = {score: i for i, score in enumerate(all_scores)}

    # Process all sample matches
    processed = [process_match(mid, grouped_matches.get_group(mid), score_to_idx) for mid in sample_match_ids]

    # Convert to a match-level DataFrame
    processed_df = pd.DataFrame(processed)

    processed_df['match_id'] = processed_df['match_id'].astype(str)

    processed_dataframes.append(processed_df)

# Concatenate the processed dataframes if needed
# all_processed_df = pd.concat(processed_dataframes, ignore_index=True)

# Assign the processed dataframes back to their original variable names if desired
points2020s_processed = processed_dataframes[0]
points2010s_processed = processed_dataframes[1]
points2000s_processed = processed_dataframes[2]

all_points_processed = pd.concat([points2020s_processed, points2010s_processed, points2000s_processed], ignore_index=True)
all_points_processed.head()

Unnamed: 0,match_id,point_sequence,match_winner
0,20250610-M-ITF_Martos-Q2-Preston_Stearns-Aleja...,"[[0, 0, 1, 0, 0, 0, 0, 0], [0, 1, 1, 1, 0, 0, ...",0
1,20250609-M-Stuttgart-R32-Benjamin_Bonzi-Jiri_L...,"[[0, 0, 1, 0, 0, 0, 0, 0], [0, 1, 1, 1, 0, 0, ...",1
2,20250608-M-Roland_Garros-F-Jannik_Sinner-Carlo...,"[[0, 2, 1, 0, 4, 2, 1, 0], [0, 11, 1, 1, 4, 2,...",0
3,20250606-M-Roland_Garros-SF-Novak_Djokovic-Jan...,"[[1, 11, 1, 1, 4, 5, 0, 1], [1, 12, 1, 1, 4, 5...",1
4,20250604-M-Roland_Garros-QF-Novak_Djokovic-Ale...,"[[0, 0, 1, 1, 0, 0, 0, 0], [0, 9, 1, 1, 0, 0, ...",0


In [7]:
features_df = all_points_processed.join(matches_df.set_index('match_id'), on='match_id')
features_df = features_df.drop(columns = ["Tournament", "Time", "Court", "Umpire", "Charted by", "Final TB?"])
features_df['match_date'] = features_df['match_id'].str.extract(r'(\d{8})')
features_df['match_date'] = pd.to_datetime(features_df['match_date'], format='%Y%m%d')

# Normalize player names
features_df['Player 1'] = features_df['Player 1'].str.lower().str.strip()
features_df['Player 2'] = features_df['Player 2'].str.lower().str.strip()

names_df['full_name'] = (names_df['name_first'] + ' ' + names_df['name_last']).str.lower().str.strip()
names_df.head()

features_df = features_df.merge(names_df[['player_id', 'full_name']], how='left',
                               left_on='Player 1', right_on='full_name')
features_df = features_df.rename(columns={'player_id': 'player1_id'})
features_df = features_df.merge(names_df[['player_id', 'full_name']], how='left',
                               left_on='Player 2', right_on='full_name')
features_df = features_df.rename(columns={'player_id': 'player2_id'})

features_df.head()

Unnamed: 0,match_id,point_sequence,match_winner,Player 1,Player 2,Pl 1 hand,Pl 2 hand,Date,Round,Surface,Best of,match_date,player1_id,full_name_x,player2_id,full_name_y
0,20250610-M-ITF_Martos-Q2-Preston_Stearns-Aleja...,"[[0, 0, 1, 0, 0, 0, 0, 0], [0, 1, 1, 1, 0, 0, ...",0,preston stearns,alejandro lopez escribano,L,L,20250610,Q2,Hard,3,2025-06-10,210475.0,preston stearns,213075.0,alejandro lopez escribano
1,20250609-M-Stuttgart-R32-Benjamin_Bonzi-Jiri_L...,"[[0, 0, 1, 0, 0, 0, 0, 0], [0, 1, 1, 1, 0, 0, ...",1,benjamin bonzi,jiri lehecka,R,R,20250609,R32,Grass,3,2025-06-09,126127.0,benjamin bonzi,208103.0,jiri lehecka
2,20250608-M-Roland_Garros-F-Jannik_Sinner-Carlo...,"[[0, 2, 1, 0, 4, 2, 1, 0], [0, 11, 1, 1, 4, 2,...",0,jannik sinner,carlos alcaraz,R,R,20250608,F,Clay,5,2025-06-08,206173.0,jannik sinner,207989.0,carlos alcaraz
3,20250606-M-Roland_Garros-SF-Novak_Djokovic-Jan...,"[[1, 11, 1, 1, 4, 5, 0, 1], [1, 12, 1, 1, 4, 5...",1,novak djokovic,jannik sinner,R,R,20250606,SF,Clay,5,2025-06-06,104925.0,novak djokovic,206173.0,jannik sinner
4,20250604-M-Roland_Garros-QF-Novak_Djokovic-Ale...,"[[0, 0, 1, 1, 0, 0, 0, 0], [0, 9, 1, 1, 0, 0, ...",0,novak djokovic,alexander zverev,R,R,20250604,QF,Clay,5,2025-06-04,104925.0,novak djokovic,100644.0,alexander zverev


In [8]:
def get_latest_prior_ranking(player_id, match_date, df_rankings):
    rankings = df_rankings[(df_rankings['player'] == player_id) &
                           (df_rankings['ranking_date'] <= match_date)]
    if rankings.empty:
        return None
    return rankings.sort_values('ranking_date', ascending=False).iloc[0]['rank']

# create rankings for all rank data
rankings_df = pd.concat([rankings20s_df, rankings10s_df, rankings00s_df], ignore_index=True)

rankings_df['ranking_date'] = pd.to_datetime(rankings_df['ranking_date'], format='%Y%m%d')
features_df['rank_p1'] = features_df.apply(
    lambda row: get_latest_prior_ranking(row['player1_id'], row['match_date'], rankings_df), axis=1)

features_df['rank_p2'] = features_df.apply(
    lambda row: get_latest_prior_ranking(row['player2_id'], row['match_date'], rankings_df), axis=1)

In [25]:
features_df

Unnamed: 0,match_id,point_sequence,match_winner,Player 1,Player 2,Pl 1 hand,Pl 2 hand,Date,Round,Surface,Best of,match_date,player1_id,full_name_x,player2_id,full_name_y,rank_p1,rank_p2
0,20250610-M-ITF_Martos-Q2-Preston_Stearns-Aleja...,"[[0, 0, 1, 0, 0, 0, 0, 0], [0, 1, 1, 1, 0, 0, ...",0,preston stearns,alejandro lopez escribano,L,L,20250610,Q2,Hard,3,2025-06-10,210475.0,preston stearns,213075.0,alejandro lopez escribano,1380.0,
1,20250609-M-Stuttgart-R32-Benjamin_Bonzi-Jiri_L...,"[[0, 0, 1, 0, 0, 0, 0, 0], [0, 1, 1, 1, 0, 0, ...",1,benjamin bonzi,jiri lehecka,R,R,20250609,R32,Grass,3,2025-06-09,126127.0,benjamin bonzi,208103.0,jiri lehecka,73.0,31.0
2,20250608-M-Roland_Garros-F-Jannik_Sinner-Carlo...,"[[0, 2, 1, 0, 4, 2, 1, 0], [0, 11, 1, 1, 4, 2,...",0,jannik sinner,carlos alcaraz,R,R,20250608,F,Clay,5,2025-06-08,206173.0,jannik sinner,207989.0,carlos alcaraz,4.0,2.0
3,20250606-M-Roland_Garros-SF-Novak_Djokovic-Jan...,"[[1, 11, 1, 1, 4, 5, 0, 1], [1, 12, 1, 1, 4, 5...",1,novak djokovic,jannik sinner,R,R,20250606,SF,Clay,5,2025-06-06,104925.0,novak djokovic,206173.0,jannik sinner,1.0,4.0
4,20250604-M-Roland_Garros-QF-Novak_Djokovic-Ale...,"[[0, 0, 1, 1, 0, 0, 0, 0], [0, 9, 1, 1, 0, 0, ...",0,novak djokovic,alexander zverev,R,R,20250604,QF,Clay,5,2025-06-04,104925.0,novak djokovic,100644.0,alexander zverev,1.0,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6645,19690907-M-US_Open-F-Rod_Laver-Tony_Roche,"[[0, 0, 1, 1, 0, 0, 0, 0], [0, 6, 1, 0, 0, 0, ...",0,rod laver,tony roche,L,L,19690907,F,Grass,5,1969-09-07,100029.0,rod laver,100100.0,tony roche,,
6646,19690706-M-Wimbledon-F-John_Newcombe-Rod_Laver,"[[0, 0, 1, 0, 0, 0, 0, 0], [0, 1, 1, 1, 0, 0, ...",1,john newcombe,rod laver,R,L,19690706,F,Grass,5,1969-07-06,100087.0,john newcombe,100029.0,rod laver,,
6647,19690703-M-Wimbledon-SF-Rod_Laver-Arthur_Ashe,"[[0, 0, 1, 0, 0, 0, 0, 0], [0, 1, 1, 0, 0, 0, ...",0,rod laver,arthur ashe,L,R,19690703,SF,Grass,5,1969-07-03,100029.0,rod laver,100074.0,arthur ashe,,
6648,19600704-M-Wimbledon-F-Rod_Laver-Neale_Fraser,"[[0, 0, 1, 0, 0, 0, 0, 0], [0, 1, 1, 0, 0, 0, ...",1,rod laver,neale fraser,L,L,19600704,F,Grass,5,1960-07-04,100029.0,rod laver,100013.0,neale fraser,,


## To get Elo

In [None]:

# Get all CSV files that match the pattern
csv_files = glob.glob(atp_repo_path + '/atp_matches_*.csv')

# Combine them
all_matches = pd.concat([pd.read_csv(file) for file in csv_files], ignore_index=True)
all_matches.columns



  all_matches = pd.concat([pd.read_csv(file) for file in csv_files], ignore_index=True)


Index(['tourney_id', 'tourney_name', 'surface', 'draw_size', 'tourney_level',
       'tourney_date', 'match_num', 'winner_id', 'winner_seed', 'winner_entry',
       'winner_name', 'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age',
       'loser_id', 'loser_seed', 'loser_entry', 'loser_name', 'loser_hand',
       'loser_ht', 'loser_ioc', 'loser_age', 'score', 'best_of', 'round',
       'minutes', 'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon', 'w_2ndWon',
       'w_SvGms', 'w_bpSaved', 'w_bpFaced', 'l_ace', 'l_df', 'l_svpt',
       'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved', 'l_bpFaced',
       'winner_rank', 'winner_rank_points', 'loser_rank', 'loser_rank_points',
       'winner1_id', 'winner2_id', 'loser1_id', 'loser2_id', 'winner1_name',
       'winner1_hand', 'winner1_ht', 'winner1_ioc', 'winner1_age',
       'winner2_name', 'winner2_hand', 'winner2_ht', 'winner2_ioc',
       'winner2_age', 'loser1_name', 'loser1_hand', 'loser1_ht', 'loser1_ioc',
       'loser1

In [17]:
matches = all_matches

In [15]:
elos_tracker = names_df[["player_id", "name_first", "name_last"]]
elos_tracker["elo"] = 1500
elos_tracker["date"] = 0000
elos_tracker["num_matches"] = 0

elos_tracker_clay = names_df[["player_id", "name_first", "name_last"]]
elos_tracker_clay["elo"] = 1500
elos_tracker_clay["date"] = 0000
elos_tracker_clay["num_matches"] = 0

elos_tracker_hard = names_df[["player_id", "name_first", "name_last"]]
elos_tracker_hard["elo"] = 1500
elos_tracker_hard["date"] = 0000
elos_tracker_hard["num_matches"] = 0

elos_tracker_grass = names_df[["player_id", "name_first", "name_last"]]
elos_tracker_grass["elo"] = 1500
elos_tracker_grass["date"] = 0000
elos_tracker_grass["num_matches"] = 0

elos_tracker_carpet = names_df[["player_id", "name_first", "name_last"]]
elos_tracker_carpet["elo"] = 1500
elos_tracker_carpet["date"] = 0000
elos_tracker_carpet["num_matches"] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  elos_tracker["elo"] = 1500
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  elos_tracker["date"] = 0000
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  elos_tracker["num_matches"] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value ins

In [18]:
matches = matches[matches["tourney_date"] > 10000000.0]
matches = matches[["surface", "tourney_level", "tourney_date", "winner_id", "loser_id"]]
matches = matches.dropna(subset=["winner_id", "loser_id"])
matches["winner_id"] = matches["winner_id"].astype(int)
matches["loser_id"] = matches["loser_id"].astype(int)

matches_clay = matches[matches["surface"] == "Clay"]
matches_hard = matches[matches["surface"] == "Hard"]
matches_grass = matches[matches["surface"] == "Grass"]
matches_carpet = matches[matches["surface"] == "Carpet"]

In [20]:
def update_elo(p1_elo, p2_elo, p1_matches, p2_matches, level="A"):
    # Compute dynamic K values
    kA = 250 / ((p1_matches + 5) ** 0.4)
    kB = 250 / ((p2_matches + 5) ** 0.4)

    # Tournament weight boost for Grand Slams
    k = 1.1 if level == "G" else 1.0

    # Expected score
    expected_win = 1 / (1 + 10 ** ((p2_elo - p1_elo) / 400))
    expected_loss = 1 - expected_win

    # Updated Elo ratings
    new_p1_elo = p1_elo + (k * kA) * (1 - expected_win)
    new_p2_elo = p2_elo + (k * kB) * (0 - expected_loss)

    return round(new_p1_elo, 2), round(new_p2_elo, 2)

def apply_all_elos(matches_df, elos_tracker_df):
    # Step 1: Get latest known Elo & match count for each player
    latest_entries = elos_tracker_df.sort_values("date").drop_duplicates("player_id", keep="last")
    latest_dict = {
        row.player_id: {
            "elo": row.elo,
            "num_matches": row.num_matches,
            "name_first": row.name_first,
            "name_last": row.name_last
        }
        for row in latest_entries.itertuples(index=False)
    }

    new_rows = []

    # Step 2: Loop through matches and update elos
    for row in matches_df.itertuples(index=False):
        winner = row.winner_id
        loser = row.loser_id
        level = row.tourney_level
        date = row.tourney_date

        # If a player is missing, skip
        if winner not in latest_dict or loser not in latest_dict:
            continue

        # Extract current data
        p1 = latest_dict[winner]
        p2 = latest_dict[loser]

        # Update Elo
        new_winner_elo, new_loser_elo = update_elo(
            p1_elo=p1["elo"],
            p2_elo=p2["elo"],
            p1_matches=p1["num_matches"],
            p2_matches=p2["num_matches"],
            level=level
        )

        # Update internal tracker
        latest_dict[winner]["elo"] = new_winner_elo
        latest_dict[winner]["num_matches"] += 1
        latest_dict[loser]["elo"] = new_loser_elo
        latest_dict[loser]["num_matches"] += 1

        # Add new rows to list
        new_rows.append({
            "player_id": winner,
            "name_first": p1["name_first"],
            "name_last": p1["name_last"],
            "elo": new_winner_elo,
            "date": date,
            "num_matches": latest_dict[winner]["num_matches"]
        })

        new_rows.append({
            "player_id": loser,
            "name_first": p2["name_first"],
            "name_last": p2["name_last"],
            "elo": new_loser_elo,
            "date": date,
            "num_matches": latest_dict[loser]["num_matches"]
        })

    # Step 3: Append all new rows to existing Elo tracker
    updated_elos_tracker = pd.concat([elos_tracker_df, pd.DataFrame(new_rows)], ignore_index=True)
    return updated_elos_tracker

In [21]:
elo_df = apply_all_elos(matches, elos_tracker)
elos_clay = apply_all_elos(matches_clay, elos_tracker_clay)
elos_hard = apply_all_elos(matches_hard, elos_tracker_hard)
elos_grass = apply_all_elos(matches_grass, elos_tracker_grass)
elos_carpet = apply_all_elos(matches_carpet, elos_tracker_carpet)

In [22]:
elo_df = elo_df[elo_df["date"] > 10000000]
elo_df = elo_df.sort_values(by="date", ascending=False)
elos_clay = elos_clay[elos_clay["date"] > 10000000]
elos_clay = elos_clay.sort_values(by="date", ascending=False)
elos_hard = elos_hard[elos_hard["date"] > 10000000]
elos_hard = elos_hard.sort_values(by="date", ascending=False)
elos_grass = elos_grass[elos_grass["date"] > 10000000]
elos_grass = elos_grass.sort_values(by="date", ascending=False)
elos_carpet = elos_carpet[elos_carpet["date"] > 10000000]
elos_carpet = elos_carpet.sort_values(by="date", ascending=False)

In [38]:
features_df['Date'] = pd.to_datetime(features_df['Date'], errors='coerce')
features_df["Date"]

0      2025-06-10
1      2025-06-09
2      2025-06-08
3      2025-06-06
4      2025-06-04
          ...    
6645   1969-09-07
6646   1969-07-06
6647   1969-07-03
6648   1960-07-04
6649   1960-05-29
Name: Date, Length: 6650, dtype: datetime64[ns]

In [39]:
def get_latest_prior_elo(player_id, match_date, df_elo):
    elo = df_elo[(df_elo['player_id'] == player_id) &
                           (df_elo['date'] <= match_date)]
    if elo.empty:
        return None
    return elo.sort_values('date', ascending=False).iloc[0]['elo']

features_df['Date'] = pd.to_datetime(features_df['Date'], errors='coerce')

elo_df['date'] = pd.to_datetime(elo_df['date'], format='%Y%m%d', errors='coerce')

features_df['elo_p1_overall'] = features_df.apply(
    lambda row: get_latest_prior_elo(row['player1_id'], row['Date'], elo_df), axis=1)

features_df['elo_p2_overall'] = features_df.apply(
    lambda row: get_latest_prior_elo(row['player2_id'], row['Date'], elo_df), axis=1)

In [46]:
elos_carpet['date'] = pd.to_datetime(elos_carpet['date'], format='%Y%m%d', errors='coerce')
elos_clay['date'] = pd.to_datetime(elos_clay['date'], format='%Y%m%d', errors='coerce')
elos_grass['date'] = pd.to_datetime(elos_grass['date'], format='%Y%m%d', errors='coerce')
elos_hard['date'] = pd.to_datetime(elos_hard['date'], format='%Y%m%d', errors='coerce')

def get_surface_specific_elo(player_id, match_date, surface, elos_carpet, elos_clay, elos_grass, elos_hard):
    """
    Retrieves the latest Elo rating for a player prior to a given match date, specific to the match surface.

    Parameters:
        player_id (int): The ID of the player.
        match_date (datetime): The date of the match.
        surface (str): The surface of the match (e.g., 'carpet', 'clay', 'grass', 'hard').
        elos_carpet, elos_clay, elos_grass, elos_hard (pd.DataFrame): Elo dataframes for each surface.

    Returns:
        float or None: The latest Elo rating prior to the match date for the given surface, or None if no data exists.
    """
    if surface == 'Carpet':
        df_elo = elos_carpet
    elif surface == 'Clay':
        df_elo = elos_clay
    elif surface == 'Grass':
        df_elo = elos_grass
    elif surface == 'Hard':
        df_elo = elos_hard
    else:
        return None  # Return None if surface is invalid or not recognized

    elo = df_elo[(df_elo['player_id'] == player_id) & (df_elo['date'] <= match_date)]
    if elo.empty:
        return None
    return elo.sort_values('date', ascending=False).iloc[0]['elo']
# Apply the function to calculate surface-specific Elo ratings for player 1 and player 2
features_df['elo_p1_surface'] = features_df.apply(
    lambda row: get_surface_specific_elo(row['player1_id'], row['Date'], row['Surface'], elos_carpet, elos_clay, elos_grass, elos_hard),
    axis=1
)

features_df['elo_p2_surface'] = features_df.apply(
    lambda row: get_surface_specific_elo(row['player2_id'], row['Date'], row['Surface'], elos_carpet, elos_clay, elos_grass, elos_hard),
    axis=1
)

## Making final DF

In [101]:
final_features_df = (features_df.drop(columns = [
    "Player 1", "Player 2", "full_name_x", "full_name_y", "player1_id", "player2_id", "match_date", "Date", "match_id", "rank_diff", "elo_diff", "Round"]))
final_features_df = final_features_df.dropna(subset=['match_winner', 'Surface'])
final_features_df['elo_p1_overall'] = final_features_df['elo_p1_overall'].fillna(1500)
final_features_df['elo_p2_overall'] = final_features_df['elo_p2_overall'].fillna(1500)
median_rank = final_features_df[['rank_p1', 'rank_p2']].median().median()
final_features_df['rank_p1'] = final_features_df['rank_p1'].fillna(median_rank)
final_features_df['rank_p2'] = final_features_df['rank_p2'].fillna(median_rank)
final_features_df['Pl 1 hand'] = final_features_df['Pl 1 hand'].str.strip().str.lower()
final_features_df['Pl 2 hand'] = final_features_df['Pl 2 hand'].str.strip().str.lower()



In [102]:
final_features_df['rank_diff'] = final_features_df['rank_p1'] - final_features_df['rank_p2']
final_features_df['elo_diff'] = final_features_df['elo_p1_overall'] - final_features_df['elo_p2_overall']
final_features_df['elo_diff_surface'] = final_features_df['elo_p1_surface'] - final_features_df['elo_p2_surface']
final_features_df

Unnamed: 0,point_sequence,match_winner,Pl 1 hand,Pl 2 hand,Surface,Best of,rank_p1,rank_p2,elo_p1_overall,elo_p2_overall,elo_p1_surface,elo_p2_surface,rank_diff,elo_diff,elo_diff_surface
0,"[[0, 0, 1, 0, 0, 0, 0, 0], [0, 1, 1, 1, 0, 0, ...",0,l,l,Hard,3,1380.0,23.0,1383.39,1463.04,1366.45,1462.70,1357.0,-79.65,-96.25
1,"[[0, 0, 1, 0, 0, 0, 0, 0], [0, 1, 1, 1, 0, 0, ...",1,r,r,Grass,3,73.0,31.0,1481.11,1661.09,1691.67,1682.60,42.0,-179.98,9.07
2,"[[0, 2, 1, 0, 4, 2, 1, 0], [0, 11, 1, 1, 4, 2,...",0,r,r,Clay,5,4.0,2.0,2098.61,1914.14,1699.01,1875.91,2.0,184.47,-176.90
3,"[[1, 11, 1, 1, 4, 5, 0, 1], [1, 12, 1, 1, 4, 5...",1,r,r,Clay,5,1.0,4.0,1949.24,2098.61,1893.54,1699.01,-3.0,-149.37,194.53
4,"[[0, 0, 1, 1, 0, 0, 0, 0], [0, 9, 1, 1, 0, 0, ...",0,r,r,Clay,5,1.0,7.0,1949.24,1854.58,1893.54,1821.29,-6.0,94.66,72.25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6645,"[[0, 0, 1, 1, 0, 0, 0, 0], [0, 6, 1, 0, 0, 0, ...",0,l,l,Grass,5,23.0,23.0,2155.48,1963.17,2003.81,1967.65,0.0,192.31,36.16
6646,"[[0, 0, 1, 0, 0, 0, 0, 0], [0, 1, 1, 1, 0, 0, ...",1,r,l,Grass,5,23.0,23.0,1908.13,2099.06,1909.47,1999.89,0.0,-190.93,-90.42
6647,"[[0, 0, 1, 0, 0, 0, 0, 0], [0, 1, 1, 0, 0, 0, ...",0,l,r,Grass,5,23.0,23.0,2099.06,1825.25,1999.89,1748.63,0.0,273.81,251.26
6648,"[[0, 0, 1, 0, 0, 0, 0, 0], [0, 1, 1, 0, 0, 0, ...",1,l,l,Grass,5,23.0,23.0,2011.15,2193.50,1950.80,2151.92,0.0,-182.35,-201.12


In [112]:
for col in ['Pl 1 hand', 'Pl 2 hand', 'Surface']:
    final_features_df[col] = final_features_df[col].fillna(final_features_df[col].mode()[0])
categorical_cols = ['Pl 1 hand', 'Pl 2 hand', 'Surface']
encoded_df = pd.get_dummies(final_features_df, columns=categorical_cols, dtype=int)
encoded_df = encoded_df[encoded_df["Pl 2 hand_davis cup world group"] != 1].drop(columns=["Pl 2 hand_davis cup world group"])
encoded_df = encoded_df[encoded_df["Surface_3"] != 1].drop(columns=["Surface_3"])
encoded_df = encoded_df[encoded_df["Surface_Eva Asderaki-Moore"] != 1].drop(columns=["Surface_Eva Asderaki-Moore"])
encoded_df = encoded_df[encoded_df["Pl 1 hand_20240915"] != 1].drop(columns=["Pl 1 hand_20240915"])

In [113]:
encoded_df.columns

Index(['point_sequence', 'match_winner', 'Best of', 'rank_p1', 'rank_p2',
       'elo_p1_overall', 'elo_p2_overall', 'elo_p1_surface', 'elo_p2_surface',
       'rank_diff', 'elo_diff', 'elo_diff_surface', 'Pl 1 hand_l',
       'Pl 1 hand_r', 'Pl 2 hand_l', 'Pl 2 hand_r', 'Surface_Clay',
       'Surface_Grass', 'Surface_Hard'],
      dtype='object')