# Turning multiple shots into a single point

This notebook processes data to be used for a regression task.

The regression task will attempt to answer: Will the server win the point?

The inputs will be point-level data.

The ouput will be binary: 1 if the serving player won the point, and 0 if they lost.


In [2]:
import pandas as pd

df = pd.read_csv("../../data/raw/tennis-m-shots-rg.csv")

In [8]:
cols = [
    'Date', 'Tournament', 'Round', 'Player1', 'Player2', 'Point',
    'ServingPlayer', 'WinningPlayer',
    'Shot', 'ShotHand', 'ShotType', 'ServeDirection',
    'Serve', 'ShotDirection', 'ShotDepth',
    'OutcomeType', 'ErrorType'
]

missing_summary = (
    df[cols]
      .isna()
      .mean()
      .sort_values(ascending=False)
      .to_frame('missing_fraction')
      .reset_index()
      .rename(columns={'index': 'column'})
)

missing_summary['missing_percent'] = (missing_summary['missing_fraction'] * 100).round(2)
missing_summary = missing_summary.drop(columns='missing_fraction')
missing_summary


Unnamed: 0,column,missing_percent
0,OutcomeType,82.89
1,ErrorType,82.73
2,ShotDepth,77.36
3,ServeDirection,76.59
4,ShotDirection,24.38
5,ShotHand,23.54
6,ShotType,0.13
7,Date,0.0
8,Tournament,0.0
9,Shot,0.0


In [9]:
# 1. Convert Date column
df['Date'] = pd.to_datetime(df['Date'], format='%Y%m%d', errors='coerce')

# 2. Fill missing categorical values with consistent placeholders
fill_values = {
    'ShotHand': 'unknown',
    'ShotType': 'unknown',
    'ServeDirection': 'none',
    'OutcomeType': 'none',
    'ErrorType': 'none'
}
df = df.fillna(value=fill_values)

# 3. Optional sanity check: confirm no NaN left in key modeling columns
check_cols = ['Date', 'Tournament', 'Player1', 'Player2',
              'ServingPlayer', 'WinningPlayer',
              'ShotHand', 'ShotType', 'ServeDirection',
              'OutcomeType', 'ErrorType']

df[check_cols].isna().sum()


Date              0
Tournament        0
Player1           0
Player2           0
ServingPlayer     0
WinningPlayer     0
ShotHand          0
ShotType          0
ServeDirection    0
OutcomeType       0
ErrorType         0
dtype: int64

In [10]:
# Compute rally length
rally_df = (
    df.groupby(['Date','Tournament','Player1','Player2','Point'])
      .agg({
          'Shot': 'max',
          'WinningPlayer': 'first',
          'ServingPlayer': 'first'
      })
      .reset_index()
      .rename(columns={'Shot':'rally_len'})
)

# Add target variable
rally_df['server_won'] = rally_df['ServingPlayer'] == rally_df['WinningPlayer']


In [13]:
# Shot-type ratios per point
hand_counts = (
    df.groupby(['Date','Tournament','Player1','Player2','Point','ShotHand'])
      .size()
      .unstack(fill_value=0)
      .reset_index()
)

rally_df = rally_df.merge(hand_counts, on=['Date','Tournament','Player1','Player2','Point'], how='left')
rally_df['forehand_ratio'] = rally_df['forehand'] / (rally_df[['forehand','backhand']].sum(axis=1))


In [14]:
# Check number of rows (should match number of unique points)
num_rows = len(rally_df)
print(f"Total rows in point-level dataset: {num_rows:,}")

# Check for duplicate points (should be zero if grouped correctly)
duplicates = rally_df.duplicated(subset=['Date','Tournament','Player1','Player2','Point']).sum()
print(f"Duplicate rallies (should be 0): {duplicates}")

# Quick summary of server_won
print("\n--- server_won summary ---")
print(rally_df['server_won'].value_counts())
print("\nProportion of points won by server:")
print(rally_df['server_won'].mean().round(3))

# Sanity check: head of relevant columns
rally_df[['ServingPlayer','WinningPlayer','server_won','rally_len']].head(10)


Total rows in point-level dataset: 73,349
Duplicate rallies (should be 0): 0

--- server_won summary ---
server_won
True     44741
False    28608
Name: count, dtype: int64

Proportion of points won by server:
0.61


Unnamed: 0,ServingPlayer,WinningPlayer,server_won,rally_len
0,Luis_Ayala,Nicola_Pietrangeli,False,3
1,Luis_Ayala,Luis_Ayala,True,15
2,Luis_Ayala,Nicola_Pietrangeli,False,3
3,Luis_Ayala,Luis_Ayala,True,6
4,Luis_Ayala,Nicola_Pietrangeli,False,5
5,Luis_Ayala,Luis_Ayala,True,7
6,Luis_Ayala,Luis_Ayala,True,6
7,Nicola_Pietrangeli,Luis_Ayala,False,11
8,Nicola_Pietrangeli,Luis_Ayala,False,5
9,Nicola_Pietrangeli,Luis_Ayala,False,6


In [19]:
from pathlib import Path

# Go up two levels: notebooks/data_processing → tennis/
processed_path = Path("../../data/processed/point_level_rg.csv")

# Save the processed CSV
rally_df.to_csv(processed_path, index=False)
print(f"✅ Saved processed dataset to: {processed_path.resolve()}")

print(f"Rows: {len(rally_df):,}")


✅ Saved processed dataset to: C:\Users\jacks\OneDrive - University of Tennessee\CS_UT\CS 545\final_project\tennis\data\processed\point_level_rg.csv
Rows: 73,349


In [20]:
import gzip
import shutil

# Input and output paths
input_path = "../../data/processed/point_level_rg.csv"
output_path = "../../data/processed/point_level_rg.csv.gz"

# Compress the CSV file
with open(input_path, "rb") as f_in:
    with gzip.open(output_path, "wb", compresslevel=9) as f_out:
        shutil.copyfileobj(f_in, f_out)

print(f"✅ Compressed file saved as: {output_path}")

✅ Compressed file saved as: ../../data/processed/point_level_rg.csv.gz
