# Shot Sequences

For LSTM, RNN, and GRU's, we need to keep the shot sequence data instead of collapsing the data into a single row. 

This allows the model to learn from patterns of shots to better predict outcomes. 


In [2]:
import pandas as pd

df = pd.read_csv("../../data/raw/tennis-m-shots-rg.csv")

cols = [
    'Date', 'Tournament', 'Round', 'Player1', 'Player2', 'Point',
    'ServingPlayer', 'WinningPlayer',
    'Shot', 'ShotHand', 'ShotType', 'ServeDirection',
    'Serve', 'ShotDirection', 'ShotDepth',
    'OutcomeType', 'ErrorType'
]

missing_summary = (
    df[cols]
      .isna()
      .mean()
      .sort_values(ascending=False)
      .to_frame('missing_fraction')
      .reset_index()
      .rename(columns={'index': 'column'})
)

missing_summary['missing_percent'] = (missing_summary['missing_fraction'] * 100).round(2)
missing_summary = missing_summary.drop(columns='missing_fraction')
missing_summary


Unnamed: 0,column,missing_percent
0,OutcomeType,82.89
1,ErrorType,82.73
2,ShotDepth,77.36
3,ServeDirection,76.59
4,ShotDirection,24.38
5,ShotHand,23.54
6,ShotType,0.13
7,Serve,0.0
8,Date,0.0
9,Tournament,0.0


In [3]:
# 1. Convert Date column
df['Date'] = pd.to_datetime(df['Date'], format='%Y%m%d', errors='coerce')

# 2. Fill missing categorical values with consistent placeholders
fill_values = {
    'ShotHand': 'unknown',
    'ShotType': 'unknown',
    'ServeDirection': 'none',
    'OutcomeType': 'none',
    'ErrorType': 'none'
}
df = df.fillna(value=fill_values)

# 3. Optional sanity check: confirm no NaN left in key modeling columns
check_cols = ['Date', 'Tournament', 'Player1', 'Player2',
              'ServingPlayer', 'WinningPlayer',
              'ShotHand', 'ShotType', 'ServeDirection',
              'OutcomeType', 'ErrorType']

df[check_cols].isna().sum()


print(f"Total shots: {len(df):,}")


Total shots: 431,177


In [6]:
# Encode categorical features (keep shot-level data) 
from sklearn.preprocessing import LabelEncoder
import numpy as np

encoders = {}
categories = ['ShotHand', 'ShotType', 'ServeDirection', 'ShotDirection', 
            'ShotDepth', 'OutcomeType', 'ErrorType']

for col in categories:
    labelEncoder = LabelEncoder()
    df[col + '_encoded'] = labelEncoder.fit_transform(df[col])
    encoders[col] = labelEncoder


# select which features to include in the model
# _encode suffix are the encoded categorical features and 'Shot' is the shot number associated with each row
feature_cols = [col + '_encoded' for col in categories] + ['Shot']  # Shot number

def create_sequences(group):
    return group[feature_cols].values

# group all shots belonging to the same point together
sequences = (
    df.groupby(['Date','Tournament','Player1','Player2','Point'])
      .apply(create_sequences)
      .tolist()
)

# LSTMs need consistent input shapes, so pad the sequences to the same length
# tensorflow was not loading correctly so used chatgpt for this part
max_len = max(len(seq) for seq in sequences)
X = np.array([
    np.vstack([seq, np.zeros((max_len - len(seq), seq.shape[1]))])
    if len(seq) < max_len else seq
    for seq in sequences
], dtype='float32')

# Create target (y) if the serving player won the point (1) or not (0)
y = (
    df.groupby(['Date','Tournament','Player1','Player2','Point'])
      .agg({
          'ServingPlayer': 'first',
          'WinningPlayer': 'first'
      })
      .apply(lambda row: int(row['ServingPlayer'] == row['WinningPlayer']), axis=1)
      .values
)

# x shape should be sequences
# y shape should be labels
print(f"X shape: {X.shape}") 
print(f"y shape: {y.shape}")  # (num_points,)

# save the processed data as numpy arrays to preserve the shapes
np.save('X_sequences.npy', X)
np.save('y_labels.npy', y)


  .apply(create_sequences)


X shape: (73349, 86, 8)
y shape: (73349,)
