# Predict whether server wins point

This notebook uses the data processed in `data_processing/shots_to_points.ipynb` to train a simple regression model to predict whether the serving player wins the point.

## Step 1 - Load data


In [4]:
# 02_point_outcome_model.ipynb

import pandas as pd
from pathlib import Path

# Load processed dataset
data_path = Path("../data/processed/point_level_rg.csv.gz")
df = pd.read_csv(data_path, compression="gzip")

print(f"✅ Loaded dataset with {len(df):,} rows")
df.head()


✅ Loaded dataset with 73,349 rows


Unnamed: 0,Date,Tournament,Player1,Player2,Point,rally_len,WinningPlayer,ServingPlayer,server_won,backhand,forehand,unknown,forehand_ratio,backhand_ratio
0,1960-05-29,Roland_Garros,Nicola_Pietrangeli,Luis_Ayala,8,3,Nicola_Pietrangeli,Luis_Ayala,False,1,1,1,0.5,0.5
1,1960-05-29,Roland_Garros,Nicola_Pietrangeli,Luis_Ayala,9,15,Luis_Ayala,Luis_Ayala,True,7,7,1,0.5,0.5
2,1960-05-29,Roland_Garros,Nicola_Pietrangeli,Luis_Ayala,10,3,Nicola_Pietrangeli,Luis_Ayala,False,1,1,2,0.5,0.5
3,1960-05-29,Roland_Garros,Nicola_Pietrangeli,Luis_Ayala,11,6,Luis_Ayala,Luis_Ayala,True,3,2,2,0.4,0.6
4,1960-05-29,Roland_Garros,Nicola_Pietrangeli,Luis_Ayala,12,5,Nicola_Pietrangeli,Luis_Ayala,False,2,2,1,0.5,0.5


## Step 2 - Start with simple features (inputs) and select target (output)


In [5]:
features = ['rally_len', 'forehand_ratio', 'backhand_ratio']
target = 'server_won'

X = df[features].fillna(0)
y = df[target].astype(int)


## Step 3 - Split data and scale features


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Split data into train/test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Standardize numeric features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Step 4 - Train baseline logistic regression model


In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report

# Train model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_scaled, y_train)

# Predict on test data
y_pred = model.predict(X_test_scaled)
y_pred_prob = model.predict_proba(X_test_scaled)[:, 1]

# Evaluate
acc = accuracy_score(y_test, y_pred)
roc = roc_auc_score(y_test, y_pred_prob)

print(f"Accuracy: {acc:.3f}")
print(f"ROC-AUC:  {roc:.3f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.610
ROC-AUC:  0.606

Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.10      0.16      5722
           1       0.62      0.94      0.75      8948

    accuracy                           0.61     14670
   macro avg       0.56      0.52      0.45     14670
weighted avg       0.57      0.61      0.52     14670



## Step 5 - Interpret feature importance


In [8]:
coef_df = pd.DataFrame({
    'Feature': features,
    'Coefficient': model.coef_[0]
}).sort_values(by='Coefficient', ascending=False)

print(coef_df)


          Feature  Coefficient
2  backhand_ratio     0.034138
1  forehand_ratio    -0.014423
0       rally_len    -0.298726
