# Experiments with logistic regression model for expected goals (xG) prediction.

**Note:**
- Use this file to improve the model, try different features, regularization techniques, etc.
- This file is only for experimentation, do not log to MLflow.
- Implement important changes in the main training script src/tasks/xg/train/train_xg.py.

In [None]:

import sys
import os
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

print(os.getcwd())

# Add project root to Python path
root = Path(os.getcwd()).parents[3]
sys.path.insert(1, str(root))

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

from src.common import io
from src.tasks.xg.features.pipeline import build_feature_pipeline
from src.tasks.xg.train.train_xg import evaluate_model



In [None]:
# Load and prepare the data

df = io.read_table(Path(root) / "data/gold/xg_features.parquet")
included_features = [
    "end_x",
    "end_y",
    "shot_distance",
    "shot_angle",
    "body_part_right_foot",
    "body_part_left_foot",
    "body_part_head",
    "body_part_other",
    "is_open_play",
    "one_on_one",
]

pipeline = build_feature_pipeline()
X = pipeline.transform(df)[included_features].dropna()
y = df["is_goal"].dropna()

In [None]:
# Feature engineering. This step happens inside `prepare_features` on the main workflow.

X["log_angle"] = np.log(X["shot_angle"] + 1e-5)
X["one_on_one_x_log_angle"] = X["one_on_one"].astype(int) * X["shot_angle"]
X["one_on_one_x_dist"] = X["one_on_one"].astype(int) * X["shot_distance"]

X["head_x_dist"] = X["body_part_head"].astype(int) * X["shot_distance"]
X["distance_x_angle"] = X["shot_distance"] * X["log_angle"]

print(f"Goal/nongoal ratio: {y.value_counts(normalize=True).to_dict()}")

In [None]:
# feature scaling
scaler = StandardScaler()
X_continous = X[["end_x", "end_y", "shot_distance", "shot_angle", "log_angle", "one_on_one_x_log_angle", "one_on_one_x_dist", "head_x_dist", "distance_x_angle"]]
X_scaled = scaler.fit_transform(X_continous)

X.drop(["end_x", "end_y", "shot_distance", "shot_angle", "log_angle", "one_on_one_x_log_angle", "one_on_one_x_dist", "head_x_dist", "distance_x_angle"], axis=1, inplace=True)
X_scaled_df = pd.DataFrame(X_scaled, columns=["end_x", "end_y", "shot_distance", "shot_angle", "log_angle", "one_on_one_x_log_angle", "one_on_one_x_dist", "head_x_dist", "distance_x_angle"], index=X.index)
X = pd.concat([X, X_scaled_df], axis=1)

In [None]:
# Cross-validated model training
def train_model(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    model = LogisticRegression(max_iter=1000, solver="lbfgs", penalty="l2", C=0.3)

    # Cross-validation
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='neg_log_loss')
    print(f"Cross-validated Log Loss scores: {-cv_scores}")
    print(f"Mean CV Log Loss: {-cv_scores.mean()}")

    # Fit the model on the entire training set
    model.fit(X_train, y_train)

    # Evaluate on the test set

    return model, X_train, X_test, y_train, y_test
  
model, X_train, X_test, y_train, y_test = train_model(X, y)

In [None]:
# Evaluate model

y_train_proba = model.predict_proba(X_train)[:, 1]
y_test_proba = model.predict_proba(X_test)[:, 1]
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
metrics = evaluate_model(model, X_train, X_test, y_train, y_test)
for dataset in ["train", "test"]:
    print(f"{dataset.capitalize()} Performance:")
    print(f"  ROC-AUC: {metrics[dataset]['roc_auc']:.3f}")
    print(f"  Brier Score: {metrics[dataset]['brier_score']:.3f}")
    print(f"  Log Loss: {metrics[dataset]['log_loss']:.3f}")
    print(classification_report(y_test if dataset == "test" else y_train, y_test_pred if dataset == "test" else y_train_pred))
    print()

# Calibration curve
calib_curve_train = metrics["train"]["calibration_curve"]
calib_curve_test = metrics["test"]["calibration_curve"]

plt.figure(figsize=(8, 6))
plt.plot(calib_curve_train[0], calib_curve_train[1], marker='o', label='Train')
plt.plot(calib_curve_test[0], calib_curve_test[1], marker='o', label='Test')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.xlabel('Predicted Probability')
plt.ylabel('Observed Probability')
plt.title('Calibration Curve')
plt.legend()
plt.grid()
plt.show()