In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.metrics import accuracy_score, log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [2]:
data_dir = Path("..") / "data" / "source"
csv_files = sorted(list(data_dir.glob("*.csv")))
if not csv_files:
    raise FileNotFoundError(
        f"No CSV files found in {data_dir}. Check path from the notebook."
    )

print("Found files:", [f.name for f in csv_files])
# Read a sample (first file) to inspect header quickly
sample_df = pd.read_csv(csv_files[0], nrows=5)
print("Columns in sample file:")
print(sample_df.columns.tolist())
# To load full dataset (may be large) concatenate all files
full_df = pd.concat(
    (pd.read_csv(f, low_memory=False) for f in csv_files), ignore_index=True
)
print("\nFull dataframe shape:", full_df.shape)
print("Columns in full dataframe:")
print(full_df.columns.tolist())
full_df.head()

Found files: ['2014_LoL_esports_match_data_from_OraclesElixir.csv', '2015_LoL_esports_match_data_from_OraclesElixir.csv', '2016_LoL_esports_match_data_from_OraclesElixir.csv', '2017_LoL_esports_match_data_from_OraclesElixir.csv', '2018_LoL_esports_match_data_from_OraclesElixir.csv', '2019_LoL_esports_match_data_from_OraclesElixir.csv', '2020_LoL_esports_match_data_from_OraclesElixir.csv', '2021_LoL_esports_match_data_from_OraclesElixir.csv', '2022_LoL_esports_match_data_from_OraclesElixir.csv', '2023_LoL_esports_match_data_from_OraclesElixir.csv', '2024_LoL_esports_match_data_from_OraclesElixir.csv', '2025_LoL_esports_match_data_from_OraclesElixir.csv']
Columns in sample file:
['gameid', 'datacompleteness', 'url', 'league', 'year', 'split', 'playoffs', 'date', 'game', 'patch', 'participantid', 'side', 'position', 'playername', 'playerid', 'teamname', 'teamid', 'champion', 'ban1', 'ban2', 'ban3', 'ban4', 'ban5', 'pick1', 'pick2', 'pick3', 'pick4', 'pick5', 'gamelength', 'result', 'kills

Unnamed: 0,gameid,datacompleteness,url,league,year,split,playoffs,date,game,patch,...,opp_csat25,golddiffat25,xpdiffat25,csdiffat25,killsat25,assistsat25,deathsat25,opp_killsat25,opp_assistsat25,opp_deathsat25
0,TRLH3/33,complete,http://matchhistory.na.leagueoflegends.com/en/...,EU LCS,2014,Spring,0,2014-01-14 17:52:02,1.0,3.15,...,206.0,76.0,-512.0,-18.0,3.0,4.0,0.0,1.0,2.0,2.0
1,TRLH3/33,complete,http://matchhistory.na.leagueoflegends.com/en/...,EU LCS,2014,Spring,0,2014-01-14 17:52:02,1.0,3.15,...,140.0,-888.0,351.0,-42.0,0.0,5.0,3.0,2.0,1.0,1.0
2,TRLH3/33,complete,http://matchhistory.na.leagueoflegends.com/en/...,EU LCS,2014,Spring,0,2014-01-14 17:52:02,1.0,3.15,...,225.0,621.0,733.0,8.0,1.0,5.0,1.0,1.0,2.0,0.0
3,TRLH3/33,complete,http://matchhistory.na.leagueoflegends.com/en/...,EU LCS,2014,Spring,0,2014-01-14 17:52:02,1.0,3.15,...,161.0,3265.0,1950.0,50.0,6.0,2.0,0.0,0.0,0.0,4.0
4,TRLH3/33,complete,http://matchhistory.na.leagueoflegends.com/en/...,EU LCS,2014,Spring,0,2014-01-14 17:52:02,1.0,3.15,...,28.0,1780.0,2397.0,-19.0,0.0,7.0,0.0,0.0,1.0,3.0


In [3]:
# Define the 24 features you described
features = [
    # Objective Control (binary)
    "firstblood",
    "firstdragon",
    "firstherald",
    "firstbaron",
    "firsttower",
    "firstmidtower",
    "firsttothreetowers",
    # Economic Advantages (numeric)
    "golddiffat10",
    "golddiffat15",
    "xpdiffat10",
    "xpdiffat15",
    "csdiffat10",
    "csdiffat15",
    # Combat Statistics (numeric)
    "killsat10",
    "killsat15",
    "assistsat10",
    "assistsat15",
    "opp_killsat10",
    "opp_killsat15",
    "opp_assistsat10",
    "opp_assistsat15",
]

# Check which of these features exist in the dataset
missing = [c for c in features if c not in full_df.columns]
if missing:
    print("Warning - missing columns from feature list:", missing)

found_target = "result"
print("Using target column:", found_target)

# Build X and y (only keep features that exist)
available_features = [c for c in features if c in full_df.columns]
X = full_df[available_features].copy()
# Fill missing values sensibly
X = X.fillna(0)

# Convert binary/objective features to numeric 0/1 where necessary
binary_cols = [
    c for c in available_features if c.startswith("first") or "tothreetowers" in c
]
for c in binary_cols:
    if X[c].dtype == "object":
        # attempt to map common representations
        X[c] = X[c].replace(
            {"True": 1, "False": 0, "true": 1, "false": 0, "yes": 1, "no": 0}
        )
    X[c] = pd.to_numeric(X[c], errors="coerce").fillna(0).astype(int)

Using target column: result


In [4]:
le = LabelEncoder()
y_raw = full_df[found_target]
y = le.fit_transform(y_raw.astype(str))

print("Target classes (label encoded):", list(le.classes_))

Target classes (label encoded): ['0', '1']


In [5]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X.values)

In [6]:
X_scaled = np.asarray(X_scaled, dtype=float)
y = np.asarray(y, dtype=int)
print("X shape:", X_scaled.shape, "y shape:", y.shape)

X shape: (1097988, 21) y shape: (1097988,)


In [7]:
class LogisticRegressionApproxGD:
    def __init__(
        self,
        lr: float = 0.1,
        epochs: int = 100,
        batch_size: int = 256,
        reg: float = 1e-4,
        verbose: bool = True,
        lr_decay: float = 0.0,
    ):
        self.lr = lr
        self.epochs = epochs
        self.batch_size = batch_size
        self.reg = reg
        self.verbose = verbose
        self.lr_decay = lr_decay
        self.w = None
        self.b = None

    def _sigmoid(self, z):
        # numerically-stable sigmoid
        return 1.0 / (1.0 + np.exp(-z))

    def predict_proba(self, X):
        return self._sigmoid(X.dot(self.w) + self.b)

    def predict(self, X):
        return (self.predict_proba(X) >= 0.5).astype(int)

    def fit(self, X, y, X_val=None, y_val=None):
        n_samples, n_features = X.shape
        self.w = np.zeros(n_features, dtype=float)
        self.b = 0.0

        for epoch in range(1, self.epochs + 1):
            # shuffle
            inds = np.random.permutation(n_samples)
            for start in range(0, n_samples, self.batch_size):
                batch_idx = inds[start : start + self.batch_size]
                Xb = X[batch_idx]
                yb = y[batch_idx]
                preds = self._sigmoid(Xb.dot(self.w) + self.b)
                error = preds - yb
                # gradient for weights (with L2 regularization)
                grad_w = Xb.T.dot(error) / len(yb) + self.reg * self.w
                grad_b = np.mean(error)
                # parameter update
                self.w -= self.lr * grad_w
                self.b -= self.lr * grad_b

            # optional learning rate decay
            if self.lr_decay:
                self.lr *= 1.0 / (1.0 + self.lr_decay * epoch)

            if self.verbose and (epoch % max(1, self.epochs // 10) == 0 or epoch == 1):
                train_probs = self.predict_proba(X)
                train_loss = log_loss(y, train_probs, labels=[0, 1])
                train_acc = accuracy_score(y, (train_probs >= 0.5).astype(int))
                msg = f"Epoch {epoch}/{self.epochs} - train_loss: {train_loss:.4f}, train_acc: {train_acc:.4f}"
                if X_val is not None and y_val is not None:
                    val_probs = self.predict_proba(X_val)
                    val_loss = log_loss(y_val, val_probs, labels=[0, 1])
                    try:
                        val_auc = roc_auc_score(y_val, val_probs)
                        msg += f", val_loss: {val_loss:.4f}, val_auc: {val_auc:.4f}"
                    except Exception:
                        msg += f", val_loss: {val_loss:.4f}"
                print(msg)

        return self

In [10]:
# If there are >1 classes and imbalanced, stratify
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

model = LogisticRegressionApproxGD(
    lr=0.5, epochs=200, batch_size=512, reg=1e-3, verbose=True, lr_decay=1e-3
)
model.fit(X_train, y_train, X_val=X_test, y_val=y_test)

probs = model.predict_proba(X_test)
preds = (probs >= 0.5).astype(int)

print("\nFinal evaluation on test set:")
print("Accuracy:", accuracy_score(y_test, preds))
print("Log Loss:", log_loss(y_test, probs))
try:
    print("ROC AUC:", roc_auc_score(y_test, probs))
except Exception:
    print("ROC AUC could not be computed for this target.")

Epoch 1/200 - train_loss: 0.5955, train_acc: 0.6645, val_loss: 0.5944, val_auc: 0.7428
Epoch 20/200 - train_loss: 0.5952, train_acc: 0.6677, val_loss: 0.5942, val_auc: 0.7433
Epoch 20/200 - train_loss: 0.5952, train_acc: 0.6677, val_loss: 0.5942, val_auc: 0.7433
Epoch 40/200 - train_loss: 0.5948, train_acc: 0.6653, val_loss: 0.5938, val_auc: 0.7438
Epoch 40/200 - train_loss: 0.5948, train_acc: 0.6653, val_loss: 0.5938, val_auc: 0.7438
Epoch 60/200 - train_loss: 0.5947, train_acc: 0.6657, val_loss: 0.5938, val_auc: 0.7437
Epoch 60/200 - train_loss: 0.5947, train_acc: 0.6657, val_loss: 0.5938, val_auc: 0.7437
Epoch 80/200 - train_loss: 0.5946, train_acc: 0.6658, val_loss: 0.5936, val_auc: 0.7441
Epoch 80/200 - train_loss: 0.5946, train_acc: 0.6658, val_loss: 0.5936, val_auc: 0.7441
Epoch 100/200 - train_loss: 0.5946, train_acc: 0.6658, val_loss: 0.5936, val_auc: 0.7441
Epoch 100/200 - train_loss: 0.5946, train_acc: 0.6658, val_loss: 0.5936, val_auc: 0.7441
Epoch 120/200 - train_loss: 0.5