In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import os
train_X = pd.read_csv("../data/train_features.csv")
train_y = pd.read_csv("../data/train_targets.csv")
test_X = pd.read_csv("../data/test_features.csv")
submission = pd.read_csv("../data/sample_submission.csv")
print("Train features shape:", train_X.shape)
print("Train targets shape:", train_y.shape)
print("Test features shape:", test_X.shape)


Train features shape: (39675, 246)
Train targets shape: (39675, 6)
Test features shape: (10000, 246)


In [4]:
# Data Cleaning, Feature Scaling, Regression, AUC score

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
import numpy as np

# Cleaning: keeping only numeric columns for training ignoring things like matchID(for now)
numeric_cols = train_X.select_dtypes(include=np.number).columns
X_numeric = train_X[numeric_cols]
y = train_y['radiant_win']

# 80-20 Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X_numeric, y, test_size=0.2, random_state=42
)

print("Training set shape:", X_train.shape)
print("Validation set shape:", X_val.shape)

# Creating test set (keep match IDs separately)
test_match_ids = test_X['match_id_hash']  # save match IDs for submission
test_X_numeric = test_X[numeric_cols].copy()  # drop non-numeric columns

# Scaling features using StandardScaler from scikitlearn
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
test_X_scaled = scaler.transform(test_X_numeric)

#Train using logistic regression
model = LogisticRegression(max_iter=1000)
model.fit(X_train_scaled, y_train)

#Prediction on validation set
y_val_pred = model.predict_proba(X_val_scaled)[:,1]

#Evaluate with roc auc score
auc = roc_auc_score(y_val, y_val_pred)
print("Validation AUC after scaling:", auc)


Training set shape: (31740, 245)
Validation set shape: (7935, 245)
Validation AUC after scaling: 0.8128938537403103


In [5]:
#Predict on test set and create submission CSV

#Predict probabilities for radiant winning
y_test_pred = model.predict_proba(test_X_scaled)[:, 1]  # probability radiant wins

#Prepare submission DataFrame
submission_df = pd.DataFrame({
    'match_id_hash': test_match_ids,
    'radiant_win': y_test_pred
})

#Save to CSV
submission_df.to_csv("dota2_baseline_submission.csv", index=False)

print("Submission file created: dota2_baseline_submission.csv")

Submission file created: dota2_baseline_submission.csv


In [6]:
# Cell 2: Random Forest baseline for Dota 2 dataset

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
import numpy as np

# 1️⃣ Keep only numeric columns in training features
numeric_cols = train_X.select_dtypes(include=np.number).columns
X_numeric = train_X[numeric_cols]
y = train_y['radiant_win']

# 2️⃣ Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X_numeric, y, test_size=0.2, random_state=42
)

print("Training set shape:", X_train.shape)
print("Validation set shape:", X_val.shape)

# 3️⃣ Prepare test set (keep match IDs separately)
test_match_ids = test_X['match_id_hash']  # save match IDs for submission
test_X_numeric = test_X[numeric_cols].copy()  # drop non-numeric columns

# 4️⃣ Train Random Forest
model = RandomForestClassifier(
    n_estimators=200,       # number of trees
    max_depth=None,         # grow trees until all leaves are pure
    n_jobs=-1,              # use all CPU cores
    random_state=42
)
model.fit(X_train, y_train)

# 5️⃣ Predict on validation set
y_val_pred = model.predict_proba(X_val)[:,1]

# 6️⃣ Evaluate with AUC
auc = roc_auc_score(y_val, y_val_pred)
print("Validation AUC (Random Forest):", auc)
#Predict on test set and create submission CSV

#Predict probabilities for radiant winning
y_test_pred = model.predict_proba(test_X_numeric)[:, 1]  # probability radiant wins

#Prepare submission DataFrame
submission_df = pd.DataFrame({
    'match_id_hash': test_match_ids,
    'radiant_win': y_test_pred
})

#Save to CSV
submission_df.to_csv("dota2_RandomForest_submission.csv", index=False)

print("Submission file created: dota2_RandomForest_submission.csv")


Training set shape: (31740, 245)
Validation set shape: (7935, 245)
Validation AUC (Random Forest): 0.7858020697587627
Submission file created: dota2_baseline_submission.csv


In [7]:
# Cell 2: XGBoost baseline for Dota 2 dataset
!pip install xgboost
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import numpy as np

# 1️⃣ Keep only numeric columns
numeric_cols = train_X.select_dtypes(include=np.number).columns
X_numeric = train_X[numeric_cols]
y = train_y['radiant_win']

# 2️⃣ Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X_numeric, y, test_size=0.2, random_state=42
)

print("Training set shape:", X_train.shape)
print("Validation set shape:", X_val.shape)

# 3️⃣ Prepare test set
test_match_ids = test_X['match_id_hash']
test_X_numeric = test_X[numeric_cols].copy()

# 4️⃣ Convert data to DMatrix (XGBoost’s data structure)
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
dtest = xgb.DMatrix(test_X_numeric)

# 5️⃣ Set XGBoost parameters
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'learning_rate': 0.05,
    'max_depth': 6,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'seed': 42
}

# 6️⃣ Train XGBoost model
evals = [(dtrain, 'train'), (dval, 'val')]
model = xgb.train(
    params,
    dtrain,
    num_boost_round=500,
    evals=evals,
    early_stopping_rounds=50,
    verbose_eval=50
)

# 7️⃣ Predict on validation set
y_val_pred = model.predict(dval)
auc = roc_auc_score(y_val, y_val_pred)
print("Validation AUC (XGBoost):", auc)
# Predict on test set
y_test_pred = model.predict(dtest)

submission_df = pd.DataFrame({
    'match_id_hash': test_match_ids,
    'radiant_win': y_test_pred
})

submission_df.to_csv("dota2_xgboost_submission.csv", index=False)
print("Submission file created: dota2_xgboost_submission.csv")

Collecting xgboost
  Downloading xgboost-3.0.4-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.4-py3-none-win_amd64.whl (56.8 MB)
   ---------------------------------------- 0.0/56.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/56.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/56.8 MB ? eta -:--:--
   ---------------------------------------- 0.3/56.8 MB ? eta -:--:--
   ---------------------------------------- 0.5/56.8 MB 838.7 kB/s eta 0:01:08
   ---------------------------------------- 0.5/56.8 MB 838.7 kB/s eta 0:01:08
   ---------------------------------------- 0.5/56.8 MB 838.7 kB/s eta 0:01:08
    --------------------------------------- 0.8/56.8 MB 618.9 kB/s eta 0:01:31
    --------------------------------------- 1.0/56.8 MB 764.6 kB/s eta 0:01:13
    --------------------------------------- 1.3/56.8 MB 836.0 kB/s eta 0:01:07
   - -------------------------------------- 1.8/56.8 MB 997.1 kB/s eta 0:00:56
   - ---------------