In [None]:
# Notebook: ML Model - Match Outcome Prediction
# Purpose: Train Random Forest classifier to predict Dota 2 match outcomes

import mlflow
import os
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline

storage_account = "dota2lakehousenew"
container = "data"
access_key = os.getenv("AZURE_STORAGE_ACCOUNT_KEY")

if not access_key:
    raise ValueError("AZURE_STORAGE_ACCOUNT_KEY environment variable not set")

try:
    spark.conf.set(
        f"fs.azure.account.key.{storage_account}.dfs.core.windows.net",
        access_key
    )
    print("Azure ADLS Gen2 connection established")
except Exception as e:
    print(f"Connection error: {e}")

print("\nLoading ml_features table from Gold layer...")

gold_path = f"abfss://{container}@{storage_account}.dfs.core.windows.net/gold/ml_features"
df = spark.read.format("delta").load(gold_path)

selected_cols = [
    "kills", "deaths", "assists",
    "gold_per_min", "xp_per_min",
    "hero_damage", "tower_damage",
    "last_hits", "level",
    "duration_minutes",
    "win"
]

data = df.select(selected_cols).dropna()
count = data.count()
print(f"Data loaded: {count} samples")

print("\nSplitting data (80% train / 20% test)...")
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)
print(f"Train set: {train_data.count()} samples")
print(f"Test set: {test_data.count()} samples")

print("\nConfiguring Random Forest model...")
feature_cols = [c for c in selected_cols if c != "win"]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

rf = RandomForestClassifier(
    labelCol="win",
    featuresCol="features",
    numTrees=50,
    maxDepth=10
)

pipeline = Pipeline(stages=[assembler, rf])

print("\nTraining model with MLflow tracking...")
mlflow.autolog()

model = pipeline.fit(train_data)
predictions = model.transform(test_data)

evaluator = MulticlassClassificationEvaluator(
    labelCol="win",
    predictionCol="prediction",
    metricName="accuracy"
)
accuracy = evaluator.evaluate(predictions)

print("\nModel Results:")
print(f"Algorithm: Random Forest Classifier")
print(f"Accuracy: {accuracy*100:.2f}%")

print("\nConfusion Matrix:")
predictions.groupBy("win", "prediction").count().show()

In [None]:
import pandas as pd

noms_features = [
    "kills", "deaths", "assists",
    "gold_per_min", "xp_per_min",
    "hero_damage", "tower_damage",
    "last_hits", "level",
    "duration_minutes"
]

importances = model.stages[-1].featureImportances.toArray()

df_importance = pd.DataFrame(list(zip(noms_features, importances)),
                             columns=["Feature", "Importance"])

print("\nFeature Importance Ranking:")
print(df_importance.sort_values("Importance", ascending=False).to_string(index=False))