# Data Exploration for League of Legends Bayesian Network

This notebook explores the match data, checks data quality, and validates discretization.


In [None]:
# Imports
import sys
sys.path.insert(0, '..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from src import config, preprocessing, discretization, variables

# Configure plotting
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

%matplotlib inline
%load_ext autoreload
%autoreload 2


## 1. Load Raw Data

Let's start by loading a sample of the raw match data.


In [None]:
# Load a sample of raw data for exploration (use 5000 samples for speed)
sample_size = 5000
raw_data = preprocessing.load_raw_match_data(sample_size=sample_size)

print(f"Loaded {len(raw_data)} matches")
print(f"Columns: {len(raw_data.columns)}")
print(f"\nFirst few columns: {list(raw_data.columns)[:20]}")


## 2. Extract Team Features


In [None]:
# Extract team-level features
features = preprocessing.extract_team_features(raw_data)

print(f"Extracted {len(features)} matches with {len(features.columns)} features")
print(f"\nFeatures: {list(features.columns)}")
print(f"\nFirst few rows:")
features.head()


## 3. Basic Statistics


In [None]:
# Summary statistics
features.describe()


In [None]:
# Check for missing values
print("Missing values:")
print(features.isnull().sum())
print(f"\nTotal missing: {features.isnull().sum().sum()}")


## 4. Discretization


In [None]:
# Apply discretization
features_clean = features.drop(columns=['matchId', 'gameDuration'], errors='ignore')
discretized = discretization.discretize_all_variables(features_clean.copy())

print("Discretized data:")
discretized.head(10)


In [None]:
# Get discretization summary
summary = discretization.get_discretization_summary(discretized)
print("\nDiscretization Summary:")
print(summary.to_string(index=False))


## 5. Variable Distributions


In [None]:
# Plot distributions
from src.visualize import plot_variable_distributions

fig = plot_variable_distributions(discretized, "Sample Data")
plt.show()


## 6. Win Rate Analysis


In [None]:
# Win rate by different factors
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Win rate by First Blood
fb_win = discretized.groupby('FB')['Win'].apply(lambda x: (x == '1').mean())
fb_win.plot(kind='bar', ax=axes[0, 0], color='skyblue')
axes[0, 0].set_title('Win Rate by First Blood')
axes[0, 0].set_ylabel('Win Rate')
axes[0, 0].set_ylim([0, 1])

# Win rate by First Tower
ft_win = discretized.groupby('FT')['Win'].apply(lambda x: (x == '1').mean())
ft_win.plot(kind='bar', ax=axes[0, 1], color='lightcoral')
axes[0, 1].set_title('Win Rate by First Tower')
axes[0, 1].set_ylabel('Win Rate')
axes[0, 1].set_ylim([0, 1])

# Win rate by Gold20
gold_win = discretized.groupby('Gold20')['Win'].apply(lambda x: (x == '1').mean())
gold_win.plot(kind='bar', ax=axes[1, 0], color='lightgreen')
axes[1, 0].set_title('Win Rate by Gold Difference @20min')
axes[1, 0].set_ylabel('Win Rate')
axes[1, 0].set_ylim([0, 1])

# Win rate by Baron
baron_win = discretized.groupby('Baron')['Win'].apply(lambda x: (x == '1').mean())
baron_win.plot(kind='bar', ax=axes[1, 1], color='gold')
axes[1, 1].set_title('Win Rate by Baron Kills')
axes[1, 1].set_ylabel('Win Rate')
axes[1, 1].set_ylim([0, 1])

plt.tight_layout()
plt.show()


## 7. Correlation Analysis


In [None]:
# Encode categorical variables as integers for correlation
from sklearn.preprocessing import LabelEncoder

encoded_data = discretized.copy()
for col in encoded_data.columns:
    le = LabelEncoder()
    encoded_data[col] = le.fit_transform(encoded_data[col].astype(str))

# Compute correlation matrix
corr = encoded_data.corr()

# Plot heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm', center=0, 
            square=True, linewidths=1)
plt.title('Correlation Matrix of Discretized Variables', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()


## 8. Rank Distribution


In [None]:
# Load match IDs and assign rank buckets
match_ids = preprocessing.load_match_ids()
features_with_rank = preprocessing.assign_rank_bucket(features, match_ids)

print("Rank distribution:")
print(features_with_rank['rank_bucket'].value_counts())

# Plot
features_with_rank['rank_bucket'].value_counts().plot(kind='bar', color='steelblue')
plt.title('Match Distribution by Rank')
plt.xlabel('Rank')
plt.ylabel('Number of Matches')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()


## Conclusion

The data quality looks good:
- Minimal missing values
- Balanced class distribution (Win should be ~50%)
- Clear relationships between objectives and outcomes
- Sufficient samples across rank tiers

Ready for structure learning!
