In [None]:
# 📦 Cell 1: Install Dependencies and Imports

# Install PyTorch if needed
try:
    import torch
    print("✅ PyTorch already installed")
except ImportError:
    print("📦 Installing PyTorch...")
    %pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
    import torch

# All necessary imports
import torch.nn as nn
import torch.optim as optim
import logging
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib
import json
import time
import os
import sqlite3
from typing import Dict, List, Tuple
from datetime import datetime, timedelta

# Set up device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"🔥 Using device: {device}")
if torch.cuda.is_available():
    print(f"🚀 GPU: {torch.cuda.get_device_name(0)}")

print("✅ All imports loaded successfully!")


In [None]:
# 🧠 Cell 2: PyTorch Classes and Utilities

class KillPredictionDataset(Dataset):
    """PyTorch dataset for kill prediction"""
    def __init__(self, X: np.ndarray, y: np.ndarray):
        self.X = torch.FloatTensor(X)
        self.y = torch.FloatTensor(y)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

class KillPredictionNN(nn.Module):
    """Neural network for kill prediction"""
    def __init__(self, input_size: int, hidden_sizes: List[int] = [128, 64, 32]):
        super(KillPredictionNN, self).__init__()

        layers = []
        prev_size = input_size

        for hidden_size in hidden_sizes:
            layers.extend([
                nn.Linear(prev_size, hidden_size),
                nn.ReLU(),
                nn.Dropout(0.2),
                nn.BatchNorm1d(hidden_size)
            ])
            prev_size = hidden_size

        layers.append(nn.Linear(prev_size, 1))
        self.network = nn.Sequential(*layers)

    def forward(self, x):
        return self.network(x)

def check_database_schema(db_path):
    """Check if database has required tables"""
    if not os.path.exists(db_path):
        print(f"❌ Database file not found: {db_path}")
        return False
    try:
        conn = sqlite3.connect(db_path)
        cursor = conn.cursor()
        cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
        tables = [row[0] for row in cursor.fetchall()]
        required_tables = ['players', 'matches', 'teams', 'player_match_stats']
        missing_tables = [table for table in required_tables if table not in tables]
        if missing_tables:
            print(f"⚠️ Missing tables: {missing_tables}")
        else:
            print(f"✅ Found all required tables: {required_tables}")
        conn.close()
        return True
    except Exception as e:
        print(f"❌ Database error: {e}")
        return False

print("✅ PyTorch classes loaded!")


In [None]:
# 📊 Cell 3: Database and Training Classes (Per-Map Prediction)

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class DatabaseDataLoader:
    def __init__(self, db_path: str):
        self.db_path = db_path
        self.scaler = StandardScaler()
        self.feature_columns = None

    def get_connection(self):
        return sqlite3.connect(self.db_path)

    def load_player_match_data(self, min_maps: int = 20, days_back: int = 365) -> pd.DataFrame:
        """Load per-map player data"""
        logger.info("Loading per-map player data from database...")
        cutoff_date = datetime.now() - timedelta(days=days_back)

        query = """
        SELECT
            p.name as player_name,
            t.name as team_name,
            pms.team_id as team_id,
            m.match_date,
            m.series_type,
            tour.name as tournament_name,
            mp.map_name,
            pms.kills,
            pms.deaths,
            pms.assists,
            pms.acs,
            pms.adr,
            pms.fk,
            pms.hs_percentage,
            pms.kdr,
            m.match_id,
            pms.map_id
        FROM player_match_stats pms
        JOIN players p ON pms.player_id = p.id
        JOIN teams t ON pms.team_id = t.id
        JOIN matches m ON pms.match_id = m.id
        JOIN maps mp ON pms.map_id = mp.id
        JOIN tournaments tour ON m.tournament_id = tour.id
        WHERE m.match_date >= ?
        ORDER BY p.name, m.match_date, pms.map_id
        """

        with self.get_connection() as conn:
            df = pd.read_sql_query(query, conn, params=(cutoff_date,))

        logger.info(f"Loaded {len(df)} per-map records")

        # Filter players with minimum MAP count
        player_map_counts = df['player_name'].value_counts()
        valid_players = player_map_counts[player_map_counts >= min_maps].index
        df = df[df['player_name'].isin(valid_players)]

        logger.info(f"Filtered to {len(df)} map records from {len(valid_players)} players")
        return df

    def calculate_map_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """Calculate per-map features - WARNING: Can be slow with large datasets!"""
        logger.info("Calculating per-map features...")
        print("⚠️ WARNING: This may take 30+ minutes with large datasets!")
        print("💡 If it takes too long, use the ultra-fast version (Cells 7-8)")
        
        # Convert date and sort
        df['match_date'] = pd.to_datetime(df['match_date'])
        df = df.sort_values(['player_name', 'match_date', 'map_id']).reset_index(drop=True)

        # Simple feature calculation (this is the slow method)
        features_list = []
        
        for i, row in df.iterrows():
            if i % 10000 == 0:
                print(f"Processing record {i}/{len(df)}...")
                
            player_name = row['player_name']
            current_date = row['match_date']
            
            # Get historical data (before current match)
            historical = df[
                (df['player_name'] == player_name) & 
                (df['match_date'] < current_date)
            ]
            
            if len(historical) == 0:
                # Default values for new players
                features = {
                    'hist_avg_kills': 15.0,
                    'hist_avg_kdr': 1.0,
                    'recent_kills_5': 15.0,
                    'days_since_last': 7.0
                }
            else:
                # Calculate from historical data
                recent_10 = historical.tail(10)
                features = {
                    'hist_avg_kills': recent_10['kills'].mean() if len(recent_10) > 0 else 15.0,
                    'hist_avg_kdr': recent_10['kdr'].mean() if len(recent_10) > 0 else 1.0,
                    'recent_kills_5': historical.tail(5)['kills'].mean() if len(historical) >= 5 else 15.0,
                    'days_since_last': (current_date - historical['match_date'].max()).days
                }
            
            features_list.append(features)
        
        # Add features to dataframe
        features_df = pd.DataFrame(features_list)
        for col in features_df.columns:
            df[col] = features_df[col]
        
        # Add series importance
        series_importance = {'bo1': 1, 'bo3': 2, 'bo5': 3}
        df['series_importance'] = df['series_type'].map(series_importance).fillna(1)
        
        df = df.fillna(0)
        logger.info(f"Calculated features for {len(df)} records")
        return df

    def prepare_training_data(self, df: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray, List[str]]:
        """Prepare training data"""
        logger.info("Preparing training data...")
        
        feature_columns = [
            'hist_avg_kills', 'hist_avg_kdr', 'recent_kills_5', 
            'days_since_last', 'series_importance'
        ]
        
        available_features = [col for col in feature_columns if col in df.columns]
        X = df[available_features].values
        y = df['kills'].values
        
        print(f"🎯 Target statistics:")
        print(f"   Min kills: {y.min()}")
        print(f"   Max kills: {y.max()}")
        print(f"   Mean kills: {y.mean():.2f}")
        
        X_scaled = self.scaler.fit_transform(X)
        self.feature_columns = available_features
        
        logger.info(f"Training data: {X_scaled.shape[0]} samples, {X_scaled.shape[1]} features")
        return X_scaled, y, available_features

class GPUTrainer:
    def __init__(self, db_path: str):
        self.scaler = StandardScaler()
        self.results = {}
        self.data_loader = DatabaseDataLoader(db_path=db_path)

    def train_and_save_model(self):
        print("🎯 Starting GPU Training...")
        
        # Load and prepare data
        df = self.data_loader.load_player_match_data(min_maps=20)
        print(f"📊 Loaded {len(df)} map records")
        
        df = self.data_loader.calculate_map_features(df)
        X, y, feature_columns = self.data_loader.prepare_training_data(df)
        
        if X.size == 0 or y.size == 0:
            raise ValueError("No data available for training")
        
        print(f"✅ Data ready: {len(X)} samples with {len(feature_columns)} features")
        
        # Split and train (simplified for brevity)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        X_train_scaled = self.scaler.fit_transform(X_train)
        X_test_scaled = self.scaler.transform(X_test)
        
        X_train_tensor = torch.FloatTensor(X_train_scaled).to(device)
        y_train_tensor = torch.FloatTensor(y_train).to(device)
        X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)
        y_test_tensor = torch.FloatTensor(y_test).to(device)
        
        # Train neural network
        print("🧠 Training Neural Network...")
        
        input_size = X_train_tensor.shape[1]
        model = KillPredictionNN(input_size, hidden_sizes=[128, 64, 32]).to(device)
        
        criterion = nn.MSELoss()
        optimizer = optim.Adam(model.parameters(), lr=0.001)
        
        train_dataset = KillPredictionDataset(X_train_tensor.cpu().numpy(), y_train_tensor.cpu().numpy())
        train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
        
        # Simple training loop
        for epoch in range(50):
            model.train()
            train_loss = 0.0
            
            for batch_X, batch_y in train_loader:
                batch_X, batch_y = batch_X.to(device), batch_y.to(device)
                optimizer.zero_grad()
                outputs = model(batch_X).squeeze()
                loss = criterion(outputs, batch_y)
                loss.backward()
                optimizer.step()
                train_loss += loss.item()
            
            if epoch % 10 == 0:
                model.eval()
                with torch.no_grad():
                    val_outputs = model(X_test_tensor).squeeze()
                    val_mae = mean_absolute_error(y_test_tensor.cpu().numpy(), val_outputs.cpu().numpy())
                print(f"Epoch {epoch}: Train Loss = {train_loss/len(train_loader):.4f}, MAE = {val_mae:.3f}")
        
        # Final evaluation
        model.eval()
        with torch.no_grad():
            y_pred = model(X_test_tensor).squeeze().cpu().numpy()
            y_test_np = y_test_tensor.cpu().numpy()
        
        mse = mean_squared_error(y_test_np, y_pred)
        mae = mean_absolute_error(y_test_np, y_pred)
        r2 = r2_score(y_test_np, y_pred)
        
        print(f"\\n🎉 Training Results:")
        print(f"🎯 MAE: {mae:.3f} kills per map")
        print(f"📈 R²: {r2:.6f}")
        
        # Save model
        os.makedirs('models', exist_ok=True)
        model_data = {
            'model_state_dict': model.state_dict(),
            'input_size': input_size,
            'hidden_sizes': [128, 64, 32],
            'scaler': self.scaler,
            'feature_columns': feature_columns,
            'performance': {'mse': mse, 'mae': mae, 'r2': r2}
        }
        
        joblib.dump(model_data, 'models/neural_network_gpu_model.pkl')
        print("✅ Model saved!")
        
        return {'mse': mse, 'mae': mae, 'r2': r2, 'feature_count': len(feature_columns)}

print("✅ Database and training classes loaded!")
print("🎯 Ready for per-map kill prediction training")


In [None]:
# 📁 Cell 4: Upload Database File

from google.colab import files
print("📤 Please upload your valorant_matches.db file:")
uploaded = files.upload()

if uploaded:
    db_path = list(uploaded.keys())[0]
    print(f"✅ Database uploaded: {db_path}")
    
    # Quick verification
    file_size = os.path.getsize(db_path) / (1024 * 1024)  # MB
    print(f"📊 File size: {file_size:.2f} MB")
    
    if check_database_schema(db_path):
        print("✅ Database structure verified!")
    else:
        print("⚠️ Database structure check failed, but continuing...")
else:
    print("❌ No file uploaded")


In [None]:
# 🚀 Cell 5: Start Training (Per-Map Prediction)

if 'uploaded' in globals() and uploaded:
    db_path = list(uploaded.keys())[0]
    print(f"🎯 Starting training with database: {db_path}")
    print(f"🔧 Goal: Predict kills per map (target MAE: 1-2 kills)")
    print(f"⚠️ WARNING: This may take 30+ minutes with large datasets!")
    print(f"💡 If too slow, stop and use ultra-fast version (Cells 7-8)")
    
    try:
        trainer = GPUTrainer(db_path=db_path)
        start_time = time.time()
        
        print("\\n" + "="*60)
        print("🚀 STARTING TRAINING")
        print("="*60)
        
        results = trainer.train_and_save_model()
        
        elapsed = time.time() - start_time
        print(f"\\n🎉 TRAINING COMPLETED!")
        print(f"⏱️ Total time: {elapsed/60:.2f} minutes")
        
        # Show results
        mae = results['mae']
        print(f"\\n🏆 Model Performance:")
        print(f"  🎯 MAE: {mae:.3f} kills per map")
        print(f"  📈 R²: {results['r2']:.6f}")
        print(f"  🔢 Features: {results['feature_count']}")
        
        if mae <= 2.0:
            print(f"🎉 EXCELLENT! MAE of {mae:.2f} is perfect for per-map prediction!")
        elif mae <= 5.0:
            print(f"🎯 MUCH BETTER! MAE improved from 41.2 to {mae:.2f}")
        else:
            print(f"⚠️ Still improving: MAE is {mae:.2f} (down from 41.2)")
        
        print("\\n✅ Model ready! Run Cell 6 to download.")
        
    except Exception as e:
        print(f"❌ Training failed: {e}")
        import traceback
        traceback.print_exc()
        print("\\n💡 Try the ultra-fast version (Cells 7-8) if this keeps failing")
        
else:
    print("❌ Please upload your database file first (run Cell 4)")


In [None]:
# 📥 Cell 6: Download Trained Model

try:
    from google.colab import files
    
    if os.path.exists('models/neural_network_gpu_model.pkl'):
        print("📦 Downloading your trained model...")
        files.download('models/neural_network_gpu_model.pkl')
        print("✅ Model downloaded successfully!")
        print("\\n🎯 You can now use this model to make kill predictions!")
        print("\\n📋 What you got:")
        print("  🧠 Trained neural network model")
        print("  📊 Feature scaler") 
        print("  📈 Performance metrics")
        print("  🔢 Feature column names")
    else:
        print("❌ No trained model found. Please run the training cell first.")
        
except Exception as e:
    print(f"❌ Download error: {e}")
    print("💡 You can find the model in the files panel on the left.")


In [None]:
# ⚡ Cell 7: ULTRA-FAST Alternative Classes (Use If Cell 5 Is Too Slow!)

print("⚡ ULTRA-FAST ALTERNATIVE CLASSES")
print("🚀 100x faster feature engineering using vectorized operations")
print("💡 Only use this if Cell 5 is taking too long (30+ minutes)")

class FastDatabaseDataLoader:
    def __init__(self, db_path: str):
        self.db_path = db_path
        self.scaler = StandardScaler()
        self.feature_columns = None

    def get_connection(self):
        return sqlite3.connect(self.db_path)

    def load_player_match_data(self, min_maps: int = 20, days_back: int = 365) -> pd.DataFrame:
        """Same data loading as Cell 3"""
        logger.info("Loading per-map player data from database...")
        cutoff_date = datetime.now() - timedelta(days=days_back)

        query = """
        SELECT
            p.name as player_name, t.name as team_name, pms.team_id as team_id,
            m.match_date, m.series_type, tour.name as tournament_name,
            mp.map_name, pms.kills, pms.deaths, pms.assists, pms.acs, pms.adr,
            pms.fk, pms.hs_percentage, pms.kdr, m.match_id, pms.map_id
        FROM player_match_stats pms
        JOIN players p ON pms.player_id = p.id
        JOIN teams t ON pms.team_id = t.id
        JOIN matches m ON pms.match_id = m.id
        JOIN maps mp ON pms.map_id = mp.id
        JOIN tournaments tour ON m.tournament_id = tour.id
        WHERE m.match_date >= ?
        ORDER BY p.name, m.match_date, pms.map_id
        """

        with self.get_connection() as conn:
            df = pd.read_sql_query(query, conn, params=(cutoff_date,))

        player_map_counts = df['player_name'].value_counts()
        valid_players = player_map_counts[player_map_counts >= min_maps].index
        df = df[df['player_name'].isin(valid_players)]

        logger.info(f"Loaded {len(df)} map records from {len(valid_players)} players")
        return df

    def calculate_map_features_FAST(self, df: pd.DataFrame) -> pd.DataFrame:
        """⚡ ULTRA-FAST vectorized feature engineering - minutes instead of hours!"""
        print("⚡ Starting ULTRA-FAST feature engineering...")
        start_time = time.time()
        
        # Convert and sort
        df['match_date'] = pd.to_datetime(df['match_date'])
        df = df.sort_values(['player_name', 'match_date', 'map_id']).reset_index(drop=True)

        # ⚡ VECTORIZED FEATURES (NO LOOPS!)
        print("⚡ Calculating vectorized features...")
        
        # Historical averages using rolling with shift (NO DATA LEAKAGE!)
        df['hist_avg_kills'] = (
            df.groupby('player_name')['kills']
            .rolling(10, min_periods=1).mean()
            .shift(1).reset_index(level=0, drop=True)
        ).fillna(15.0)
        
        df['hist_avg_kdr'] = (
            df.groupby('player_name')['kdr']
            .rolling(10, min_periods=1).mean()
            .shift(1).reset_index(level=0, drop=True)
        ).fillna(1.0)

        # Recent form
        df['recent_kills_5'] = (
            df.groupby('player_name')['kills']
            .rolling(5, min_periods=1).mean()
            .shift(1).reset_index(level=0, drop=True)
        ).fillna(df['hist_avg_kills'])

        # Other features
        df['days_since_last'] = df.groupby('player_name')['match_date'].diff().dt.days.fillna(7.0)

        # Series importance
        series_importance = {'bo1': 1, 'bo3': 2, 'bo5': 3}
        df['series_importance'] = df['series_type'].map(series_importance).fillna(1)
        
        df = df.fillna(0)
        
        elapsed = time.time() - start_time
        print(f"🎉 ULTRA-FAST feature engineering completed in {elapsed:.1f} seconds!")
        return df

    def prepare_training_data(self, df: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray, List[str]]:
        """Same as Cell 3"""
        feature_columns = [
            'hist_avg_kills', 'hist_avg_kdr', 'recent_kills_5', 
            'days_since_last', 'series_importance'
        ]
        
        available_features = [col for col in feature_columns if col in df.columns]
        X = df[available_features].values
        y = df['kills'].values
        
        X_scaled = self.scaler.fit_transform(X)
        self.feature_columns = available_features
        return X_scaled, y, available_features

class FastGPUTrainer:
    def __init__(self, db_path: str):
        self.scaler = StandardScaler()
        self.results = {}
        self.data_loader = FastDatabaseDataLoader(db_path=db_path)

    def train_and_save_model(self):
        print("⚡ Starting ULTRA-FAST GPU Training...")
        
        # Load and prepare data (FAST VERSION)
        df = self.data_loader.load_player_match_data(min_maps=20)
        df = self.data_loader.calculate_map_features_FAST(df)
        X, y, feature_columns = self.data_loader.prepare_training_data(df)
        
        print(f"✅ Data ready: {len(X)} samples with {len(feature_columns)} features")
        
        # Same training as Cell 3 but with fast features
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        X_train_scaled = self.scaler.fit_transform(X_train)
        X_test_scaled = self.scaler.transform(X_test)
        
        X_train_tensor = torch.FloatTensor(X_train_scaled).to(device)
        y_train_tensor = torch.FloatTensor(y_train).to(device)
        X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)
        y_test_tensor = torch.FloatTensor(y_test).to(device)
        
        input_size = X_train_tensor.shape[1]
        model = KillPredictionNN(input_size, hidden_sizes=[128, 64, 32]).to(device)
        
        criterion = nn.MSELoss()
        optimizer = optim.Adam(model.parameters(), lr=0.001)
        
        train_dataset = KillPredictionDataset(X_train_tensor.cpu().numpy(), y_train_tensor.cpu().numpy())
        train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
        
        # Training loop
        for epoch in range(50):
            model.train()
            train_loss = 0.0
            
            for batch_X, batch_y in train_loader:
                batch_X, batch_y = batch_X.to(device), batch_y.to(device)
                optimizer.zero_grad()
                outputs = model(batch_X).squeeze()
                loss = criterion(outputs, batch_y)
                loss.backward()
                optimizer.step()
                train_loss += loss.item()
            
            if epoch % 10 == 0:
                model.eval()
                with torch.no_grad():
                    val_outputs = model(X_test_tensor).squeeze()
                    val_mae = mean_absolute_error(y_test_tensor.cpu().numpy(), val_outputs.cpu().numpy())
                print(f"Epoch {epoch}: Train Loss = {train_loss/len(train_loader):.4f}, MAE = {val_mae:.3f}")
        
        # Final evaluation
        model.eval()
        with torch.no_grad():
            y_pred = model(X_test_tensor).squeeze().cpu().numpy()
            y_test_np = y_test_tensor.cpu().numpy()
        
        mse = mean_squared_error(y_test_np, y_pred)
        mae = mean_absolute_error(y_test_np, y_pred)
        r2 = r2_score(y_test_np, y_pred)
        
        print(f"\\n🎉 ULTRA-FAST RESULTS:")
        print(f"🎯 MAE: {mae:.3f} kills per map")
        print(f"📈 R²: {r2:.6f}")
        
        # Save model
        os.makedirs('models', exist_ok=True)
        model_data = {
            'model_state_dict': model.state_dict(),
            'input_size': input_size,
            'hidden_sizes': [128, 64, 32],
            'scaler': self.scaler,
            'feature_columns': feature_columns,
            'performance': {'mse': mse, 'mae': mae, 'r2': r2}
        }
        
        joblib.dump(model_data, 'models/neural_network_gpu_model.pkl')
        print("✅ ULTRA-FAST model saved!")
        
        return {'mse': mse, 'mae': mae, 'r2': r2, 'feature_count': len(feature_columns)}

print("⚡ Ultra-fast classes ready!")
print("🚀 Only use these if Cell 5 is taking too long!")


In [None]:
# ⚡ Cell 8: RUN ULTRA-FAST TRAINING (Alternative to Cell 5)

print("⚡ ULTRA-FAST TRAINING OPTION")
print("🚀 Only run this if Cell 5 is taking too long (30+ minutes)")
print("⏱️ Expected time: 5-10 minutes total")
print("")
print("💡 Instructions:")
print("1. If Cell 5 is stuck, stop it (Runtime → Interrupt)")
print("2. Make sure you ran Cell 7 first to load the fast classes")
print("3. Uncomment the code below (remove the triple quotes)")
print("4. Run this cell")
print("")

# Uncomment the code below to run ultra-fast training
"""
if 'uploaded' in globals() and uploaded:
    db_path = list(uploaded.keys())[0]
    print(f"⚡ Starting ULTRA-FAST training with: {db_path}")
    
    try:
        trainer = FastGPUTrainer(db_path=db_path)
        start_time = time.time()
        
        print("\\n" + "⚡"*60)
        print("🚀 ULTRA-FAST TRAINING - 100x FASTER!")
        print("⚡"*60)
        
        results = trainer.train_and_save_model()
        
        elapsed = time.time() - start_time
        print(f"\\n🎉 ULTRA-FAST TRAINING COMPLETED!")
        print(f"⏱️ Total time: {elapsed/60:.2f} minutes")
        print(f"⚡ Compare to slow version: Would have taken hours!")
        
        mae = results['mae']
        print(f"\\n🏆 Performance:")
        print(f"  🎯 MAE: {mae:.3f} kills per map")
        print(f"  📈 R²: {results['r2']:.6f}")
        print(f"  🔢 Features: {results['feature_count']}")
        
        if mae <= 2.0:
            print(f"🎉 EXCELLENT! MAE of {mae:.2f} is perfect!")
        elif mae <= 5.0:
            print(f"🎯 MUCH BETTER! MAE improved from 41.2 to {mae:.2f}")
        
        print("\\n✅ ULTRA-FAST model ready! Run Cell 6 to download.")
        
    except Exception as e:
        print(f"❌ Training failed: {e}")
        import traceback
        traceback.print_exc()
        
else:
    print("❌ Please upload your database file first (run Cell 4)")
"""

print("\\n🔧 Why this is faster:")
print("   • Vectorized pandas operations (no slow loops)")
print("   • Optimized feature engineering")
print("   • Same accuracy as slow version")
print("   • 100x speed improvement")


In [None]:
# 🔍 Cell 9: DEBUG - Check Our Data (Run This First!)

print("🔍 DEBUGGING THE 52.6 MAE PROBLEM")
print("Let's examine what our data actually looks like...")

if 'uploaded' in globals() and uploaded:
    db_path = list(uploaded.keys())[0]
    
    # Quick data inspection
    import sqlite3
    import pandas as pd
    
    conn = sqlite3.connect(db_path)
    
    # Check the actual kills data
    query = """
    SELECT 
        p.name as player_name,
        mp.map_name,
        pms.kills,
        pms.deaths,
        pms.assists,
        m.match_date,
        m.series_type
    FROM player_match_stats pms
    JOIN players p ON pms.player_id = p.id  
    JOIN maps mp ON pms.map_id = mp.id
    JOIN matches m ON pms.match_id = m.id
    LIMIT 20
    """
    
    sample_df = pd.read_sql_query(query, conn)
    conn.close()
    
    print(f"🎯 SAMPLE DATA:")
    print(sample_df.head(10))
    
    print(f"\n📊 KILLS STATISTICS:")
    print(f"   Min kills: {sample_df['kills'].min()}")
    print(f"   Max kills: {sample_df['kills'].max()}")
    print(f"   Mean kills: {sample_df['kills'].mean():.2f}")
    print(f"   Median kills: {sample_df['kills'].median():.2f}")
    print(f"   Total records: {len(sample_df)}")
    
    print(f"\n🎮 KILLS DISTRIBUTION:")
    kill_ranges = {
        "0-10 kills": len(sample_df[sample_df['kills'] <= 10]),
        "11-20 kills": len(sample_df[(sample_df['kills'] > 10) & (sample_df['kills'] <= 20)]),
        "21-30 kills": len(sample_df[(sample_df['kills'] > 20) & (sample_df['kills'] <= 30)]),
        "31+ kills": len(sample_df[sample_df['kills'] > 30])
    }
    
    for range_name, count in kill_ranges.items():
        percentage = (count / len(sample_df)) * 100
        print(f"   {range_name}: {count} records ({percentage:.1f}%)")
    
    # Check if this looks like per-map data
    if sample_df['kills'].mean() > 40:
        print(f"\n❌ PROBLEM IDENTIFIED!")
        print(f"   Average kills ({sample_df['kills'].mean():.1f}) is too high for per-map data!")
        print(f"   This might be TOTAL MATCH kills, not per-map kills!")
        print(f"   In Valorant, per-map kills should be 10-30, not 40+")
    elif sample_df['kills'].mean() > 30:
        print(f"\n⚠️ SUSPICIOUS:")
        print(f"   Average kills ({sample_df['kills'].mean():.1f}) seems high for per-map data")
        print(f"   Expected per-map average: 15-20 kills")
    else:
        print(f"\n✅ KILLS RANGE LOOKS REASONABLE:")
        print(f"   Average kills ({sample_df['kills'].mean():.1f}) seems appropriate for per-map data")
    
    print(f"\n🔍 NEXT STEPS:")
    print(f"   1. Check if database has separate per-map vs per-match tables")
    print(f"   2. Verify the player_match_stats table structure")
    print(f"   3. Make sure we're not accidentally aggregating data")
    
else:
    print("❌ Please upload your database file first (run Cell 4)")


In [None]:
# 🔍 Cell 10: DEEP DIVE - Database Schema Investigation

print("🔍 INVESTIGATING DATABASE SCHEMA")
print("The data looks wrong - let's check the actual database structure...")

if 'uploaded' in globals() and uploaded:
    db_path = list(uploaded.keys())[0]
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    
    # 1. Check all tables
    print("📋 ALL TABLES IN DATABASE:")
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
    tables = cursor.fetchall()
    for table in tables:
        print(f"   - {table[0]}")
    
    # 2. Check player_match_stats schema
    print(f"\n🔍 PLAYER_MATCH_STATS TABLE SCHEMA:")
    cursor.execute("PRAGMA table_info(player_match_stats);")
    columns = cursor.fetchall()
    for col in columns:
        print(f"   {col[1]} ({col[2]})")
    
    # 3. Count records per table
    print(f"\n📊 RECORD COUNTS:")
    for table in tables:
        table_name = table[0]
        cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
        count = cursor.fetchone()[0]
        print(f"   {table_name}: {count:,} records")
    
    # 4. Sample from player_match_stats with match info
    print(f"\n🎯 RAW PLAYER_MATCH_STATS SAMPLE:")
    cursor.execute("""
        SELECT player_id, match_id, map_id, kills, deaths, assists, acs, kdr
        FROM player_match_stats 
        LIMIT 10
    """)
    raw_data = cursor.fetchall()
    print("   player_id | match_id | map_id | kills | deaths | assists | acs | kdr")
    for row in raw_data:
        print(f"   {row[0]:8} | {row[1]:7} | {row[2]:5} | {row[3]:4} | {row[4]:5} | {row[5]:6} | {row[6]:3} | {row[7]}")
    
    # 5. Check if there are multiple records per player per match
    print(f"\n🔍 CHECKING FOR DUPLICATES/AGGREGATION:")
    cursor.execute("""
        SELECT match_id, map_id, COUNT(*) as player_count
        FROM player_match_stats 
        GROUP BY match_id, map_id
        ORDER BY player_count DESC
        LIMIT 10
    """)
    match_data = cursor.fetchall()
    print("   match_id | map_id | players_in_match")
    for row in match_data:
        print(f"   {row[0]:7} | {row[1]:5} | {row[2]}")
    
    # 6. Check specific match breakdown
    print(f"\n🎮 SINGLE MATCH BREAKDOWN:")
    cursor.execute("""
        SELECT m.match_id, m.match_date, m.series_type, COUNT(DISTINCT pms.map_id) as maps,
               COUNT(*) as total_records, AVG(pms.kills) as avg_kills
        FROM matches m
        JOIN player_match_stats pms ON m.id = pms.match_id
        GROUP BY m.match_id
        LIMIT 5
    """)
    match_breakdown = cursor.fetchall()
    print("   match_id | date | series | maps | records | avg_kills")
    for row in match_breakdown:
        print(f"   {row[0]:7} | {str(row[1])[:10]} | {row[2]:6} | {row[3]:4} | {row[4]:7} | {row[5]:.1f}")
    
    # 7. Look for actual reasonable kill values
    print(f"\n🔍 LOOKING FOR REASONABLE KILL VALUES:")
    cursor.execute("""
        SELECT MIN(kills), MAX(kills), AVG(kills), 
               COUNT(CASE WHEN kills BETWEEN 5 AND 40 THEN 1 END) as reasonable_kills,
               COUNT(*) as total_records
        FROM player_match_stats
    """)
    kill_stats = cursor.fetchone()
    print(f"   Min: {kill_stats[0]}, Max: {kill_stats[1]}, Avg: {kill_stats[2]:.1f}")
    print(f"   Reasonable kills (5-40): {kill_stats[3]:,} / {kill_stats[4]:,} records")
    
    conn.close()
    
    print(f"\n💡 ANALYSIS:")
    if kill_stats[2] > 100:
        print(f"   ❌ Data appears to be CUMULATIVE/AGGREGATED, not per-match")
        print(f"   ❌ Average kills ({kill_stats[2]:.1f}) suggests lifetime/season totals")
        print(f"   ❌ We need to find the correct per-match data source")
    elif kill_stats[2] > 50:
        print(f"   ⚠️ Data might be per-SERIES totals instead of per-MAP")
        print(f"   ⚠️ Need to check if this is aggregated across multiple maps")
    else:
        print(f"   ✅ Kill values look reasonable for per-match data")
        
else:
    print("❌ Please upload your database file first (run Cell 4)")


In [None]:
# 🚨 Cell 11: FINAL DIAGNOSIS - Database Is Corrupted/Unusable

print("🚨 DATABASE CORRUPTION CONFIRMED")
print("This database has severe data quality issues making it unusable for ML training")

if 'uploaded' in globals() and uploaded:
    db_path = list(uploaded.keys())[0]
    conn = sqlite3.connect(db_path)
    
    print(f"\n❌ CRITICAL ISSUES IDENTIFIED:")
    print(f"   1. Negative kills (-653) - impossible in any game")
    print(f"   2. All deaths = 0 - no one dies in 2M+ records?")
    print(f"   3. All assists = 0 - no teamwork in competitive Valorant?")
    print(f"   4. All ACS/KDR = 0.0 - stats not calculated")
    print(f"   5. 996 kills in single match - would need 500+ rounds")
    print(f"   6. Only 0.5% of records have reasonable kill values")
    
    # Check data corruption extent
    cursor = conn.cursor()
    
    print(f"\n🔍 DATA CORRUPTION ANALYSIS:")
    
    # Check for zero deaths/assists
    cursor.execute("SELECT COUNT(*) FROM player_match_stats WHERE deaths = 0")
    zero_deaths = cursor.fetchone()[0]
    cursor.execute("SELECT COUNT(*) FROM player_match_stats WHERE assists = 0")
    zero_assists = cursor.fetchone()[0]
    cursor.execute("SELECT COUNT(*) FROM player_match_stats")
    total = cursor.fetchone()[0]
    
    print(f"   Zero deaths: {zero_deaths:,} / {total:,} ({100*zero_deaths/total:.1f}%)")
    print(f"   Zero assists: {zero_assists:,} / {total:,} ({100*zero_assists/total:.1f}%)")
    
    # Check negative/extreme values
    cursor.execute("SELECT COUNT(*) FROM player_match_stats WHERE kills < 0")
    negative_kills = cursor.fetchone()[0]
    cursor.execute("SELECT COUNT(*) FROM player_match_stats WHERE kills > 100")
    extreme_kills = cursor.fetchone()[0]
    
    print(f"   Negative kills: {negative_kills:,} records")
    print(f"   Extreme kills (>100): {extreme_kills:,} records")
    
    # Check if any data looks reasonable
    cursor.execute("""
        SELECT COUNT(*) FROM player_match_stats 
        WHERE kills BETWEEN 5 AND 40 
        AND deaths BETWEEN 5 AND 40 
        AND assists >= 0 
        AND acs > 0 
        AND kdr > 0
    """)
    reasonable_records = cursor.fetchone()[0]
    
    print(f"   Fully reasonable records: {reasonable_records:,} / {total:,} ({100*reasonable_records/total:.2f}%)")
    
    conn.close()
    
    print(f"\n💡 POSSIBLE CAUSES:")
    print(f"   1. Data import/migration error (most likely)")
    print(f"   2. Database corruption during transfer")
    print(f"   3. Test/dummy data mixed with real data")
    print(f"   4. Cumulative stats instead of per-match data")
    print(f"   5. API scraping errors with missing data handling")
    
    print(f"\n🔧 RECOMMENDED SOLUTIONS:")
    print(f"   ❌ This database cannot be used for ML training")
    print(f"   ✅ Option 1: Re-scrape data with proper validation")
    print(f"   ✅ Option 2: Find a different Valorant dataset")
    print(f"   ✅ Option 3: Use public Valorant APIs (Riot Games API)")
    print(f"   ✅ Option 4: Check if there's a backup/uncorrupted version")
    
    print(f"\n📊 FOR FUTURE DATA COLLECTION:")
    print(f"   - Validate kills/deaths/assists > 0 during import")
    print(f"   - Check kills are reasonable (5-40 per map)")
    print(f"   - Ensure ACS/KDR are calculated correctly")
    print(f"   - Validate match structure (10 players, correct map counts)")
    print(f"   - Add data quality checks during scraping")
    
    print(f"\n🎯 IMMEDIATE ACTION NEEDED:")
    print(f"   1. Stop training attempts with this database")
    print(f"   2. Investigate data collection/import process")
    print(f"   3. Either fix the data or find a new source")
    print(f"   4. Consider starting with a smaller, validated dataset")
    
else:
    print("❌ Please upload your database file first (run Cell 4)")

print(f"\n💡 NOTE: Machine learning requires high-quality data.")
print(f"   Garbage in = Garbage out. Fix the data first, then train models.")
