# üìä An√°lise Completa do Modelo UNDER/OVER Total Kills

Este notebook cont√©m an√°lise completa do modelo de ML para prever se total_kills ser√° OVER ou UNDER da m√©dia da liga.

## 1. Prepara√ß√£o dos Dados

In [1]:
# Prepara dados diretamente (importa o m√≥dulo ao inv√©s de subprocess)
import sys
import os
from pathlib import Path

# Ajusta path para importar m√≥dulos
current_dir = Path.cwd()
# Se n√£o estiver no diret√≥rio machine_learning, tenta encontrar
if not (current_dir / "data_preparation.py").exists():
    possible_dirs = [
        current_dir / "machine_learning",
        current_dir.parent / "machine_learning"
    ]
    for ml_dir in possible_dirs:
        if (ml_dir / "data_preparation.py").exists():
            os.chdir(ml_dir)
            sys.path.insert(0, str(ml_dir))
            break

print("Preparando dados...")
try:
    # Importa e executa diretamente
    from data_preparation import main as prep_main
    prep_main()
    print("Dados preparados com sucesso!")
except Exception as e:
    print(f"Erro ao preparar dados: {e}")
    print("Continuando com dados existentes (se j√° foram preparados)...")

Preparando dados...


Exception in thread Thread-3 (_readerthread):
Traceback (most recent call last):
  File [35m"C:\Users\Matheus\AppData\Local\Python\pythoncore-3.14-64\Lib\threading.py"[0m, line [35m1082[0m, in [35m_bootstrap_inner[0m
    [31mself._context.run[0m[1;31m(self.run)[0m
    [31m~~~~~~~~~~~~~~~~~[0m[1;31m^^^^^^^^^^[0m
  File [35m"C:\Users\Matheus\AppData\Local\Python\pythoncore-3.14-64\Lib\threading.py"[0m, line [35m1024[0m, in [35mrun[0m
    [31mself._target[0m[1;31m(*self._args, **self._kwargs)[0m
    [31m~~~~~~~~~~~~[0m[1;31m^^^^^^^^^^^^^^^^^^^^^^^^^^^^^[0m
  File [35m"C:\Users\Matheus\AppData\Local\Python\pythoncore-3.14-64\Lib\subprocess.py"[0m, line [35m1613[0m, in [35m_readerthread[0m
    buffer.append([31mfh.read[0m[1;31m()[0m)
                  [31m~~~~~~~[0m[1;31m^^[0m
  File [35m"C:\Users\Matheus\AppData\Local\Python\pythoncore-3.14-64\Lib\encodings\cp1252.py"[0m, line [35m23[0m, in [35mdecode[0m
    return [31mcodecs.charmap_decode

None


## 2. Carregamento dos Dados

In [2]:
import pandas as pd
import numpy as np
import pickle
from pathlib import Path
import os
import sys

# Tenta importar matplotlib e seaborn, se n√£o estiver instalado, instala
try:
    import matplotlib.pyplot as plt
    import seaborn as sns
except ImportError:
    print("Instalando matplotlib e seaborn...")
    import subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "matplotlib", "seaborn", "-q", "--quiet"])
    import matplotlib.pyplot as plt
    import seaborn as sns

from sklearn.metrics import (
    roc_curve, auc, precision_recall_curve, average_precision_score,
    confusion_matrix, classification_report, f1_score, precision_score, recall_score
)

# Configura√ß√£o
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)

# Ajusta diret√≥rio de trabalho - garante que estamos no diret√≥rio correto
current_dir = Path.cwd()
if not (current_dir / "data_preparation.py").exists():
    # Tenta encontrar o diret√≥rio machine_learning
    possible_paths = [
        current_dir / "machine_learning",
        current_dir.parent / "machine_learning"
    ]
    for path in possible_paths:
        if path.exists() and (path / "data_preparation.py").exists():
            os.chdir(path)
            current_dir = path
            break

DATA_DIR = current_dir / "data"

# Carrega dados
features_df = pd.read_csv(DATA_DIR / "features.csv")
labels = np.load(DATA_DIR / "labels.npy")

with open(DATA_DIR / "league_stats.pkl", "rb") as f:
    league_stats = pickle.load(f)

print(f"Features shape: {features_df.shape}")
print(f"Labels shape: {labels.shape}")
print(f"\nDistribuicao de labels:")
print(f"  UNDER (0): {np.sum(labels == 0)} ({np.sum(labels == 0)/len(labels)*100:.1f}%)")
print(f"  OVER (1): {np.sum(labels == 1)} ({np.sum(labels == 1)/len(labels)*100:.1f}%)")

: 

## 3. An√°lise Explorat√≥ria

In [None]:
# Estat√≠sticas por liga
print("Estat√≠sticas por liga:")
for league in sorted(league_stats.keys()):
    stats = league_stats[league]
    print(f"  {league:8s}: m√©dia={stats['mean']:5.2f}, std={stats['std']:5.2f}")

In [None]:
# Visualiza√ß√£o das m√©dias por liga
leagues = sorted(league_stats.keys())
means = [league_stats[lg]['mean'] for lg in leagues]
stds = [league_stats[lg]['std'] for lg in leagues]

plt.figure(figsize=(14, 6))
plt.bar(leagues, means, yerr=stds, capsize=5, alpha=0.7, color='steelblue')
plt.xlabel('Liga', fontsize=12)
plt.ylabel('M√©dia de Total Kills', fontsize=12)
plt.title('M√©dia de Total Kills por Liga', fontsize=14, fontweight='bold')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

print(f"\nVaria√ß√£o entre ligas: {max(means) - min(means):.2f} kills")

## 4. Treinamento do Modelo

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Prepara dados
X = features_df.values
y = labels

# Split train/test - guarda √≠ndices para an√°lise posterior
indices = np.arange(len(X))
X_train, X_test, y_train, y_test, train_idx, test_idx = train_test_split(
    X, y, indices, test_size=0.2, random_state=42, stratify=y
)

print(f"Train: {len(X_train)} amostras")
print(f"Test: {len(X_test)} amostras")
print(f"\nDistribui√ß√£o train: UNDER={np.sum(y_train == 0)}, OVER={np.sum(y_train == 1)}")
print(f"Distribui√ß√£o test: UNDER={np.sum(y_test == 0)}, OVER={np.sum(y_test == 1)}")
print(f"\n√çndices salvos: train_idx (primeiros 5)={train_idx[:5]}, test_idx (primeiros 5)={test_idx[:5]}")

In [None]:
# Normaliza√ß√£o
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Treina modelo
model = LogisticRegression(
    max_iter=1000,
    random_state=42,
    class_weight='balanced'
)

model.fit(X_train_scaled, y_train)

# Predi√ß√µes
y_pred = model.predict(X_test_scaled)
y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]

print("Modelo treinado com sucesso!")

## 5. M√©tricas de Performance

In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score

# M√©tricas b√°sicas
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("=" * 60)
print("M√âTRICAS DO MODELO")
print("=" * 60)
print(f"\nAccuracy:  {accuracy:.4f}")
print(f"ROC-AUC:    {roc_auc:.4f}")
print(f"Precision:  {precision:.4f}")
print(f"Recall:     {recall:.4f}")
print(f"F1-Score:   {f1:.4f}")

In [None]:
# Classification Report detalhado
print("\n" + "=" * 60)
print("CLASSIFICATION REPORT")
print("=" * 60)
print(classification_report(y_test, y_pred, target_names=['UNDER', 'OVER']))

## 6. Curva ROC

In [None]:
# Calcula curva ROC
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

# Plota curva ROC
plt.figure(figsize=(10, 8))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random Classifier')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.title('ROC Curve - Modelo UNDER/OVER Total Kills', fontsize=14, fontweight='bold')
plt.legend(loc="lower right", fontsize=11)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

print(f"\nAUC-ROC: {roc_auc:.4f}")

## 7. Precision-Recall Curve

In [None]:
# Calcula Precision-Recall curve
precision_curve, recall_curve, pr_thresholds = precision_recall_curve(y_test, y_pred_proba)
avg_precision = average_precision_score(y_test, y_pred_proba)

# Plota Precision-Recall curve
plt.figure(figsize=(10, 8))
plt.plot(recall_curve, precision_curve, color='darkblue', lw=2, 
         label=f'PR curve (AP = {avg_precision:.4f})')
plt.xlabel('Recall', fontsize=12)
plt.ylabel('Precision', fontsize=12)
plt.title('Precision-Recall Curve', fontsize=14, fontweight='bold')
plt.legend(loc="lower left", fontsize=11)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

print(f"\nAverage Precision: {avg_precision:.4f}")

## 8. F1-Score por Threshold

In [None]:
# Calcula F1 para diferentes thresholds
thresholds_range = np.arange(0.1, 1.0, 0.05)
f1_scores = []

for threshold in thresholds_range:
    y_pred_thresh = (y_pred_proba >= threshold).astype(int)
    f1 = f1_score(y_test, y_pred_thresh)
    f1_scores.append(f1)

# Plota F1 por threshold
plt.figure(figsize=(10, 6))
plt.plot(thresholds_range, f1_scores, marker='o', color='green', lw=2)
plt.xlabel('Threshold', fontsize=12)
plt.ylabel('F1-Score', fontsize=12)
plt.title('F1-Score por Threshold', fontsize=14, fontweight='bold')
plt.grid(alpha=0.3)
plt.axvline(x=0.5, color='red', linestyle='--', label='Threshold padr√£o (0.5)')
plt.legend()
plt.tight_layout()
plt.show()

# Melhor threshold
best_idx = np.argmax(f1_scores)
best_threshold = thresholds_range[best_idx]
best_f1 = f1_scores[best_idx]
print(f"\nMelhor threshold: {best_threshold:.2f} (F1 = {best_f1:.4f})")

## 9. Confusion Matrix

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

# Plota confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['UNDER', 'OVER'], 
            yticklabels=['UNDER', 'OVER'])
plt.ylabel('True Label', fontsize=12)
plt.xlabel('Predicted Label', fontsize=12)
plt.title('Confusion Matrix', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

# M√©tricas da confusion matrix
tn, fp, fn, tp = cm.ravel()
print(f"\nTrue Negatives (UNDER predito corretamente):  {tn}")
print(f"False Positives (OVER predito incorretamente): {fp}")
print(f"False Negatives (UNDER predito incorretamente): {fn}")
print(f"True Positives (OVER predito corretamente):     {tp}")

## 10. Distribui√ß√£o de Probabilidades

In [None]:
# Histograma de probabilidades
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(y_pred_proba[y_test == 0], bins=20, alpha=0.7, label='UNDER (True)', color='red')
plt.hist(y_pred_proba[y_test == 1], bins=20, alpha=0.7, label='OVER (True)', color='green')
plt.xlabel('Probabilidade Predita (OVER)', fontsize=11)
plt.ylabel('Frequ√™ncia', fontsize=11)
plt.title('Distribui√ß√£o de Probabilidades', fontsize=12, fontweight='bold')
plt.legend()
plt.grid(alpha=0.3)

plt.subplot(1, 2, 2)
plt.boxplot([y_pred_proba[y_test == 0], y_pred_proba[y_test == 1]], 
            labels=['UNDER', 'OVER'])
plt.ylabel('Probabilidade Predita (OVER)', fontsize=11)
plt.title('Boxplot de Probabilidades', fontsize=12, fontweight='bold')
plt.grid(alpha=0.3)

plt.tight_layout()
plt.show()

## 11. An√°lise por Liga

In [None]:
# Carrega dados originais para an√°lise por liga
df_original = pd.read_csv("../database_improved/data_transformed.csv")

# Cria dataframe de an√°lise usando os √≠ndices do test set
# test_idx foi salvo na c√©lula anterior
df_analysis = pd.DataFrame(index=test_idx)
df_analysis['y_test'] = y_test
df_analysis['y_pred'] = y_pred
df_analysis['y_pred_proba'] = y_pred_proba

# Adiciona informa√ß√µes da liga do dataframe original
# Alinha pelos √≠ndices do dataframe original
df_analysis['league'] = df_original.loc[test_idx, 'league'].values

# M√©tricas por liga (apenas test set)
print("M√©tricas por Liga (Test Set):")
print("=" * 80)
for league in sorted(df_analysis['league'].unique()):
    league_data = df_analysis[df_analysis['league'] == league]
    if len(league_data) > 0:
        league_y_test = league_data['y_test'].values
        league_y_pred = league_data['y_pred'].values
        
        if len(np.unique(league_y_test)) > 1:  # Precisa ter ambas as classes
            acc = accuracy_score(league_y_test, league_y_pred)
            f1 = f1_score(league_y_test, league_y_pred)
            print(f"{league:8s}: Accuracy={acc:.3f}, F1={f1:.3f}, Amostras={len(league_data)}")
        elif len(league_data) > 0:
            # Liga com apenas uma classe no test set
            acc = accuracy_score(league_y_test, league_y_pred)
            print(f"{league:8s}: Accuracy={acc:.3f}, Amostras={len(league_data)} (apenas uma classe)")

## 12. Teste de Predi√ß√£o

In [None]:
# Usa o modelo j√° treinado nesta sess√£o
# Carrega dados necess√°rios
with open(DATA_DIR / "champion_impacts.pkl", "rb") as f:
    champion_impacts_loaded = pickle.load(f)
league_stats_loaded = league_stats
feature_columns_loaded = list(features_df.columns)

# Fun√ß√£o auxiliar para criar features (mesma l√≥gica do predict.py)
def create_features_from_game_local(game_data, league_stats, champion_impacts, feature_columns):
    league = game_data['league']
    league_impacts = champion_impacts.get(league, {})
    
    def normalize_champ(champ):
        if not champ:
            return ''
        return str(champ).strip()
    
    top_t1_impact = league_impacts.get(normalize_champ(game_data.get('top_t1', '')), 0.0)
    jung_t1_impact = league_impacts.get(normalize_champ(game_data.get('jung_t1', '')), 0.0)
    mid_t1_impact = league_impacts.get(normalize_champ(game_data.get('mid_t1', '')), 0.0)
    adc_t1_impact = league_impacts.get(normalize_champ(game_data.get('adc_t1', '')), 0.0)
    sup_t1_impact = league_impacts.get(normalize_champ(game_data.get('sup_t1', '')), 0.0)
    
    top_t2_impact = league_impacts.get(normalize_champ(game_data.get('top_t2', '')), 0.0)
    jung_t2_impact = league_impacts.get(normalize_champ(game_data.get('jung_t2', '')), 0.0)
    mid_t2_impact = league_impacts.get(normalize_champ(game_data.get('mid_t2', '')), 0.0)
    adc_t2_impact = league_impacts.get(normalize_champ(game_data.get('adc_t2', '')), 0.0)
    sup_t2_impact = league_impacts.get(normalize_champ(game_data.get('sup_t2', '')), 0.0)
    
    team1_avg_impact = np.mean([top_t1_impact, jung_t1_impact, mid_t1_impact, adc_t1_impact, sup_t1_impact])
    team2_avg_impact = np.mean([top_t2_impact, jung_t2_impact, mid_t2_impact, adc_t2_impact, sup_t2_impact])
    impact_diff = team1_avg_impact - team2_avg_impact
    
    league_mean = league_stats.get(league, {}).get('mean', 0.0)
    league_std = league_stats.get(league, {}).get('std', 0.0)
    
    feature_dict = {
        'league_mean': league_mean,
        'league_std': league_std,
        'team1_avg_impact': team1_avg_impact,
        'team2_avg_impact': team2_avg_impact,
        'impact_diff': impact_diff,
        'top_t1_impact': top_t1_impact,
        'jung_t1_impact': jung_t1_impact,
        'mid_t1_impact': mid_t1_impact,
        'adc_t1_impact': adc_t1_impact,
        'sup_t1_impact': sup_t1_impact,
        'top_t2_impact': top_t2_impact,
        'jung_t2_impact': jung_t2_impact,
        'mid_t2_impact': mid_t2_impact,
        'adc_t2_impact': adc_t2_impact,
        'sup_t2_impact': sup_t2_impact,
    }
    
    for col in feature_columns:
        if col.startswith('league_') and col != 'league_mean' and col != 'league_std':
            liga_name = col.replace('league_', '')
            feature_dict[col] = 1.0 if liga_name == league else 0.0
    
    features = np.array([feature_dict.get(col, 0.0) for col in feature_columns])
    return features.reshape(1, -1)

# Exemplo de jogo
game_example = {
    'league': 'LCK',
    'top_t1': 'Aatrox',
    'jung_t1': 'Graves',
    'mid_t1': 'Azir',
    'adc_t1': 'Jinx',
    'sup_t1': 'Thresh',
    'top_t2': 'Gnar',
    'jung_t2': 'Sejuani',
    'mid_t2': 'Orianna',
    'adc_t2': 'Aphelios',
    'sup_t2': 'Braum'
}

# Predi√ß√£o para m√©dia da liga usando modelo da sess√£o
X_game = create_features_from_game_local(game_example, league_stats_loaded, 
                                         champion_impacts_loaded, feature_columns_loaded)
X_game_scaled = scaler.transform(X_game)
prob_over = model.predict_proba(X_game_scaled)[0, 1]

pred_mean = {
    'league_mean': league_stats_loaded.get(game_example['league'], {}).get('mean', 0.0),
    'probability_over_mean': prob_over,
    'probability_under_mean': 1 - prob_over,
    'prediction': 'OVER' if prob_over >= 0.5 else 'UNDER',
    'confidence': 'High' if prob_over >= 0.70 or prob_over <= 0.30 else 'Medium'
}

print("Exemplo de Predi√ß√£o:")
print(f"Liga: {game_example['league']}")
print(f"M√©dia da liga: {pred_mean['league_mean']:.2f} kills")
print(f"Probabilidade OVER m√©dia: {pred_mean['probability_over_mean']:.1%}")
print(f"Predi√ß√£o: {pred_mean['prediction']} (Confian√ßa: {pred_mean['confidence']})")

# Predi√ß√£o para linha espec√≠fica
betting_line = 28.5
league_mean = league_stats_loaded.get(game_example['league'], {}).get('mean', 0.0)
league_std = league_stats_loaded.get(game_example['league'], {}).get('std', 1.0)
prob_over_mean = pred_mean['probability_over_mean']

if league_std > 0:
    z_score = (betting_line - league_mean) / league_std
    adjustment = 1 / (1 + np.exp(-z_score * 0.5))
    if betting_line > league_mean:
        prob_over_line = prob_over_mean * (1 - adjustment * 0.3)
    else:
        prob_over_line = prob_over_mean + (1 - prob_over_mean) * adjustment * 0.3
    prob_over_line = np.clip(prob_over_line, 0.0, 1.0)
else:
    prob_over_line = prob_over_mean

pred_line = {
    'probability_over_line': prob_over_line,
    'probability_under_line': 1 - prob_over_line,
    'bet_over': prob_over_line >= 0.55,
    'bet_under': (1 - prob_over_line) >= 0.55
}

print(f"\nPara linha da casa {betting_line}:")
print(f"Probabilidade OVER {betting_line}: {pred_line['probability_over_line']:.1%}")
if pred_line['bet_over']:
    print(f"Recomenda√ß√£o: APOSTAR OVER {betting_line}")
elif pred_line['bet_under']:
    print(f"Recomenda√ß√£o: APOSTAR UNDER {betting_line}")

## 13. Resumo Final

In [None]:
print("=" * 60)
print("RESUMO FINAL DO MODELO")
print("=" * 60)
print(f"\nDataset:")
print(f"  Total de amostras: {len(features_df)}")
print(f"  Train: {len(X_train)}")
print(f"  Test: {len(X_test)}")
print(f"\nPerformance:")
print(f"  Accuracy:  {accuracy:.4f}")
print(f"  ROC-AUC:    {roc_auc:.4f}")
print(f"  Precision:  {precision:.4f}")
print(f"  Recall:     {recall:.4f}")
print(f"  F1-Score:   {f1:.4f}")

# Verifica se best_threshold foi definido
try:
    print(f"\nMelhor Threshold: {best_threshold:.2f} (F1 = {best_f1:.4f})")
except NameError:
    print(f"\nMelhor Threshold: 0.50 (padr√£o)")

print(f"\nModelo treinado com sucesso e pronto para uso!")