### A√±adimos SRC a la ra√≠z del proyecto para poder importar el contenido

In [1]:
import sys
import os

# Obtener la ruta absoluta de la carpeta ra√≠z (donde est√° src)
ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))  # Subir un nivel desde notebooks/

# Agregar la carpeta src al path
sys.path.append(os.path.join(ROOT_DIR, "src"))

### Importamos los m√≥dulos necesarios

In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import time
import json
import re
from collections import Counter
from datetime import datetime

from config import SMOGON_SCRAPPING_DATA_PATH

### Comenzamos la recolecci√≥n de datos mediante Scraping

In [None]:
# üìå URL base de Smogon Showdown stats
BASE_URL = "https://www.smogon.com/stats/"

# üìå Rango de a√±os a analizar
YEARS = ["2022", "2023", "2024"]

# üìå Lista de meses a analizar, incluyendo DLCs
MONTHS = [
    f"{year}-{str(month).zfill(2)}" for year in YEARS for month in range(1, 13)
] + ["2023-09-DLC1", "2023-12-DLC2"]  # Incluye meses con DLCs

# üìå Diccionario con los archivos VGC relevantes
VGC_FILES = {
    "2022": ["gen8vgc2022-1760.txt"],  
    "2023": [
        "gen9vgc2023series2-1760.txt",
        "gen9vgc2023regulationc-1760.txt",
        "gen9vgc2023regulationd-1760.txt",
        "gen9vgc2023regulatione-1760.txt"
    ],  
    "2024": [
        "gen9vgc2024regg-1760.txt",
        "gen9vgc2024regh-1760.txt",
        "gen9vgc2024regf-1760.txt"
    ]
}

# üìå Ajuste especial para los meses con DLC
DLC_FILES = {
    "2023-09-DLC1": ["gen9vgc2023regulationd-1760.txt"],
    "2023-12-DLC2": ["gen9vgc2023regulationf-1760.txt"]
}

# üìå Diccionario para almacenar los datos de uso
vgc_usage = {year: {} for year in YEARS}

# üìå Scraping de los archivos correspondientes
for month in MONTHS:
    year = month.split("-")[0]
    
    # Determinar qu√© archivos usar (DLC o est√°ndar)
    files_to_scrape = DLC_FILES.get(month, VGC_FILES.get(year, []))

    for file_name in files_to_scrape:
        url = f"{BASE_URL}{month}/{file_name}"
        
        print(f"üîé Accediendo a {url}")

        try:
            response = requests.get(url)
            if response.status_code == 200:
                lines = response.text.split("\n")

                # üìÑ Mostrar las primeras l√≠neas del archivo para verificar la estructura
                print(f"üìÑ Primeras l√≠neas del archivo ({file_name} en {month}):")
                print("\n".join(lines[:10]))  # Muestra las primeras 10 l√≠neas
                
                for line in lines:
                    # Evitar l√≠neas vac√≠as o encabezados
                    if not line.strip() or "Usage" in line:
                        continue
                    
                    # Intentar dividir la l√≠nea correctamente
                    parts = line.split("|")
                    
                    if len(parts) > 3:
                        try:
                            pokemon_name = parts[2].strip().lower().replace(" ", "-")  # Formato de nombres
                            usage_percent = parts[3].strip().replace("%", "")
                            
                            if pokemon_name and usage_percent:
                                usage_float = float(usage_percent)  # Convertir a float
                                vgc_usage[year].setdefault(pokemon_name, []).append(usage_float)
                        
                        except ValueError:
                            print(f"‚ö†Ô∏è Error al convertir datos en: {line}")

            else:
                print(f"‚ö†Ô∏è No se encontr√≥ data para {file_name} en {month}")

        except Exception as e:
            print(f"‚ùå Error al acceder a {url}: {e}")

    time.sleep(1)  # Evitar sobrecargar Smogon

# üìå Obtener la moda (valor m√°s frecuente) de cada Pok√©mon en cada a√±o
final_usage = {}
for year, data in vgc_usage.items():
    final_usage[year] = {
        pokemon: Counter(usages).most_common(1)[0][0] if usages else "NoUsage"
        for pokemon, usages in data.items()
    }

# üìå Convertir a DataFrame
df_vgc = pd.DataFrame(final_usage).reset_index()
df_vgc.rename(columns={"index": "Pokemon", "2022": "VGCusage2022", "2023": "VGCusage2023", "2024": "VGCusage2024"}, inplace=True)

# üìå Guardar en CSV
df_vgc.to_csv(SMOGON_SCRAPPING_DATA_PATH, index=False)
print("‚úÖ Datos de VGC guardados en {SMOGON_SCRAPPING_DATA_PATH}")



üîé Accediendo a https://www.smogon.com/stats/2022-01/gen8vgc2022-1760.txt
üìÑ Primeras l√≠neas del archivo (gen8vgc2022-1760.txt en 2022-01):
 Total battles: 818865
 Avg. weight/team: 0.002
 + ---- + ------------------ + --------- + ------ + ------- + ------ + ------- + 
 | Rank | Pokemon            | Usage %   | Raw    | %       | Real   | %       | 
 + ---- + ------------------ + --------- + ------ + ------- + ------ + ------- + 
 | 1    | Zacian-Crowned     | 60.39172% | 762697 | 46.570% | 520631 | 69.453% | 
 | 2    | Incineroar         | 58.57608% | 828744 | 50.603% | 366436 | 48.883% | 
 | 3    | Kyogre             | 49.39676% | 592063 | 36.151% | 377266 | 50.328% | 
 | 4    | Regieleki          | 33.66805% | 447449 | 27.321% | 154396 | 20.597% | 
 | 5    | Grimmsnarl         | 26.28657% | 370212 | 22.605% | 133457 | 17.803% | 
üîé Accediendo a https://www.smogon.com/stats/2022-02/gen8vgc2022-1760.txt
üìÑ Primeras l√≠neas del archivo (gen8vgc2022-1760.txt en 2022-02):
 Total

In [5]:
df_vgc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1067 entries, 0 to 1066
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Pokemon       1067 non-null   object 
 1   VGCusage2022  725 non-null    float64
 2   VGCusage2023  624 non-null    float64
 3   VGCusage2024  794 non-null    float64
dtypes: float64(3), object(1)
memory usage: 33.5+ KB
