### Añadimos SRC a la raíz del proyecto para poder importar el contenido

In [1]:
import sys
import os

# Obtener la ruta absoluta de la carpeta raíz (donde está src)
ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))  # Subir un nivel desde notebooks/

# Agregar la carpeta src al path
sys.path.append(os.path.join(ROOT_DIR, "src"))

### Importamos los módulos necesarios

In [None]:
import pandas as pd
import time
import requests
import shutil

from pokemon_api import get_pokemon_names, get_all_forms, get_pokemon_data, get_species_data, clean_pokemon_name
from data_processing import save_partial_data, filter_pokemon_variants
from config import POKEAPI_SAVE_PATH_DATA_COLLECTION, POKEAPI_SORTED_FILTER_DATA_PATH, POKEAPI_SORTED_FILTER_DATA_PATH_CLEANED, POKEAPI_FINAL_DATASET_PATH

### Cargamos el dataset recolectado en 01_data_collection.ipynb

In [None]:
# Cargar el dataset original
df_pokemon = pd.read_csv(POKEAPI_SAVE_PATH_DATA_COLLECTION)

In [5]:
df_pokemon.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1304 entries, 0 to 1303
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   index           1304 non-null   int64  
 1   name            1304 non-null   object 
 2   type            1302 non-null   object 
 3   height          1304 non-null   float64
 4   weight          1304 non-null   float64
 5   abilities       1302 non-null   object 
 6   hidden_ability  1002 non-null   object 
 7   hp              1304 non-null   int64  
 8   attack          1304 non-null   int64  
 9   defense         1304 non-null   int64  
 10  sp_atk          1304 non-null   int64  
 11  sp_def          1304 non-null   int64  
 12  speed           1304 non-null   int64  
 13  legendary       1304 non-null   bool   
 14  mythical        1304 non-null   bool   
 15  generation      1304 non-null   object 
dtypes: bool(2), float64(2), int64(7), object(5)
memory usage: 145.3+ KB


In [6]:
print("\n🔎 Valores nulos en el dataset:")
print(df_pokemon.isnull().sum())


🔎 Valores nulos en el dataset:
index               0
name                0
type                2
height              0
weight              0
abilities           2
hidden_ability    302
hp                  0
attack              0
defense             0
sp_atk              0
sp_def              0
speed               0
legendary           0
mythical            0
generation          0
dtype: int64


In [7]:
print("\n📌 Pokémon únicos en el dataset:")
print(df_pokemon["name"].nunique())


📌 Pokémon únicos en el dataset:
1304


In [8]:
print("\n🔍 Duplicados en el dataset:")
print(df_pokemon.duplicated().sum())


🔍 Duplicados en el dataset:
0


### Ordenamos por Nº de Pokedex y Nombre base siempre primero

In [None]:
# Cargar el archivo CSV filtrado
df_final = pd.read_csv(POKEAPI_SAVE_PATH_DATA_COLLECTION)

# Crear una nueva columna para identificar la forma base (sin '-')
df_final["is_base"] = df_final["name"].apply(lambda x: "-" not in x)

# Ordenar por index y asegurarse de que la forma base quede en primer lugar
df_final = df_final.sort_values(by=["index", "is_base"], ascending=[True, False]).reset_index(drop=True)

# Eliminar la columna auxiliar "is_base" después de ordenar
df_final.drop(columns=["is_base"], inplace=True)

# Guardar nuevamente el CSV ordenado correctamente

df_final.to_csv(POKEAPI_SORTED_FILTER_DATA_PATH, index=False)

print(f"✅ Dataset ordenado y limpio guardado correctamente en '{POKEAPI_SORTED_FILTER_DATA_PATH}'.")

# Mostrar las primeras filas para verificar
df_final.head(20)

✅ Dataset ordenado y limpio guardado correctamente en '../data/data_filtered/pokemon_filtered_sorted_data.csv'.


Unnamed: 0,index,name,type,height,weight,abilities,hidden_ability,hp,attack,defense,sp_atk,sp_def,speed,legendary,mythical,generation
0,1,bulbasaur,grass / poison,0.7,6.9,overgrow,chlorophyll,45,49,49,65,65,45,False,False,generation-i
1,2,ivysaur,grass / poison,1.0,13.0,overgrow,chlorophyll,60,62,63,80,80,60,False,False,generation-i
2,3,venusaur,grass / poison,2.0,100.0,overgrow,chlorophyll,80,82,83,100,100,80,False,False,generation-i
3,3,venusaur-mega,grass / poison,2.4,155.5,thick-fat,,80,100,123,122,120,80,False,False,generation-i
4,3,venusaur-gmax,grass / poison,24.0,1000.0,overgrow,chlorophyll,80,82,83,100,100,80,False,False,generation-i
5,4,charmander,fire,0.6,8.5,blaze,solar-power,39,52,43,60,50,65,False,False,generation-i
6,5,charmeleon,fire,1.1,19.0,blaze,solar-power,58,64,58,80,65,80,False,False,generation-i
7,6,charizard,fire / flying,1.7,90.5,blaze,solar-power,78,84,78,109,85,100,False,False,generation-i
8,6,charizard-mega-x,fire / dragon,1.7,110.5,tough-claws,,78,130,111,130,85,100,False,False,generation-i
9,6,charizard-mega-y,fire / flying,1.7,100.5,drought,,78,104,78,159,115,100,False,False,generation-i


### Eliminar a frillish-female y jellicent-female, ya que dan errores

In [None]:
# Cargar el dataset en un DataFrame
df = pd.read_csv(POKEAPI_SORTED_FILTER_DATA_PATH)

# Eliminar las filas con los nombres específicos
df = df[~df["name"].isin(["frillish-female", "jellicent-female"])]

# Guardar el dataset actualizado
df.to_csv(POKEAPI_SORTED_FILTER_DATA_PATH, index=False)

print("✅ Filas eliminadas y dataset actualizado correctamente.")

✅ Filas eliminadas y dataset actualizado correctamente.


### Rellenar valores nulos de "hidden_ability" con valor "None" y crear columna "total_stats" con la suma de todas las estadísticas base

In [None]:
# Cargar el archivo CSV ordenado
df = pd.read_csv(POKEAPI_SORTED_FILTER_DATA_PATH)

# Rellenar valores nulos en la columna "hidden_ability" con "None"
df["hidden_ability"].fillna("No_ability", inplace=True)

# Crear la nueva columna "total_stats" sumando todas las estadísticas base
stats_columns = ["hp", "attack", "defense", "sp_atk", "sp_def", "speed"]
df["total_stats"] = df[stats_columns].sum(axis=1)

columns_to_convert = ["index", "height", "weight", "hp", "attack", "defense", "sp_atk", "sp_def", "speed", "total_stats"]
df[columns_to_convert] = df[columns_to_convert].astype(int)

# Guardar el dataset actualizado
df.to_csv(SORTED_FILTER_DATA_PATH_CLEANED, index=False)

print(f"✅ Dataset actualizado y guardado en '{POKEAPI_SORTED_FILTER_DATA_PATH_CLEANED}'.")

# Mostrar una muestra de los datos para verificar
display(df.head(20))

✅ Dataset actualizado y guardado en '../data/data_filtered/pokemon_filtered_sorted_data_cleaned.csv'.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["hidden_ability"].fillna("None ", inplace=True)


Unnamed: 0,index,name,type,height,weight,abilities,hidden_ability,hp,attack,defense,sp_atk,sp_def,speed,legendary,mythical,generation,total_stats
0,1,bulbasaur,grass / poison,0,6,overgrow,chlorophyll,45,49,49,65,65,45,False,False,generation-i,318
1,2,ivysaur,grass / poison,1,13,overgrow,chlorophyll,60,62,63,80,80,60,False,False,generation-i,405
2,3,venusaur,grass / poison,2,100,overgrow,chlorophyll,80,82,83,100,100,80,False,False,generation-i,525
3,3,venusaur-mega,grass / poison,2,155,thick-fat,,80,100,123,122,120,80,False,False,generation-i,625
4,3,venusaur-gmax,grass / poison,24,1000,overgrow,chlorophyll,80,82,83,100,100,80,False,False,generation-i,525
5,4,charmander,fire,0,8,blaze,solar-power,39,52,43,60,50,65,False,False,generation-i,309
6,5,charmeleon,fire,1,19,blaze,solar-power,58,64,58,80,65,80,False,False,generation-i,405
7,6,charizard,fire / flying,1,90,blaze,solar-power,78,84,78,109,85,100,False,False,generation-i,534
8,6,charizard-mega-x,fire / dragon,1,110,tough-claws,,78,130,111,130,85,100,False,False,generation-i,634
9,6,charizard-mega-y,fire / flying,1,100,drought,,78,104,78,159,115,100,False,False,generation-i,634


### Verificamos que no tenemos valores nulos

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1302 entries, 0 to 1301
Data columns (total 17 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   index           1302 non-null   int64 
 1   name            1302 non-null   object
 2   type            1302 non-null   object
 3   height          1302 non-null   int64 
 4   weight          1302 non-null   int64 
 5   abilities       1302 non-null   object
 6   hidden_ability  1302 non-null   object
 7   hp              1302 non-null   int64 
 8   attack          1302 non-null   int64 
 9   defense         1302 non-null   int64 
 10  sp_atk          1302 non-null   int64 
 11  sp_def          1302 non-null   int64 
 12  speed           1302 non-null   int64 
 13  legendary       1302 non-null   bool  
 14  mythical        1302 non-null   bool  
 15  generation      1302 non-null   object
 16  total_stats     1302 non-null   int64 
dtypes: bool(2), int64(10), object(5)
memory usage: 155.2

### Ya tenemos el dataset final, lo renombramos a pokemon_dataset_final.csv

In [None]:
# Ruta del archivo original y nuevo nombre
old_path = POKEAPI_SORTED_FILTER_DATA_PATH_CLEANED
new_path = POKEAPI_FINAL_DATASET_PATH

# Copiar el archivo
shutil.copy(old_path, new_path)

print("✅ El archivo ha sido copiado correctamente como 'pokemon_dataset_final.csv'.")

✅ El archivo ha sido copiado correctamente como 'pokemon_dataset_final.csv'.


In [61]:
df = pd.read_csv(FINAL_DATASET_PATH)
df.head(12)

Unnamed: 0,index,name,type,height,weight,abilities,hidden_ability,hp,attack,defense,sp_atk,sp_def,speed,legendary,mythical,generation,total_stats
0,1,bulbasaur,grass / poison,0,6,overgrow,chlorophyll,45,49,49,65,65,45,False,False,generation-i,318
1,2,ivysaur,grass / poison,1,13,overgrow,chlorophyll,60,62,63,80,80,60,False,False,generation-i,405
2,3,venusaur,grass / poison,2,100,overgrow,chlorophyll,80,82,83,100,100,80,False,False,generation-i,525
3,3,venusaur-mega,grass / poison,2,155,thick-fat,,80,100,123,122,120,80,False,False,generation-i,625
4,3,venusaur-gmax,grass / poison,24,1000,overgrow,chlorophyll,80,82,83,100,100,80,False,False,generation-i,525
5,4,charmander,fire,0,8,blaze,solar-power,39,52,43,60,50,65,False,False,generation-i,309
6,5,charmeleon,fire,1,19,blaze,solar-power,58,64,58,80,65,80,False,False,generation-i,405
7,6,charizard,fire / flying,1,90,blaze,solar-power,78,84,78,109,85,100,False,False,generation-i,534
8,6,charizard-mega-x,fire / dragon,1,110,tough-claws,,78,130,111,130,85,100,False,False,generation-i,634
9,6,charizard-mega-y,fire / flying,1,100,drought,,78,104,78,159,115,100,False,False,generation-i,634


### Cambiamos la posicion de la columna total_stats

In [None]:
# Cargar el archivo CSV ordenado
df = pd.read_csv(POKEAPI_FINAL_DATASET_PATH)

# Lista de columnas en el orden deseado
cols = list(df.columns)  # Obtener todas las columnas
cols.remove("total_stats")  # Quitar 'total_stats' de su posición original

# Buscar la posición de 'speed' y colocar 'total_stats' después
speed_index = cols.index("speed") + 1
cols.insert(speed_index, "total_stats")  # Insertar después de 'speed'

# Reordenar el DataFrame con las columnas ajustadas
df = df[cols]

# Mostrar el DataFrame actualizado
df.head() 

# Guardar el dataset actualizado
df.to_csv(POKEAPI_FINAL_DATASET_PATH, index=False)

print(f"✅ Dataset actualizado y guardado en '{POKEAPI_FINAL_DATASET_PATH}'.")

✅ Dataset actualizado y guardado en '../data/pokemon_dataset_final.csv'.


In [None]:
# Cargar el archivo CSV ordenado
df = pd.read_csv(POKEAPI_FINAL_DATASET_PATH)

# Separar la columna 'type' en 'type1' y 'type2'
df[["type1", "type2"]] = df["type"].str.split(" / ", expand=True)
df["type2"].fillna("None", inplace=True)  # Rellenar NaN con 'None'

# Separar la columna 'abilities' en 'ability1' y 'ability2'
df[["ability1", "ability2"]] = df["abilities"].str.split(" / ", expand=True)
df["ability2"].fillna("None", inplace=True)  # Rellenar NaN con 'None'

# Eliminar las columnas originales
df.drop(columns=["type", "abilities"], inplace=True)

# Definir el nuevo orden de columnas
column_order = ["index", "name", "type1", "type2", "ability1", "ability2", "hidden_ability"] + \
               [col for col in df.columns if col not in ["index", "name", "type1", "type2", "ability1", "ability2", "hidden_ability"]]

# Reordenar el DataFrame
df = df[column_order]

# Mostrar el DataFrame actualizado
df.head()

# Guardar el dataset actualizado
df.to_csv(POKEAPI_FINAL_DATASET_PATH, index=False)

print(f"✅ Dataset actualizado y guardado en '{POKEAPI_FINAL_DATASET_PATH}'.")

✅ Dataset actualizado y guardado en '../data/pokemon_dataset_final.csv'.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["type2"].fillna("None", inplace=True)  # Rellenar NaN con 'None'
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["ability2"].fillna("None", inplace=True)  # Rellenar NaN con 'None'


In [16]:
df.head()

Unnamed: 0,index,name,type1,type2,ability1,ability2,hidden_ability,height,weight,hp,attack,defense,sp_atk,sp_def,speed,total_stats,legendary,mythical,generation
0,1,bulbasaur,grass,poison,overgrow,,chlorophyll,0,6,45,49,49,65,65,45,318,False,False,generation-i
1,2,ivysaur,grass,poison,overgrow,,chlorophyll,1,13,60,62,63,80,80,60,405,False,False,generation-i
2,3,venusaur,grass,poison,overgrow,,chlorophyll,2,100,80,82,83,100,100,80,525,False,False,generation-i
3,3,venusaur-mega,grass,poison,thick-fat,,,2,155,80,100,123,122,120,80,625,False,False,generation-i
4,3,venusaur-gmax,grass,poison,overgrow,,chlorophyll,24,1000,80,82,83,100,100,80,525,False,False,generation-i


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1302 entries, 0 to 1301
Data columns (total 19 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   index           1302 non-null   int64 
 1   name            1302 non-null   object
 2   type1           1302 non-null   object
 3   type2           1302 non-null   object
 4   ability1        1302 non-null   object
 5   ability2        1302 non-null   object
 6   hidden_ability  1302 non-null   object
 7   height          1302 non-null   int64 
 8   weight          1302 non-null   int64 
 9   hp              1302 non-null   int64 
 10  attack          1302 non-null   int64 
 11  defense         1302 non-null   int64 
 12  sp_atk          1302 non-null   int64 
 13  sp_def          1302 non-null   int64 
 14  speed           1302 non-null   int64 
 15  total_stats     1302 non-null   int64 
 16  legendary       1302 non-null   bool  
 17  mythical        1302 non-null   bool  
 18  generati