# Part 2: Data Loading and Preprocessing

In [1]:
import pandas as pd
import numpy as np
import json

In [2]:
# Step 1: Load the JSON dataset
with open('/content/sample_data/all_pokemon_data.json', 'r') as file:
    pokemon_data = json.load(file)

# Convert the JSON data into a Pandas DataFrame
df = pd.DataFrame.from_dict(pokemon_data, orient='index')

In [3]:
# Step 2: Transpose the DataFrame if needed
# By default, each row corresponds to a Pokémon; ensure this is the case.
# If needed, uncomment the following line:
# df = df.T

In [4]:
# Step 3: Save as a pickle file for later use
df.to_pickle('raw_pokemon_data.pkl')

In [5]:
# Step 4: Data Type Conversion (Simple Numerical Fields)
df['National №'] = pd.to_numeric(df['National №'], errors='coerce')
df['Base Exp.'] = pd.to_numeric(df['Base Exp.'], errors='coerce')

In [6]:
# Step 5: Height and Weight Conversion
def convert_to_numeric(value, unit):
    """Extract numeric value from height or weight fields."""
    if pd.isnull(value):
        return np.nan
    try:
        return float(value.split(unit)[0].strip())
    except:
        return np.nan

df['Height_m'] = df['Height'].apply(lambda x: convert_to_numeric(x, 'm'))
df['Weight_kg'] = df['Weight'].apply(lambda x: convert_to_numeric(x, 'kg'))

In [7]:
# Step 6: Catch Rate and Egg Cycles Conversion
def extract_numeric(value):
    """Extract numeric value from string fields with extra text."""
    if pd.isnull(value):
        return np.nan
    return pd.to_numeric(''.join([c for c in value if c.isdigit()]), errors='coerce')

df['Catch Rate'] = df['Catch rate'].apply(extract_numeric)
df['Egg Cycles'] = df['Egg cycles'].apply(extract_numeric)

# Base Friendship Conversion
df['Base Friendship'] = df['Base Friendship'].apply(extract_numeric)

In [8]:
# Step 7: Column Splitting - Types
def split_types(types):
    if isinstance(types, str):
        type_list = types.split()
        if len(type_list) == 2:
            return type_list[0], type_list[1]
        elif len(type_list) == 1:
            return type_list[0], 'None'
    return 'None', 'None'

df['Primary Type'], df['Secondary Type'] = zip(*df['Type'].apply(split_types))
df.drop(columns=['Type'], inplace=True)

In [10]:
# Step 8: Gender Splitting
def split_gender(gender):
    """Extract male and female percentages from gender field."""
    if isinstance(gender, str):
        gender_split = gender.replace('%', '').split(', ')
        if len(gender_split) == 2:
            male, female = gender_split[0].split()[0], gender_split[1].split()[0]
            return float(male), float(female)
        elif 'male' in gender:
            return float(gender.split('%')[0]), 0.0
        elif 'female' in gender:
            return 0.0, float(gender.split('%')[0])
    return np.nan, np.nan

df['Male Percentage'], df['Female Percentage'] = zip(*df['Gender'].apply(split_gender))
df.drop(columns=['Gender'], inplace=True)

In [11]:
# Step 9: Add Generation Column
def assign_generation(national_no):
    if pd.isnull(national_no):
        return np.nan
    if national_no <= 151:
        return 1
    elif national_no <= 251:
        return 2
    elif national_no <= 386:
        return 3
    elif national_no <= 493:
        return 4

df['Generation'] = df['National №'].apply(assign_generation)

In [12]:
# Step 10: Abilities Splitting
def split_abilities(abilities):
    if isinstance(abilities, list):
        if len(abilities) == 2:
            return abilities[0], abilities[1]
        elif len(abilities) == 1:
            return abilities[0], 'None'
    return 'None', 'None'

df['Primary Ability'], df['Secondary Ability'] = zip(*df['Abilities'].apply(split_abilities))
df.drop(columns=['Abilities'], inplace=True)

In [13]:
# Step 11: Save Cleaned DataFrame
cleaned_file_path = 'cleaned_pokemon_data.pkl'
df.to_pickle(cleaned_file_path)
print(f"Cleaned Pokémon data has been saved to {cleaned_file_path}")

Cleaned Pokémon data has been saved to cleaned_pokemon_data.pkl


In [14]:
# Step 12: Evaluation
print(df.info())
print("\nUnique values in Primary Types:", df['Primary Type'].unique())
print("\nUnique values in Secondary Types:", df['Secondary Type'].unique())

<class 'pandas.core.frame.DataFrame'>
Index: 500 entries, Bulbasaur to Luxray
Data columns (total 45 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   name               500 non-null    object 
 1   National №         500 non-null    int64  
 2   Species            500 non-null    object 
 3   Height             500 non-null    object 
 4   Weight             500 non-null    object 
 5   EV yield           500 non-null    object 
 6   Catch rate         500 non-null    object 
 7   Base Friendship    500 non-null    int64  
 8   Base Exp.          500 non-null    int64  
 9   Growth Rate        500 non-null    object 
 10  HP_lv1             500 non-null    object 
 11  HP_min             500 non-null    object 
 12  HP_max             500 non-null    object 
 13  Attack_lv1         500 non-null    object 
 14  Attack_min         500 non-null    object 
 15  Attack_max         500 non-null    object 
 16  Defense_lv1        5