In [1]:
# Cell 1: Mount Drive
from google.colab import drive
drive.mount('/content/drive')

# Create project folder in Drive
!mkdir -p "/content/drive/MyDrive/PokemonDataScience/data/raw"
!mkdir -p "/content/drive/MyDrive/PokemonDataScience/data/processed"
!mkdir -p "/content/drive/MyDrive/PokemonDataScience/data/forML"

Mounted at /content/drive


In [2]:
# Cell 2: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ast import literal_eval
import warnings
warnings.filterwarnings('ignore')

print("Libraries loaded successfully!")

Libraries loaded successfully!


In [3]:
# Cell 3: Load Raw Data
df = pd.read_csv('/content/drive/MyDrive/PokemonDataScience/data/raw/Pokemon.csv')

print(f"Dataset loaded: {df.shape[0]} rows, {df.shape[1]} columns")
print("\nColumn names:")
print(df.columns.tolist())
print("\nFirst 5 rows:")
df.head()


Dataset loaded: 1179 rows, 47 columns

Column names:
['DexNumber', 'Name', 'Type', 'Abilities', 'HiddenAbility', 'Generation', 'Hp', 'Attack', 'Defense', 'SpecialAttack', 'SpecialDefense', 'Speed', 'TotalStats', 'Weight', 'Height', 'GenderProbM', 'Category', 'CatchRate', 'EggCycles', 'EggGroup', 'LevelingRate', 'BaseFriendship', 'IsLegendary', 'IsMythical', 'IsUltraBeast', 'HasMega', 'EvoStage', 'TotalEvoStages', 'PreevoName', 'DamageFromNormal', 'DamageFromFighting', 'DamageFromFlying', 'DamageFromPoison', 'DamageFromGround', 'DamageFromRock', 'DamageFromBug', 'DamageFromGhost', 'DamageFromSteel', 'DamageFromFire', 'DamageFromWater', 'DamageFromGrass', 'DamageFromElectric', 'DamageFromPsychic', 'DamageFromIce', 'DamageFromDragon', 'DamageFromDark', 'DamageFromFairy']

First 5 rows:


Unnamed: 0,DexNumber,Name,Type,Abilities,HiddenAbility,Generation,Hp,Attack,Defense,SpecialAttack,...,DamageFromSteel,DamageFromFire,DamageFromWater,DamageFromGrass,DamageFromElectric,DamageFromPsychic,DamageFromIce,DamageFromDragon,DamageFromDark,DamageFromFairy
0,494,Victini,"['Psychic', 'Fire']",['Victory Star'],[],V,100,100,100,100,...,0.5,0.5,2.0,0.5,1.0,0.5,0.5,1.0,2.0,0.5
1,1,Bulbasaur,"['Grass', 'Poison']",['Overgrow'],['Chlorophyll'],I,45,49,49,65,...,1.0,2.0,0.5,0.25,0.5,2.0,2.0,1.0,1.0,0.5
2,2,Ivysaur,"['Grass', 'Poison']",['Overgrow'],['Chlorophyll'],I,60,62,63,80,...,1.0,2.0,0.5,0.25,0.5,2.0,2.0,1.0,1.0,0.5
3,3,Venusaur,"['Grass', 'Poison']",['Overgrow'],['Chlorophyll'],I,80,82,83,100,...,1.0,2.0,0.5,0.25,0.5,2.0,2.0,1.0,1.0,0.5
4,4,Charmander,['Fire'],['Blaze'],['Solar Power'],I,39,52,43,60,...,0.5,0.5,2.0,0.5,1.0,1.0,0.5,1.0,1.0,0.5


In [4]:
# Cell 4: Data Cleaning and Type Conversion

# Convert string representations of lists to actual lists
def safe_eval(x):
    """Safely evaluate string representation of list"""
    if pd.isna(x):
        return []
    if isinstance(x, list):
        return x
    try:
        result = literal_eval(x)
        return result if isinstance(result, list) else [result]
    except:
        return [x] if x else []

# Apply to list columns
list_columns = ['Type', 'Abilities', 'EggGroup']
for col in list_columns:
    if col in df.columns:
        df[col] = df[col].apply(safe_eval)

print("Data types converted successfully!")
print(f"\nSample Type column: {df['Type'].iloc[0]}")
print(f"Sample Abilities column: {df['Abilities'].iloc[0]}")


Data types converted successfully!

Sample Type column: ['Psychic', 'Fire']
Sample Abilities column: ['Victory Star']


In [5]:
# Cell 5: Split Compound Features into Atomic Features

# Split Type into Type1 and Type2
df['Type1'] = df['Type'].apply(lambda x: x[0] if len(x) > 0 else None)
df['Type2'] = df['Type'].apply(lambda x: x[1] if len(x) > 1 else None)

# Split Abilities into Ability1 and Ability2
df['Ability1'] = df['Abilities'].apply(lambda x: x[0] if len(x) > 0 else None)
df['Ability2'] = df['Abilities'].apply(lambda x: x[1] if len(x) > 1 else None)

# Split EggGroup into EggGroup1 and EggGroup2
df['EggGroup1'] = df['EggGroup'].apply(lambda x: x[0] if len(x) > 0 else None)
df['EggGroup2'] = df['EggGroup'].apply(lambda x: x[1] if len(x) > 1 else None)

# Create NoGender feature
df['NoGender'] = df['GenderProbM'].isna().astype(int)

print("Compound features split successfully!")
print(f"\nNew columns created:")
print("- Type1, Type2")
print("- Ability1, Ability2")
print("- EggGroup1, EggGroup2")
print("- NoGender")


Compound features split successfully!

New columns created:
- Type1, Type2
- Ability1, Ability2
- EggGroup1, EggGroup2
- NoGender


In [6]:
# Cell 6: Handle Missing Values

# Fill missing values appropriately
df['HiddenAbility'].fillna('None', inplace=True)
df['Type2'].fillna('None', inplace=True)
df['Ability2'].fillna('None', inplace=True)
df['EggGroup2'].fillna('None', inplace=True)
df['PreevoName'].fillna('None', inplace=True)

# Fill numerical missing values with median
numerical_cols = ['Hp', 'Attack', 'Defense', 'SpecialAttack', 'SpecialDefense',
                  'Speed', 'Weight', 'Height', 'CatchRate', 'EggCycles',
                  'BaseFriendship']

for col in numerical_cols:
    if col in df.columns:
        df[col].fillna(df[col].median(), inplace=True)

print("Missing values handled!")
print(f"\nMissing values per column:")
print(df.isnull().sum()[df.isnull().sum() > 0])


Missing values handled!

Missing values per column:
Ability1    2
dtype: int64


In [7]:
# Cell 7: Calculate Additional Statistics

# Calculate geometric means for sweepers and walls
df['PhysicalSweeper'] = np.sqrt(df['Attack'] * df['Speed'])
df['SpecialSweeper'] = np.sqrt(df['SpecialAttack'] * df['Speed'])
df['PhysicalWall'] = np.sqrt(df['Hp'] * df['Defense'])
df['SpecialWall'] = np.sqrt(df['Hp'] * df['SpecialDefense'])

print("Additional statistics calculated!")
print("\nTop 5 Physical Sweepers:")
print(df.nlargest(5, 'PhysicalSweeper')[['Name', 'Attack', 'Speed', 'PhysicalSweeper']])


Additional statistics calculated!

Top 5 Physical Sweepers:
                              Name  Attack  Speed  PhysicalSweeper
432            Deoxys Attack Forme     180    150       164.316767
431            Deoxys Normal Forme     150    150       150.000000
1020          Zacian Crowned Sword     150    148       148.996644
626   Galarian Darmanitan Zen Mode     160    135       146.969385
924             Necrozma Dusk Mane     167    129       146.775339


In [8]:
# Cell 8: Basic Single Feature Statistics

print("=" * 60)
print("SINGLE FEATURE STATISTICS")
print("=" * 60)

# Type Distribution
print("\n1. TYPE DISTRIBUTION")
print("-" * 40)
type1_counts = df['Type1'].value_counts()
print(type1_counts)

# Generation Distribution
print("\n2. GENERATION DISTRIBUTION")
print("-" * 40)
gen_counts = df['Generation'].value_counts().sort_index()
print(gen_counts)

# Legendary/Mythical/Ultra Beast counts
print("\n3. SPECIAL POKEMON COUNTS")
print("-" * 40)
print(f"Legendary Pokemon: {df['IsLegendary'].sum()}")
print(f"Mythical Pokemon: {df['IsMythical'].sum()}")
print(f"Ultra Beasts: {df['IsUltraBeast'].sum()}")
print(f"Pokemon with Mega Evolution: {df['HasMega'].sum()}")

# Stats Summary
print("\n4. BASE STATS SUMMARY")
print("-" * 40)
stats_cols = ['Hp', 'Attack', 'Defense', 'SpecialAttack', 'SpecialDefense', 'Speed']
print(df[stats_cols].describe())


SINGLE FEATURE STATISTICS

1. TYPE DISTRIBUTION
----------------------------------------
Type1
Water       145
Normal      135
Grass       111
Bug          91
Psychic      76
Fire         72
Electric     71
Rock         64
Dark         53
Poison       48
Ground       46
Fighting     45
Fairy        44
Dragon       43
Ice          42
Steel        41
Ghost        40
Flying       12
Name: count, dtype: int64

2. GENERATION DISTRIBUTION
----------------------------------------
Generation
I       151
II      100
III     141
IV      123
IX      136
V       176
VI       90
VII     116
VIII    146
Name: count, dtype: int64

3. SPECIAL POKEMON COUNTS
----------------------------------------
Legendary Pokemon: 97
Mythical Pokemon: 34
Ultra Beasts: 11
Pokemon with Mega Evolution: 47

4. BASE STATS SUMMARY
----------------------------------------
                Hp       Attack      Defense  SpecialAttack  SpecialDefense  \
count  1179.000000  1179.000000  1179.000000    1179.000000     1179.00000

In [9]:
# Cell 9: Save Processed Dataset for Visualizations

# Select columns for visualization dataset
viz_columns = [
    'DexNumber', 'Name', 'Type', 'Type1', 'Type2',
    'Abilities', 'Ability1', 'Ability2', 'HiddenAbility',
    'Generation', 'Hp', 'Attack', 'Defense', 'SpecialAttack',
    'SpecialDefense', 'Speed', 'TotalStats',
    'Weight', 'Height', 'GenderProbM', 'NoGender',
    'Category', 'CatchRate', 'EggCycles', 'EggGroup',
    'EggGroup1', 'EggGroup2', 'LevelingRate', 'BaseFriendship',
    'IsLegendary', 'IsMythical', 'IsUltraBeast', 'HasMega',
    'EvoStage', 'TotalEvoStages', 'PreevoName',
    'PhysicalSweeper', 'SpecialSweeper', 'PhysicalWall', 'SpecialWall'
]

# Add damage columns
damage_cols = [col for col in df.columns if col.startswith('DamageFrom')]
viz_columns.extend(damage_cols)

# Filter columns that exist
viz_columns = [col for col in viz_columns if col in df.columns]

df_viz = df[viz_columns].copy()

# Save
output_path = '/content/drive/MyDrive/PokemonDataScience/data/processed/pokemon_processed.csv'
df_viz.to_csv(output_path, index=False)

print(f"Visualization dataset saved!")
print(f"Shape: {df_viz.shape}")
print(f"Location: {output_path}")


Visualization dataset saved!
Shape: (1179, 58)
Location: /content/drive/MyDrive/PokemonDataScience/data/processed/pokemon_processed.csv


In [10]:
# Cell 10: Create Machine Learning Dataset (Only Numerical Features) - FIXED

# Select only numerical columns for ML
ml_columns = [
    'DexNumber', 'Generation',
    'Hp', 'Attack', 'Defense', 'SpecialAttack', 'SpecialDefense', 'Speed', 'TotalStats',
    'Weight', 'Height', 'GenderProbM', 'NoGender',
    'CatchRate', 'EggCycles', 'BaseFriendship',
    'IsLegendary', 'IsMythical', 'IsUltraBeast', 'HasMega',
    'EvoStage', 'TotalEvoStages',
    'PhysicalSweeper', 'SpecialSweeper', 'PhysicalWall', 'SpecialWall'
]

# Add all damage multiplier columns
ml_columns.extend(damage_cols)

# Filter existing columns
ml_columns = [col for col in ml_columns if col in df.columns]

df_ml = df[ml_columns].copy()

print(f"Initial shape: {df_ml.shape}")

# Convert all columns to numeric, coercing errors to NaN
for col in df_ml.columns:
    df_ml[col] = pd.to_numeric(df_ml[col], errors='coerce')

# Handle missing values with median
df_ml = df_ml.fillna(df_ml.median())

# For columns that are still all NaN (median couldn't help), fill with 0
df_ml = df_ml.fillna(0)

print(f"Final shape: {df_ml.shape}")

# Save
output_path_ml = '/content/drive/MyDrive/PokemonDataScience/data/forML/pokemon_ml.csv'
df_ml.to_csv(output_path_ml, index=False)

print(f"\nMachine Learning dataset saved!")
print(f"Shape: {df_ml.shape}")
print(f"Features: {len(ml_columns)}")
print(f"Location: {output_path_ml}")


Initial shape: (1179, 44)
Final shape: (1179, 44)

Machine Learning dataset saved!
Shape: (1179, 44)
Features: 44
Location: /content/drive/MyDrive/PokemonDataScience/data/forML/pokemon_ml.csv


In [11]:
# Cell 11: Display Summary

print("\n" + "=" * 60)
print("PREPROCESSING COMPLETE!")
print("=" * 60)
print(f"\nOriginal dataset: {df.shape[0]} Pokemon, {df.shape[1]} features")
print(f"Visualization dataset: {df_viz.shape[0]} Pokemon, {df_viz.shape[1]} features")
print(f"ML dataset: {df_ml.shape[0]} Pokemon, {df_ml.shape[1]} features")
print("\nFiles created:")
print("1. pokemon_processed.csv (for visualizations)")
print("2. pokemon_ml.csv (for machine learning)")
print("\n" + "=" * 60)



PREPROCESSING COMPLETE!

Original dataset: 1179 Pokemon, 58 features
Visualization dataset: 1179 Pokemon, 58 features
ML dataset: 1179 Pokemon, 44 features

Files created:
1. pokemon_processed.csv (for visualizations)
2. pokemon_ml.csv (for machine learning)

