# Preprocessing the raw Pokemon data

by GLoria

In [3]:
import numpy as np 
import pandas as pd 
import os

### images 

getting pokemon names from the images we have so we can merge with complete dataset on pokemon names later

source: https://www.kaggle.com/datasets/arenagrenade/the-complete-pokemon-images-data-set/code

In [4]:
_root = 'real_pokemon_data/pokemon_images'
filenames = sorted(os.listdir(_root))
filenames = pd.Series(filenames)

In [5]:
filenames

0             abomasnow.png
1                  abra.png
2                 absol.png
3              accelgor.png
4      aegislash-shield.png
               ...         
893             zoroark.png
894               zorua.png
895               zubat.png
896            zweilous.png
897             zygarde.png
Length: 898, dtype: object

In [6]:
image_df = pd.DataFrame()
image_df['name'] = [x[:-4] for x in filenames]
image_df

Unnamed: 0,name
0,abomasnow
1,abra
2,absol
3,accelgor
4,aegislash-shield
...,...
893,zoroark
894,zorua
895,zubat
896,zweilous


embedding images into base64 and adding them to the df

In [7]:
import base64

def encode_image(image_path):
    with open(image_path, 'rb') as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

In [11]:
image_df['image_base64'] = image_df.apply(lambda row: encode_image(f'real_pokemon_data/pokemon_images/{row['name']}.png'), axis=1)

In [12]:
image_df

Unnamed: 0,name,image_base64
0,abomasnow,iVBORw0KGgoAAAANSUhEUgAAAdsAAAHbCAYAAACDejA0AA...
1,abra,iVBORw0KGgoAAAANSUhEUgAAAdsAAAHbCAYAAACDejA0AA...
2,absol,iVBORw0KGgoAAAANSUhEUgAAAdsAAAHbCAYAAACDejA0AA...
3,accelgor,iVBORw0KGgoAAAANSUhEUgAAAdsAAAHbCAYAAACDejA0AA...
4,aegislash-shield,iVBORw0KGgoAAAANSUhEUgAAAdsAAAHbCAYAAACDejA0AA...
...,...,...
893,zoroark,iVBORw0KGgoAAAANSUhEUgAAAdsAAAHbCAYAAACDejA0AA...
894,zorua,iVBORw0KGgoAAAANSUhEUgAAAdsAAAHbCAYAAACDejA0AA...
895,zubat,iVBORw0KGgoAAAANSUhEUgAAAdsAAAHbCAYAAACDejA0AA...
896,zweilous,iVBORw0KGgoAAAANSUhEUgAAAdsAAAHbCAYAAACDejA0AA...


### stats dataset

In [16]:
stats_df = pd.read_csv('real_pokemon_data/pokemon_complete_dataset.csv')
stats_df

Unnamed: 0,abilities,against_bug,against_dark,against_dragon,against_electric,against_fairy,against_fight,against_fire,against_flying,against_ghost,...,percentage_male,pokedex_number,sp_attack,sp_defense,speed,type1,type2,weight_kg,generation,is_legendary
0,"['Overgrow', 'Chlorophyll']",1.00,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,88.1,1,65,65,45,grass,poison,6.9,1,0
1,"['Overgrow', 'Chlorophyll']",1.00,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,88.1,2,80,80,60,grass,poison,13.0,1,0
2,"['Overgrow', 'Chlorophyll']",1.00,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,88.1,3,122,120,80,grass,poison,100.0,1,0
3,"['Blaze', 'Solar Power']",0.50,1.0,1.0,1.0,0.5,1.0,0.5,1.0,1.0,...,88.1,4,60,50,65,fire,,8.5,1,0
4,"['Blaze', 'Solar Power']",0.50,1.0,1.0,1.0,0.5,1.0,0.5,1.0,1.0,...,88.1,5,80,65,80,fire,,19.0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
796,['Beast Boost'],0.25,1.0,0.5,2.0,0.5,1.0,2.0,0.5,1.0,...,,797,107,101,61,steel,flying,999.9,7,1
797,['Beast Boost'],1.00,1.0,0.5,0.5,0.5,2.0,4.0,1.0,1.0,...,,798,59,31,109,grass,steel,0.1,7,1
798,['Beast Boost'],2.00,0.5,2.0,0.5,4.0,2.0,0.5,1.0,0.5,...,,799,97,53,43,dark,dragon,888.0,7,1
799,['Prism Armor'],2.00,2.0,1.0,1.0,1.0,0.5,1.0,1.0,2.0,...,,800,127,89,79,psychic,,230.0,7,1


### metadata

source: https://www.kaggle.com/datasets/rounakbanik/pokemon

- `name`: The English name of the Pokemon
- `japanese_name`: The Original Japanese name of the Pokemon
- `pokedex_number`: The entry number of the Pokemon in the National Pokedex
- `percentage_male`: The percentage of the species that are male. Blank if the Pokemon is genderless.
- `type1`: The Primary Type of the Pokemon
- `type2`: The Secondary Type of the Pokemon
- `classification`: The Classification of the Pokemon as described by the Sun and Moon Pokedex
- `height_m`: Height of the Pokemon in metres
- `weight_kg`: The Weight of the Pokemon in kilograms
- `capture_rate`: Capture Rate of the Pokemon
- `base_egg_steps`: The number of steps required to hatch an egg of the Pokemon
- `abilities`: A stringified list of abilities that the Pokemon is capable of having
- `experience_growth`: The Experience Growth of the Pokemon
- `base_happiness`: Base Happiness of the Pokemon
- `against_?`: Eighteen features that denote the amount of damage taken against an attack of a particular type
- `hp`: The Base HP of the Pokemon
- `attack`: The Base Attack of the Pokemon
- `defense`: The Base Defense of the Pokemon
- `sp_attack`: The Base Special Attack of the Pokemon
- `sp_defense`: The Base Special Defense of the Pokemon
- `speed`: The Base Speed of the Pokemon
- `generation`: The numbered generation which the Pokemon was first introduced
- `is_legendary`: Denotes if the Pokemon is legendary.

data cleaning (changing names to all lowercases)

In [17]:
stats_df['name'] = stats_df['name'].apply(str.lower)
stats_df['name']

0       bulbasaur
1         ivysaur
2        venusaur
3      charmander
4      charmeleon
          ...    
796    celesteela
797       kartana
798      guzzlord
799      necrozma
800      magearna
Name: name, Length: 801, dtype: object

check pokemons that are in the dataset and have image

In [18]:
complete_df = stats_df.merge(image_df, how='inner', left_on='name', right_on='name')
complete_df

Unnamed: 0,abilities,against_bug,against_dark,against_dragon,against_electric,against_fairy,against_fight,against_fire,against_flying,against_ghost,...,pokedex_number,sp_attack,sp_defense,speed,type1,type2,weight_kg,generation,is_legendary,image_base64
0,"['Overgrow', 'Chlorophyll']",1.00,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,1,65,65,45,grass,poison,6.9,1,0,iVBORw0KGgoAAAANSUhEUgAAAdsAAAHbCAYAAACDejA0AA...
1,"['Overgrow', 'Chlorophyll']",1.00,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,2,80,80,60,grass,poison,13.0,1,0,iVBORw0KGgoAAAANSUhEUgAAAdsAAAHbCAYAAACDejA0AA...
2,"['Overgrow', 'Chlorophyll']",1.00,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,3,122,120,80,grass,poison,100.0,1,0,iVBORw0KGgoAAAANSUhEUgAAAdsAAAHbCAYAAACDejA0AA...
3,"['Blaze', 'Solar Power']",0.50,1.0,1.0,1.0,0.5,1.0,0.5,1.0,1.0,...,4,60,50,65,fire,,8.5,1,0,iVBORw0KGgoAAAANSUhEUgAAAdsAAAHbCAYAAACDejA0AA...
4,"['Blaze', 'Solar Power']",0.50,1.0,1.0,1.0,0.5,1.0,0.5,1.0,1.0,...,5,80,65,80,fire,,19.0,1,0,iVBORw0KGgoAAAANSUhEUgAAAdsAAAHbCAYAAACDejA0AA...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
748,['Beast Boost'],0.25,1.0,0.5,2.0,0.5,1.0,2.0,0.5,1.0,...,797,107,101,61,steel,flying,999.9,7,1,iVBORw0KGgoAAAANSUhEUgAAAdsAAAHbCAYAAACDejA0AA...
749,['Beast Boost'],1.00,1.0,0.5,0.5,0.5,2.0,4.0,1.0,1.0,...,798,59,31,109,grass,steel,0.1,7,1,iVBORw0KGgoAAAANSUhEUgAAAdsAAAHbCAYAAACDejA0AA...
750,['Beast Boost'],2.00,0.5,2.0,0.5,4.0,2.0,0.5,1.0,0.5,...,799,97,53,43,dark,dragon,888.0,7,1,iVBORw0KGgoAAAANSUhEUgAAAdsAAAHbCAYAAACDejA0AA...
751,['Prism Armor'],2.00,2.0,1.0,1.0,1.0,0.5,1.0,1.0,2.0,...,800,127,89,79,psychic,,230.0,7,1,iVBORw0KGgoAAAANSUhEUgAAAdsAAAHbCAYAAACDejA0AA...


we have 753 pokemons with complete stats/info and image

exporting complete df into csv

In [19]:
# complete_df.to_csv('real_pokemon_data/preprocessed_data.csv')

# file too big, maybe not a good idea to turn image into base64