# Pokedex
This notebook is used to collect the full pokedex dataset from Gen I to Gen VIII

In [None]:
# probbaly not necessary
import datetime as dt
import re
import numpy as np

In [7]:
# required imports
import pandas as pd
import requests
from bs4 import BeautifulSoup

We will use bulbapedia.bulbagarden.net to collect the name of all the pokemon.

In [8]:
url = "https://bulbapedia.bulbagarden.net/wiki/List_of_Pokémon_by_National_Pokédex_number"

In [9]:
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
rows = soup.find_all("tr")

In [10]:
df = pd.DataFrame()
for row in rows:
    if len(row.find_all("th")) > 1:
        headers = [header.text.replace('\n','').replace(' ','') for header in row.find_all("th")]
    elif len(row.find_all("td")) > 1: 
        record = {}
        for i in range(len(row.find_all("td"))):
            record[headers[i]] = row.find_all("td")[i].text.replace('\n','').replace(' ','')
        df = df.append(pd.Series(record), ignore_index=True)

In [11]:
cols = ['Ndex', 'MS']
df = df[cols]
df.drop_duplicates(inplace=True)

In [12]:
pokemon_names = df['MS'].to_list()

In [15]:
pokemon_names = [item.lower() for item in pokemon_names]

In [16]:
print(pokemon_names[0:5])

['bulbasaur', 'ivysaur', 'venusaur', 'charmander', 'charmeleon']


Now we will use the primary data source of Serebii.net using their Gen VIII pokedex

In [17]:
url = "https://www.serebii.net/pokedex-swsh/{}/#stats"

In [20]:
def get_number(soup):
    table = soup.find_all("table", class_="dextable")[1]
    num = table.find_all("tr")[1].find_all("td", class_="fooinfo")[2].text.strip().split('\n')[0].split("#")[-1]
    
    return num

In [19]:
def get_name(soup):
    table = soup.find_all("table", class_="dextable")[1]
    name = table.find_all("tr")[1].find_all("td")[0].text
    
    return name

In [22]:
def get_altnames(soup):
    altnames = {}
    table = soup.find_all("table", class_="dextable")[1]
    rows = table.find_all("tr")[1].find_all("td")[1].find_all("tr")
    for row in rows:
        key = row.find_all("td")[0].text.strip().replace(":","")
        contents = row.find_all("td")[1].contents
        value = [x for x in contents if isinstance(x, type(contents[0]))]
        altnames[key] = value
    
    return altnames

In [24]:
def get_type(soup):
    types = []
    table = soup.find_all("table", class_="dextable")[1]
    imgs = table.find_all("img")
    for img in imgs:
        types.append(img["src"].split("/")[-1].split(".")[0])
    
    return types

In [25]:
def get_gender(soup):
    gender_ratios = {}
    table = soup.find_all("table", class_="dextable")[1]
    try:  #genderless pokemon will cause exception
        rows = table.find_all("tr")[1].find_all("td", class_="fooinfo")[3].contents[0].find_all("tr")
        for row in rows:
            key = row.find_all("td")[0].text.split(" ")[0]
            value = row.find_all("td")[1].text
            gender_ratios[key] = value
    except:
        pass

    return gender_ratios

In [27]:
def get_classification(soup):
    table = soup.find_all("table", class_="dextable")[1]
    classification = table.find_all("td", class_="fooinfo")[4].text
    
    return classification

In [28]:
def get_height(soup):
    table = soup.find_all("table", class_="dextable")[1]
    height = table.find_all("td", class_="fooinfo")[5].text.split("\t")[-1]
    
    return height

In [29]:
def get_weight(soup):
    table = soup.find_all("table", class_="dextable")[1]
    weight = table.find_all("td", class_="fooinfo")[6].text.split("\t")[-1]
    
    return weight

In [30]:
def get_capture_rate(soup):
    table = soup.find_all("table", class_="dextable")[1]
    rate = table.find_all("td", class_="fooinfo")[7].text.split("\t")[-1]
    
    return rate

In [31]:
def get_base_egg_steps(soup):
    table = soup.find_all("table", class_="dextable")[1]
    steps = table.find_all("td", class_="fooinfo")[8].text.split("\t")[-1].replace(",","")
    
    return steps

In [32]:
def get_base_stats(soup):
    base_stats = {}
    stat_names = ["HP","Attack","Defense","Sp. Attack","Sp. Defense","Speed"]

    stats_indices = []
    for row in soup.find_all("tr"):
        if 'Stats' == row.text.replace('\n',''):
            stats_indices.append(soup.find_all("tr").index(row))

    try:
        columns = soup.find_all("tr")[stats_indices[0]+2].find_all("td")

    except:
        columns = soup.find_all("tr")[stats_indices[1]+2].find_all("td")

    for i in range(len(stat_names)):
        base_stats[stat_names[i]] = columns[i+1].text

    return base_stats

In [33]:
def get_legendary_status(name):
    url = "https://www.serebii.net/pokemon/legendary.shtml"
    r = requests.get(url)
    soup = BeautifulSoup(r.text, "html.parser")
    tables = soup.find_all("table", class_="trainer")
    status_list = ["sublegendary","legendary","mythical"]
    status_dict = {}
    for i in range(len(tables)):
        status_dict[status_list[i]] = []
        for row in tables[i].find_all("tr", recursive=False)[1:-1]:
            for column in row.find_all("td", recursive=False):
                item = column.find_all('td')[1].text
                status_dict[status_list[i]].append(item)
                
    legendary_status = {"sublegendary": 0, "legendary": 0, "mythical": 0}
    if name in status_dict["sublegendary"]:
        legendary_status["sublegendary"] = 1
    elif name in status_dict["legendary"]:
        legendary_status["legendary"] = 1
    elif name in status_dict["mythical"]:
        legendary_status["mythical"] = 1
    
    return legendary_status

In [34]:
def get_experience_growth(soup):
    table = soup.find_all("table", class_="dextable")[2].find_all("tr", recursive=False)[3]
    exp = table.find_all("tr", recursive=False)[3].td.contents[0].split(" ")[0].replace(",","")
    
    return exp

In [35]:
def get_base_happiness(soup):
    table = soup.find_all("table", class_="dextable")[2].find_all("tr", recursive=False)[3]
    happiness = table.find_all("tr", recursive=False)[3].find_all("td",recursive=False)[1].text
    
    return happiness

In [36]:
def get_against(soup):
    headers = []
    against_dict = {}
    links = soup.find_all("table", class_="dextable")[3].find_all("tr",recursive=False)[1].find_all("a")
    for link in links:
        header = link['href'].split("/")[-1].split(".")[0]
        headers.append(header)
    
    columns = soup.find_all("table", class_="dextable")[3].find_all("tr",recursive=False)[2].find_all('td')
    for i in range(len(headers)):
        against_dict[headers[i]] = columns[i].text.split('*')[-1]
    
    return against_dict

In [51]:
def get_abilities(soup):
    table = soup.find_all("table", class_="dextable")[2]
    abilities = table.find_all('tr')[0].text.split(": ")[1].strip().split(" - ")
    
    return abilities

In [65]:
def get_gen(number):
    if int(number) <= 151:
        gen = 'I'
    elif int(number) <= 251:
        gen = 'II'
    elif int(number) <= 386:
        gen = 'III'
    elif int(number) <= 493:
        gen = 'IV'
    elif int(number) <= 649:
        gen = 'V'
    elif int(number) <= 721:
        gen = 'VI'
    elif int(number) <= 809:
        gen = 'VII'
    else:
        gen = 'VIII'
    
    return gen

In [66]:
pokedex = pd.DataFrame()
for pokemon in pokemon_names:
    stats = {}
    
    try:
        url = "https://www.serebii.net/pokedex-swsh/{}/#stats"
        r = requests.get(url.format(pokemon))
        soup = BeautifulSoup(r.text, "html.parser")
        check = soup.find_all("table", class_="dextable")[1]
    except:
        url = "https://www.serebii.net/pokedex-sm/{}.shtml"
        number = "{0:0=3d}".format(pokemon_names.index(pokemon)+1)
        r = requests.get(url.format(number))
        soup = BeautifulSoup(r.text, "html.parser")
    
    stats['national_number'] = get_number(soup)
    stats['gen'] = get_gen(stats['national_number'])
    stats['english_name'] = get_name(soup)
    stats['japanese_name'] = get_altnames(soup)['Japan'][0]
    stats['primary_type'] = get_type(soup)[0]
    stats['secondary_type'] = get_type(soup)[1] if len(get_type(soup))>1 else None 
    stats['percent_male'] = get_gender(soup)['Male'].replace('%','') if len(get_gender(soup))>1 else None
    stats['percent_female'] = get_gender(soup)['Female'].replace('%','') if len(get_gender(soup))>1 else None
    stats['classification'] = get_classification(soup)
    stats['height_m'] = get_height(soup).split('m')[0]
    stats['weight_kg'] = get_weight(soup).split('kg')[0]
    stats['capture_rate'] = get_capture_rate(soup)
    stats['base_egg_steps'] = get_base_egg_steps(soup)
    stats['hp'] = get_base_stats(soup)["HP"]
    stats['attack'] = get_base_stats(soup)["Attack"]
    stats['defense'] = get_base_stats(soup)["Defense"]
    stats['sp_attack'] = get_base_stats(soup)["Sp. Attack"]
    stats['sp_defense'] = get_base_stats(soup)["Sp. Defense"]
    stats['speed'] = get_base_stats(soup)["Speed"]
    stats['abilities'] = get_abilities(soup)
    
    legend = get_legendary_status(stats['english_name'])
    for i in legend.keys():
        stats['is_{}'.format(i)] = legend[i]
    
    against = get_against(soup)
    for i in against.keys():
        stats['against_{}'.format(i)] = against[i]
    
    pokedex = pokedex.append(pd.Series(stats), ignore_index=True)

In [67]:
cols = [
    'national_number',
    'gen',
    'english_name',
    'japanese_name',
    'primary_type',
    'secondary_type',
    'classification',
    'percent_male',
    'percent_female',
    'height_m',
    'weight_kg',
    'capture_rate',
    'base_egg_steps',
    'hp',
    'attack',
    'defense',
    'sp_attack',
    'sp_defense',
    'speed',
    'abilities'
    ]

for i in against.keys():
    cols.append('against_{}'.format(i))

for i in legend.keys():
    cols.append('is_{}'.format(i))

In [68]:
pokedex[cols].to_csv('pokemon.csv', index=False, encoding='utf-16')