# Pokedex
This notebook is used to collect the full pokedex dataset from Gen I to Gen VIII

In [41]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

We will use bulbapedia.bulbagarden.net to collect the name of all the pokemon.

In [101]:
url = "https://bulbapedia.bulbagarden.net/wiki/List_of_Pokémon_by_National_Pokédex_number"

In [102]:
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
rows = soup.find_all("tr")

In [103]:
df = pd.DataFrame()
for row in rows:
    if len(row.find_all("th")) > 1:
        headers = [header.text.replace('\n','').replace(' ','') for header in row.find_all("th")]
    elif len(row.find_all("td")) > 1: 
        record = {}
        for i in range(len(row.find_all("td"))):
            record[headers[i]] = row.find_all("td")[i].text.replace('\n','').replace(' ','')
        df = df.append(pd.Series(record), ignore_index=True)

In [104]:
cols = ['Ndex', 'MS']
df = df[cols]
df.drop_duplicates(inplace=True)

In [105]:
pokemon_names = df['MS'].to_list()

In [106]:
pokemon_names = [item.lower() for item in pokemon_names]

In [107]:
print(pokemon_names[0:5])

['bulbasaur', 'ivysaur', 'venusaur', 'charmander', 'charmeleon']


Now we will use the primary data source of Serebii.net using their Gen VIII pokedex

In [108]:
url = "https://www.serebii.net/pokedex-swsh/{}/#stats"

In [357]:
def get_number(soup):
    table = soup.find_all("table", class_="dextable")[1]
    num = table.find_all("tr")[1].find_all("td", class_="fooinfo")[2].text.strip().split('\n')[0].split("#")[-1]
    
    return num

In [358]:
def get_name(soup):
    table = soup.find_all("table", class_="dextable")[1]
    name = table.find_all("tr")[1].find_all("td")[0].text
    
    return name

In [359]:
def get_altnames(soup):
    altnames = {}
    table = soup.find_all("table", class_="dextable")[1]
    rows = table.find_all("tr")[1].find_all("td")[1].find_all("tr")
    for row in rows:
        key = row.find_all("td")[0].text.strip().replace(":","")
        contents = row.find_all("td")[1].contents
        value = [x for x in contents if isinstance(x, type(contents[0]))]
        altnames[key] = value
    
    return altnames

In [360]:
def get_type(soup):
    types = []
    table = soup.find_all("table", class_="dextable")[1]
    imgs = table.find_all("img")
    for img in imgs:
        types.append(img["src"].split("/")[-1].split(".")[0])
    
    return types

In [361]:
def get_gender(soup):
    gender_ratios = {}
    table = soup.find_all("table", class_="dextable")[1]
    try:  #genderless pokemon will cause exception
        rows = table.find_all("tr")[1].find_all("td", class_="fooinfo")[3].contents[0].find_all("tr")
        for row in rows:
            key = row.find_all("td")[0].text.split(" ")[0]
            value = row.find_all("td")[1].text
            gender_ratios[key] = value
    except:
        pass

    return gender_ratios

In [362]:
def get_classification(soup):
    table = soup.find_all("table", class_="dextable")[1]
    classification = table.find_all("td", class_="fooinfo")[4].text
    
    return classification

In [363]:
def get_height(soup):
    table = soup.find_all("table", class_="dextable")[1]
    height = table.find_all("td", class_="fooinfo")[5].text.split("\t")[-1]
    
    return height

In [364]:
def get_weight(soup):
    table = soup.find_all("table", class_="dextable")[1]
    weight = table.find_all("td", class_="fooinfo")[6].text.split("\t")[-1]
    
    return weight

In [365]:
def get_capture_rate(soup):
    table = soup.find_all("table", class_="dextable")[1]
    rate = table.find_all("td", class_="fooinfo")[7].text.split("\t")[-1]
    
    return rate

In [366]:
def get_base_egg_steps(soup):
    table = soup.find_all("table", class_="dextable")[1]
    steps = table.find_all("td", class_="fooinfo")[8].text.split("\t")[-1].replace(",","")
    
    return steps

In [367]:
def get_base_stats(soup):
    base_stats = {}
    stat_names = ["HP","Attack","Defense","Sp. Attack","Sp. Defense","Speed"]

    stats_indices = []
    for row in soup.find_all("tr"):
        if 'Stats' == row.text.replace('\n',''):
            stats_indices.append(soup.find_all("tr").index(row))

    try:
        columns = soup.find_all("tr")[stats_indices[0]+2].find_all("td")

    except:
        columns = soup.find_all("tr")[stats_indices[1]+2].find_all("td")

    for i in range(len(stat_names)):
        base_stats[stat_names[i]] = columns[i+1].text

    return base_stats

In [368]:
def get_legendary_status(name):
                
    legendary_status = {"sublegendary": 0, "legendary": 0, "mythical": 0}
    if name in status_dict["sublegendary"]:
        legendary_status["sublegendary"] = 1
    elif name in status_dict["legendary"]:
        legendary_status["legendary"] = 1
    elif name in status_dict["mythical"]:
        legendary_status["mythical"] = 1
    
    return legendary_status

In [369]:
def get_experience_growth(soup):
    table = soup.find_all("table", class_="dextable")[2].find_all("tr", recursive=False)[3]
    exp = table.find_all("tr", recursive=False)[3].td.contents[0].split(" ")[0].replace(",","")
    
    return exp

In [370]:
def get_base_happiness(soup):
    table = soup.find_all("table", class_="dextable")[2].find_all("tr", recursive=False)[3]
    happiness = table.find_all("tr", recursive=False)[3].find_all("td",recursive=False)[1].text
    
    return happiness

In [371]:
def get_against(soup):
    headers = []
    against_dict = {}
    links = soup.find_all("table", class_="dextable")[3].find_all("tr",recursive=False)[1].find_all("a")
    for link in links:
        header = link['href'].split("/")[-1].split(".")[0]
        headers.append(header)
    
    columns = soup.find_all("table", class_="dextable")[3].find_all("tr",recursive=False)[2].find_all('td')
    for i in range(len(headers)):
        against_dict[headers[i]] = columns[i].text.split('*')[-1]
    
    return against_dict

In [372]:
def get_abilities(soup):
    table = soup.find_all("table", class_="dextable")[2]
    abilities = table.find_all('tr')[0].text.split(": ")[1].strip().split(" - ")
    
    return abilities

In [373]:
def get_gen(number):
    if int(number) <= 151:
        gen = 'I'
    elif int(number) <= 251:
        gen = 'II'
    elif int(number) <= 386:
        gen = 'III'
    elif int(number) <= 493:
        gen = 'IV'
    elif int(number) <= 649:
        gen = 'V'
    elif int(number) <= 721:
        gen = 'VI'
    elif int(number) <= 809:
        gen = 'VII'
    else:
        gen = 'VIII'
    
    return gen

In [387]:
def get_description(name):
    name = name.replace("'",'').replace(' ','-').replace('.',' ').strip().replace(' ','-').replace(':','-')
    if name == 'nidoran♀':
        name = 'nidoran-female'
    elif name == 'nidoran♂':
        name = 'nidoran-male'
    elif name == 'mimejr':
        name = 'mime-jr'
    elif name[:4] == 'tapu':
        name = name[:4] + '-' + name[4:]
    url = 'https://www.pokemon.com/us/pokedex/{}'
    r = requests.get(url.format(name))
    soup = BeautifulSoup(r.text, "html.parser")
    desc = soup.find('div', class_='version-descriptions active').find('p', class_='active').contents[0].strip()

    return desc

In [375]:
def get_evochain(soup):
    table = soup.find_all("table", class_="evochain")[0]
    chain = []
    if len(table.find_all('tr')) == 1:
        for img in table.find_all('img'):
            try:
                a = img['title']
                chain.append(a)   
            except:
                try:
                    a = img['alt']
                    chain.append(a) 
                except:
                    pass
    
    elif table.find('a')['href'].split('/')[1] == 'pokedex-sm':
        row = table.find('tr')
        for col in row.find_all('td')[:7]:
            try:
                a = col.find('img')['title'].strip()
            except:
                try:
                    a = col.find('img')['alt'].strip()
                except:
                    number = col.find('a')['href'].split('/')[-1].split('.')[0]
                    a = pokemon_names[int(number)-1].capitalize()
            chain.append(a)
    
    else:
        for col in table.find_all('td')[:7]:
            try:
                a = col.find('img')['title'].strip()
            except:
                try:
                    a = col.find('img')['alt'].strip()
                except:
                    try:
                        number = col.find('img')['src'].split('/')[-1].split('.')[0]
                        a = pokemon_names[int(number)-1].capitalize()
                    except:
                        pass
                    #a = col.find('a')['href'].split('/')[-1].capitalize()
            chain.append(a)
    
    if len(chain)>1:
        if chain[0] == chain[1]:
            chain.pop(0)
    
        if len(chain)>3 and chain[2] == chain[3]:
            try:
                chain.pop(5)
            except:
                pass
            chain.pop(4)
            chain.pop(3)

        try:
            if chain[4] == chain[5]:
                chain.pop(6)
                chain.pop(5)
        except:
            pass 
    
    return chain

In [388]:
pokedex = pd.DataFrame()
url = "https://www.serebii.net/pokemon/legendary.shtml"
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
tables = soup.find_all("table", class_="trainer")
status_list = ["sublegendary","legendary","mythical"]
status_dict = {}
for i in range(len(tables)):
    status_dict[status_list[i]] = []
    for row in tables[i].find_all("tr", recursive=False)[1:-1]:
        for column in row.find_all("td", recursive=False):
            item = column.find_all('td')[1].text
            status_dict[status_list[i]].append(item)
            
for pokemon in pokemon_names:
    stats = {}
    
    try:
        url = "https://www.serebii.net/pokedex-swsh/{}/#stats"
        r = requests.get(url.format(pokemon))
        soup = BeautifulSoup(r.text, "html.parser")
        check = soup.find_all("table", class_="dextable")[1]
    except:
        url = "https://www.serebii.net/pokedex-sm/{}.shtml"
        number = "{0:0=3d}".format(pokemon_names.index(pokemon)+1)
        r = requests.get(url.format(number))
        soup = BeautifulSoup(r.text, "html.parser")
    
    stats['national_number'] = get_number(soup)
    stats['gen'] = get_gen(stats['national_number'])
    stats['english_name'] = get_name(soup)
    stats['japanese_name'] = get_altnames(soup)['Japan'][0]
    stats['primary_type'] = get_type(soup)[0]
    stats['secondary_type'] = get_type(soup)[1] if len(get_type(soup))>1 else None 
    stats['percent_male'] = get_gender(soup)['Male'].replace('%','') if len(get_gender(soup))>1 else None
    stats['percent_female'] = get_gender(soup)['Female'].replace('%','') if len(get_gender(soup))>1 else None
    stats['classification'] = get_classification(soup)
    stats['height_m'] = get_height(soup).split('m')[0]
    stats['weight_kg'] = get_weight(soup).split('kg')[0]
    stats['capture_rate'] = get_capture_rate(soup)
    stats['base_egg_steps'] = get_base_egg_steps(soup)
    stats['hp'] = get_base_stats(soup)["HP"]
    stats['attack'] = get_base_stats(soup)["Attack"]
    stats['defense'] = get_base_stats(soup)["Defense"]
    stats['sp_attack'] = get_base_stats(soup)["Sp. Attack"]
    stats['sp_defense'] = get_base_stats(soup)["Sp. Defense"]
    stats['speed'] = get_base_stats(soup)["Speed"]
    stats['description'] = get_description(pokemon)
    
    abilities = get_abilities(soup)
    for i in range(3):
        try:
            if 'Hidden' not in abilities[i]:
                stats['abilities_{}'.format(i)] = abilities[i]
            elif 'Hidden' in abilities[i]:
                stats['abilities_hidden'] = abilities[i].split('(')[0].strip()
        except:
             stats['abilities_{}'.format(i)] = None
                
    for i in range(7):
        try:
            stats['evochain_{}'.format(i)] = get_evochain(soup)[i]
        except:
            stats['evochain_{}'.format(i)] = None
    
    legend = get_legendary_status(stats['english_name'])
    for i in legend.keys():
        stats['is_{}'.format(i)] = legend[i]
    
    against = get_against(soup)
    for i in against.keys():
        stats['against_{}'.format(i)] = against[i]
    
    pokedex = pokedex.append(pd.Series(stats), ignore_index=True)

In [385]:
pokemon

'lele'

In [389]:
cols = [
    'national_number',
    'gen',
    'english_name',
    'japanese_name',
    'primary_type',
    'secondary_type',
    'classification',
    'percent_male',
    'percent_female',
    'height_m',
    'weight_kg',
    'capture_rate',
    'base_egg_steps',
    'hp',
    'attack',
    'defense',
    'sp_attack',
    'sp_defense',
    'speed'
    ]

for i in range(3):
    cols.append('abilities_{}'.format(i))

cols.append('abilities_hidden')

for i in against.keys():
    cols.append('against_{}'.format(i))

for i in legend.keys():
    cols.append('is_{}'.format(i))

for i in range(7):
    cols.append('evochain_{}'.format(i))

cols.append('description')

In [390]:
pokedex[cols].to_csv('pokemon.csv', index=False, encoding='utf-16')

Alter data set for radar plot 

In [4]:
df = pd.read_csv('pokemon.csv', encoding='utf-16')

In [7]:
df = pd.melt(df, 
        id_vars=['national_number','english_name'], 
        value_vars=['hp', 'attack', 'defense', 'sp_attack', 'sp_defense','speed'], 
        var_name='stat_name', 
        value_name='stat_value')

In [8]:
df.to_csv('pokemon_stats.csv', index=False, encoding='utf-16')

Obtain small images of all 898 pokemon

In [None]:
import urllib.request

for i in range(898):
    url = "https://assets.pokemon.com/assets/cms2/img/pokedex/detail/{0:0=3d}.png".format(i+1)
    output = "{0:0=3d}.png".format(i+1)
    urllib.request.urlretrieve(url, output)

Obtain large images of all 898 pokemon

In [None]:
import os

cwd = os.getcwd()

for i in range(len(pokemon_names)):
    try:
        url = "https://bulbapedia.bulbagarden.net/wiki/File:{}.png".format("{0:0=3d}".format(i+1)+pokemon_names[i].replace("'","%27").replace(".","._").capitalize())
        r = requests.get(url)
        soup = BeautifulSoup(r.text, "html.parser")
        div = soup.find("div", class_="fullImageLink")
        target = div.a['href']
        output = "/images/{0:0=3d}.png".format(i+1)
    except:
        try:
            url = "https://bulbapedia.bulbagarden.net/wiki/File:{}.png".format("{0:0=3d}".format(i+1)+pokemon_names[i].replace("'","%27").replace(".","._").title())
            r = requests.get(url)
            soup = BeautifulSoup(r.text, "html.parser")
            div = soup.find("div", class_="fullImageLink")
            target = div.a['href']
            output = "/images/{0:0=3d}.png".format(i+1)
        except:
            if pokemon_names[i]=='mimejr.':
                name = 'Mime_Jr'
                url = "https://bulbapedia.bulbagarden.net/wiki/File:{}.png".format("{0:0=3d}".format(i+1)+name)
                r = requests.get(url)
                soup = BeautifulSoup(r.text, "html.parser")
                div = soup.find("div", class_="fullImageLink")
                target = div.a['href']
                output = "/images/{0:0=3d}.png".format(i+1)
            else:
                name = "Giratina"
                url = "https://bulbapedia.bulbagarden.net/wiki/File:{}.png".format("{0:0=3d}".format(i+1)+name+"-Origin")
                r = requests.get(url)
                soup = BeautifulSoup(r.text, "html.parser")
                div = soup.find("div", class_="fullImageLink")
                target = div.a['href']
                output = "/images/{0:0=3d}.png".format(i+1+485)
                
    try:
        urllib.request.urlretrieve(target, output)
    except:
        if target[0:4] != 'http':
            target = "https:" + target
        r = requests.get(target)
        with open(cwd+output, 'wb') as outfile:
            outfile.write(r.content)
            outfile.close()

Collect alternative images for pokemon with alt forms (e.g. regional variants, mega evolution, gigantamax)

In [20]:
url = 'https://bulbapedia.bulbagarden.net/wiki/Mega_Evolution'
base = 'https://bulbapedia.bulbagarden.net'

In [32]:
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")

In [22]:
import os

cwd = os.getcwd()

for i in soup.find_all('a', class_='image'):
    if '-Mega' in i['href']:
        url = base+i['href']
        output = i['href'].split(':')[-1]
        r = requests.get(url)
        soup2 = BeautifulSoup(r.text, "html.parser")
        div = soup2.find("div", class_="fullImageLink")
        target = div.a['href']
        if target[0:4] != 'http':
            target = "https:" + target
        try:
            urllib.request.urlretrieve(target, output)
        except:
            r = requests.get(target)
        with open(cwd+output, 'wb') as outfile:
            outfile.write(r.content)
            outfile.close()

In [31]:
url = 'https://bulbapedia.bulbagarden.net/wiki/Primal_Reversion'

In [34]:
import os

cwd = os.getcwd()

for i in soup.find_all('a', class_='image'):
    if '-Primal' in i['href']:
        url = base+i['href']
        output = i['href'].split(':')[-1]
        r = requests.get(url)
        soup2 = BeautifulSoup(r.text, "html.parser")
        div = soup2.find("div", class_="fullImageLink")
        target = div.a['href']
        if target[0:4] != 'http':
            target = "https:" + target
        try:
            urllib.request.urlretrieve(target, output)
        except:
            r = requests.get(target)
        with open(cwd+output, 'wb') as outfile:
            outfile.write(r.content)
            outfile.close()

In [36]:
url = 'https://bulbapedia.bulbagarden.net/wiki/List_of_Pok%C3%A9mon_with_form_differences'
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")

In [40]:
import os

cwd = os.getcwd()

for i in soup.find_all('a', class_='image'):
    if '-' in i['href'] and '.png' in i['href']:
        url = base+i['href']
        output = i['href'].split(':')[-1]
        r = requests.get(url)
        soup2 = BeautifulSoup(r.text, "html.parser")
        div = soup2.find("div", class_="fullImageLink")
        target = div.a['href']
        if target[0:4] != 'http':
            target = "https:" + target
        try:
            urllib.request.urlretrieve(target, output)
        except:
            r = requests.get(target)
        with open(cwd+output, 'wb') as outfile:
            outfile.write(r.content)
            outfile.close()

In [None]:
url = 'https://bulbapedia.bulbagarden.net/wiki/Regional_form'
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")

In [307]:
soup.find('div', class_='version-descriptions active').find('p', class_='active').contents[0].strip()

'There is a plant seed on its back right from the day this Pokémon is born. The seed slowly grows larger.'

In [None]:
import urllib.request

for i in range(898):
    url = "https://assets.pokemon.com/assets/cms2/img/pokedex/detail/{0:0=3d}.png".format(i+1)
    output = "{0:0=3d}.png".format(i+1)
    urllib.request.urlretrieve(url, output)