In [59]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## First, grab the list of pokemon and their base stats from the web
Bulbapedia has this list

In [60]:
import urllib
import re

In [93]:
def get_matching_line_number(pattern, lines):
    has_match = False
    for i, line in enumerate(lines):
        match = re.search(pattern, line)
        if match:
            has_match = True
            break
    return i if has_match else -1

In [94]:
stats_url = (
    'https://bulbapedia.bulbagarden.net/wiki/'
    'List_of_Pok%C3%A9mon_by_base_stats_in_Generation_I'
)
page = urllib.request.urlopen(stats_url)
html_bytes = page.read()
html = html_bytes.decode('utf-8')
html_lines = html.split('\n')

In [95]:
print(get_matching_line_number(r'\)">[A-Z][^<]*<', html_lines))
print(get_matching_line_number(r'">\d+$', html_lines))
print(get_matching_line_number(r'0029', html_lines))
print(get_matching_line_number(r'0030', html_lines))

546
548
1130
1151


In [96]:
indices = range(1, 152)
bs_dict = {'index': indices, 'name': [], 'base_hp': [], 'base_att': [],
           'base_def': [], 'base_spe': [], 'base_spc': []
          }
name_search_string = r'\)">([A-Z][^<]*)<'
stat_search_string = r'">(\d+)$'

def get_name(line):
    name = ''
    match = re.search(name_search_string, line)
    if match:
        name = match.group(1)
    else:
        print('Error in get_name: '
              f'no match found for line={line}. '
              'Returning empty string')
    return name

def get_stat(line):
    stat = -1
    match = re.search(stat_search_string, line)
    if match:
        stat = int(match.group(1))
    else:
        print('Error in get_stat: '
              f'no match found for line={line}. '
              'Returning -1')
    return stat

def get_names_and_stats():
    i_line = 0
    while True:
        lines = html_lines[i_line:]
        species_line_num = get_matching_line_number(
            name_search_string, lines)
        if species_line_num < 0:
            break
        i_line += species_line_num
        name = get_name(lines[species_line_num])
        print(name)
        bs_dict['name'].append(name)
        bs_dict['base_hp'].append(get_stat(lines[species_line_num+2]))
        bs_dict['base_att'].append(get_stat(lines[species_line_num+4]))
        bs_dict['base_def'].append(get_stat(lines[species_line_num+6]))
        bs_dict['base_spe'].append(get_stat(lines[species_line_num+8]))
        bs_dict['base_spc'].append(get_stat(lines[species_line_num+10]))
        i_line += 11
        if name == 'Mew':
            break

In [97]:
get_names_and_stats()

Bulbasaur
Ivysaur
Venusaur
Charmander
Charmeleon
Charizard
Squirtle
Wartortle
Blastoise
Caterpie
Metapod
Butterfree
Weedle
Kakuna
Beedrill
Pidgey
Pidgeotto
Pidgeot
Rattata
Raticate
Spearow
Fearow
Ekans
Arbok
Pikachu
Raichu
Sandshrew
Sandslash
Nidoran♀
Nidorina
Nidoqueen
Nidoran♂
Nidorino
Nidoking
Clefairy
Clefable
Vulpix
Ninetales
Jigglypuff
Wigglytuff
Zubat
Golbat
Oddish
Gloom
Vileplume
Paras
Parasect
Venonat
Venomoth
Diglett
Dugtrio
Meowth
Persian
Psyduck
Golduck
Mankey
Primeape
Growlithe
Arcanine
Poliwag
Poliwhirl
Poliwrath
Abra
Kadabra
Alakazam
Machop
Machoke
Machamp
Bellsprout
Weepinbell
Victreebel
Tentacool
Tentacruel
Geodude
Graveler
Golem
Ponyta
Rapidash
Slowpoke
Slowbro
Magnemite
Magneton
Farfetch'd
Doduo
Dodrio
Seel
Dewgong
Grimer
Muk
Shellder
Cloyster
Gastly
Haunter
Gengar
Onix
Drowzee
Hypno
Krabby
Kingler
Voltorb
Electrode
Exeggcute
Exeggutor
Cubone
Marowak
Hitmonlee
Hitmonchan
Lickitung
Koffing
Weezing
Rhyhorn
Rhydon
Chansey
Tangela
Kangaskhan
Horsea
Seadra
Goldeen
Seaking

In [98]:
bs_df = pd.DataFrame(data=bs_dict, index=np.array(indices)-1)

In [99]:
# Fix Nidoran names -- html had symbols for F and M
bs_df.loc[28, 'name'] = 'Nidoran F'
bs_df.loc[31, 'name'] = 'Nidoran M'

In [101]:
bs_df.iloc[28]

index              29
name        Nidoran F
base_hp            55
base_att           47
base_def           52
base_spe           41
base_spc           40
Name: 28, dtype: object

In [102]:
# original method using manually copy-pasted csv file
bs_df_orig = pd.read_csv('base_stats.csv')
diffs = (bs_df_orig != bs_df).any(1)
diffs[diffs == True]
# exactly the same!!

ValueError: Can only compare identically-labeled DataFrame objects

## `bs_df` has species index, name, and base stats
Next we want to get type information. We'll pull this from the pokemondb.net stats table

In [103]:
types_url = (
    'https://pokemondb.net/pokedex/stats/gen1'
)

page = urllib.request.urlopen(types_url)
html_bytes = page.read()
html = html_bytes.decode('utf-8')

In [104]:
print(html)

<!DOCTYPE html>
<html lang="en">
<head>
	<meta charset="utf-8">
	<title>Generation 1 new Pokémon stats | Pokémon Database</title>

	<link rel="preconnect" href="https://img.pokemondb.net">
	<link rel="preconnect" href="https://s.pokemondb.net">
	<link rel="preload" href="/static/fonts/fira-sans-v17-latin-400.woff2" as="font" type="font/woff2" crossorigin>
	<link rel="preload" href="/static/fonts/fira-sans-v17-latin-400i.woff2" as="font" type="font/woff2" crossorigin>
	<link rel="preload" href="/static/fonts/fira-sans-v17-latin-600.woff2" as="font" type="font/woff2" crossorigin>
		<link rel="stylesheet" href="/static/css/pokemondb-9ad4b87c02.css">

	<meta name="viewport" content="width=device-width, initial-scale=1">

	<meta property="og:description" name="description" content="List of new Pokémon introduced in Gen 1 (Red/Blue/Yellow) along with their stats.">
	<link rel="canonical" href="https://pokemondb.net/pokedex/stats/gen1">
	<meta property="og:url" content="https://pokemondb.net/

In [105]:
html_lines = html.split('\n')

In [106]:
index_list = []
type1_list = []
type2_list = []
starting_line = 0
for i, species in enumerate(bs_df['name']):
    lines = html_lines[starting_line:]
    search_string = '">' + species + '<'
    match = None
    species_line = -1
    line_idx = 0
    while not match and (
        species_line < len(lines) - 1
    ):
        species_line += 1
        match = re.search(search_string, lines[species_line])
    if not match:
        print(f'Hit the end of the file. species={species}, starting_line={starting_line}, species_line={species_line}')
        continue
    the_line = lines[species_line]
    starting_line += species_line
    line_idx = match.span()[1]
    search_string = r'>([A-Z][a-z]+)<'
    match = re.search(search_string, the_line[line_idx:])
    type1 = match.group(1)
    type2 = 'None'
    line_idx += match.span()[1]
    match = re.search(search_string, the_line[line_idx:])
    if match:
        type2 = match.group(1)
    # print(f'Name: {species}\tType 1: {type1}\tType 2: {type2}')
    index_list.append(i+1)
    type1_list.append(type1)
    type2_list.append(type2)

Hit the end of the file. species=Nidoran F, starting_line=585, species_line=1624
Hit the end of the file. species=Nidoran M, starting_line=624, species_line=1585


Nidoran F and Nidoran M give us trouble because of the symbols in the name. Need to manually add these to the lists.

In [129]:
for i, t in enumerate(type1_list):
    print(f'i={i}, type1={t}')

i=0, type1=Grass
i=1, type1=Grass
i=2, type1=Grass
i=3, type1=Fire
i=4, type1=Fire
i=5, type1=Fire
i=6, type1=Water
i=7, type1=Water
i=8, type1=Water
i=9, type1=Bug
i=10, type1=Bug
i=11, type1=Bug
i=12, type1=Bug
i=13, type1=Bug
i=14, type1=Bug
i=15, type1=Normal
i=16, type1=Normal
i=17, type1=Normal
i=18, type1=Normal
i=19, type1=Normal
i=20, type1=Normal
i=21, type1=Normal
i=22, type1=Poison
i=23, type1=Poison
i=24, type1=Electric
i=25, type1=Electric
i=26, type1=Ground
i=27, type1=Ground
i=28, type1=Poison
i=29, type1=Poison
i=30, type1=Poison
i=31, type1=Poison
i=32, type1=Poison
i=33, type1=Poison
i=34, type1=Fairy
i=35, type1=Fairy
i=36, type1=Fire
i=37, type1=Fire
i=38, type1=Normal
i=39, type1=Normal
i=40, type1=Poison
i=41, type1=Poison
i=42, type1=Grass
i=43, type1=Grass
i=44, type1=Grass
i=45, type1=Bug
i=46, type1=Bug
i=47, type1=Bug
i=48, type1=Bug
i=49, type1=Ground
i=50, type1=Ground
i=51, type1=Normal
i=52, type1=Normal
i=53, type1=Water
i=54, type1=Water
i=55, type1=Fi

In [130]:
bs_df.iloc[34]

index             35
name        Clefairy
base_hp           70
base_att          45
base_def          48
base_spe          35
base_spc          60
Name: 34, dtype: object

In [128]:
index_list.insert(28, 29)
index_list.insert(31, 32)
type1_list.insert(28, 'Poison')
type1_list.insert(31, 'Poison')
type2_list.insert(28, 'None')
type2_list.insert(31, 'None')

In [131]:
series_type1 = pd.Series(data=type1_list)
series_type2 = pd.Series(data=type2_list)

In [132]:
bst_df = bs_df.copy()
bst_df['type1'] = series_type1
bst_df['type2'] = series_type2

In [133]:
bst_df

Unnamed: 0,index,name,base_hp,base_att,base_def,base_spe,base_spc,type1,type2
0,1,Bulbasaur,45,49,49,45,65,Grass,Poison
1,2,Ivysaur,60,62,63,60,80,Grass,Poison
2,3,Venusaur,80,82,83,80,100,Grass,Poison
3,4,Charmander,39,52,43,65,50,Fire,
4,5,Charmeleon,58,64,58,80,65,Fire,
...,...,...,...,...,...,...,...,...,...
146,147,Dratini,41,64,45,50,50,Dragon,
147,148,Dragonair,61,84,65,70,70,Dragon,
148,149,Dragonite,91,134,95,80,100,Dragon,Flying
149,150,Mewtwo,106,110,90,130,154,Psychic,


## `bst_df` now contains base stats and type info
Need to add randomizer levels and move lists. Randomizer levels are listed in `Gen1RandomizerLevels.csv`.

In [134]:
rl_df = pd.read_csv('Gen1RandomizerLevels.csv')
rl_df

Unnamed: 0,name,randomizer_level
0,Abra,84
1,Aerodactyl,75
2,Alakazam,68
3,Arbok,78
4,Arcanine,75
...,...,...
146,Weepinbell,80
147,Weezing,76
148,Wigglytuff,76
149,Zapdos,68


In [135]:
bstr_df = bst_df.merge(rl_df, on='name', how='inner')
bstr_df

Unnamed: 0,index,name,base_hp,base_att,base_def,base_spe,base_spc,type1,type2,randomizer_level
0,1,Bulbasaur,45,49,49,45,65,Grass,Poison,89
1,2,Ivysaur,60,62,63,60,80,Grass,Poison,80
2,3,Venusaur,80,82,83,80,100,Grass,Poison,74
3,4,Charmander,39,52,43,65,50,Fire,,90
4,5,Charmeleon,58,64,58,80,65,Fire,,81
...,...,...,...,...,...,...,...,...,...,...
146,147,Dratini,41,64,45,50,50,Dragon,,89
147,148,Dragonair,61,84,65,70,70,Dragon,,80
148,149,Dragonite,91,134,95,80,100,Dragon,Flying,74
149,150,Mewtwo,106,110,90,130,154,Psychic,,60


## `bstr_df` now contains stats, types, and randomizer level
Last we need to include movepools. Define functions to grab this from either Bulbapedia or PokemonDB.

In [136]:
def get_html_lines(species):
    name = species.lower()
    if name == 'mr. mime':
        name = 'mr-mime'
    if name == 'nidoran f':
        name = 'nidoran-f'
    if name == 'nidoran m':
        name = 'nidoran-m'
    if name == "farfetch'd":
        name = 'farfetchd'
    url = 'https://pokemondb.net/pokedex/' + name + '/moves/1'
    print(f'Opening url for {species}')
    page = urllib.request.urlopen(url)
    print(f'Got page for {species}')
    html_bytes = page.read()
    html = html_bytes.decode('utf-8')
    html_lines = html.split('\n')
    return html_lines

In [137]:
tauros_lines = get_html_lines('Tauros')
print(tauros_lines[211])

Opening url for Tauros
Got page for Tauros
			<div class="grid-row"> <div class="grid-col span-lg-6"><h3>Moves learnt by level up</h3> <p class="text-small"><em>Tauros</em> learns the following moves in Pokémon Red &amp; Blue at the levels specified.</p> <div class="resp-scroll"><table class="data-table"><thead><tr><th class="sorting" data-sort-type="int"><div class="sortwrap">Lv.</div></th> <th class="sorting" data-sort-type="string"><div class="sortwrap">Move</div></th> <th class="sorting" data-sort-type="string"><div class="sortwrap">Type</div></th> <th class="sorting" data-sort-type="string"><div class="sortwrap">Cat.</div></th> <th class="sorting" data-sort-type="int" data-sort-default="desc" data-blanks="1"><div class="sortwrap">Power</div></th> <th class="sorting" data-sort-type="int" data-sort-default="desc" data-blanks="1"><div class="sortwrap">Acc.</div></th> </tr></thead><tbody><tr><td class="cell-num">1</td><td class="cell-name"><a class="ent-name" href="/move/tackle" title

In [138]:
def get_matching_line_number(pattern, lines):
    has_match = False
    for i, line in enumerate(lines):
        match = re.search(pattern, line)
        if match:
            has_match = True
            break
    return i if has_match else -1

In [139]:
get_matching_line_number(r'Lv\.', tauros_lines)

211

In [140]:
def get_movepool(lines):
    moves = []
    starting_line = get_matching_line_number(r'Lv\.', lines)
    pat = r'"View details for ([A-Z][^"]*)"'
    for line in lines[starting_line:]:
        match = re.search(pat, line)
        if match:
            moves.append(match.group(1))
    return set(moves)

In [141]:
tauros_moves = get_movepool(tauros_lines)
print(tauros_moves)

{'Thunderbolt', 'Mimic', 'Take Down', 'Rage', 'Leer', 'Double Team', 'Ice Beam', 'Body Slam', 'Stomp', 'Fire Blast', 'Substitute', 'Hyper Beam', 'Tail Whip', 'Blizzard', 'Skull Bash', 'Bide', 'Strength', 'Toxic', 'Earthquake', 'Fissure', 'Thunder', 'Double-Edge', 'Tackle', 'Horn Drill', 'Rest'}


In [142]:
def convert_set(set):
    ret = ''
    sorted_set = sorted(set)
    for element in sorted_set:
        ret += element
        ret += ';'
    return ret[:-1]

In [143]:
def get_movepool_series():
    species_moves = []
    for species in bstr_df['name']:
        lines = get_html_lines(species)
        move_set = get_movepool(lines)
        print(f'{species}: {len(move_set)}')
        move_str = convert_set(move_set)
        species_moves.append(move_str)
    move_series = pd.Series(data=species_moves)
    return move_series

In [147]:
move_series = get_movepool_series()

Opening url for Bulbasaur
Got page for Bulbasaur
Bulbasaur: 23
Opening url for Ivysaur
Got page for Ivysaur
Ivysaur: 23
Opening url for Venusaur
Got page for Venusaur
Venusaur: 24
Opening url for Charmander
Got page for Charmander
Charmander: 31
Opening url for Charmeleon
Got page for Charmeleon
Charmeleon: 31
Opening url for Charizard
Got page for Charizard
Charizard: 35
Opening url for Squirtle
Got page for Squirtle
Squirtle: 30
Opening url for Wartortle
Got page for Wartortle
Wartortle: 30
Opening url for Blastoise
Got page for Blastoise
Blastoise: 33
Opening url for Caterpie
Got page for Caterpie
Caterpie: 2
Opening url for Metapod
Got page for Metapod
Metapod: 3
Opening url for Butterfree
Got page for Butterfree
Butterfree: 30
Opening url for Weedle
Got page for Weedle
Weedle: 2
Opening url for Kakuna
Got page for Kakuna
Kakuna: 3
Opening url for Beedrill
Got page for Beedrill
Beedrill: 24
Opening url for Pidgey
Got page for Pidgey
Pidgey: 21
Opening url for Pidgeotto
Got page for

Got page for Aerodactyl
Aerodactyl: 22
Opening url for Snorlax
Got page for Snorlax
Snorlax: 39
Opening url for Articuno
Got page for Articuno
Articuno: 23
Opening url for Zapdos
Got page for Zapdos
Zapdos: 24
Opening url for Moltres
Got page for Moltres
Moltres: 21
Opening url for Dratini
Got page for Dratini
Dratini: 28
Opening url for Dragonair
Got page for Dragonair
Dragonair: 29
Opening url for Dragonite
Got page for Dragonite
Dragonite: 31
Opening url for Mewtwo
Got page for Mewtwo
Mewtwo: 43
Opening url for Mew
Got page for Mew
Mew: 57


In [148]:
df_all = bstr_df.copy()
df_all['movepool'] = move_series

In [149]:
df_all

Unnamed: 0,index,name,base_hp,base_att,base_def,base_spe,base_spc,type1,type2,randomizer_level,movepool
0,1,Bulbasaur,45,49,49,45,65,Grass,Poison,89,Bide;Body Slam;Cut;Double Team;Double-Edge;Gro...
1,2,Ivysaur,60,62,63,60,80,Grass,Poison,80,Bide;Body Slam;Cut;Double Team;Double-Edge;Gro...
2,3,Venusaur,80,82,83,80,100,Grass,Poison,74,Bide;Body Slam;Cut;Double Team;Double-Edge;Gro...
3,4,Charmander,39,52,43,65,50,Fire,,90,Bide;Body Slam;Counter;Cut;Dig;Double Team;Dou...
4,5,Charmeleon,58,64,58,80,65,Fire,,81,Bide;Body Slam;Counter;Cut;Dig;Double Team;Dou...
...,...,...,...,...,...,...,...,...,...,...,...
146,147,Dratini,41,64,45,50,50,Dragon,,89,Agility;Bide;Blizzard;Body Slam;BubbleBeam;Dou...
147,148,Dragonair,61,84,65,70,70,Dragon,,80,Agility;Bide;Blizzard;Body Slam;BubbleBeam;Dou...
148,149,Dragonite,91,134,95,80,100,Dragon,Flying,74,Agility;Bide;Blizzard;Body Slam;BubbleBeam;Dou...
149,150,Mewtwo,106,110,90,130,154,Psychic,,60,Amnesia;Barrier;Bide;Blizzard;Body Slam;Bubble...


## One more problem to fix: fairy type
The PokemonDB info for types includes Fairy for a few pokemon. Need to remove this

In [150]:
# change primary type to Normal
df_all.loc[df_all['type1'] == 'Fairy', 'type1'] = 'Normal'

# change secondary type to None
df_all.loc[df_all['type2'] == 'Fairy', 'type2'] = 'None'

## `df_all` now contains everything!
Save it to a file

In [151]:
df_all.to_csv('species_list.csv', index=False)

In [152]:
df_all[df_all['name'] == 'Tauros'].to_dict(orient='records')[0]

{'index': 128,
 'name': 'Tauros',
 'base_hp': 75,
 'base_att': 100,
 'base_def': 95,
 'base_spe': 110,
 'base_spc': 70,
 'type1': 'Normal',
 'type2': 'None',
 'randomizer_level': 68,
 'movepool': 'Bide;Blizzard;Body Slam;Double Team;Double-Edge;Earthquake;Fire Blast;Fissure;Horn Drill;Hyper Beam;Ice Beam;Leer;Mimic;Rage;Rest;Skull Bash;Stomp;Strength;Substitute;Tackle;Tail Whip;Take Down;Thunder;Thunderbolt;Toxic'}

## Move list modifications
There are some moves we definitely don't want to implement, so let's remove them from all Pokemons' movepools.

# Move list