In [1]:
import requests
import bs4
from bs4 import BeautifulSoup
import re
import pandas as pd


In [2]:
"""
get generation 2 spawn tables
"""

def get_spawn_tables(loc):
    """
    params:
    -------
        loc - url to search through 
    rets:
    -----
        res - generation 2 spawn tables scraped from loc url
    """
    
    loc_page = requests.get(loc)
    loc_soup = BeautifulSoup(loc_page.content, 'html.parser')
    
    # extract html between header tags
    start = loc_soup.find('h2', {'id': 'gen2'})
    html  = u''
    res   = []
    
    if (start is None):
        return []
    
    for elem in start.next_siblings:
        if elem.name == 'h2':
            break
        else:
            html += str(elem)

    soup   = BeautifulSoup(html, 'html.parser')
    tables = soup.find_all('table', {'class': 'data-table'})
    catch_methods = soup.find_all('h3')
    
    for i in range(len(tables)):
        res.append(tuple([loc[31:], catch_methods[i].text, tables[i]]))
        
    return res
                                                         


In [3]:
"""
parse spawn table row
"""
def parse_spawn_table_row(row):
    """
    params:
    -------
        row - row to parse
    rets:
    -----
        res - tuple of the form [pokemon name | times available]
    """
    
    name         = row.find('td').find('span', {'class': 'infocard-data'}).find('a')['href'][9:]
    availability = []
    iterator     = 0
    times        = ['morning', 'day', 'night']
    
    for elem in row.find('td', {'class': 'cell-fixed'}).contents:
        if elem.name == 'img':
            availability.append(times[iterator])
            
        iterator += 1
        
    return tuple([name, availability])
    


In [4]:
"""
parse a single spawn table
"""

def parse_spawn_table(table):
    """
    params:
    -------
        table - table to parse. tuple (route, catch method, data table)
    rets:
    -----
        res - 2d matrix where each row is of form [pokemon name | routes available | times available | catch method]
    """
    
    route          = table[0]
    catch_method   = table[1]
    names          = []
    availabilities = []
    res            = []
    
    for row in table[2].find_all('tr')[1:]:
        name, availability = parse_spawn_table_row(row)
        names.append(name)
        availabilities.append(availability)
        
    for i in range(len(names)):
        res.append([names[i], route, availabilities[i], catch_method])
        
    return res

In [5]:
"""
parse all spawn tables
"""

def parse_spawn_tables():
    """
    params:
    -------
        NONE (relies on global var 'tables')
    rets:
    ----
         res - 2d matrix where each row is of form [pokemon name | routes available | times available | catch method]
    """
    
    res = []
    
    for table in tables:
        res += parse_spawn_table(table)
        
    return res

In [6]:
"""
union duplicate pokemon data
"""

def union_pokemon_guide():
    """
    params:
    -------
        NONE (relies on global var 'pokemon_location_guide')
    rets:
    -----
        res - unioned pokemon location guide (i.e. dupe pokemon have data unioned(?))
    """
    
    res   = []
    names = get_names()
    
    f_name   = ''
    f_routes = []
    f_catch  = []
    f_times  = [["morning", 0], ["day", 0], ["night", 0]]
    
    for name in names:
        
        to_union = [x for x in pokemon_location_guide if x[0] == name]
        f_name   = name
        f_routes = sorted(list(set([x[1] for x in to_union])))
        f_catch  = sorted(list(set([x[3] for x in to_union])))
        f_times  = [["morning", 0], ["day", 0], ["night", 0]]

        for data in to_union:
            for time in data[2]:
                if time == 'morning':
                    f_times[0][1] += 1
                elif time == 'day':
                    f_times[1][1] += 1
                elif time == 'night':
                    f_times[2][1] += 1
        
        for i in range(len(f_times)):
            f_times[i] = tuple(f_times[i])
                    
        res.append([f_name, f_routes, f_times, f_catch])
        
    return res
            

In [7]:
"""
get all available pokemon names
"""

def get_names():
    """
    params:
    -------
        NONE (relies on global var 'pokemon_location_guide')
    rets:
    -----
        res - all pokemon names available
    """
    
    res = []
    
    for poke_data in pokemon_location_guide:
        res.append(poke_data[0])
        
    return sorted(list(set(res)))
    

SCRAPING BEGINS HERE

In [8]:
"""
base url
"""
base_url = 'https://pokemondb.net/location'
page     = requests.get(base_url)
soup     = BeautifulSoup(page.content, 'html.parser')
poke_df  = pd.DataFrame(columns=['routes available', 'time of day', 'capture method'], dtype='object')

In [44]:
"""
load route urls into locs
"""
locs = []
loc  = ''

for a in soup.find_all('a', href=re.compile(r'/location/kanto')):
    to_append     = a['href'][9:]
    loc  = base_url + to_append
    locs.append(loc)
    
print (locs)
urban=['kanto-route-1','kanto-route-2', 'kanto-route-3','kanto-route-4','kanto-route-5','kanto-route-6','kanto-route-7','kanto-route-8','kanto-route-9','kanto-route-10','kanto-route-11','kanto-route-15','kanto-route-16','kanto-route-18','kanto-route-22','kanto-route-24', 'kanto-bond-bridge', 'kanto-celadon-city', 'kanto-cerulean-city', 'kanto-cinnabar-island', 'kanto-fuchsia-city', 'kanto-kindle-road', 'kanto-pallet-town', 'kanto-pewter-city', 'kanto-pokemon-mansion', 'kanto-pokemon-tower', 'kanto-power-plant', 'kanto-resort-gorgeous', 'kanto-safari-zone', 'kanto-saffron-city', 'kanto-silph-co', 'kanto-three-isle-port', 'kanto-trainer-tower', 'kanto-treasure-beach', 'kanto-vermilion-city', 'kanto-victory-road', 'kanto-viridian-city', 'kanto-viridian-forest']
rural=['kanto-route-12','kanto-route-13','kanto-route-14','kanto-route-17','kanto-route-19','kanto-route-20','kanto-route-21','kanto-route-23','kanto-route-25','kanto-route-26','kanto-route-27','kanto-route-28', 'kanto-berry-forest', 'kanto-canyon-entrance', 'kanto-cerulean-cave', 'kanto-cape-brink', 'kanto-digletts-cave', 'kanto-five-island', 'kanto-five-isle-meadow', 'kanto-four-island', 'kanto-green-path', 'kanto-icefall-cave', 'kanto-lost-cave', 'kanto-memorial-pillar', 'kanto-mt-ember', 'kanto-mt-moon', 'kanto-one-island', 'kanto-outcast-island', 'kanto-pattern-bush', 'kanto-rock-tunnel', 'kanto-ruin-valley', 'kanto-seafoam-islands', 'kanto-seavault-canyon', 'kanto-tanoby-ruins', 'kanto-tohjo-falls', 'kanto-water-labyrinth', 'kanto-water-path']
other = ['kanto-berry-forest', 'kanto-canyon-entrance', 'kanto-cerulean-cave', 'kanto-cape-brink', 'kanto-digletts-cave', 'kanto-five-island', 'kanto-five-isle-meadow', 'kanto-four-island', 'kanto-green-path', 'kanto-icefall-cave', 'kanto-lost-cave', 'kanto-memorial-pillar', 'kanto-mt-ember', 'kanto-mt-moon', 'kanto-one-island', 'kanto-outcast-island', 'kanto-pattern-bush', 'kanto-rock-tunnel', 'kanto-ruin-valley', 'kanto-seafoam-islands', 'kanto-seavault-canyon', 'kanto-tanoby-ruins', 'kanto-tohjo-falls', 'kanto-water-labyrinth', 'kanto-water-path', 'kanto-bond-bridge', 'kanto-celadon-city', 'kanto-cerulean-city', 'kanto-cinnabar-island', 'kanto-fuchsia-city', 'kanto-kindle-road', 'kanto-pallet-town', 'kanto-pewter-city', 'kanto-pokemon-mansion', 'kanto-pokemon-tower', 'kanto-power-plant', 'kanto-resort-gorgeous', 'kanto-safari-zone', 'kanto-saffron-city', 'kanto-silph-co', 'kanto-three-isle-port', 'kanto-trainer-tower', 'kanto-treasure-beach', 'kanto-vermilion-city', 'kanto-victory-road', 'kanto-viridian-city', 'kanto-viridian-forest']
def urbanOrRural(arr):
    urb=0
    rur=0
    oth=0
    for i in arr:
        if i in urban:
            urb+=1
        if i in rural:
            rur+=1
        if i in other:
            oth+=1
    if urb>=rur:
        return 'urban' 
    else:
        return 'rural'

    
def countUR(df):
    urb=0
    rur=0
    for i in df:
        if i== 'urban':
            urb+=1
        else:
            rur+=1
    return 'Urban: ' + str(urb)+  " rural: " + str(rur)


['https://pokemondb.net/location/kanto-route-1', 'https://pokemondb.net/location/kanto-route-2', 'https://pokemondb.net/location/kanto-route-3', 'https://pokemondb.net/location/kanto-route-4', 'https://pokemondb.net/location/kanto-route-5', 'https://pokemondb.net/location/kanto-route-6', 'https://pokemondb.net/location/kanto-route-7', 'https://pokemondb.net/location/kanto-route-8', 'https://pokemondb.net/location/kanto-route-9', 'https://pokemondb.net/location/kanto-route-10', 'https://pokemondb.net/location/kanto-route-11', 'https://pokemondb.net/location/kanto-route-12', 'https://pokemondb.net/location/kanto-route-13', 'https://pokemondb.net/location/kanto-route-14', 'https://pokemondb.net/location/kanto-route-15', 'https://pokemondb.net/location/kanto-route-16', 'https://pokemondb.net/location/kanto-route-17', 'https://pokemondb.net/location/kanto-route-18', 'https://pokemondb.net/location/kanto-route-19', 'https://pokemondb.net/location/kanto-route-20', 'https://pokemondb.net/locat

In [45]:
"""
load generation 2 tables from each url
"""
tables = []

for loc in locs:
    tables += (get_spawn_tables(loc))

In [46]:
"""
parse loaded tables and put into 2d matrix (non-numpy)
"""
pokemon_location_guide = parse_spawn_tables()
pokemon_location_guide = sorted(pokemon_location_guide, key=lambda x: x[0])
pokemon_location_guide = union_pokemon_guide()

In [47]:
"""
flip generated 2d matrix for df generation
"""
to_df = list(zip(*reversed(pokemon_location_guide)))

In [50]:
"""
generate pandas dataframe
"""
pokemon_location_guide_d  = {'Pokemon': to_df[0], 'Routes Available': to_df[1], 'Times Available': to_df[2], 'Capture Methods': to_df[3]}
pokemon_location_guide_df = pd.DataFrame(pokemon_location_guide_d)
pokemon_location_guide_df.set_index('Pokemon', inplace=True)
pokemon_location_guide_df['Urban/Rural']=pokemon_location_guide_df['Routes Available'].apply(urbanOrRural)


In [51]:
"""
save dataframe to "./Desktop/pokemonLocations.csv
"""
pokemon_location_guide_df.to_csv('~/Desktop/pokemonLocations.csv')