IMPORTS

In [1]:
import requests
import bs4
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as np

SCRAPE POKEMON LOCATION GUIDE DATA

In [2]:
"""
get generation 2 spawn tables
"""

def get_spawn_tables(loc):
    """
    params:
    -------
        loc - url to search through 
    rets:
    -----
        res - generation 2 spawn tables scraped from loc url
    """
    
    loc_page = requests.get(loc)
    loc_soup = BeautifulSoup(loc_page.content, 'html.parser')
    
    # extract html between header tags
    start = loc_soup.find('h2', {'id': 'gen2'})
    html  = u''
    res   = []
    
    if (start is None):
        return []
    
    for elem in start.next_siblings:
        if elem.name == 'h2':
            break
        else:
            html += str(elem)

    soup   = BeautifulSoup(html, 'html.parser')
    tables = soup.find_all('table', {'class': 'data-table'})
    catch_methods = soup.find_all('h3')
    
    for i in range(len(tables)):
        res.append(tuple([loc[31:], catch_methods[i].text, tables[i]]))
        
    return res

In [3]:
"""
parse spawn table row
"""
def parse_spawn_table_row(row):
    """
    params:
    -------
        row - row to parse
    rets:
    -----
        res - tuple of the form [pokemon name | times available]
    """
    
    name         = row.find('td').find('span', {'class': 'infocard-data'}).find('a')['href'][9:]
    availability = []
    iterator     = 0
    times        = ['morning', 'day', 'night']
    
    for elem in row.find('td', {'class': 'cell-fixed'}).contents:
        if elem.name == 'img':
            availability.append(times[iterator])
            
        iterator += 1
        
    return tuple([name, availability])
    

In [4]:
"""
parse a single spawn table
"""

def parse_spawn_table(table):
    """
    params:
    -------
        table - table to parse. tuple (route, catch method, data table)
    rets:
    -----
        res - 2d matrix where each row is of form [pokemon name | routes available | times available | catch method]
    """
    
    route          = table[0]
    catch_method   = table[1]
    names          = []
    availabilities = []
    res            = []
    
    for row in table[2].find_all('tr')[1:]:
        name, availability = parse_spawn_table_row(row)
        names.append(name)
        availabilities.append(availability)
        
    for i in range(len(names)):
        res.append([names[i], route, availabilities[i], catch_method])
        
    return res

In [5]:
"""
parse all spawn tables
"""

def parse_spawn_tables():
    """
    params:
    -------
        NONE (relies on global var 'tables')
    rets:
    ----
         res - 2d matrix where each row is of form [pokemon name | routes available | times available | catch method]
    """
    
    res = []
    
    for table in tables:
        res += parse_spawn_table(table)
        
    return res

In [6]:
"""
union duplicate pokemon data
"""

def union_pokemon_guide():
    """
    params:
    -------
        NONE (relies on global var 'pokemon_location_guide')
    rets:
    -----
        res - unioned pokemon location guide (i.e. dupe pokemon have data unioned(?))
    """
    
    res   = []
    names = get_names()
    
    f_name   = ''
    f_routes = []
    f_catch  = []
    f_times  = [["morning", 0], ["day", 0], ["night", 0]]
    
    for name in names:
        
        to_union = [x for x in pokemon_location_guide if x[0] == name]
        f_name   = name
        f_routes = sorted(list(set([x[1] for x in to_union])))
        f_catch  = sorted(list(set([x[3] for x in to_union])))
        f_times  = [["morning", 0], ["day", 0], ["night", 0]]

        for data in to_union:
            for time in data[2]:
                if time == 'morning':
                    f_times[0][1] += 1
                elif time == 'day':
                    f_times[1][1] += 1
                elif time == 'night':
                    f_times[2][1] += 1
        
        for i in range(len(f_times)):
            f_times[i] = tuple(f_times[i])
                    
        res.append([f_name, f_routes, f_times, f_catch])
        
    return res
            

In [7]:
"""
get all available pokemon names
"""

def get_names():
    """
    params:
    -------
        NONE (relies on global var 'pokemon_location_guide')
    rets:
    -----
        res - all pokemon names available
    """
    
    res = []
    
    for poke_data in pokemon_location_guide:
        res.append(poke_data[0])
        
    return sorted(list(set(res)))
    

In [8]:
"""
base url
"""
base_url = 'https://pokemondb.net/location'
page     = requests.get(base_url)
soup     = BeautifulSoup(page.content, 'html.parser')
poke_df  = pd.DataFrame(columns=['routes available', 'time of day', 'capture method'], dtype='object')

In [9]:
"""
load route urls into locs
"""
locs = []
loc  = ''

for a in soup.find_all('a', href=re.compile(r'/location/kanto')):
    to_append     = a['href'][9:]
    loc  = base_url + to_append
    locs.append(loc)

In [10]:
"""
load generation 2 tables from each url
"""
tables = []

for loc in locs:
    tables += (get_spawn_tables(loc))

In [11]:
"""
parse loaded tables and put into 2d matrix (non-numpy)
"""
pokemon_location_guide = parse_spawn_tables()
pokemon_location_guide = sorted(pokemon_location_guide, key=lambda x: x[0])
pokemon_location_guide = union_pokemon_guide()

In [12]:
"""
flip generated 2d matrix for df generation
"""
to_df = list(zip(*reversed(pokemon_location_guide)))

In [13]:
"""
generate pandas dataframe
"""
pokemon_location_guide_d  = {'Pokemon': to_df[0], 'Routes Available': to_df[1], 'Times Available': to_df[2], 'Capture Methods': to_df[3]}
pokemon_location_guide_df = pd.DataFrame(pokemon_location_guide_d)
pokemon_location_guide_df.set_index('Pokemon', inplace=True)
pokemon_location_guide_df

Unnamed: 0_level_0,Capture Methods,Routes Available,Times Available
Pokemon,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
zubat,[Walking],"[kanto-route-10, kanto-route-3, kanto-route-4,...","[(morning, 1), (day, 1), (night, 7)]"
xatu,[Trade],[kanto-pewter-city],"[(morning, 1), (day, 1), (night, 1)]"
weepinbell,[Walking],"[kanto-route-24, kanto-route-25]","[(morning, 2), (day, 2), (night, 2)]"
weedle,"[Headbutt, Headbutt (Special)]","[kanto-route-26, kanto-route-27]","[(morning, 4), (day, 4), (night, 4)]"
vulpix,[Walking],"[kanto-route-7, kanto-route-8]","[(morning, 2), (day, 2), (night, 2)]"
voltorb,[Walking],[kanto-route-10],"[(morning, 1), (day, 1), (night, 1)]"
venonat,[Walking],"[kanto-route-10, kanto-route-13, kanto-route-1...","[(morning, 2), (day, 0), (night, 8)]"
venomoth,[Walking],"[kanto-route-10, kanto-route-13, kanto-route-1...","[(morning, 0), (day, 0), (night, 7)]"
ursaring,[Walking],[kanto-route-28],"[(morning, 1), (day, 1), (night, 1)]"
tentacruel,"[Super Rod, Surfing]","[kanto-cinnabar-island, kanto-pallet-town, kan...","[(morning, 12), (day, 12), (night, 12)]"


START KAGGLE DATA

In [15]:
#read in file only with certain columns that we need
fields = ['pokemonId', 'appearedHour', 'closeToWater', 'city', 'weather', 'weatherIcon', 'urban', 'suburban', 'midurban', 'rural']
df = pd.read_csv('./data/300k.csv', skipinitialspace=True, usecols=fields)

In [16]:
#convert pokemonId to pokemon name - note: farfetch'd = farfetchd, mr. mime = mrmime
pokeNames = ['', 'bulbasaur', 'ivysaur', 'venusaur', 'charmander', 'charmeleon', 'charizard', 'squirtle', 'wartortle', 'blastoise', 'caterpie', 'metapod', 'butterfree', 'weedle', 'kakuna', 'beedrill', 'pidgey', 'pidgeotto', 'pidgeot', 'rattata', 'raticate', 'spearow', 'fearow', 'ekans', 'arbok', 'pikachu', 'raichu', 'sandshrew', 'sandslash', 'nidoran', 'nidorina', 'nidoqueen', 'nidoran', 'nidorino', 'nidoking', 'clefairy', 'vulpix', 'ninetales', 'jigglypuff', 'wigglytuff', 'zubat', 'golbat', 'oddish', 'gloom', 'vileplume', 'paras', 'parasect', 'venonat', 'venemoth', 'diglett', 'dugtrio', 'meowth', 'persian', 'psyduck', 'golduck', 'mankey', 'primeape', 'growlithe', 'arcanine', 'poliwag', 'poliwhirl', 'poliwrath', 'abra', 'kadabra', 'alakazam', 'machop', 'machoke', 'machamp', 'bellsprout', 'weepinbell', 'victreebel', 'tentacool', 'tentacruel', 'geodude', 'graveler', 'golem', 'ponyta', 'rapidash', 'slowpoke', 'slowbro', 'magnemite', 'magnetron', 'farfetchd', 'doduo', 'dodrio', 'seel', 'dewgong', 'grimer', 'muk', 'shellder', 'cloyster', 'gastly', 'haunter', 'gengar', 'onix', 'drowzee', 'hypno', 'krabby', 'kingler', 'voltorb', 'electrode', 'exeggcute', 'exeggutor', 'cubone', 'marowak', 'hitmonlee', 'hitmonchan', 'lickitung', 'koffing', 'weezing', 'rhyhorn', 'rhydon', 'chansey', 'tangela', 'kangaskhan', 'horsea', 'seadra', 'goldeen', 'seaking', 'staryu', 'starmie', 'mrmime', 'scyther', 'jynx', 'electabuzz', 'magmar', 'pinsir', 'tauros', 'magikarp', 'gyarados', 'lapras', 'ditto', 'eevee', 'vaporeon', 'jolteon', 'flareon', 'porygon', 'omanyte', 'omastar', 'kabuto', 'kabutops', 'aerodactyl', 'snorlax', 'articuno', 'zapdos', 'moltres', 'dratini', 'dragonair', 'dragonite', 'mewtwo', 'mew']

def idToName(pokeId): 
    index = int(pokeId)
    return pokeNames[index]

df['pokemonName'] = df['pokemonId'].apply(idToName)

In [17]:
#convert appearedHour to time of day to match scraped data
def timeConvert(hour):
    if(hour >= 4 and hour < 10):
        return 'morning'
    elif(hour >= 10 and hour < 18):
        return 'day'
    else:
        return 'night'

df['appearedTimeOfDay'] = df['appearedHour'].apply(timeConvert)

In [18]:
#dictionary of name: (# of morning appearances, day, night) 
pokeTimes = {}

for i, row in df.iterrows():
    name = row['pokemonName']
    time = row['appearedTimeOfDay']
    if name in pokeTimes:
        m, d, n = pokeTimes[name]
        if(time == 'morning'):
            pokeTimes[name] = (m+1, d, n)
        if(time == 'day'):
            pokeTimes[name] = (m, d+1, n)
        if(time == 'night'):
            pokeTimes[name] = (m, d, n+1)
    else:
        if(time == 'morning'):
            pokeTimes[name] = (1, 0, 0)
        if(time == 'day'):
            pokeTimes[name] = (0, 1, 0)
        if(time == 'night'):
            pokeTimes[name] = (0, 0, 1)   


In [19]:
#dictionary of name: time of day with most appearances
pokeTimeOfDay = {}
for pokemon, (m, d, n) in pokeTimes.items():
    if(m > d and m > n):
        pokeTimeOfDay[pokemon] = 'morning'
    elif(d > m and d > n):
        pokeTimeOfDay[pokemon] = 'day'
    elif(n > m and n > d):
        pokeTimeOfDay[pokemon] = 'night'
    elif(m == d and m == n):
        if(m == n):
            pokeTimeOfDay[pokemon] = 'morning, day, night'
        else: 
            pokeTimeOfDay[pokemon] = 'morning, day'
    elif(m == n):
        pokeTimeOfDay[pokemon] = 'morning, night'
    elif(d == n):
        pokeTimeOfDay[pokemon] = 'day, night'
    
print(pokeTimeOfDay)

{'pidgey': 'night', 'vaporeon': 'night', 'weedle': 'night', 'spearow': 'night', 'machoke': 'night', 'sandshrew': 'night', 'clefairy': 'night', 'rattata': 'night', 'seadra': 'night', 'graveler': 'night', 'nidoran': 'night', 'gyarados': 'night', 'ekans': 'night', 'golem': 'night', 'arcanine': 'night', 'venemoth': 'night', 'parasect': 'night', 'golbat': 'night', 'poliwhirl': 'night', 'starmie': 'night', 'dragonair': 'night', 'kingler': 'night', 'pidgeotto': 'night', 'kangaskhan': 'night', 'beedrill': 'night', 'kadabra': 'night', 'marowak': 'night', 'exeggutor': 'night', 'persian': 'night', 'primeape': 'night', 'rhydon': 'night', 'caterpie': 'night', 'kakuna': 'night', 'pikachu': 'night', 'dugtrio': 'night', 'slowbro': 'night', 'hypno': 'night', 'jynx': 'morning', 'weepinbell': 'night', 'magikarp': 'night', 'tauros': 'night', 'metapod': 'night', 'golduck': 'night', 'rapidash': 'night', 'diglett': 'night', 'oddish': 'night', 'bulbasaur': 'night', 'tentacruel': 'night', 'ninetales': 'night',

In [20]:
#number of appearances in urban vs rural
urbanAppearances = 0
ruralAppearances = 0

for i, row in df.iterrows():
    if(row['urban']):
        urbanAppearances+= 1
    if(row['rural']):
        ruralAppearances+= 1

print(str(urbanAppearances) + str(ruralAppearances))

12713185775


In [21]:
#number of appearances in urban vs rural per pokemon
#dictionary of name: (urban appearances, rural)
pokePlace = {}
for i, row in df.iterrows():
    name = row['pokemonName']
    if name in pokePlace:
        u, r = pokePlace[name]
        if(row['urban']):
            pokePlace[name] = (u+1, r)
        if(row['rural']):
            pokePlace[name] = (u, r+1)
    else:
        if(row['urban']):
            pokePlace[name] = (1, 0)
        if(row['rural']):
            pokePlace[name] = (0, 1)
            
print(pokePlace)

{'pidgey': (20801, 15626), 'vaporeon': (4289, 3612), 'weedle': (9870, 9507), 'spearow': (5592, 3286), 'machoke': (310, 147), 'sandshrew': (906, 445), 'clefairy': (1693, 966), 'rattata': (15851, 12593), 'seadra': (1146, 581), 'graveler': (1103, 636), 'nidoran': (3500, 2082), 'gyarados': (3978, 2299), 'ekans': (1820, 848), 'golem': (80, 33), 'arcanine': (801, 343), 'venemoth': (3076, 2713), 'parasect': (2909, 2003), 'poliwhirl': (2049, 968), 'starmie': (1881, 849), 'dragonair': (293, 163), 'golbat': (4956, 2472), 'kingler': (2294, 1168), 'pidgeotto': (1340, 944), 'kangaskhan': (176, 62), 'beedrill': (67, 72), 'kadabra': (605, 371), 'exeggutor': (800, 496), 'persian': (762, 463), 'rhydon': (531, 307), 'pikachu': (225, 136), 'slowbro': (936, 459), 'jynx': (190, 63), 'weepinbell': (1410, 1149), 'primeape': (852, 447), 'metapod': (241, 187), 'caterpie': (3824, 3115), 'golduck': (1956, 1121), 'oddish': (512, 234), 'bulbasaur': (683, 353), 'tentacruel': (506, 407), 'ninetales': (249, 194), 'dr

In [22]:
#pokemon that appear more often in urban/rural
urbanPokemon = []
ruralPokemon = []
equalPokemon = []

for pokemon, (u, r) in pokePlace.items():
    if(u > r):
        urbanPokemon.append(pokemon)
    elif(r > u):
        ruralPokemon.append(pokemon)
    else:
        equalPokemon.append(pokemon)

print(str(urbanPokemon) + '\n\n' + str(ruralPokemon) + '\n\n' + str(equalPokemon))

['pidgey', 'vaporeon', 'weedle', 'spearow', 'machoke', 'sandshrew', 'clefairy', 'rattata', 'seadra', 'graveler', 'nidoran', 'gyarados', 'ekans', 'golem', 'arcanine', 'venemoth', 'parasect', 'poliwhirl', 'starmie', 'dragonair', 'golbat', 'kingler', 'pidgeotto', 'kangaskhan', 'kadabra', 'exeggutor', 'persian', 'rhydon', 'pikachu', 'slowbro', 'jynx', 'weepinbell', 'primeape', 'metapod', 'caterpie', 'golduck', 'oddish', 'bulbasaur', 'tentacruel', 'ninetales', 'drowzee', 'magikarp', 'tauros', 'hypno', 'vileplume', 'kakuna', 'lapras', 'gloom', 'kabutops', 'cloyster', 'seel', 'marowak', 'seaking', 'haunter', 'poliwag', 'pidgeot', 'dugtrio', 'rapidash', 'hitmonchan', 'squirtle', 'fearow', 'cubone', 'staryu', 'weezing', 'diglett', 'wigglytuff', 'geodude', 'farfetchd', 'articuno', 'magnemite', 'raticate', 'venonat', 'dodrio', 'dewgong', 'victreebel', 'omastar', 'charmander', 'electabuzz', 'poliwrath', 'omanyte', 'krabby', 'nidorino', 'psyduck', 'wartortle', 'growlithe', 'nidorina', 'muk', 'manke

In [23]:
#total appearances of each pokemon
pokeAppearances = {}
for i, row in df.iterrows():
    name = row['pokemonName']
    if name in pokeAppearances:
        pokeAppearances[name] = pokeAppearances[name] + 1
    else:
        pokeAppearances[name] = 1

In [24]:
#sorted by least to most appearances
pokeAppearancesSorted = sorted(pokeAppearances.items(), key=lambda x: x[1])
print(pokeAppearancesSorted)

[('doduo', 1), ('shellder', 5), ('aerodactyl', 5), ('kabuto', 7), ('machop', 7), ('charizard', 8), ('jigglypuff', 8), ('lapras', 10), ('flareon', 10), ('raichu', 10), ('abra', 12), ('venusaur', 12), ('onix', 12), ('omanyte', 14), ('nidoqueen', 16), ('rhyhorn', 16), ('paras', 16), ('nidoking', 17), ('blastoise', 18), ('porygon', 18), ('tentacool', 19), ('ditto', 20), ('ponyta', 20), ('tangela', 22), ('charmeleon', 23), ('mewtwo', 24), ('meowth', 25), ('zubat', 25), ('jolteon', 26), ('grimer', 28), ('cubone', 29), ('psyduck', 29), ('hitmonlee', 29), ('gastly', 29), ('poliwag', 32), ('exeggcute', 34), ('snorlax', 34), ('horsea', 34), ('machamp', 38), ('farfetchd', 40), ('lickitung', 40), ('slowpoke', 41), ('chansey', 43), ('koffing', 46), ('dragonite', 47), ('hitmonchan', 56), ('sandslash', 56), ('mrmime', 58), ('magnemite', 59), ('growlithe', 60), ('vulpix', 64), ('articuno', 73), ('goldeen', 84), ('butterfree', 95), ('wartortle', 99), ('ivysaur', 100), ('alakazam', 107), ('magmar', 130)