IMPORTS

In [39]:
import requests
import bs4
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as np

SCRAPE POKEMON LOCATION GUIDE DATA

In [40]:
"""
get generation 2 spawn tables
"""

def get_spawn_tables(loc):
    """
    params:
    -------
        loc - url to search through 
    rets:
    -----
        res - generation 2 spawn tables scraped from loc url
    """
    
    loc_page = requests.get(loc)
    loc_soup = BeautifulSoup(loc_page.content, 'html.parser')
    
    # extract html between header tags
    start = loc_soup.find('h2', {'id': 'gen2'})
    html  = u''
    res   = []
    
    if (start is None):
        return []
    
    for elem in start.next_siblings:
        if elem.name == 'h2':
            break
        else:
            html += str(elem)

    soup   = BeautifulSoup(html, 'html.parser')
    tables = soup.find_all('table', {'class': 'data-table'})
    catch_methods = soup.find_all('h3')
    
    for i in range(len(tables)):
        res.append(tuple([loc[31:], catch_methods[i].text, tables[i]]))
        
    return res

In [41]:
"""
parse spawn table row
"""
def parse_spawn_table_row(row):
    """
    params:
    -------
        row - row to parse
    rets:
    -----
        res - tuple of the form [pokemon name | times available]
    """
    
    name         = row.find('td').find('span', {'class': 'infocard-data'}).find('a')['href'][9:]
    availability = []
    iterator     = 0
    times        = ['morning', 'day', 'night']
    
    for elem in row.find('td', {'class': 'cell-fixed'}).contents:
        if elem.name == 'img':
            availability.append(times[iterator])
            
        iterator += 1
        
    return tuple([name, availability])
    

In [42]:
"""
parse a single spawn table
"""

def parse_spawn_table(table):
    """
    params:
    -------
        table - table to parse. tuple (route, catch method, data table)
    rets:
    -----
        res - 2d matrix where each row is of form [pokemon name | routes available | times available | catch method]
    """
    
    route          = table[0]
    catch_method   = table[1]
    names          = []
    availabilities = []
    res            = []
    
    for row in table[2].find_all('tr')[1:]:
        name, availability = parse_spawn_table_row(row)
        names.append(name)
        availabilities.append(availability)
        
    for i in range(len(names)):
        res.append([names[i], route, availabilities[i], catch_method])
        
    return res

In [43]:
"""
parse all spawn tables
"""

def parse_spawn_tables():
    """
    params:
    -------
        NONE (relies on global var 'tables')
    rets:
    ----
         res - 2d matrix where each row is of form [pokemon name | routes available | times available | catch method]
    """
    
    res = []
    
    for table in tables:
        res += parse_spawn_table(table)
        
    return res

In [44]:
"""
union duplicate pokemon data
"""

def union_pokemon_guide():
    """
    params:
    -------
        NONE (relies on global var 'pokemon_location_guide')
    rets:
    -----
        res - unioned pokemon location guide (i.e. dupe pokemon have data unioned(?))
    """
    
    res   = []
    names = get_names()
    
    f_name   = ''
    f_routes = []
    f_catch  = []
    f_times  = [["morning", 0], ["day", 0], ["night", 0]]
    
    for name in names:
        
        to_union = [x for x in pokemon_location_guide if x[0] == name]
        f_name   = name
        f_routes = sorted(list(set([x[1] for x in to_union])))
        f_catch  = sorted(list(set([x[3] for x in to_union])))
        f_times  = [["morning", 0], ["day", 0], ["night", 0]]

        for data in to_union:
            for time in data[2]:
                if time == 'morning':
                    f_times[0][1] += 1
                elif time == 'day':
                    f_times[1][1] += 1
                elif time == 'night':
                    f_times[2][1] += 1
        
        for i in range(len(f_times)):
            f_times[i] = tuple(f_times[i])
                    
        res.append([f_name, f_routes, f_times, f_catch])
        
    return res
            

In [45]:
"""
get all available pokemon names
"""

def get_names():
    """
    params:
    -------
        NONE (relies on global var 'pokemon_location_guide')
    rets:
    -----
        res - all pokemon names available
    """
    
    res = []
    
    for poke_data in pokemon_location_guide:
        res.append(poke_data[0])
        
    return sorted(list(set(res)))
    

In [46]:
"""
base url
"""
base_url = 'https://pokemondb.net/location'
page     = requests.get(base_url)
soup     = BeautifulSoup(page.content, 'html.parser')
poke_df  = pd.DataFrame(columns=['routes available', 'time of day', 'capture method'], dtype='object')

In [47]:
"""
load route urls into locs
"""
locs = []
loc  = ''

for a in soup.find_all('a', href=re.compile(r'/location/kanto')):
    to_append     = a['href'][9:]
    loc  = base_url + to_append
    locs.append(loc)

In [48]:
"""
load generation 2 tables from each url
"""
tables = []

for loc in locs:
    tables += (get_spawn_tables(loc))

In [49]:
"""
parse loaded tables and put into 2d matrix (non-numpy)
"""
pokemon_location_guide = parse_spawn_tables()
pokemon_location_guide = sorted(pokemon_location_guide, key=lambda x: x[0])
pokemon_location_guide = union_pokemon_guide()

In [50]:
"""
flip generated 2d matrix for df generation
"""
to_df = list(zip(*reversed(pokemon_location_guide)))

In [51]:
"""
generate pandas dataframe
"""
pokemon_location_guide_d  = {'Pokemon': to_df[0], 'Routes Available': to_df[1], 'Times Available': to_df[2], 'Capture Methods': to_df[3]}
pokemon_location_guide_df = pd.DataFrame(pokemon_location_guide_d)
pokemon_location_guide_df.set_index('Pokemon', inplace=True)
pokemon_location_guide_df

Unnamed: 0_level_0,Capture Methods,Routes Available,Times Available
Pokemon,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
zubat,[Walking],"[kanto-route-10, kanto-route-3, kanto-route-4,...","[(morning, 1), (day, 1), (night, 7)]"
xatu,[Trade],[kanto-pewter-city],"[(morning, 1), (day, 1), (night, 1)]"
weepinbell,[Walking],"[kanto-route-24, kanto-route-25]","[(morning, 2), (day, 2), (night, 2)]"
weedle,"[Headbutt, Headbutt (Special)]","[kanto-route-26, kanto-route-27]","[(morning, 4), (day, 4), (night, 4)]"
vulpix,[Walking],"[kanto-route-7, kanto-route-8]","[(morning, 2), (day, 2), (night, 2)]"
voltorb,[Walking],[kanto-route-10],"[(morning, 1), (day, 1), (night, 1)]"
venonat,[Walking],"[kanto-route-10, kanto-route-13, kanto-route-1...","[(morning, 2), (day, 0), (night, 8)]"
venomoth,[Walking],"[kanto-route-10, kanto-route-13, kanto-route-1...","[(morning, 0), (day, 0), (night, 7)]"
ursaring,[Walking],[kanto-route-28],"[(morning, 1), (day, 1), (night, 1)]"
tentacruel,"[Super Rod, Surfing]","[kanto-cinnabar-island, kanto-pallet-town, kan...","[(morning, 12), (day, 12), (night, 12)]"


START KAGGLE DATA

In [52]:
#read in file only with certain columns that we need
fields = ['pokemonId', 'appearedHour', 'closeToWater', 'city', 'weather', 'weatherIcon', 'urban', 'suburban', 'midurban', 'rural']
df = pd.read_csv('./data/300k.csv', skipinitialspace=True, usecols=fields)

In [53]:
#convert pokemonId to pokemon name - note: farfetch'd = farfetchd, mr. mime = mrmime
pokeNames = ['', 'bulbasaur', 'ivysaur', 'venusaur', 'charmander', 'charmeleon', 'charizard', 'squirtle', 'wartortle', 'blastoise', 'caterpie', 'metapod', 'butterfree', 'weedle', 'kakuna', 'beedrill', 'pidgey', 'pidgeotto', 'pidgeot', 'rattata', 'raticate', 'spearow', 'fearow', 'ekans', 'arbok', 'pikachu', 'raichu', 'sandshrew', 'sandslash', 'nidoran', 'nidorina', 'nidoqueen', 'nidoran', 'nidorino', 'nidoking', 'clefairy', 'vulpix', 'ninetales', 'jigglypuff', 'wigglytuff', 'zubat', 'golbat', 'oddish', 'gloom', 'vileplume', 'paras', 'parasect', 'venonat', 'venemoth', 'diglett', 'dugtrio', 'meowth', 'persian', 'psyduck', 'golduck', 'mankey', 'primeape', 'growlithe', 'arcanine', 'poliwag', 'poliwhirl', 'poliwrath', 'abra', 'kadabra', 'alakazam', 'machop', 'machoke', 'machamp', 'bellsprout', 'weepinbell', 'victreebel', 'tentacool', 'tentacruel', 'geodude', 'graveler', 'golem', 'ponyta', 'rapidash', 'slowpoke', 'slowbro', 'magnemite', 'magnetron', 'farfetchd', 'doduo', 'dodrio', 'seel', 'dewgong', 'grimer', 'muk', 'shellder', 'cloyster', 'gastly', 'haunter', 'gengar', 'onix', 'drowzee', 'hypno', 'krabby', 'kingler', 'voltorb', 'electrode', 'exeggcute', 'exeggutor', 'cubone', 'marowak', 'hitmonlee', 'hitmonchan', 'lickitung', 'koffing', 'weezing', 'rhyhorn', 'rhydon', 'chansey', 'tangela', 'kangaskhan', 'horsea', 'seadra', 'goldeen', 'seaking', 'staryu', 'starmie', 'mrmime', 'scyther', 'jynx', 'electabuzz', 'magmar', 'pinsir', 'tauros', 'magikarp', 'gyarados', 'lapras', 'ditto', 'eevee', 'vaporeon', 'jolteon', 'flareon', 'porygon', 'omanyte', 'omastar', 'kabuto', 'kabutops', 'aerodactyl', 'snorlax', 'articuno', 'zapdos', 'moltres', 'dratini', 'dragonair', 'dragonite', 'mewtwo', 'mew']

def idToName(pokeId): 
    index = int(pokeId)
    return pokeNames[index]

df['pokemonName'] = df['pokemonId'].apply(idToName)

In [54]:
#remove rows with pokemon not in scrapped data
pokeList = pokemon_location_guide_df.index.values
df = df.loc[df['pokemonName'].isin(pokeList)]

In [55]:
#convert appearedHour to time of day to match scraped data
def timeConvert(hour):
    if(hour >= 4 and hour < 10):
        return 'morning'
    elif(hour >= 10 and hour < 18):
        return 'day'
    else:
        return 'night'

df['appearedTimeOfDay'] = df['appearedHour'].apply(timeConvert)

In [56]:
#dictionary of name: (# of morning appearances, day, night) 
pokeTimes = {}

for i, row in df.iterrows():
    name = row['pokemonName']
    time = row['appearedTimeOfDay']
    if name in pokeTimes:
        m, d, n = pokeTimes[name]
        if(time == 'morning'):
            pokeTimes[name] = (m+1, d, n)
        if(time == 'day'):
            pokeTimes[name] = (m, d+1, n)
        if(time == 'night'):
            pokeTimes[name] = (m, d, n+1)
    else:
        if(time == 'morning'):
            pokeTimes[name] = (1, 0, 0)
        if(time == 'day'):
            pokeTimes[name] = (0, 1, 0)
        if(time == 'night'):
            pokeTimes[name] = (0, 0, 1)   


In [87]:
#dictionary of name: time of day with most appearances
pokeTimeOfDay = {}
for pokemon, (m, d, n) in pokeTimes.items():
    if(m > d and m > n):
        pokeTimeOfDay[pokemon] = 'morning'
    elif(d > m and d > n):
        pokeTimeOfDay[pokemon] = 'day'
    elif(n > m and n > d):
        pokeTimeOfDay[pokemon] = 'night'
    elif(m == d):
        if(m == n):
            pokeTimeOfDay[pokemon] = 'morning, day, night'
        else: 
            pokeTimeOfDay[pokemon] = 'morning, day'
    elif(m == n):
        pokeTimeOfDay[pokemon] = 'morning, night'
    elif(d == n):
        pokeTimeOfDay[pokemon] = 'day, night'
    
print(pokeTimeOfDay)

{'pidgey': 'night', 'weedle': 'night', 'spearow': 'night', 'sandshrew': 'night', 'clefairy': 'night', 'rattata': 'night', 'gyarados': 'night', 'ekans': 'night', 'golbat': 'night', 'poliwhirl': 'night', 'kingler': 'night', 'pidgeotto': 'night', 'beedrill': 'night', 'kadabra': 'night', 'marowak': 'night', 'persian': 'night', 'primeape': 'night', 'caterpie': 'night', 'kakuna': 'night', 'hypno': 'night', 'weepinbell': 'night', 'magikarp': 'night', 'metapod': 'night', 'golduck': 'night', 'rapidash': 'night', 'oddish': 'night', 'tentacruel': 'night', 'gloom': 'night', 'drowzee': 'night', 'seaking': 'night', 'haunter': 'night', 'tangela': 'morning', 'poliwag': 'night', 'muk': 'night', 'exeggcute': 'night', 'abra': 'morning, day, night', 'krabby': 'day, night', 'fearow': 'night', 'dodrio': 'night', 'staryu': 'day', 'electabuzz': 'night', 'magnemite': 'day', 'raticate': 'night', 'venonat': 'night', 'butterfree': 'night', 'nidorina': 'night', 'arbok': 'night', 'nidorino': 'night', 'psyduck': 'mo

In [58]:
#number of appearances in urban vs rural
urbanAppearances = 0
ruralAppearances = 0

for i, row in df.iterrows():
    if(row['urban']):
        urbanAppearances+= 1
    if(row['rural']):
        ruralAppearances+= 1

print(str(urbanAppearances) + ", " + str(ruralAppearances))

9726366259


In [59]:
#number of appearances in urban vs rural per pokemon
#dictionary of name: (urban appearances, rural)
pokePlace = {}
for i, row in df.iterrows():
    name = row['pokemonName']
    if name in pokePlace:
        u, r = pokePlace[name]
        if(row['urban']):
            pokePlace[name] = (u+1, r)
        if(row['rural']):
            pokePlace[name] = (u, r+1)
    else:
        if(row['urban']):
            pokePlace[name] = (1, 0)
        if(row['rural']):
            pokePlace[name] = (0, 1)
            
print(pokePlace)

{'pidgey': (20801, 15626), 'weedle': (9870, 9507), 'spearow': (5592, 3286), 'sandshrew': (906, 445), 'clefairy': (1693, 966), 'rattata': (15851, 12593), 'gyarados': (3978, 2299), 'ekans': (1820, 848), 'poliwhirl': (2049, 968), 'golbat': (4956, 2472), 'kingler': (2294, 1168), 'pidgeotto': (1340, 944), 'beedrill': (67, 72), 'kadabra': (605, 371), 'persian': (762, 463), 'weepinbell': (1410, 1149), 'primeape': (852, 447), 'metapod': (241, 187), 'caterpie': (3824, 3115), 'golduck': (1956, 1121), 'oddish': (512, 234), 'tentacruel': (506, 407), 'drowzee': (107, 42), 'magikarp': (342, 145), 'hypno': (6022, 2040), 'kakuna': (661, 594), 'gloom': (1358, 1237), 'tangela': (5, 6), 'marowak': (413, 230), 'seaking': (2053, 956), 'haunter': (1273, 554), 'poliwag': (17, 8), 'abra': (1, 8), 'rapidash': (516, 380), 'fearow': (178, 107), 'staryu': (70, 42), 'magnemite': (29, 15), 'raticate': (502, 373), 'venonat': (90, 70), 'dodrio': (285, 128), 'electabuzz': (510, 109), 'krabby': (194, 57), 'nidorino': (

In [63]:
#pokemon that appear more often in urban/rural
urbanPokemon = []
ruralPokemon = []

for pokemon, (u, r) in pokePlace.items():
    if(u > r):
        urbanPokemon.append(pokemon)
    elif(r > u):
        ruralPokemon.append(pokemon)

print(str(urbanPokemon) + '\n\n' + str(ruralPokemon))

['pidgey', 'weedle', 'spearow', 'sandshrew', 'clefairy', 'rattata', 'gyarados', 'ekans', 'poliwhirl', 'golbat', 'kingler', 'pidgeotto', 'kadabra', 'persian', 'weepinbell', 'primeape', 'metapod', 'caterpie', 'golduck', 'oddish', 'tentacruel', 'drowzee', 'magikarp', 'hypno', 'kakuna', 'gloom', 'marowak', 'seaking', 'haunter', 'poliwag', 'rapidash', 'fearow', 'staryu', 'magnemite', 'raticate', 'venonat', 'dodrio', 'electabuzz', 'krabby', 'nidorino', 'psyduck', 'growlithe', 'nidorina', 'muk', 'mankey', 'sandslash', 'arbok', 'vulpix', 'meowth', 'voltorb', 'shellder', 'slowpoke', 'goldeen', 'butterfree', 'grimer', 'chansey', 'zubat', 'aerodactyl']

['beedrill', 'tangela', 'abra', 'exeggcute', 'tentacool', 'ponyta', 'jigglypuff']


In [64]:
#total appearances of each pokemon
pokeAppearances = {}
for i, row in df.iterrows():
    name = row['pokemonName']
    if name in pokeAppearances:
        pokeAppearances[name] = pokeAppearances[name] + 1
    else:
        pokeAppearances[name] = 1

In [65]:
#sorted by least to most appearances
pokeAppearancesSorted = sorted(pokeAppearances.items(), key=lambda x: x[1])
print(pokeAppearancesSorted)

[('doduo', 1), ('shellder', 5), ('aerodactyl', 5), ('jigglypuff', 8), ('abra', 12), ('tentacool', 19), ('ponyta', 20), ('tangela', 22), ('meowth', 25), ('zubat', 25), ('grimer', 28), ('psyduck', 29), ('poliwag', 32), ('exeggcute', 34), ('slowpoke', 41), ('chansey', 43), ('sandslash', 56), ('magnemite', 59), ('growlithe', 60), ('vulpix', 64), ('goldeen', 84), ('butterfree', 95), ('mankey', 131), ('muk', 138), ('arbok', 146), ('staryu', 152), ('voltorb', 153), ('beedrill', 199), ('drowzee', 205), ('nidorina', 205), ('venonat', 227), ('nidorino', 272), ('krabby', 318), ('fearow', 393), ('dodrio', 551), ('metapod', 596), ('magikarp', 670), ('electabuzz', 755), ('marowak', 930), ('oddish', 1035), ('tentacruel', 1153), ('rapidash', 1210), ('raticate', 1233), ('kadabra', 1360), ('persian', 1757), ('kakuna', 1807), ('primeape', 1912), ('sandshrew', 2025), ('haunter', 2419), ('pidgeotto', 3290), ('weepinbell', 3468), ('clefairy', 3565), ('gloom', 3603), ('seaking', 3842), ('poliwhirl', 3897), (

In [88]:
#go through scraped data and see what time of day each pokemon appears in most often
scrapedPokeTime = {}
for i, row in pokemon_location_guide_df.iterrows():
    temp = row['Times Available']
    m = temp[0]
    d = temp[1]
    n = temp[2]
    if(m[1] > d[1] and m[1] > n[1]):
        scrapedPokeTime[i] = 'morning'
    elif(d[1] > m[1] and d[1] > n[1]):
        scrapedPokeTime[i] = 'day'
    elif(n[1] > m[1] and n[1] > d[1]):
        scrapedPokeTime[i] = 'night'
    elif(m[1] == d[1]):
        if(m[1] == n[1]):
            scrapedPokeTime[i] = 'morning, day, night'
        else: 
            scrapedPokeTime[i] = 'morning, day'
    elif(m[1] == n[1]):
        scrapedPokeTime[i] = 'morning, night'
    elif(d[1] == n[1]):
        scrapedPokeTime[i] = 'day, night'
        
print(scrapedPokeTime)

{'zubat': 'night', 'xatu': 'morning, day, night', 'weepinbell': 'morning, day, night', 'weedle': 'morning, day, night', 'vulpix': 'morning, day, night', 'voltorb': 'morning, day, night', 'venonat': 'night', 'venomoth': 'night', 'ursaring': 'morning, day, night', 'tentacruel': 'morning, day, night', 'tentacool': 'morning, day, night', 'tangela': 'morning, day, night', 'sunkern': 'day', 'staryu': 'night', 'spearow': 'morning, day', 'snubbull': 'morning, day', 'sneasel': 'night', 'slugma': 'day', 'slowpoke': 'morning, day, night', 'skiploom': 'morning', 'shellder': 'morning, day, night', 'sentret': 'morning, day', 'seaking': 'morning, day, night', 'sandslash': 'morning, day, night', 'sandshrew': 'morning, day', 'rattata': 'morning', 'raticate': 'morning, day, night', 'rapidash': 'morning, day', 'qwilfish': 'morning, day, night', 'quagsire': 'night', 'psyduck': 'night', 'primeape': 'morning, day, night', 'ponyta': 'morning, day, night', 'poliwhirl': 'night', 'poliwag': 'night', 'pineco': '

In [89]:
#compare pogo and scraped time of days
sameTimes = 0
for pokemon, time in pokeTimeOfDay.items():
    scrapedTime = scrapedPokeTime[pokemon]
    if(time == scrapedTime):
        sameTimes+= 1

print(sameTimes)

14


In [91]:
urban=['kanto-route-1','kanto-route-2', 'kanto-route-3','kanto-route-4','kanto-route-5','kanto-route-6','kanto-route-7','kanto-route-8','kanto-route-9','kanto-route-10','kanto-route-11','kanto-route-15','kanto-route-16','kanto-route-18','kanto-route-22','kanto-route-24', 'kanto-bond-bridge', 'kanto-celadon-city', 'kanto-cerulean-city', 'kanto-cinnabar-island', 'kanto-fuchsia-city', 'kanto-kindle-road', 'kanto-pallet-town', 'kanto-pewter-city', 'kanto-pokemon-mansion', 'kanto-pokemon-tower', 'kanto-power-plant', 'kanto-resort-gorgeous', 'kanto-safari-zone', 'kanto-saffron-city', 'kanto-silph-co', 'kanto-three-isle-port', 'kanto-trainer-tower', 'kanto-treasure-beach', 'kanto-vermilion-city', 'kanto-victory-road', 'kanto-viridian-city', 'kanto-viridian-forest']
rural=['kanto-route-12','kanto-route-13','kanto-route-14','kanto-route-17','kanto-route-19','kanto-route-20','kanto-route-21','kanto-route-23','kanto-route-25','kanto-route-26','kanto-route-27','kanto-route-28', 'kanto-berry-forest', 'kanto-canyon-entrance', 'kanto-cerulean-cave', 'kanto-cape-brink', 'kanto-digletts-cave', 'kanto-five-island', 'kanto-five-isle-meadow', 'kanto-four-island', 'kanto-green-path', 'kanto-icefall-cave', 'kanto-lost-cave', 'kanto-memorial-pillar', 'kanto-mt-ember', 'kanto-mt-moon', 'kanto-one-island', 'kanto-outcast-island', 'kanto-pattern-bush', 'kanto-rock-tunnel', 'kanto-ruin-valley', 'kanto-seafoam-islands', 'kanto-seavault-canyon', 'kanto-tanoby-ruins', 'kanto-tohjo-falls', 'kanto-water-labyrinth', 'kanto-water-path']
def urbanOrRural(arr):
    urb=0
    rur=0
    for i in arr:
        if i in urban:
            urb+=1
        if i in rural:
            rur+=1
    if urb>=rur:
        return 'urban' 
    else:
        return 'rural'
    
pokemon_location_guide_df['Urban/Rural']=pokemon_location_guide_df['Routes Available'].apply(urbanOrRural)

In [96]:
#compare pogo and scraped urban/rural pokemon
urbanMatch = 0
ruralMatch = 0
for i, row in pokemon_location_guide_df.iterrows():
    ur = row['Urban/Rural']
    if(ur == 'urban' and i in urbanPokemon):
        urbanMatch+=1
    elif(ur == 'rural' and i in ruralPokemon):
        ruralMatch+=1
        
print(str(urbanMatch) + ', ' + str(ruralMatch))

39, 5
