# Control panel

In [None]:
project_name = 'pokemon'
file_name = f'{project_name}'

# Scrap Scope 
full_dataset = True
part_dataset = 5

# Output settings
save_dataset_to_csv = True
save_dataset_to_json = False

# Full Script 

In [None]:
import os
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [None]:
# Added timers to log script control points for better optimization
script_start_time = time.time() 
start_timestamp = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
print("="*12,f"Project {project_name} | Script started {start_timestamp}","="*12)

# Automatically checks and creates the required file directory if it doesn't exist.
os_path = os.getcwd()

if os_path.split('\\')[-1] == 'notebooks':
    file_save_path = "\\".join(os_path.split('\\')[:-1])+"\data"
else:
    file_save_path = f'{os_path}\\data'
    
os.makedirs(file_save_path, exist_ok=True)

In [None]:
# Simple function to display the size of a DataFrame.
def df_size(dataframe_name):
  """Calculates and formats the size of a dataframe (rows, columns)."""
  rows = dataframe_name.shape[0]
  columns = dataframe_name.shape[1]
  return f'{rows} rows, {columns} columns.'

## Data scrap

### Pokemon details dataset scarp
First fetching a list with all pokemon and removing the duplicates to loop thru each individual pokemon and extract all the data for the details and stats

In [None]:
pokemon_list = "https://pokemondb.net/pokedex/all"
details_start_time = time.time()

# Send request to get the html of each pokemon
response = requests.get(pokemon_list)
pokemon_soup_list = BeautifulSoup(response.text, "html.parser") # Use response.text for decoding

pokemon_list = list(dict.fromkeys(pokemon_soup_list.find_all('a', class_="ent-name")))

In [None]:
# Calculating data scrap scope (for testing purposes)
if full_dataset:
  pokemon_scope = len(pokemon_list)
else:
  pokemon_scope =  part_dataset

In [None]:
pokemon_details = []
pokemon_stats = []

for index, pokemon in enumerate(pokemon_list[:pokemon_scope], start = 1):
    pokemon_url = "https://pokemondb.net" + pokemon["href"]
    
    # Send request to get the html of each pokemon
    response = requests.get(pokemon_url)
    pokemon_soup = BeautifulSoup(response.text, "html.parser")  # Use response.text for decoding
    
    ###### Data prep
    ### Pokemon Info
    pokemon_id = int(pokemon_soup.find("th", string="National №").find_next("td").text)
    pokemon_name = pokemon_soup.find("h1").text.strip()
    pokemon_desc = pokemon_soup.find('div', class_='tabset-basics').find_all_previous("p")
    pokemon_desc = '|'.join(desc.text.strip() for desc in pokemon_desc).split('|')[::-1]
    pokemon_desc = ' '.join(pokemon_desc)
    japanese_name = pokemon_soup.find("th", string="Japanese").find_next("td").text.strip()
    pokemon_image = pokemon_soup.find("div", class_="grid-col").find("img")['src']
    species_data = pokemon_soup.find("th", string="Species").find_next("td").text.strip().replace(" Pokémon", "")
    height = int(float(pokemon_soup.find("th", string="Height").find_next("td").text.strip().split()[0])*100)
    weight = int(float(pokemon_soup.find("th", string="Weight").find_next("td").text.strip().split()[0])*100)
    type_elements = pokemon_soup.find("th", string="Type").find_next("td").find_all("a")
    type_info = ', '.join(type_element.text.strip() for type_element in type_elements).split(',')
    type_1 = type_info[0]
    if len(type_info) == 1:
        type_2 = None
    else:
        type_2 = type_info[1]
    ability_elements = pokemon_soup.find("th", string="Abilities").find_next("td").find_all("a")
    abilities = ', '.join(ability_element.text.strip() for ability_element in ability_elements)
    ev_yield = pokemon_soup.find("th", string="EV yield").find_next("td").text.strip()
    catch_rate = pokemon_soup.find("th", string="Catch rate").find_next("td").text.strip().split()[0]
    base_friendship = pokemon_soup.find("th", string="Base Exp.").find_previous("td").text.strip().split()[0]
    base_exp = pokemon_soup.find("th", string="Base Exp.").find_next("td").text.strip().split()[0]
    growth_rate = pokemon_soup.find("th", string="Growth Rate").find_next("td").text.strip()
    gender = pokemon_soup.find("th", string="Gender").find_next("td").text.strip().split(', ')
    if len(gender) > 1:
        gender_male = gender[0]
        gender_male = gender_male.split('%')[0]
    else:
        gender_male = '0'
    if len(gender) > 1:
        gender_female = gender[1]
        gender_female = gender_female.split('%')[0]
    else:
        gender_female = '0'    
    generation_title_element = pokemon_soup.find(class_="list-nav-title", string='In other generations')
    if generation_title_element: 
        generation_all = generation_title_element.find_next_siblings('li')
        in_generation = ', '.join(generation_select.text.strip() for generation_select in generation_all)
    else:
        in_generation = '9'
    generation = int(in_generation[0])
    name_etymology_element = pokemon_soup.find(class_="list-nav-title", string='In other generations')
    if name_etymology_element:
        name_etymology_piece = pokemon_soup.find("dl", class_="etymology").find_all('dt')
        name_etymology_desc = pokemon_soup.find("dl", class_="etymology").find_all('dd')
        name_etymology = [f"{dt.text.strip()}: {dd.text.strip()}" for dt, dd in zip(name_etymology_piece, name_etymology_desc)]
        name_etymology = " | ".join(name_etymology) 
    
    ### Pokemon Stats
    # HP
    hp_elements = pokemon_soup.find("th", string="HP").find_next_siblings("td", class_="cell-num")
    hp_stats = [hp_element.text.strip() for hp_element in hp_elements]
    base_hp, min_hp, max_hp = hp_stats
    # Attack
    atk_elements = pokemon_soup.find("th", string="Attack").find_next_siblings("td", class_="cell-num")
    atk_stats = [atk_element.text.strip() for atk_element in atk_elements]
    base_atk, min_atk, max_atk = atk_stats
    # Defense
    def_elements = pokemon_soup.find("th", string="Defense").find_next_siblings("td", class_="cell-num")
    def_stats = [def_element.text.strip() for def_element in def_elements]
    base_def, min_def, max_def = def_stats
    # Speed Attack
    satk_elements = pokemon_soup.find("th", string="Sp. Atk").find_next_siblings("td", class_="cell-num")
    satk_stats = [satk_element.text.strip() for satk_element in satk_elements]
    base_satk, min_satk, max_satk = satk_stats
    # Speed defense
    sdef_elements = pokemon_soup.find("th", string="Sp. Def").find_next_siblings("td", class_="cell-num")
    sdef_stats = [sdef_element.text.strip() for sdef_element in sdef_elements]
    base_sdef, min_sdef, max_sdef = sdef_stats
    # Speed
    spd_elements = pokemon_soup.find("th", string="Speed").find_next_siblings("td", class_="cell-num")
    spd_stats = [spd_element.text.strip() for spd_element in spd_elements]
    base_spd, min_spd, max_spd = spd_stats  
    
    pokemon_details.append({
                            "Pokemon ID": pokemon_id,
                            "Name": pokemon_name,
                            "Description": pokemon_desc,
                            "Japanese name": japanese_name,
                            "Name etymology":name_etymology,
                            "Image URL": pokemon_image,
                            "Species": species_data,
                            "Height": height,
                            "Weight": weight,
                            "Type 1": type_1,
                            "Type 2": type_2,
                            "Abilities": abilities,
                            "EV Yield":ev_yield,
                            "Catch rate":catch_rate,
                            "Base Exp":base_exp,
                            "Growth rate":growth_rate,
                            "Gender male %":gender_male,
                            "Gender female %":gender_female,
                            "Friendship":base_friendship,
                            "Generation":generation
                            })  
    # To add the data to the stats list
    
    pokemon_stats.append({
                        "Pokemon ID": pokemon_id,
                        "Base HP":base_hp,
                        "Min. HP":min_hp,
                        "Max. HP":max_hp,
                        "Base Attack":base_atk,
                        "Min. Attack":min_atk,
                        "Max. Attack":max_atk,
                        "Base Defense":base_def,
                        "Min. Defense":min_def,
                        "Max. Defense":max_def,
                        "Base Spd Attack":base_satk,
                        "Min. Spd Attack":min_satk,
                        "Max. Spd Attack":max_satk,
                        "Base Spd Defense":base_sdef,
                        "Min. Spd Defense":min_sdef,
                        "Max. Spd Defense":max_sdef,
                        "Base Speed":base_spd,
                        "Min. Speed":min_spd,
                        "Max. Speed":max_spd
                        })

details_df = pd.DataFrame(pokemon_details)
stats_df = pd.DataFrame(pokemon_stats)

details_end_time = time.time()
details_end_timestamp = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
details_elapsed_time = time.strftime("%H:%M:%S", time.gmtime(details_end_time - details_start_time))

In [None]:
print(f'Pokemon details extraction finished    |    Data set size: {df_size(details_df)}    |    Elapsed time: {details_elapsed_time}    |    Script ended on {details_end_timestamp}')
print('='*160)
print(f'Pokemon stats extraction finished    |    Data set size: {df_size(stats_df)}    |    Elapsed time: {details_elapsed_time}    |    Script ended on {details_end_timestamp}')
print('='*160)

### Pokemon league scrap
Expanding data collection to compile a comprehensive dataset containing all Pokémon and their respective leagues + the sprites for the different leagues

In [None]:
pokemon_league_start_time = time.time()
# Pokedex scrape
pokedex_url = "https://pokemondb.net/pokedex"

# Send request and making soup
response = requests.get(pokedex_url)
pokedex_soup = BeautifulSoup(response.text, 'html.parser')

# Data find and prep
pokedex_panel = pokedex_soup.find('nav', class_='panel panel-nav').find_all('ul')[1]
league_name_raw = pokedex_panel.find_all('a')
region_name_raw = pokedex_panel.find_all('small', class_='text-muted')
league_region_link = [f"{league.text.strip()}|{region.text.strip()[1:-1]}|{link['href']}" for league, region, link in zip(league_name_raw, region_name_raw, league_name_raw)][1:]

In [None]:
pokemon_league = []

if full_dataset:
  league_scope = len(league_region_link)
else:
  league_scope =  part_dataset

for value in league_region_link[:league_scope]:
  league_name, league_region, link = value.split("|")
  league_link = f'https://pokemondb.net{link}'


  response = requests.get(league_link)
  league_soup = BeautifulSoup(response.text, "html.parser")

  # Data call and clean
  #### Pokemon names
  pokemon_entries = league_soup.find_all("div", class_="infocard")

  if full_dataset:
    pokemon_entries_scope = len(pokemon_entries)
  else:
    pokemon_entries_scope =  part_dataset

  ## Loop through pokemons in the different leagues
  for pokemon in pokemon_entries[:pokemon_entries_scope]:
    ##### Data call and clean
    # Pokemon info
    pokemon_league_number = int(pokemon.find(class_="infocard-lg-data text-muted").find("small").text[1:])
    pokemon_name = pokemon.find("a", class_="ent-name").text.strip()
    pokemon_link = pokemon.find(class_="ent-name")["href"]
    pokemon_sprite = pokemon.find(class_="img-fixed")['src']

    # Send request to get the pokemon page html to get the pokemon id.
    pokemon_page = "https://pokemondb.net" + pokemon_link

    page_response = requests.get(pokemon_page)
    pokemon_detail_soup = BeautifulSoup(page_response.text, "html.parser")

    # Data call and clean
    pokemon_id = int(pokemon_detail_soup.find("th", string="National №").find_next("td").text)

    # To add the data to the pokemon_league list
    pokemon_league.append({
                          "League ID": pokemon_league_number,
                          "Pokemon ID": pokemon_id,
                          "Sprite URL": pokemon_sprite,
                          "League name": league_name,
                          "League Region": league_region
                          })

pokemon_league = pd.DataFrame(pokemon_league)                          

pokemon_league_end_time = time.time()
pokemon_league_end_timestamp = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
pokemon_league_elapsed_time = time.strftime("%H:%M:%S", time.gmtime(pokemon_league_end_time - pokemon_league_start_time))

In [None]:
print(f'Pokemon league extraction finished    |    Data set size: {df_size(pokemon_league)}    |    Elapsed time: {pokemon_league_elapsed_time}    |    Script ended on {pokemon_league_end_timestamp}')
print('='*160)

### Pokemon region details
Reshaping the structure of a file to match the datamodel

In [None]:
region_details_start_time = time.time()

region_details = pd.read_csv(fr'{file_save_path}\dirty\pokemon_region_details.csv')

region_details = region_details.melt(id_vars=['Type'], var_name='Region', value_name='Values') #columns to rows
region_details = region_details.pivot(index='Region', columns='Type', values='Values') # rows to columns
region_details = region_details.reset_index() # Reset the index to make 'Region' a column again  

region_details_end_time = time.time()
region_details_end_timestamp = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
region_details_elapsed_time = time.strftime("%H:%M:%S", time.gmtime(pokemon_league_end_time - pokemon_league_start_time))

In [None]:
print(f'Region details extraction finished.    |    Data set size: {df_size(region_details)}    |    Elapsed time: {region_details_elapsed_time}    |    Script ended on {region_details_end_timestamp}')
print('='*160)

## Output
To save files to CSV and/or JSON based on the user preferences specified in the control panel

In [None]:
########################################################    Output the data to files    ##########################################################
if save_dataset_to_csv:
  #  Output the full dataset into a csv file
  details_df.to_csv(f'{file_save_path}\{file_name}_details.csv', header=True, index=False, encoding='utf-8-sig')
  stats_df.to_csv(f'{file_save_path}\{file_name}_stats.csv', header=True, index=False, encoding='utf-8-sig')
  pokemon_league.to_csv(f'{file_save_path}\{file_name}_league.csv', header=True, index=False, encoding='utf-8-sig')
  region_details.to_csv(f'{file_save_path}\{file_name}_region_details.csv', header=True, index=False, encoding='utf-8-sig')
  print(f"CSV file saved to {file_save_path}")
  
if save_dataset_to_json:
  #  Output the full dataset into a csv file
  details_df.to_json(f'{file_save_path}\{file_name}_details.json', orient='records', lines=True)
  stats_df.to_json(f'{file_save_path}\{file_name}_stats.json', orient='records', lines=True)
  pokemon_league.to_json(f'{file_save_path}\{file_name}_league.json', orient='records', lines=True)
  region_details.to_json(f'{file_save_path}\{file_name}_region_details.json', orient='records', lines=True)
  print(f"JSON file saved to {file_save_path}")

### To log the execution times, to track the performance of the script

In [None]:
# Record the end time to calculate and format the elapsed time
script_end_time = time.time()
script_end_timestamp = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
script_elapsed_time = time.strftime("%H:%M:%S", time.gmtime(script_end_time - script_start_time))

In [None]:
print('='*120)
print(f'Script finished    |    Elapsed time: {script_elapsed_time}    |    Finish time: {script_end_timestamp}')
print('='*120)