In [None]:
# import dependencies
import pandas as pd
import unicodedata
import re
from sqlalchemy import create_engine, inspect
from sqlalchemy.orm import Session
from sqlalchemy.dialects import postgresql
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.ext.declarative import declarative_base
from bs4 import BeautifulSoup as bs
from splinter import Browser
from webdriver_manager.chrome import ChromeDriverManager

## Extract

In [None]:
# removes accented characters from a string
def strip_accents(s):
   return ''.join(c for c in unicodedata.normalize('NFD', s)
                  if unicodedata.category(c) != 'Mn')

# set up the splinter service
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless = True)

# specify the initial web browser URL
web_url = "https://en.wikipedia.org/wiki/List_of_legendary_creatures_(A)"

# send the browser instance to the provided URL
browser.visit(web_url)

# initialize the destination lists
names = []
origins = []
descriptions = []

# define the alphabetical list for browser navigation
alphabet = ["B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "A"]

# iterate through the web pages and scrape the content into the destination lists
for letter in alphabet:
    
    # define the BeautifulSoup instance
    soup = bs(browser.html, "html.parser")
    
    # retrieve the creature list
    creatures = soup.body.find("div", class_ = "mw-body-content mw-content-ltr").find_all("ul")[1].find_all("li")
    
    # split the list items into name and origin then store into lists
    for creature in creatures:
        
        # store the whole list item text
        myStr = creature.text
        
        # make sure there are parentheses to delimit with
        if "(" in myStr:
            
            # split the text by parentheses
            split0 = myStr.split("(")
            split1 = split0[1].split(")")
            
            # extract the relevant information
            name = strip_accents(split0[0].strip())
            origin = strip_accents(split1[0].strip())
            description = strip_accents(split1[len(split1) - 1].strip()[1:].strip())
            
            # store the information into lists
            names.append(name)
            origins.append(origin)
            descriptions.append(description)
    
    # advance to the next page
    browser.links.find_by_href(f"/wiki/List_of_legendary_creatures_({letter})").click()

browser.quit()

# store the lists in a dataframe
folklore_creatures_df = pd.DataFrame({
    "names": names,
    "origins": origins,
    "descriptions": descriptions})

In [None]:
# import dnd monster csv into a Pandas dataframe
csv_file = "Resources/dnd_monsters.csv"
dndmonster_df= pd.read_csv(csv_file)
dndmonster_df.head()

## Transform

In [None]:
# trim out unnecessary columns
clean_monster_df = dndmonster_df[["name","cr","type","size","ac","hp","align"]]
clean_monster_df.head()

In [None]:
# remove invalid data
clean_monster_df.dropna()

In [None]:
# confirm data types
clean_monster_df.dtypes

In [None]:
# convert text representations of fractions into decimals
clean_monster_df["cr"] = clean_monster_df["cr"].apply(lambda s: re.sub(r"1/4","0.25", str(s)))
clean_monster_df["cr"] = clean_monster_df["cr"].apply(lambda s: re.sub(r"1/2","0.50", str(s)))
clean_monster_df["cr"] = clean_monster_df["cr"].apply(lambda s: re.sub(r"1/8","0.125", str(s)))

In [None]:
# convert 'challenge rating' column into float from string
clean_monster_df["cr"] = clean_monster_df["cr"].astype(float)

In [None]:
# rename columns
clean_monster_df = clean_monster_df.rename(columns = {
    "cr": "challenge_rating",
    "ac": "armor_class",
    "hp": "hit_points",
    "align": "alignment"})

In [None]:
# confirm data types
clean_monster_df.dtypes

## Load

In [None]:
# import the PostgreSQL confidential values
from config import postgresql_key, postgresql_port, postgresql_host, postgresql_db, postgresql_user

In [None]:
# create the engine
engine = create_engine(f"postgresql+psycopg2://{postgresql_user}:{postgresql_key}@{postgresql_host}/{postgresql_db}")

In [None]:
# create the base reflector and print tables
Base = automap_base()
Base.prepare(engine, reflect = True)
Base.classes.keys()

In [None]:
# define the tables
dnd_monsters_tbl = Base.classes.dnd_monsters
folklore_creatures_tbl = Base.classes.folklore_creatures

In [None]:
# create the session
session = Session(engine)

In [None]:
# add data from the clean_monster_df to the current session
for index, row in clean_monster_df.iterrows():
    session.add(dnd_monsters_tbl(name = row["name"], challenge_rating = row["challenge_rating"], type = row["type"], size = row["size"], armor_class = row["armor_class"], hit_points = row["hit_points"], alignment = row["alignment"]))

In [None]:
# add data from the folklore_creatures_df to the current session
for index, row in folklore_creatures_df.iterrows():
    session.add(folklore_creatures_tbl(name = row["names"], origin = row["origins"], description = row["descriptions"]))

In [None]:
# send the new data to the database then flush the session
session.commit()

In [None]:
# close and release session resources
session.close()