In [1]:
import requests
from bs4 import BeautifulSoup
import json
import re
import string
import sys
from bs4.element import NavigableString, Tag

In [2]:
f = open("roll_20_all_items.json")
all_items = json.load(f)
f.close()

fs = open("roll_20_all_spells.json", "r", encoding="utf-8")
all_spells = json.load(fs)
fs.close()
    
legal_traits = ["jack of all trades", "pact of the blade", "step of the wind", "mask of the wild", "pact of the tome",
               "fury of the small", "book of ancient secrets"]
banned_traits = ["you have"]

class DualSpecError(Exception):
    pass

class NoTraitsError(Exception):
    pass

class BadItemError(Exception):
    pass

modifiers = [-5, -5, -4, -4, -3, -3, -2, -2, -1, -1, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10]
attr_keys = {"STR": "strength", "DEX": "dexterity", "CON": "constitution", 
             "INT": "intelligence", "WIS": "wisdom", "CHA": "charisma"}
spellcasting_mod = {"bard": "CHA", "paladin": "CHA", "sorcerer": "CHA", "warlock": "CHA",
                   "cleric": "WIS", "ranger": "WIS", "wizard": "INT", "eldritch knight": "INT"}

In [3]:
url_list = ["http://www.kassoon.com/dnd/5e/character-sheet/846/elliza-maren-drow-elf-warlock-1/",
           "http://www.kassoon.com/dnd/5e/character-sheet/2384/vinheim-high-elf-wizard-1/",
           "http://www.kassoon.com/dnd/5e/character-sheet/2382/mijira-imbixtellrhyst-dragonborn-rogue-1/",
           "http://www.kassoon.com/dnd/5e/character-sheet/208/adam-fidge-variant-human-bard-5/",
           "https://www.kassoon.com//dnd/5e/character-sheet/48/paren-galastacia-wood-elf-rogue-1/",
           "http://www.kassoon.com/dnd/5e/character-sheet/55/dodge-lightfoot-halfling-sorcerer2/",
           "https://www.kassoon.com/dnd/5e/character-sheet/28/eston-human-bard-1-cleric-1/",
           "https://www.kassoon.com/dnd/5e/character-sheet/2100/"]

In [8]:
# Scrape a single page
def scrape_page(url):
    char = {}
    page = requests.get(url)

    # Basic Info #
    soup = BeautifulSoup(page.content, "html.parser")
    char["name"] = soup.find("div", {"class": "character_name"}).contents[0]
    char_box = soup.findAll("div", {"class": "underline autocomplete updstats"})
    cls = char_box[0].contents[0]
    if "/" in cls:
        raise DualSpecError
    match = re.match(r"([A-Za-z]+)\s?([0-9]+)", cls, re.I)
    if match:
        items = match.groups()
        char["class"] = items[0].lower()
        char["level"] = int(items[1])
    else:
        char["class"] = cls.rsplit(" ")[0].lower()
        char["level"] = int(cls.rsplit(" ")[1])
    char["background"] = char_box[1].contents[0].lower()
    char["race"] = char_box[2].contents[0].lower()
    char["alignment"] = char_box[3].contents[0].lower()
    char["gender"] = char_box[4].contents[0].lower()

    # Attributes #
    attrs = soup.findAll("div", {"class": "character_attribute_box"})
    char["attributes"] = {}
    for att in attrs[:6]:
        att_map = {}
        att_name = att.find("div", {"class": "character_attr_name"}).contents[0]
        att_map["true_val"] = int(att.find("div", {"class": "character_attr_value"}).contents[0])
        att_map["points"] = int(att.find("div", {"class": "character_attr_value editshow updstats"}).contents[0])
        att_map["racial_bonus"] = att_map["true_val"] - att_map["points"]
        att_map["modifier"] = modifiers[att_map["true_val"]]
        char["attributes"][att_name.lower()] = att_map
    char["attributes"]["armor_class"] = 10 + char["attributes"]["dexterity"]["modifier"]
    char["attributes"]["initiative"] = char["attributes"]["dexterity"]["modifier"]
    
    sheet_proficiency = int(attrs[8].find("div", {"class": "character_attr_bonus"}).contents[0])
    char["attributes"]["proficiency"] = int((char["level"]-1)/4 + 2)
    proficiency_diff = char["attributes"]["proficiency"] - sheet_proficiency
    
    char["attributes"]["perception"] = 10 + char["attributes"]["wisdom"]["modifier"]
    char["attributes"]["hit_points"] = int(attrs[11].find("div", {"id": "val15"}).contents[0])
    
    # Skills #
    skill_map = {}
    proficient_skills = []
    skills = soup.findAll("div", {"class": "clear flex"})
    for s in skills[1:-1]:
        skill_name = ((s.find("div", {"class": "character_skills_info"}).contents[0]).split("(")[0]).replace("* ", "").replace("'", "").strip().lower()
        skill_val = 0
        try:
            skill_val = int(s.find("span", {"class": "editshow character_skills_bonus underline"}).contents[0])
        except AttributeError:
            print("ERROR: " + skill_name)
            print(s)
            skill_val = 0
        if s.find("input", {"checked": "checked"}):
            proficient_skills.append(string.capwords(skill_name))
            skill_val = skill_val + proficiency_diff
        skill_map[string.capwords(skill_name)] = skill_val
    
    char["skills"] = skill_map
    char["proficient_skills"] = proficient_skills
    
    # Saves #
    saves = soup.find("div", {"class": "character_throws"}).findAll("div", {"class": "clear"})
    saving_throws = {}
    proficient_saves = []
    for s in saves[:-1]:
        save_name = s.find("div", {"class": "character_throw_info"}).contents[0].lower()
        save_val = char["attributes"][save_name]["modifier"]
        if s.find("input", {"checked": "checked"}):
            proficient_saves.append(save_name)
            save_val = save_val + char["attributes"]["proficiency"]
        saving_throws[save_name] = save_val
    char["saving_throws"] = saving_throws
    char["proficient_saves"] = proficient_saves
    
    
    # Attacks #
    misc = soup.find("div", {"class": "character_misc"})
    atks_and_spells = []
    item_list = misc.find("tbody", {"id": "list0"}).findAll("tr")
    for item in item_list:
        item_map = {}
        i = item.findAll("td")
        #print(i)
        #print(type(i[0].contents[0]))
        
        if (isinstance(i[0].contents[0], Tag)):
            if str(i[0].contents[0].contents[0]) == "Roll":
                continue
        item_map["name"] = string.capwords(i[0].contents[0])
        item_name = item_map["name"].title()
        
        # Item and spell mapping
        def map_item_or_spell(item_map, item_name):
            if item_name in all_items:
                if "Damage" in all_items[item_name]:
                    item_map["damage"] = all_items[item_name]["Damage"]
                if "Damage Type" in all_items[item_name]:
                    item_map["damage_type"] = all_items[item_name]["Damage Type"]
                else:
                    item_map["damage_type"] = "normal"
                if "Range" in all_items[item_name]:
                    item_map["range"] = all_items[item_name]["Range"]
                else:
                    item_map["range"] = "Reach"
                if "Properties" in all_items[item_name]:
                    item_map["properties"] = all_items[item_name]["Properties"]
                return (item_map, True)
            elif item_name in all_spells:
                if "Damage" in all_spells[item_name]:
                    item_map["damage"] = all_spells[item_name]["Damage"]
                if "Damage Type" in all_spells[item_name]:
                    item_map["damage_type"] = all_spells[item_name]["Damage Type"]
                if "Range" in all_spells[item_name]:    
                    item_map["range"] = all_spells[item_name]["Range"]
                else:
                    item_map["range"] = "Reach"
                return (item_map, True)
            else:
                return (item_map, False)
        
        item_map, v = map_item_or_spell(item_map, item_name)
        if not v:
            new_name = string.capwords(item_name.replace(" ", "").title())
            item_map, v2 = map_item_or_spell(item_map, new_name)
            if not v2:
                raise BadItemError("Bad Item or Spell: " + str(item_name))
            else:
                item_map["name"] = new_name.title()
        
        #print(item_map)
        atks_and_spells.append(item_map)
    
    char["attacks"] = atks_and_spells
    
    
    # Spellcasting #
    if char["class"] in spellcasting_mod:
        spellcasting = {}
        spellcasting["spellcast_modifier"] = attr_keys[spellcasting_mod[char["class"]]]
        spellcasting["spellsave_dc"] = 8 + char["attributes"][spellcasting["spellcast_modifier"]]["modifier"] + char["attributes"]["proficiency"]
        spellcasting["spell_attack_bonus"] = int(char["attributes"][spellcasting["spellcast_modifier"]]["modifier"] + char["attributes"]["proficiency"])
        for i in range(1,10):
            slot_val = misc.find("div", {"id": ("val" + str(55+i))}).contents
            if slot_val:
                if slot_val[0].find("-") != -1:
                    slot_val[0] = int(slot_val[0].split('-')[0])
                    

                spellcasting["level_" + str(i) + "_slots"] = int(slot_val[0])
            else:
                spellcasting["level_" + str(i) + "_slots"] = 0
        char["spellcasting"] = spellcasting
        
        
    # Features & Traits #
    features = soup.findAll("div", {"class": "edithide"})
    #print(feats[3].contents)
    features_and_traits = []
    pattern = re.compile("^([A-Z][a-z]+):")
    for i in features[3].contents:
        i = str(i).strip().replace("’", "'")
        #print(i)
        if ":" in i:
            #print(i)
            #print("")
            pattern = re.search("^([A-Za-z' ]+):", i)
            bold_pattern = re.search("\<b\>([A-Za-z':, ]+)\</b\>", i)
            p_pattern = re.search("^\<p\>([A-Za-z', ]+):", i)
            pat_str = ""
            if pattern:
                pat_str = string.capwords(str(pattern.group(0)).rstrip(":"))
            elif bold_pattern:
                pat_str = string.capwords(str(bold_pattern.group(0)).title().split("<B>")[1].rsplit(":</B>")[0])
            elif p_pattern:
                pat_str = string.capwords(str(p_pattern.group(0)).title().split("<P>")[1].rsplit(":")[0])
            
            if (pat_str != "") and (pat_str.lower() not in banned_traits):
                if pat_str not in features_and_traits:
                    features_and_traits.append(pat_str)
        
        else:
            spl = i.split()
            i = " ".join(spl)
            basic_pattern = re.search("^([A-Za-z', ]+)$", i)
            if basic_pattern:
                if i.lower() in legal_traits:
                    pat_str = string.capwords(str(basic_pattern.group(0)))
                    if pat_str not in features_and_traits:
                        features_and_traits.append(pat_str)
                elif "," in i.lower():
                    if ("draconic ancestry" in i.lower() or "favored enemy" in i.lower()):
                        pat_str = string.capwords(str(basic_pattern.group(0)))
                        if pat_str not in features_and_traits:
                            features_and_traits.append(pat_str)
                    else:
                        print("Rejected: " + " ".join(spl) + " link: " + url)
                elif len(spl) > 3:
                    print("Rejected: " + " ".join(spl) + " link: " + url)
                else:
                    pat_str = string.capwords(str(basic_pattern.group(0)))
                    if pat_str not in features_and_traits:
                        features_and_traits.append(pat_str)
            
            #else:
                #print("No Pattern: " + " ".join(spl))
    if not features_and_traits:
        raise NoTraitsError
    char["features"] = (features_and_traits)
    
    
    # End of char #
    return char

{
    "name": "Greethen Norixius",
    "class": "rogue",
    "level": 1,
    "background": "inheritor",
    "race": "human",
    "alignment": "neutral",
    "gender": "male",
    "attributes": {
        "strength": {
            "true_val": 13,
            "points": 12,
            "racial_bonus": 1,
            "modifier": 1
        },
        "dexterity": {
            "true_val": 16,
            "points": 15,
            "racial_bonus": 1,
            "modifier": 3
        },
        "constitution": {
            "true_val": 14,
            "points": 13,
            "racial_bonus": 1,
            "modifier": 2
        },
        "intelligence": {
            "true_val": 11,
            "points": 10,
            "racial_bonus": 1,
            "modifier": 0
        },
        "wisdom": {
            "true_val": 15,
            "points": 14,
            "racial_bonus": 1,
            "modifier": 2
        },
        "charisma": {
            "true_val": 9,
            "points": 8,
    

In [9]:
test_characters = []
for idx, url in enumerate(url_list):
    try:
        test_characters.append(scrape_page(url))
    except ValueError:
        print("ValError at idx " + str(idx) + " link: " + url)
    except IndexError:
        print("IndexError at idx " + str(idx) + " link: " + url)
    except DualSpecError:
        print("DualSpecError at idx " + str(idx) + " link: " + url)
    except NoTraitsError:
        print("NoTraitsError at idx " + str(idx) + " link: " + url)
    except BadItemError as e:
        print("BadItemError at idx " + str(idx) + " link: " + url)
print(json.dumps(test_characters, indent=4))
with open("scraper_output/kassoon_scrapes/test_kassoon_out.json", "w", encoding='utf-8') as outfile:
    json.dump(test_characters, outfile, ensure_ascii=False, indent=4)

Rejected: on saving throws against being link: http://www.kassoon.com/dnd/5e/character-sheet/846/elliza-maren-drow-elf-warlock-1/
Rejected: You can't use this feature again until you finish a short or link: http://www.kassoon.com/dnd/5e/character-sheet/846/elliza-maren-drow-elf-warlock-1/
Rejected: Whenever you finish a link: http://www.kassoon.com/dnd/5e/character-sheet/846/elliza-maren-drow-elf-warlock-1/
Rejected: , you can touch one weapon that you are proficient with and that lacks link: http://www.kassoon.com/dnd/5e/character-sheet/846/elliza-maren-drow-elf-warlock-1/
Rejected: , instead of link: http://www.kassoon.com/dnd/5e/character-sheet/846/elliza-maren-drow-elf-warlock-1/
Rejected: This benefit lasts until you finish a link: http://www.kassoon.com/dnd/5e/character-sheet/846/elliza-maren-drow-elf-warlock-1/
Rejected: Once per day after a link: http://www.kassoon.com/dnd/5e/character-sheet/2384/vinheim-high-elf-wizard-1/
Rejected: , you can choose expended link: http://www.ka

In [12]:
# Scrape by sheet number
num_base = "https://www.kassoon.com/dnd/5e/character-sheet/"
all_chars = []
out_count = 0
total_valid = 0
total_attempts = 0
start = 0
end = 500
cap = 300
for i in range(start, end):
    total_attempts = total_attempts + 1
    try:
        #print("i = " + str(i))
        all_chars.append(scrape_page(num_base + str(i) + "/"))
        total_valid = total_valid + 1
    except ValueError:
        print("ValError at idx " + str(i) + " link: " + num_base + str(i) + "/")
    except IndexError:
        print("IndexError at idx " + str(i) + " link: " + num_base + str(i) + "/")
    except DualSpecError:
        print("DualSpecError at idx " + str(i) + " link: " + num_base + str(i) + "/")
    except NoTraitsError:
        print("NoTraitsError at idx " + str(i) + " link: " + num_base + str(i) + "/")
    except BadItemError as e:
        print(e)
        print("BadItemError at idx " + str(i) + " link: " + num_base + str(i) + "/")
    
    if len(all_chars) >= 100:
        outf = "kassoon_out_" + str(out_count) + ".json"
        with open(outf, "w", encoding='utf-8') as outfile:
            json.dump(all_chars, outfile, ensure_ascii=False, indent=4)
        print("")
        print("Printing page " + str(out_count) + " to file")
        print("Total Attempts: " + str(total_attempts))
        print("Total Valid: " + str(total_valid))
        print("")
        out_count = out_count + 1
        all_chars = []
    if total_valid >= cap:
        break
if(len(all_chars) > 0):
    print("")
    print("Printing page " + str(out_count) + " to file")

    outf = "scraper_output/kassoon_scrapes/kassoon_pass2_out_" + str(out_count) + ".json"
    with open(outf, "w", encoding='utf-8') as outfile:
        json.dump(all_chars, outfile, ensure_ascii=False, indent=4)        

print("Final Count!")
print("Total Attempts: " + str(total_attempts))
print("Total Valid: " + str(total_valid))

Rejected: Once per day after a link: https://www.kassoon.com/dnd/5e/character-sheet/6/
Rejected: , you can choose expended link: https://www.kassoon.com/dnd/5e/character-sheet/6/
Rejected: Once per day after a link: https://www.kassoon.com/dnd/5e/character-sheet/9/
Rejected: , you can choose expended link: https://www.kassoon.com/dnd/5e/character-sheet/9/
Rejected: Once per day after a link: https://www.kassoon.com/dnd/5e/character-sheet/21/
Rejected: , you can choose expended link: https://www.kassoon.com/dnd/5e/character-sheet/21/
DualSpecError at idx 28 link: https://www.kassoon.com/dnd/5e/character-sheet/28/
Rejected: Once per day after a link: https://www.kassoon.com/dnd/5e/character-sheet/31/
Rejected: , you can choose expended link: https://www.kassoon.com/dnd/5e/character-sheet/31/
Rejected: Once per day after a link: https://www.kassoon.com/dnd/5e/character-sheet/36/
Rejected: , you can choose expended link: https://www.kassoon.com/dnd/5e/character-sheet/36/
Bad Item or Spell:

NoTraitsError at idx 181 link: https://www.kassoon.com/dnd/5e/character-sheet/181/
NoTraitsError at idx 182 link: https://www.kassoon.com/dnd/5e/character-sheet/182/
Bad Item or Spell: Longwsord
BadItemError at idx 188 link: https://www.kassoon.com/dnd/5e/character-sheet/188/
ValError at idx 189 link: https://www.kassoon.com/dnd/5e/character-sheet/189/
IndexError at idx 191 link: https://www.kassoon.com/dnd/5e/character-sheet/191/
ValError at idx 194 link: https://www.kassoon.com/dnd/5e/character-sheet/194/
Bad Item or Spell: Sutra'S Word
BadItemError at idx 195 link: https://www.kassoon.com/dnd/5e/character-sheet/195/
NoTraitsError at idx 196 link: https://www.kassoon.com/dnd/5e/character-sheet/196/
Bad Item or Spell: Daggers
BadItemError at idx 197 link: https://www.kassoon.com/dnd/5e/character-sheet/197/
Bad Item or Spell: Fire Breath
BadItemError at idx 198 link: https://www.kassoon.com/dnd/5e/character-sheet/198/
IndexError at idx 199 link: https://www.kassoon.com/dnd/5e/character

Rejected: on saving throws against being link: https://www.kassoon.com/dnd/5e/character-sheet/287/
Bad Item or Spell: +1 Hand Axe
BadItemError at idx 288 link: https://www.kassoon.com/dnd/5e/character-sheet/288/
Rejected: on saving throws against being link: https://www.kassoon.com/dnd/5e/character-sheet/289/
Rejected: spell once with this trait and regain the ability to do so when you finish a link: https://www.kassoon.com/dnd/5e/character-sheet/293/
Rejected: when not wearing heavy armor, link: https://www.kassoon.com/dnd/5e/character-sheet/294/
Rejected: Once you have raged the number of times shown for your barbarian level in the Rages column of the Barbarian table, you must finish a link: https://www.kassoon.com/dnd/5e/character-sheet/295/
Rejected: on melee weapon attack rolls using Strength during this turn, but attack rolls against you have link: https://www.kassoon.com/dnd/5e/character-sheet/295/
Rejected: , link: https://www.kassoon.com/dnd/5e/character-sheet/295/
Rejected: ,

IndexError at idx 386 link: https://www.kassoon.com/dnd/5e/character-sheet/386/
Rejected: on saving throws against being link: https://www.kassoon.com/dnd/5e/character-sheet/387/
Bad Item or Spell: Spell Attack
BadItemError at idx 388 link: https://www.kassoon.com/dnd/5e/character-sheet/388/
IndexError at idx 389 link: https://www.kassoon.com/dnd/5e/character-sheet/389/
DualSpecError at idx 390 link: https://www.kassoon.com/dnd/5e/character-sheet/390/
IndexError at idx 392 link: https://www.kassoon.com/dnd/5e/character-sheet/392/
Bad Item or Spell: Battleaxe (Versatile)
BadItemError at idx 396 link: https://www.kassoon.com/dnd/5e/character-sheet/396/
Bad Item or Spell: Spell Attack
BadItemError at idx 406 link: https://www.kassoon.com/dnd/5e/character-sheet/406/
Bad Item or Spell: Unarmed
BadItemError at idx 410 link: https://www.kassoon.com/dnd/5e/character-sheet/410/
Bad Item or Spell: Unarmed Strike
BadItemError at idx 411 link: https://www.kassoon.com/dnd/5e/character-sheet/411/
In

In [None]:
# Scrape by page
master_page = requests.get("https://www.kassoon.com/dnd/5e/premade-characters/")
link_soup = BeautifulSoup(master_page.content, "html.parser")
chr_list = link_soup.find("tbody", {"id": "tblChrList"}).findAll("tr")
#print(chr_list[0])
base = "http://www.kassoon.com"

cap_page = 1000
all_chars_page = []
for idx,c in enumerate(chr_list):
    c_url = c.find("a", href=True)["href"]
    try:
        all_chars_page.append(scrape_page(base + c_url))
    except ValueError:
        print("ValError at idx " + str(idx) + " link: " + base + c_url)
    except IndexError:
        print("IndexError at idx " + str(idx) + " link: " + base + c_url)
    except DualSpecError:
        print("DualSpecError at idx " + str(idx) + " link: " + base + c_url)
    except NoTraitsError:
        print("NoTraitsError at idx " + str(idx) + " link: " + base + c_url)
    except BadItemError as e:
        print(e)
        print("BadItemError at idx " + str(idx) + " link: " + base + c_url)
    if len(all_chars_page) >= cap_page:
        break


In [100]:
# premade page test
original_stdout = sys.stdout
with open("temp_out.txt", "w", encoding='utf-8') as outfile:
    sys.stdout = outfile
    print(BeautifulSoup(requests.get("https://www.kassoon.com/dnd/5e/premade-characters/").content, "html.parser"))
    sys.stdout = original_stdout

In [None]:
# pattern testing
pattern = re.search("^([A-Za-z' ]+)", i)

In [139]:
# pattern testing 2
test = ["Darkvision: Thanks to your elf blood, you have superior vision in dark and dim conditions.You can",
       "ends, you gain the following benefits:",
       "hexblade's Curse: Starting at 1st level, you gain the ability to place a baleful curse on someone. As a bonus action, choose",
       "Hex Warrior:At 1st level , you acquire the traini",
       "Thieves’ Cant: You know thieves’ cant, a"]
for t in test:
    pattern = re.search("^([A-Za-z'’ ]+):", t)
    if pattern:
        print(pattern.group(0))
    else:
        print("Pattern Not Found")

Darkvision:
Pattern Not Found
hexblade's Curse:
Hex Warrior:
Thieves’ Cant:


In [None]:
type(BeautifulSoup)

In [5]:
import webbrowser

chrome_path = "C:\Program Files (x86)\Google\Chrome\Application\chrome.exe %s"

# 1: https://www.kassoon.com/dnd/5e/character-sheet/846/elliza-maren-drow-elf-warlock-1/
# 2: https://www.kassoon.com/dnd/5e/character-sheet/2384/vinheim-high-elf-wizard-1/
# 3: https://www.kassoon.com/dnd/5e/character-sheet/2382/mijira-imbixtellrhyst-dragonborn-rogue-1/
# 4: https://www.kassoon.com/dnd/5e/character-sheet/208/adam-fidge-variant-human-bard-5/

webbrowser.get(chrome_path).open("http://www.kassoon.com/dnd/5e/character-sheet/846/elliza-maren-drow-elf-warlock-1/")
webbrowser.get(chrome_path).open("http://www.kassoon.com/dnd/5e/character-sheet/2384/vinheim-high-elf-wizard-1/")
webbrowser.get(chrome_path).open("http://www.kassoon.com/dnd/5e/character-sheet/2382/mijira-imbixtellrhyst-dragonborn-rogue-1/")
webbrowser.get(chrome_path).open("http://www.kassoon.com/dnd/5e/character-sheet/208/adam-fidge-variant-human-bard-5/")

False