In [None]:
import requests
from bs4 import BeautifulSoup as bs
import re
import json
from IPython.display import clear_output
import html
import string

## Schools

In [None]:
urls = ["http://lasthaiku.wikidot.com/sccrab",
        "http://lasthaiku.wikidot.com/sccrane",
        "http://lasthaiku.wikidot.com/scdragon",
        "http://lasthaiku.wikidot.com/sclion",
        "http://lasthaiku.wikidot.com/scmantis",
        "http://lasthaiku.wikidot.com/scphoenix",
        "http://lasthaiku.wikidot.com/scscorpion",
        "http://lasthaiku.wikidot.com/scspider",
        "http://lasthaiku.wikidot.com/scunicorn",
       ]

In [None]:
# Ignore the following schools
school_ignore_list = [
    "Chuda Shugenja", # Affinity /Deficiency involves Maho
    "Dark Moto Cavalry", # School requires Taint
    "Free Ogre Bushi", # Requires Ogre
    "Free Ogre Mage", # Requires Ogre
]

skill_error_list = [
    "Daidoji Scout",
    "Kakita Artisan",
    "Moshi Shugenja"
]

In [None]:
pages = []
for url in urls:
    r = requests.get(url)
    clan = url[31:].capitalize()
    pages.append((clan, r.text))

In [None]:
def get_searchable_html(html):
    soup = bs(html)
    subsoup = soup.find(id="page-content")
    classes = subsoup.find_all("h2")
    html = ""
    line = classes[0]
    while line.name != "h1":
        html += line.prettify()
        line = line.find_next_sibling()
    return html

In [None]:
def extract_classes(html):
    soup = bs(html)
    line = soup.find("h2")
    classes = []
    class_html = ""
    first = True
    
    while line != None:
        if line.name == "h2":
            if first:
                first = False
            else:
                classes.append(class_html)
                class_html = ""
        class_html += line.prettify()
        line = line.find_next_sibling()

    classes.append(class_html)
    return classes

In [None]:
def get_title_and_class(html):
    valid_classes = ["Bushi", "Artisan", "Shugenja", "Monk", "Courtier", "Ninja"]
    soup = bs(html)
    title = soup.find("h2").text.strip()
    match = re.match(r"^([a-zA-Z -]+)(?: \[([a-zA-Z\\/]+)\])?(?: \[\w+\])?$", title)
    if (match == None):
        school_name = title
        class_name = None
    else:
        school_name, class_name = match.groups()
        if not(class_name in valid_classes) and not(class_name == None):
            if not("/" in class_name):
                class_name = None
    
    if class_name == None:
        # Class should be described in the title of the school
        classes = []
        for substring in valid_classes:
            if substring in school_name:
                classes.append(substring)
        class_name = "/".join(classes)

    return (school_name, class_name)
    

In [None]:
def get_attribute(html):
    matches = re.findall(r"Benefit ?:.+? \+1 (\w+)", html, re.S)
    try:
        attribute = matches[0].strip()
    except IndexError:
        attribute = None
#     print("Attribute:", attribute)
    return attribute


def get_skills(html, choices=True):
    matches = re.findall(r"Skills ?:(?:\s*?<\/strong>)?(.+?)<", html, re.S)
    try:
        skills = []
        skill_choices = []
        skills = [s.strip().replace(".", "") for s in matches[0].split(",")]
    except IndexError:
        skills = None

    if choices:
        return extract_skill_choices(skills)
    else:
        return skills, []

def extract_skill_choices(skills):
    set_skills = []
    skill_choices = []
    
    for skill_raw in skills:
        if "any" in skill_raw.lower():
            skill = skill_raw.lower()
            
            count = 1
            if ("2" in skill) or ("two") in skill:
                count = 2
            
            macros = []
            for s in ["artisan", "bugei", "craft", "high", "lore", "merchant", "perform", "weapon"]:
                if s in skill:
                    macros.append(s.capitalize())
                
            if "low" in skill:
                if ("not" in skill) or ("non-low" in skill):
                    macros.append("!Low")
                else:
                    macros.append("Low")
                    
            if len(macros) == 0:
                macros.append("Any")
                
                
                
            for i in range(count):
                skill_choices.append(" / ".join(macros))
            
        else:
            set_skills.append(skill_raw)
                
    return (set_skills, skill_choices)
                
                
    
def get_honor(html):
    matches = re.findall(r"Honor ?:.+?([\d.]+)", html, re.S)
    try:
        honor = matches[0].strip()
    except IndexError:
        honor = None
#     print("Honor:", honor)
    return honor
    
    
def get_gear(html):
    matches = re.findall(r"Outfit ?:\s+</strong>(.+?)<", html, re.S)
    try:
        gear = [g.strip() for g in matches[0].split(",")]
    except IndexError:
        gear = None

    koku = 0
    for i in range(len(gear)):
        if "koku" in gear[i]:
            koku = gear.pop(i).split(" ")[0]
            break
        
    return gear, koku


def get_aff_def(html):
    matches = re.findall(r"Affinity ?\/ ?Deficiency:.+?(\w+) ?\/ ?(\w+)", html, re.S)
    try:
        aff_def = matches[0]
    except IndexError:
        aff_def = (None, None)
#     print("Aff/Def:", aff_def)
    return aff_def
    
    
def get_spells(html):
    matches = re.findall(r"Spells ?:\s+<\/strong>(.+?)<", html, re.S)
    try:
        spells = [s.strip() for s in matches[0].split(",")]
    except IndexError:
        spells = None
#     print("Spells:", spells)
    return spells
    
    
def get_special(html):
    matches = re.findall(r"Special ?:\s+<\/strong>(.+?)<", html, re.S)
    try:
        special = matches[0].strip()
    except IndexError:
        special = None
#     print("Special:", special)
    return special
    
    
def get_techniques(html):
    techniques = {}
    matches = re.findall(r"Technique ?:.+?<em>([^<]+)<\/em>\W*(.+?)<", html, re.S)
    
    if matches:
        techniques["1"] = {
            "name": matches[0][0].strip(),
            "effect": matches[0][1].strip()
        }
        
    else:
        matches = re.findall(r"Rank (\d) ?: ([^<]+)</strong>\s+(?:<br/>\s+)?(.+?)</p>", html, re.S)
        for m in matches:
            rank = m[0].strip()
            techniques[rank] = {
                "name": m[1].strip(),
                "effect": m[2].strip()
            }
            
    return techniques

In [None]:
schools = {}
for clan, page in pages:
    html = get_searchable_html(page)
    school_list = extract_classes(html)
    for school_html in school_list:
        school = {}
        title, school_class = get_title_and_class(school_html)
        
        if title in school_ignore_list:
            continue
        
        school["name"] = title
        school["clan"] = clan
        school["class"] = school_class
        school["attribute"] = get_attribute(school_html)
        
        skill_choices_correct = True
        if title in skill_error_list:
            skill_choices_correct = False
        school["skills"], school["skill_choices"] = get_skills(school_html, skill_choices_correct)
        
        school["honor"] = get_honor(school_html)
        school["gear"], school["koku"] = get_gear(school_html)
        school["affinity"], school["deficiency"] = get_aff_def(school_html)
        school["spells"] = get_spells(school_html)
        school["special"] = get_special(school_html)
        school["techniques"] = get_techniques(school_html)
        
        schools[title] = school
        
print(json.dumps(schools))

## Spells

In [None]:
spell_urls = [
    "http://lasthaiku.wikidot.com/air-spells",
    "http://lasthaiku.wikidot.com/earth-spells",
    "http://lasthaiku.wikidot.com/fire-spells",
    "http://lasthaiku.wikidot.com/water-spells",
    "http://lasthaiku.wikidot.com/void-spells"
]

In [None]:
spell_pages = []
for url in spell_urls:
    r = requests.get(url)
    soup = bs(r.text)
    subsoup = soup.find(id="page-content");
    html = ""
    for line in subsoup:
        html += str(line)
    spell_pages.append(html);

In [None]:
print(spell_pages[0])

In [None]:
def get_spell_html(spell_pages):
    spells = []
    
    pattern = re.compile("(<strong>.+?(?:<strong>|$))", re.DOTALL)
    
    for page in spell_pages:
        while True:
            match = pattern.search(page)
            
            if match == None:
                break
                
            if len(match.group(0)) > 20:
                match_text = re.sub("<h1.+?\/h1>", "", match.group(0))
                spells.append(html.unescape(match_text))
                
            page = re.sub(pattern, "<strong>", page, 1)
            
    return spells

In [None]:
spells = get_spell_html(spell_pages)

In [None]:
def get_spell_info(spell, outlier=False):
    
    title = get_title(spell)
    
    details = get_details(spell)
    
    spell_range = None
    aoe = None
    duration = None
    raises = None
    special = None
    
    for line in details:
        if re.match("^Ring", line, re.I):
            element, level, keywords = get_mastery(line, outlier)
        elif re.match("^Range:", line):
            spell_range = get_range(line)
        elif re.match("^Area?(?: of Effect)?:?", line, re.I):
            aoe = get_aoe(line)
        elif re.match("^Duration", line, re.I):
            duration = get_duration(line)
        elif re.match("^Raises", line, re.I):
            raises = get_raises(line)
        elif re.match("^Special", line, re.I):
            special = get_special(line)
        elif re.match("^Instantaneous", line):
            duration = "Instantaneous"
        else:
            print("Cannot find match: {}".format(line))
    
    if keywords == "":
        keywords = []
    else:
        keywords = keywords.split()
        
    return (title, {
        "title": title,
        "element": element,
        "mastery_level": level,
        "keywords": keywords,
        "range": spell_range,
        "aoe": aoe,
        "duration": duration,
        "raises": raises,
        "special": special,
        "description": get_description(spell)
    })

def get_title(html):
    s = re.findall("<strong>(.+)<\/strong>", html)[0]
    text = re.findall("^(.+?)(?:\[|$)", s)[0]
    
    def f_cap(match):
        return string.capwords(match.group(0))
    
    return re.sub("[a-zA-Z0-9-']+", f_cap, text).strip()
    
def get_details(spell):
    ul = re.search("<ul>(.+)<\/ul>", spell, re.S).group(1)
    pattern = re.compile("<li>(.+?)<\/li>", re.S)
    data = []
    while True:
        match = pattern.search(ul)
        if match == None:
            return data
        data.append(match.group(1))
        ul = re.sub(pattern, "", ul, 1)
    
def get_mastery(text, outlier=False):
    if (outlier):
        return ["Fire", "4", "Craft"]
    return re.findall("ring\/mastery?:? (\w+) (\d)(?: \(([a-z, ]+)\))?", text, re.I)[0]

def get_range(text):
    return re.findall("Range:(.+?)$", text)[0].strip()

def get_duration(text):
    return re.findall("Duration:?(.+?)$", text, re.I)[0].strip()

def get_aoe(text):
    spell_aoe = re.findall("Area?[ ofectE]*:?(.+?)$", text)
    if (len(spell_aoe) > 0):
        return spell_aoe[0].strip()
    else:
        return ""

def get_raises(text):
    spell_raises = re.findall("Raises:(.+)", text)[0]
    separated_raises = spell_raises.split(",")
    raises = [r.strip() for r in separated_raises]
    if (len(raises) > 0):
        return raises
    else:
        return []
    
def get_special(text):
    return re.findall("^special:(.+)$", text, re.I)[0].strip()

def get_description(spell):
    text = re.findall("</ul>(.+)$", spell, re.DOTALL)[0]
    description = re.sub("<[a-zA-Z\/]+>", "", text)
    return description.strip()

In [None]:
spell_info = {}

for i in range(len(spells)):
    try:
        if (i == 167):
            g = get_spell_info(spells[i], True)
        else:
            g = get_spell_info(spells[i])
        spell_info[g[0]] = g[1]
    except Exception as e:
        print(i)
        print(spells[i])
        raise(e)

In [None]:
json.dump(spell_info, open("spells_full.json", "w+"), indent=4)

## Kata

In [None]:
kata_url = "http://lasthaiku.wikidot.com/katas"
r = requests.get(kata_url)
soup = bs(r.text)
subsoup = soup.find(id="toc0")

kata = subsoup.find_next_sibling("p")

kata_info = {}

while kata:
    kata_list = kata.find_next_sibling("ul").findAll("li")
    
    mastery = re.findall("Ring(?:\/| )Mastery:(.+)", kata_list[0].text)[0].strip()
    
    school_text = re.findall("Schools:(.+)", kata_list[1].text)[0].strip()
    if len(school_text.split(",")) > 1:
        schools = []
        for s in school_text.split(","):
            schools.append(s.strip())
    else:
        schools = school_text

    if len(kata_list) == 3:
        effect = re.findall("(?:Effect:)?(.+)", kata_list[2].text)[0].strip()
        special = None
    else:
        special = re.findall("Special:(.+)", kata_list[2].text)[0].strip()
        effect = re.findall("Effect:(.+)", kata_list[3].text)[0].strip()
        
    kata_info[kata.text] = {
        "mastery": mastery,
        "schools": schools,
        "special": special,
        "effect": effect
    }
    
    kata = kata.find_next_sibling("p")
    
print(json.dumps(kata_info))