In [1]:
import requests
from bs4 import BeautifulSoup as bs
import re
import json
from IPython.display import clear_output
import html
import string

## Schools

### Download Data

In [2]:
urls = ["http://lasthaiku.wikidot.com/sccrab",
        "http://lasthaiku.wikidot.com/sccrane",
        "http://lasthaiku.wikidot.com/scdragon",
        "http://lasthaiku.wikidot.com/sclion",
        "http://lasthaiku.wikidot.com/scmantis",
        "http://lasthaiku.wikidot.com/scphoenix",
        "http://lasthaiku.wikidot.com/scscorpion",
        "http://lasthaiku.wikidot.com/scspider",
        "http://lasthaiku.wikidot.com/scunicorn",
       ]

In [3]:
pages = []
for url in urls:
    r = requests.get(url)
    clan = url[31:].capitalize()
    pages.append((clan, r.text))

In [4]:
def get_searchable_html(html):
    soup = bs(html)
    subsoup = soup.find(id="page-content")
    classes = subsoup.find_all("h2")
    html = ""
    line = classes[0]
    while line.name != "h1":
        html += line.prettify()
        line = line.find_next_sibling()
    return html

In [5]:
def extract_classes(html):
    soup = bs(html)
    line = soup.find("h2")
    classes = []
    class_html = ""
    first = True
    
    while line != None:
        if line.name == "h2":
            if first:
                first = False
            else:
                classes.append(class_html)
                class_html = ""
        class_html += line.prettify()
        line = line.find_next_sibling()

    classes.append(class_html)
    return classes

In [6]:
def get_title_and_class(html):
    valid_classes = ["Bushi", "Artisan", "Shugenja", "Monk", "Courtier", "Ninja"]
    soup = bs(html)
    title = soup.find("h2").text.strip()
    match = re.match(r"^([a-zA-Z -]+)(?: \[([a-zA-Z\\/]+)\])?(?: \[\w+\])?$", title)
    if (match == None):
        school_name = title
        class_name = None
    else:
        school_name, class_name = match.groups()
        if not(class_name in valid_classes) and not(class_name == None):
            if not("/" in class_name):
                class_name = None
    
    if class_name == None:
        # Class should be described in the title of the school
        classes = []
        for substring in valid_classes:
            if substring in school_name:
                classes.append(substring)
        class_name = "/".join(classes)
        
    return (school_name, class_name)
    

In [10]:
def get_attribute(html):
    matches = re.findall(r"Benefit ?:.+? \+1 (\w+)", html, re.S)
    try:
        attribute = matches[0].strip()
    except IndexError:
        attribute = None
#     print("Attribute:", attribute)
    return attribute
    
def get_skills(html):
    matches = re.findall(r"Skills ?:(?:\s*?<\/strong>)?(.+?)<", html, re.S)
    try:
        skills = []
        skill_choices = []
        skills = [s.strip().replace(".", "") for s in matches[0].split(",")]
    except IndexError:
        skills = None
#     print("Skills:", skills)
    return skills
    
def get_honor(html):
    matches = re.findall(r"Honor ?:.+?([\d.]+)", html, re.S)
    try:
        honor = matches[0].strip()
    except IndexError:
        honor = None
#     print("Honor:", honor)
    return honor
    
def get_gear(html):
    matches = re.findall(r"Outfit ?:\s+</strong>(.+?)<", html, re.S)
    try:
        gear = [g.strip() for g in matches[0].split(",")]
    except IndexError:
        gear = None
#     print("Gear:", gear)
    return gear
    
def get_aff_def(html):
    matches = re.findall(r"Affinity ?\/ ?Deficiency:.+?(\w+) ?\/ ?(\w+)", html, re.S)
    try:
        aff_def = matches[0]
    except IndexError:
        aff_def = (None, None)
#     print("Aff/Def:", aff_def)
    return aff_def
    
def get_spells(html):
    matches = re.findall(r"Spells ?:\s+<\/strong>(.+?)<", html, re.S)
    try:
        spells = [s.strip() for s in matches[0].split(",")]
    except IndexError:
        spells = None
#     print("Spells:", spells)
    return spells
    
def get_special(html):
    matches = re.findall(r"Special ?:\s+<\/strong>(.+?)<", html, re.S)
    try:
        special = matches[0].strip()
    except IndexError:
        special = None
#     print("Special:", special)
    return special
    
def get_techniques(html):
    techniques = {}
    matches = re.findall(r"Technique ?:.+?<em>([^<]+)<\/em>\W*(.+?)<", html, re.S)
    
    if matches:
        techniques["1"] = {
            "name": matches[0][0].strip(),
            "effect": matches[0][1].strip()
        }
        
    else:
        matches = re.findall(r"Rank (\d) ?: ([^<]+)</strong>\s+(?:<br/>\s+)?(.+?)</p>", html, re.S)
        for m in matches:
            rank = m[0].strip()
            techniques[rank] = {
                "name": m[1].strip(),
                "effect": m[2].strip()
            }
            
    return techniques

In [11]:
clan_info = {}
count = 0

for clan, page in pages:
    html = get_searchable_html(page)
    schools = extract_classes(html)
    
    if not (clan in clan_info):
        clan_info[clan] = {}
    
    for school_html in schools:
        school_info = {}
        title, school_class = get_title_and_class(school_html)
        school_info["name"] = title
        school_info["class"] = school_class
        school_info["attribute"] = get_attribute(school_html)
        school_info["skills"] = get_skills(school_html)
        school_info["skill_choices"] = []
        school_info["honor"] = get_honor(school_html)
        school_info["gear"] = get_gear(school_html)
        school_info["affinity"], school_info["deficiency"] = get_aff_def(school_html)
        school_info["spells"] = get_spells(school_html)
        school_info["special"] = get_special(school_html)
        school_info["techniques"] = get_techniques(school_html)
        clan_info[clan][title] = school_info
        
print(json.dumps(clan_info))



### Modify Files

In [36]:
clan_info = json.load(open("docs/json/schools.json"))

In [38]:
print(json.dumps(clan_info))



In [None]:
for clan in clan_info:
    for school in clan_info[clan]:
        gear = clan_info[clan][school]["gear"]
        for i in range(len(gear)):
            if "koku" in gear[i]:
                koku = gear.pop(i).split(" ")[0]
                clan_info[clan][school]["koku"] = koku
                break

In [51]:
for clan in clan_info:
    for school in clan_info[clan]:
        if clan_info[clan][school]["spells"] != None:
            print(clan, school)
            print(clan_info[clan][school]["spells"])
            new_spells = []
            for s in clan_info[clan][school]["spells"]:
                for k in s.split("and"):
                    q = k.strip()
                    if len(q) > 0:
                        new_spells.append(q)
            print(new_spells)
            print()

Crab Kuni Shugenja
['Sense', 'Commune', 'Summon', '3 Earth', '2 Fire', 'and 1 Water']
['Sense', 'Commune', 'Summon', '3 Earth', '2 Fire', '1 Water']

Crane Asahina Shugenja
['Sense', 'Commune', 'Summon', '3 Air', '2 Water', 'and 1 Earth']
['Sense', 'Commune', 'Summon', '3 Air', '2 Water', '1 Earth']

Dragon Tamori Shugenja
['Sense', 'Commune', 'Summon', '3 Earth', '2 Fire', 'and 1 Water']
['Sense', 'Commune', 'Summon', '3 Earth', '2 Fire', '1 Water']

Lion Kitsu Shugenja
['Sense', 'Commune', 'Summon', '3 Water', '2 Air', 'and 1 Earth']
['Sense', 'Commune', 'Summon', '3 Water', '2 Air', '1 Earth']

Mantis Kitsune Shugenja
['Sense', 'Commune', 'Summon', '3 Earth', '2 Water', 'and 1 Fire']
['Sense', 'Commune', 'Summon', '3 Earth', '2 Water', '1 Fire']

Mantis Moshi Shugenja
['Sense', 'Commune', 'Summon', '3 Air and 3 Fire spells']
['Sense', 'Commune', 'Summon', '3 Air', '3 Fire spells']

Mantis Yoritomo Shugenja
['Sense', 'Commune', 'Summon', '3 Water spells', '2 Fire spells', 'and 1 Air 

In [None]:
for clan

## Spells

In [4]:
spell_urls = [
    "http://lasthaiku.wikidot.com/air-spells",
    "http://lasthaiku.wikidot.com/earth-spells",
    "http://lasthaiku.wikidot.com/fire-spells",
    "http://lasthaiku.wikidot.com/water-spells",
    "http://lasthaiku.wikidot.com/void-spells"
]

In [163]:
spell_pages = []
for url in spell_urls:
    r = requests.get(url)
    soup = bs(r.text)
    subsoup = soup.find(id="page-content");
    html = ""
    for line in subsoup:
        html += str(line)
    spell_pages.append(html);

In [334]:
print(spell_pages[0])


<table style="margin:0; padding:0">
<tr>
<td style="margin:0; padding:0">
<div id="toc">
<div id="toc-action-bar"><a href="javascript:;" onclick="WIKIDOT.page.listeners.foldToc(event)">Fold</a><a href="javascript:;" onclick="WIKIDOT.page.listeners.unfoldToc(event)" style="display: none">Unfold</a></div>
<div class="title">Table of Contents</div>
<div id="toc-list">
<div style="margin-left: 1em;"><a href="#toc0">Mastery Level 1</a></div>
<div style="margin-left: 1em;"><a href="#toc1">Mastery Level 2</a></div>
<div style="margin-left: 1em;"><a href="#toc2">Mastery Level 3</a></div>
<div style="margin-left: 1em;"><a href="#toc3">Mastery Level 4</a></div>
<div style="margin-left: 1em;"><a href="#toc4">Mastery Level 5</a></div>
<div style="margin-left: 1em;"><a href="#toc5">Mastery Level 6</a></div>
</div>
</div>
</td>
</tr>
</table>
<h1 id="toc0"><span>Mastery Level 1</span></h1>
<p><strong>Arrow's Flight</strong></p>
<ul>
<li>Ring/Mastery: Air 1 (Battle)</li>
<li>Range: Touch</li>
<li>Ar

In [335]:
def get_spell_html(spell_pages):
    spells = []
    
    pattern = re.compile("(<strong>.+?(?:<strong>|$))", re.DOTALL)
    
    for page in spell_pages:
        while True:
            match = pattern.search(page)
            
            if match == None:
                break
                
            if len(match.group(0)) > 20:
                match_text = re.sub("<h1.+?\/h1>", "", match.group(0))
                spells.append(html.unescape(match_text))
                
            page = re.sub(pattern, "<strong>", page, 1)
            
    return spells

In [336]:
spells = get_spell_html(spell_pages)

In [337]:
spells[172]

'<strong>Consumed by Five Fires</strong></p>\n<ul>\n<li>Ring/Mastery: Fire 5</li>\n<li>Range: 100’</li>\n<li>Area of Effect: One target creature (and caster)</li>\n<li>Duration: Instantaneous</li>\n<li>Raises: Range (+50’ per Raise)</li>\n</ul>\n<p>One of the most potent and deadly of Fire spells, but also one used only by those facing the most desperate need. The spell invokes an array of powerful Fire kami to strike down the target with incredibly powerful heat and flame - but in order to persuade so many kami to strike with such immense power, the shugenja must bestow them with a gift of his own life-force.<br/>\nThe target of this spell is instantly killed, reduced to Dead and then burned down to ashes in a matter of seconds. However, the caster immediately suffers the same number of Wounds as were inflicted on the target - an effect which is often, though not always, lethal. This damage cannot be reduced or mitigated in any way, and if it is lethal the caster cannot avert death. T

In [467]:
def get_spell_info(spell, outlier=False):
    
    title = get_title(spell)
    
    details = get_details(spell)
    
    spell_range = None
    aoe = None
    duration = None
    raises = None
    special = None
    
    for line in details:
        if re.match("^Ring", line, re.I):
            element, level, keywords = get_mastery(line, outlier)
        elif re.match("^Range:", line):
            spell_range = get_range(line)
        elif re.match("^Area?(?: of Effect)?:?", line, re.I):
            aoe = get_aoe(line)
        elif re.match("^Duration", line, re.I):
            duration = get_duration(line)
        elif re.match("^Raises", line, re.I):
            raises = get_raises(line)
        elif re.match("^Special", line, re.I):
            special = get_special(line)
        elif re.match("^Instantaneous", line):
            duration = "Instantaneous"
        else:
            print("Cannot find match: {}".format(line))
    
    if keywords == "":
        keywords = []
    else:
        keywords = keywords.split()
        
    return (title, {
        "title": title,
        "element": element,
        "mastery_level": level,
        "keywords": keywords,
        "range": spell_range,
        "aoe": aoe,
        "duration": duration,
        "raises": raises,
        "special": special,
        "description": get_description(spell)
    })

def get_title(html):
    s = re.findall("<strong>(.+)<\/strong>", html)[0]
    text = re.findall("^(.+?)(?:\[|$)", s)[0]
    
    def f_cap(match):
        return string.capwords(match.group(0))
    
    return re.sub("[a-zA-Z0-9-']+", f_cap, text).strip()
    
def get_details(spell):
    ul = re.search("<ul>(.+)<\/ul>", spell, re.S).group(1)
    pattern = re.compile("<li>(.+?)<\/li>", re.S)
    data = []
    while True:
        match = pattern.search(ul)
        if match == None:
            return data
        data.append(match.group(1))
        ul = re.sub(pattern, "", ul, 1)
    
def get_mastery(text, outlier=False):
    if (outlier):
        return ["Fire", "4", "Craft"]
    return re.findall("ring\/mastery?:? (\w+) (\d)(?: \(([a-z, ]+)\))?", text, re.I)[0]

def get_range(text):
    return re.findall("Range:(.+?)$", text)[0].strip()

def get_duration(text):
    return re.findall("Duration:?(.+?)$", text, re.I)[0].strip()

def get_aoe(text):
    spell_aoe = re.findall("Area?[ ofectE]*:?(.+?)$", text)
    if (len(spell_aoe) > 0):
        return spell_aoe[0].strip()
    else:
        return ""

def get_raises(text):
    spell_raises = re.findall("Raises:(.+)", text)[0]
    separated_raises = spell_raises.split(",")
    raises = [r.strip() for r in separated_raises]
    if (len(raises) > 0):
        return raises
    else:
        return []
    
def get_special(text):
    return re.findall("^special:(.+)$", text, re.I)[0].strip()

def get_description(spell):
    text = re.findall("</ul>(.+)$", spell, re.DOTALL)[0]
    description = re.sub("<[a-zA-Z\/]+>", "", text)
    return description.strip()

In [468]:
spell_info = {}

for i in range(len(spells)):
    try:
        if (i == 167):
            g = get_spell_info(spells[i], True)
        else:
            g = get_spell_info(spells[i])
        spell_info[g[0]] = g[1]
    except Exception as e:
        print(i)
        print(spells[i])
        raise(e)

In [469]:
json.dump(spell_info, open("spells_full.json", "w+"), indent=4)

In [442]:
spell_info

{"Arrow's Flight": {'title': "Arrow's Flight",
  'element': 'Air',
  'mastery_level': '1',
  'keywords': ['Battle'],
  'range': ' ',
  'aoe': '',
  'duration': ' ',
  'raises': ['Duration (+1 Round per Raise)',
   'Special (one additional arrow per Raise)'],
  'special': None,
  'description': 'A prayer used by shugenja to aid bushi when an arrow absolutely must hit its target. The shugenja entreats the Air kami to play a game by guiding the arrow, and if the arrow is fired within the duration of the spell, it will unerringly strike its target. (It must still be fired by someone with a minimal understanding of archery - at least one Rank of Kyujutsu.) However, because it is the kami who ultimately ensure the arrow will hit, the shot cannot benefit from the effects of Raises or Techniques.'},
 'Blessed Wind ': {'title': 'Blessed Wind ',
  'element': 'Air',
  'mastery_level': '1',
  'keywords': ['Defense'],
  'range': ' ',
  'aoe': '',
  'duration': ' ',
  'raises': ['Special (you may ta

## Kata

In [70]:
kata_url = "http://lasthaiku.wikidot.com/katas"
r = requests.get(kata_url)
soup = bs(r.text)
subsoup = soup.find(id="toc0")

kata = subsoup.find_next_sibling("p")

kata_info = {}

while kata:
    kata_list = kata.find_next_sibling("ul").findAll("li")
    
    mastery = re.findall("Ring(?:\/| )Mastery:(.+)", kata_list[0].text)[0].strip()
    
    school_text = re.findall("Schools:(.+)", kata_list[1].text)[0].strip()
    if len(school_text.split(",")) > 1:
        schools = []
        for s in school_text.split(","):
            schools.append(s.strip())
    else:
        schools = school_text

    if len(kata_list) == 3:
        effect = re.findall("(?:Effect:)?(.+)", kata_list[2].text)[0].strip()
        special = None
    else:
        special = re.findall("Special:(.+)", kata_list[2].text)[0].strip()
        effect = re.findall("Effect:(.+)", kata_list[3].text)[0].strip()
        
    kata_info[kata.text] = {
        "mastery": mastery,
        "schools": schools,
        "special": special,
        "effect": effect
    }
    
    kata = kata.find_next_sibling("p")
    
print(json.dumps(kata_info))

{"Striking as Air": {"mastery": "Air 3", "schools": "Any", "special": null, "effect": "When in the Defense Stance your Armor TN is increased by your Air Ring."}, "Breath of Wind Style": {"mastery": "Air 3", "schools": ["Kakita Bushi", "Bayushi Bushi"], "special": null, "effect": "Your Initiative Score increases by 2 during the Reactions Stage of each Combat Round. This effect stacks but disappears immediately if the kata is no longer in effect."}, "Dance of the Winds": {"mastery": "Air 3", "schools": ["Daidoji Bushi", "Shiba Bushi"], "special": null, "effect": "When wielding a polearm or a spear, your Initiative Score is increased by 3."}, "Strength of the Mantis": {"mastery": "Air 3", "schools": "Any Mantis Bushi", "special": null, "effect": "The attack penalty for ranged attacks fired against opponents currently in melee range is reduced by 3."}, "Strength of the Crane": {"mastery": "Air 3", "schools": "Any Crane Bushi", "special": null, "effect": "When fighting with a sword or spear