In [117]:
import bs4 as bs
import pandas as pd
from itertools import zip_longest as zzip
import io


##########################################
# scrape and build dataset
# scraped from https://mhst.kiranico.com/monstie
##########################################

def text_filter(txt):
    # Returns a function.
    # The function returns whether the beginning of tag's text is txt.
    # This is used when parsing section headers by their label.
    return lambda t: t.get_text(strip=True)[:len(txt)] == txt

def get_innerhtml(tag):
    # return a beautifulsoup tag's inner html as asci text
    return tag.encode_contents(None).decode('ascii')

def atoi(s, missing_value=None):
    # str to int, ignoring commas and spaces. Return None if error
    # convert float to int by ignoring decimals: 1.8 is 1, -1.8 is -1
    try:
        return int(s.replace(',','').replace(' ','').split('.')[0])
    except ValueError:
        return missing_value

def test_atoi():
    # unit test for atoi
    assert atoi('-5') == -5
    assert atoi('+1,202') == 1202
    assert atoi('-1.3') == -1
    assert atoi('9 876.8') == 9876
    assert atoi('abc123') == None
    
def read_html_soup(filename, encoding='utf-8'):
    # return a beautifulSoup object from an html file
    # encoding can be 'utf-8' (typical html) or None
    with io.open(filename, mode='r', encoding=encoding) as f:
        return bs.BeautifulSoup(f.read(), 'lxml')

def parse_table(soup):
    # convert table soup into a list of lists
    cells = []
    for tr in soup.find_all('tr'):
        row = []
        for td in tr.find_all('th') + tr.find_all('td'):
            row.append(td.get_text().replace('\n','').strip())
        cells.append(row)
    return cells

def test_parse_table():
    # unit test for parse_table(soup)
    html = """
        <table>
        <tr><th>1</th><th>2</th></tr>
        <tr><th>3</th><th>4</th></tr>
        </table>
    """
    soup = bs.BeautifulSoup(html, "html.parser")
    assert parse_table(soup) == [['1', '2'], ['3', '4']]

    
def lengthen(l, target_len, filler=None):
    # add filler at the end of a list to make it reach desired length
    assert len(l) <= target_len
    while len(l) < target_len:
        l.append(filler)
    return l

def test_lengthen():
    # unit test for lengthen
    a = [1,2]
    b = lengthen(a,4,7)
    assert b == [1,2,7,7]
        
def parse_card(soup):
    # convert monstie html soup into a dict of relevant info
    
    # name
    name = soup.find('h4',{'class':'card-title'}).get_text(strip=True)
    
    # overworld tools
    tools = list(map(
        lambda x: x.get_text(strip=True),
        soup.find_all('button')
    ))
    lengthen(tools,2)
    
    # tendency and description
    tendency = soup.find('span',{'class':'badge'}).get_text(strip=True)
    desc = soup.find('p',{'class':'text-justify'}).get_text(strip=True)
    
    # egg pattern and colors
    t = soup.find(text_filter('Egg')).findNext('table')
    img_urls = list(map(
        lambda x:x.get('src').split('/')[-1], # filename, eg pattern13_1.png
        # full url is https://mhst.kiranico.com/images/game/egg/pattern13_1.png
        t.find_all('img')
    ))
    lengthen(img_urls,4)
    img_colors = list(map(
        lambda x:x.get('style').split(':')[1].strip(), # rgb(a,b,c)
        t.find_all('img')
    ))
    lengthen(img_colors,4)
    
    # genes: signature, fixed, common, rare
    gene_s = soup.find('div',{'class':'text-secondary'}).get_text(strip=True)
    genes_f = list(map(
        lambda x: x.get_text(strip=True),
        soup.find_all('div',{'class':'text-success'})
    ))
    lengthen(genes_f, 2)
    genes_c = list(map(
        lambda x: x.get_text(strip=True),
        soup.find_all('div',{'class':'text-info'})
    ))
    lengthen(genes_c, 5)
    genes_r = list(map(
        lambda x: x.get_text(strip=True),
        soup.find_all('div',{'class':'text-danger'})
    ))
    lengthen(genes_r, 5)
    
    # skills
    skills = parse_table(soup.find_all('table')[1]) # skill table is 2nd
    # 1 kinship skill in top row, then max 5 learned skills
    if skills[0][0] != 'Kinship Skill':
        print('%s is not "Kinship Skill" for Monstie %s'
             % (skills[0][0], name))
    skill_k = skills[0][1]
    skills_l = [row[1] for row in skills[1:]] # row[0] is level learned at
    lengthen(skills_l, 5)
    
    # level stats
    levels = parse_table(soup.find_all('table')[2])
    if levels[11][0] != '50' or levels[21][0] != '99':
        print('levels improperly formatted for Monstie %s' % name)
    stats_50 = list(map(atoi, levels[11][1:5])) # hp, atk, def, spd
    stats_99 = list(map(atoi, levels[21][1:5])) # hp, atk, def, spd
    
    # element atk and def 
    elems = parse_table(soup.find_all('table')[3])
    eatk = list(map(atoi, elems[1][1:])) # fire, water, thunder, ice, dragon
    edef = list(map(atoi, elems[2][1:])) # fire, water, thunder, ice, dragon
    
    # and misc info = 5th table
    mdata = parse_table(soup.find_all('table')[4])
    habitat = mdata[0][1]
    growth = mdata[1][1]
    default_attr = mdata[2][1]
    encount_radius = atoi(mdata[3][1])
    
    # build csv-ready flat dict of monstie data
    d = {
        'name': name,
        'tool1': tools[0],
        'tool2': tools[1],
        'tendency': tendency,
        'egg_pattern1': img_urls[0],
        'egg_pattern2': img_urls[1],
        'egg_pattern3': img_urls[2],
        'egg_pattern4': img_urls[3],
        'egg_color1': img_colors[0],
        'egg_color2': img_colors[1],
        'egg_color3': img_colors[2],
        'egg_color4': img_colors[3],
        'gene_s': gene_s,
        'gene_f1': genes_f[0],
        'gene_f2': genes_f[1],
        'gene_c1': genes_c[0],
        'gene_c2': genes_c[1],
        'gene_c3': genes_c[2],
        'gene_c4': genes_c[3],
        'gene_c5': genes_c[4],
        'gene_r1': genes_r[0],
        'gene_r2': genes_r[1],
        'gene_r3': genes_r[2],
        'gene_r4': genes_r[3],
        'gene_r5': genes_r[4],
        'skill_k': skill_k,
        'skill_l1': skills_l[0],
        'skill_l2': skills_l[1],
        'skill_l3': skills_l[2],
        'skill_l4': skills_l[3],
        'skill_l5': skills_l[4],
        'stat_50_hp': stats_50[0],
        'stat_50_atk': stats_50[1],
        'stat_50_def': stats_50[2],
        'stat_50_spd': stats_50[3],
        'stat_99_hp': stats_99[0],
        'stat_99_atk': stats_99[1],
        'stat_99_def': stats_99[2],
        'stat_99_spd': stats_99[3],
        'eatk_f': eatk[0],
        'eatk_w': eatk[1],
        'eatk_t': eatk[2],
        'eatk_i': eatk[3],
        'eatk_d': eatk[4],
        'edef_f': edef[0],
        'edef_w': edef[1],
        'edef_t': edef[2],
        'edef_i': edef[3],
        'edef_d': edef[4],
        'habitat': habitat,
        'growth': growth,
        'default_attr': default_attr,
        'encount_radius': encount_radius,
    }
    return d


def run_unit_tests():
    # run all unit tests in this cell
    test_atoi()
    test_lengthen()
    test_parse_table()

    
def build_save_data():
    run_unit_tests()
    input_filename = "data/monsties.html"
    soup = read_html_soup(input_filename)
    card_soups = soup.find_all('div', {'class':'card-body'})
    df = pd.DataFrame([parse_card(c) for c in card_soups])
    df.to_csv('data/monstie_output.tsv', sep='\t')
    return df



df = build_data() 
df.head()
