# Parse competition description

Extract and save JSON description parsed from competition HTML home page

In [1]:
# parameters

source = '~/dat/3136/raw/home.html'
target = '~/dat/3136/interim/description.json'

In [2]:
from html.parser import HTMLParser
import json
import pathlib
import re

## Paths

In [3]:
paths = {k: pathlib.Path(globals()[k]).expanduser() for k in ['source', 'target']}

## Load

In [4]:
with paths['source'].open('r') as in_stream:
    html_code = in_stream.read()

## Transform

In [6]:
class KagglePushedStateHTMLParser(HTMLParser):
    
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self._in_script = False
        self._scripts = []

    def handle_starttag(self, tag, attrs):
        self._in_script = (tag == 'script')

    def handle_endtag(self, tag):
        self._in_script = False

    def handle_data(self, data):
        if self._in_script:
            self._scripts.append(data)
    
    def pushed_states(self):
        pushed_state_rec = re.compile(r'Kaggle.State.push\((?P<json_code>\{.*\})\);')
        for match in map(pushed_state_rec.search, self._scripts):
            if match:
                yield json.loads(match.groupdict()['json_code'])
    
    def longest_pushed_state(self):
        sorted_states = sorted(self.pushed_states(), key=len)
        if sorted_states:
            return sorted_states[-1]
            
    
parser = KagglePushedStateHTMLParser()
parser.feed(html_code)
result = parser.longest_pushed_state()

## Save

In [None]:
if result:
    paths['target'].parent.mkdir(parents=True, exist_ok=True)
    with paths['target'].open('w') as out_stream:
        json.dump(result, out_stream)