In [18]:
import csv
from os import path
from glob import glob

from lxml import etree
from lxml.cssselect import CSSSelector as CSS  # also needs cssselect installed
from six import print_

## Notes

Sadly, the pages use CSS or something to make the checkboxes. So, when you look at the bare file, the checked boxes look like this: â˜‘ (unchecked look like â˜). This is likely because Chrome saves files at utf-8 by default, but then renders them as iso-latin or something dumb like that. Thanks Chrome!

If you look at the source in a text editor, though, you can see the checkboxes fine.

Also, the HTML structure borders on malicious. There is no useful structure (likely HTML was largely driven by layout concerns).

## An example

In [6]:
%ls scraped-pages/Adelanto\ Elementary

Melva Davis Academy of Excellence.html


In [7]:
fname = './scraped-pages/Adelanto Elementary/Melva Davis Academy of Excellence.html'

In [27]:
class LEA_parser:
    # I'm trying to use CSS selectors where possible, as more folks will know these
    lists_loc = CSS('ol.zeroindent li')
    parse_utf8 = etree.HTMLParser(encoding='utf-8')

    def __init__(self, filename):
        self.parsed = etree.parse(filename, self.parse_utf8)
        self.extract_responses()

        
    def extract_responses(self):
        curr_items = self.lists_loc(self.parsed)
    
        self.res = []
    
        for item in curr_items:
            # CSS won't let us grab stuff based on content, only structure
            # So, we're using Xpath (and we're using unicode because of how Chrome saved these)
            checkboxes = item.xpath('.//span[text()="☑"]')
            if not checkboxes:
                # string() exports the text of children also... not sure why, 
                # but text() doesn't work here
                if item.xpath('.//*[contains(string(), "No response")]'):
                    self.res.append('No response')
                else:
                    self.res.append('Something is amiss with the parser:' + item.xpath('string()'))
            else:
                # Some sections are multiple choice!
                checked_vals = '; '.join(self.get_box_label(box) for box in checkboxes)
                self.res.append(checked_vals)

                
    def get_box_label(self, box_el):
        '''Get the label to the right of the checkbox

        Currently, this doesn't handle "(please specify)" information'''
        return box_el.getnext().xpath('string()')
    
    def as_list(self):
        return self.res

In [28]:
# Seems to be working!
LEA_parser(fname).as_list()

['Awareness',
 'The LEA CCSS plan is currently in development.',
 'No response',
 '0-10 hours',
 '0-10 hours',
 'Using formative practices in instruction',
 'Activities provided by other vendors; Activities using online professional learning modules from the CDE',
 'Conference attendance',
 'The LEA does not offer pre-kindergarten or transitional kindergarten programs',
 'The LEA does not offer extended learning/after school programs',
 'No response',
 'Materials are being used in every classroom',
 'State Board of Education-adopted materials for mathematics; Teacher-developed materials; Free supplemental materials provided by the publishers of your currently adopted programs; Free supplemental materials provided by a publisher not directly affiliated with your currently adopted programs',
 'No response',
 '0-10 hours',
 '0-25 percent',
 'Yes',
 'No response',
 'We have not yet presented information regarding CCSS implementation to the governing board',
 'We have not yet shared informa

## Do it!

In [37]:
with open('parsed_LEA.csv', 'w') as outfile:
    outfile = csv.writer(outfile)
    # range is up to, but not including the upper limit
    outfile.writerow(['District', 'School'] + list(range(1, 26)))

    for resp_file in glob('scraped-pages/*/*.html'):
        if '00-NotSubmitted' in resp_file:
            # These don't have any data in them. Just keeping the files
            # because why delete anything?
            continue

        parsed = LEA_parser(resp_file)

        # path.split is annoying - it only does one split
        path_elts = resp_file.split('/')
        district = path_elts[-2]
        school, _ = path.splitext(path_elts[-1])
        school = school.replace('_', '/')

        outfile.writerow([district, school] + parsed.res)


    