In [36]:
import pandas as pd
import numpy as np

import os, re, sys, json
from bs4 import BeautifulSoup

In [98]:
# get list of xml files
xmls = np.array(os.listdir('OBO_XML_7-2/sessionsPapers')[1:])

# split files into 25 or 50 year spans
span_labels = ['1674-1699', '1700-1749', '1750-1799', '1800-1824', '1825-1849', 
        '1850-1874', '1875-1899', '1900-1913']
spans = [xmls[np.all([xmls > s[:4], xmls < s[5:] + '9999.xml'], axis=0)] for s in span_labels]

# check that split files add up to total
sum([len(s) for s in spans]) == len(xmls)

True

In [230]:
def session_to_df(xml, data):
    ''' Return a dataframe with one row per trial in the session'''

    # read in file
    with open(xml) as f:
        try:
            session = f.read()
        except UnicodeDecodeError:
            print(xml + " couldn't be read")
            return data

    soup = BeautifulSoup(session, 'xml')

    # separate session into trials
    trials = soup.find_all('div1', type='trialAccount')
    # get the session id
    session_id = soup.find('div0').get('id')


    # iterate through trials
    for trial in trials:

        trial_soup = BeautifulSoup(str(trial), 'xml')

        # get the text
        trial_txt = trial_soup.get_text()
        # remove leading/trailing new lines, extra new lines, extra spaces
        trial_txt = re.sub(r'^\n+|\n+$', '', trial_txt)
        trial_txt = re.sub(r'\n', '', trial_txt)
        trial_txt = re.sub(r'\s\s+', ' ', trial_txt)
        # add to data dictionary
        data['transcript'].append(trial_txt)
        
        # Get offense category and subcategory;
        # Note: we simplify here - only a small percentage of trials have
        # ... several offense categories, and when they do, it is nearly always
        # ... various subcategories of theft.
        # So we only save the first offense-suboffense we encounter for each trial.
        # We also guard ourselves against somebody having forgotten to mark the category.
        try:
            mainc = trial_soup.find('interp',type='offenceCategory').get('value')
            data['offense'].append(mainc.strip())
        except AttributeError:
            data['offense'].append('uncategorized')
        try:
            subc = trial_soup.find('interp',type='offenceSubcategory').get('value')
            data['offense_subcategory'].append(subc.strip())
        except AttributeError:
            data['offense_subcategory'].append('none') 
            
                # Get verdict category/subcategory (first verdict only)
        try:
            mainc = trial_soup.find('interp',type='verdictCategory').get('value')
            data['verdict'].append(mainc.strip())
        except AttributeError:
            data['verdict'].append('uncategorized')
        try:
            subc = trial_soup.find('interp',type='verdictSubcategory').get('value')
            data['verdict_subcategory'].append(subc.strip())
        except AttributeError:
            data['verdict_subcategory'].append('none') 
        
        # get the punishment and sub-punishment (first only)
        try:
            mainc = trial_soup.find('interp',type='punishmentCategory').get('value')
            data['punishment'].append(mainc.strip())
        except AttributeError:
            data['punishment'].append('uncategorized')
        try:
            subc = trial_soup.find('interp',type='punishmentSubcategory').get('value')
            data['punishment_subcategory'].append(subc.strip())
        except AttributeError:
            data['punishment_subcategory'].append('none') 
                 
        # get the trial id
        data['trial_id'].append(trial_soup.find('div1').get('id'))
        data['session'].append(session_id)
    return data

In [231]:
# range of sessions from about 25 yrs before and after the Bloody Code repeal
bloody_span = xmls[np.all([xmls > '1800', xmls < '1850'], axis=0)]
data = defaultdict(list)
count = 0

for session in bloody_span:
    if count % 25 == 0:
        print('{}% processed'.format(count * 100 / len(bloody_span)))
    data = session_to_df('OBO_XML_7-2/sessionsPapers/' + session, data)
    count+=1

0.0% processed
5.399568034557236% processed
10.799136069114471% processed
16.198704103671705% processed
21.598272138228943% processed
26.997840172786177% processed
32.39740820734341% processed
37.79697624190065% processed
43.196544276457885% processed
48.59611231101512% processed
53.99568034557235% processed
59.39524838012959% processed
64.79481641468682% processed
OBO_XML_7-2/sessionsPapers/18370130.xml couldn't be read
OBO_XML_7-2/sessionsPapers/18371211.xml couldn't be read
70.19438444924405% processed
75.5939524838013% processed
80.99352051835854% processed
86.39308855291577% processed
91.792656587473% processed
OBO_XML_7-2/sessionsPapers/18461123.xml couldn't be read
97.19222462203024% processed
OBO_XML_7-2/sessionsPapers/18490820.xml couldn't be read


In [246]:
bloody_data = pd.DataFrame(data)
bloody_data.set_index('trial_id', inplace=True)
bloody_data.shape


(85756, 8)

In [249]:
bloody_data.to_csv('obc_1800_1850.csv')
