In [1]:
from datetime import date
import json
import os
import re

import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import parsedatetime

In [None]:
%%time
df = pd.read_csv('../data/scotus_file', usecols = ['author', 'html'])

In [None]:
print(len(df))
df.dropna(how="any", inplace=True)
print(len(df))

In [None]:
%%time
def parse_html(raw_html):
    return BeautifulSoup(raw_html, "html.parser").text
df['text'] = df.apply(lambda row: parse_html(row['html']), axis=1)

In [2]:
%%time
df = pd.read_csv('../data/parsed.csv')

CPU times: user 9.38 s, sys: 1.18 s, total: 10.6 s
Wall time: 10.5 s


In [3]:
%%time
names = 10000 * [None] # names[i] = (first, middle, last)
PEOPLE_ROOT = "../data/people"
for file_name in os.listdir(PEOPLE_ROOT):
    author_idx = int(file_name.split('.')[0])
    with open(os.path.join(PEOPLE_ROOT, file_name)) as f:
        data = json.load(f)
        names[author_idx] = (data['name_first'], data['name_middle'], data['name_last'])
names[0:5]

CPU times: user 486 ms, sys: 106 ms, total: 592 ms
Wall time: 592 ms


In [4]:
def get_idx(author_url): # returns int
    return int(author_url.split('/')[-2])
get_idx(df['author'][df.index[0]])

26

In [5]:
%%time
fml = df.apply(lambda row: names[get_idx(row['author'])], axis = 1)
first, middle, last = map(list, zip(*fml))
last[0:5]

CPU times: user 346 ms, sys: 3.63 ms, total: 350 ms
Wall time: 348 ms


In [6]:
df['first'] = first
df['middle'] = middle
df['last'] = last

In [7]:
%%time
# See https://bear.im/code/parsedatetime/docs/index.html for additional documentation
pattern = re.compile("Decided.*\n")
cal = parsedatetime.Calendar()
def get_date(text):
    search_res = pattern.search(text, 0, 1000)
    if search_res:
        raw = search_res.group()[7 : -1]
        time_struct, parse_status = cal.parse(raw)
        if parse_status == 1: # parsed as a C{date}
            return date(*time_struct[:3])
        return None
    
    lines = text[0 : 1000].split('\n')
    for line in lines:
        time_struct, parse_status = cal.parse(line)
        if parse_status == 1: # parsed as a C{date}
            return date(*time_struct[:3])
    return None

df['date'] = df.apply(lambda row: get_date(row['text']), axis=1)

CPU times: user 12.3 s, sys: 26.9 ms, total: 12.3 s
Wall time: 12.4 s


In [8]:
print(len(df))
df.dropna(how="any", inplace=True)
print(len(df))

21337
20493


In [10]:
%%time
df.to_csv('../data/parsedv3.csv')

CPU times: user 28.5 s, sys: 1.03 s, total: 29.5 s
Wall time: 30 s


In [9]:
df

Unnamed: 0.1,Unnamed: 0,author,html,text,first,middle,last,date
0,0,http://www.courtlistener.com/api/rest/v3/peopl...,"<p class=""case_cite"">264 U.S. 22</p>\n <p c...",264 U.S. 22\n44 S.Ct. 261\n68 L.Ed. 541\nPUGET...,William,Howard,Taft,1924-02-18
1,5,http://www.courtlistener.com/api/rest/v3/peopl...,"<p class=""case_cite"">8 S.Ct. 260</p>\n <p c...",8 S.Ct. 260\n123 U.S. 679\n31 L.Ed. 278\nSHERM...,Morrison,Remick,Waite,1887-12-12
2,7,http://www.courtlistener.com/api/rest/v3/peopl...,"<p class=""case_cite"">24 U.S. 446</p>\n <p c...",24 U.S. 446\n6 L.Ed. 516\n11 Wheat. 446\nCARNO...,John,,Marshall,1826-03-14
3,10,http://www.courtlistener.com/api/rest/v3/peopl...,"<p class=""case_cite"">487 U.S. 879</p>\n <p ...",487 U.S. 879\n108 S.Ct. 2722\n101 L.Ed.2d 749\...,John,Paul,Stevens,1988-06-29
4,11,http://www.courtlistener.com/api/rest/v3/peopl...,"<p class=""case_cite"">354 U.S. 390</p>\n <p ...",354 U.S. 390\n77 S.Ct. 1096\n1 L.Ed.2d 1420\nW...,John,Marshall,Harlan,1957-06-17
5,16,http://www.courtlistener.com/api/rest/v3/peopl...,"<p class=""case_cite"">147 U.S. 230</p>\n <p ...",147 U.S. 230\n13 S.Ct. 318\n37 L.Ed. 145\nSUTL...,Horace,,Gray,1893-01-09
6,20,http://www.courtlistener.com/api/rest/v3/peopl...,"<p class=""case_cite"">122 U.S. 469</p>\n <p ...",122 U.S. 469\n7 S.Ct. 1268\n30 L.Ed. 1214\nCLI...,Samuel,Freeman,Miller,1887-05-27
7,21,http://www.courtlistener.com/api/rest/v3/peopl...,"<p class=""case_cite"">178 U.S. 270</p>\n <p ...",178 U.S. 270\n20 S.Ct. 931\n44 L.Ed. 1065\nPIT...,Joseph,,McKenna,1900-05-21
8,25,http://www.courtlistener.com/api/rest/v3/peopl...,"<p class=""case_cite"">138 U.S. 431</p>\n <p ...",138 U.S. 431\n11 S.Ct. 360\n34 L.Ed. 1019\nCAS...,David,Josiah,Brewer,1891-03-02
9,26,http://www.courtlistener.com/api/rest/v3/peopl...,"<p class=""case_cite"">241 U.S. 329</p>\n <p ...",241 U.S. 329\n36 S.Ct. 563\n60 L.Ed. 1027\nJOH...,James,Clark,McReynolds,1916-05-22
