Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(dcd): update to OpinionSiteLinear #1020

Merged
merged 5 commits into from
Jun 11, 2024
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
192 changes: 76 additions & 116 deletions juriscraper/opinions/united_states/federal_district/dcd.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,142 +4,102 @@
Author: V. David Zvenyach
Date created: 2014-02-27
Substantially Revised: Brian W. Carver, 2014-03-28
2024-05-03, grossir: Change base class OpinionSiteLinear
"""

import re
import time
from datetime import date
from datetime import date, datetime
from typing import Tuple

from lxml import html

from juriscraper.lib.string_utils import titlecase
from juriscraper.OpinionSite import OpinionSite
from juriscraper.OpinionSiteLinear import OpinionSiteLinear


class Site(OpinionSite):
class Site(OpinionSiteLinear):
docket_document_number_regex = re.compile(r"(\?)(\d+)([a-z]+)(\d+)(-)(.*)")
nature_of_suit_regex = re.compile(r"(\?)(\d+)([a-z]+)(\d+)(-)(.*)")

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.court_id = self.__module__
self.url = "https://ecf.dcd.uscourts.gov/cgi-bin/Opinions.pl?" + str(
date.today().year
)
self.url = f"https://ecf.dcd.uscourts.gov/cgi-bin/Opinions.pl?{date.today().year}"
self.status = "Published"

def _get_download_urls(self):
# There are often multiple documents and hence urls for each case.
# This requires us to pad every other metadata field to match the
# number of urls we find here.
path = "//table[2]//tr[position()>0]/td[3]/a/@href"
return [url for url in self.html.xpath(path)]
def _process_html(self):
"""
Some rows have mutliple documents and hence urls for each case.
We will "pad" every other metadata field to match the urls
"""
for row in self.html.xpath("//table[2]//tr[not(th)]"):
case_name = titlecase(
row.xpath("td[2]//text()[preceding-sibling::br]")[0].lower()
)
date_string = row.xpath("td[1]/text()")[0]
date_filed = datetime.strptime(date_string, "%m/%d/%Y")
docket = row.xpath("td[2]//text()[following-sibling::br]")[0]

def _get_case_names(self):
casenames = []
rowpath = "//table[2]//tr[position()>0]"
cnpath = "./td[2]//text()[preceding-sibling::br]"
urlpath = "./td[3]/a/@href"
for row in self.html.xpath(rowpath):
case_list = row.xpath(cnpath)
for rough_case_name in case_list:
case_name = titlecase(rough_case_name.lower())
# Determine the number of urls in each row and pad the case
# name list sufficiently
count = len(row.xpath(urlpath))
casenames.extend([case_name] * count)
return casenames
judge_element = row.xpath("td[3]")[0]
judge_string = html.tostring(
judge_element, method="text", encoding="unicode"
)
judge = re.search(r"(by\s)(.*)", judge_string, re.MULTILINE).group(
2
)

def _get_case_dates(self):
dates = []
rowpath = "//table[2]//tr[position()>0]"
datepath = "./td[1]/text()"
urlpath = "./td[3]/a/@href"
for row in self.html.xpath(rowpath):
date_string = row.xpath(datepath)
for d in date_string:
date_object = date.fromtimestamp(
time.mktime(time.strptime(d, "%m/%d/%Y"))
for url in row.xpath("td[3]/a/@href"):
doc_number, nature_of_suit = self.get_values_from_url(url)
self.cases.append(
{
"name": case_name,
"date": str(date_filed),
"url": url,
"docket": docket,
"docket_document_numbers": doc_number,
"nature_of_suit": nature_of_suit,
"judge": judge,
}
)
# Determine the number of urls in each row and pad the date
# list sufficiently
count = len(row.xpath(urlpath))
dates.extend([date_object] * count)
return dates

def _get_precedential_statuses(self):
return ["Published"] * len(self.case_names)
def get_values_from_url(self, url: str) -> Tuple[str, str]:
"""Get docket document number and nature_of_suit values from URL

def _get_docket_numbers(self):
docket_numbers = []
rowpath = "//table[2]//tr[position()>0]"
dktpath = "./td[2]//text()[following-sibling::br]"
urlpath = "./td[3]/a/@href"
for row in self.html.xpath(rowpath):
docket_number = row.xpath(dktpath)
# Determine the number of urls in each row and pad the docket
# numbers list sufficiently
count = len(row.xpath(urlpath))
docket_numbers.extend(docket_number * count)
return docket_numbers
:param url:
:return: docket document number and nature_of_suit
"""
# In 2012 (and perhaps elsewhere) they have a few weird urls.
match = self.docket_document_number_regex.search(url)
if match:
doc_number = match.group(6)
else:
doc_number = url

def _get_docket_document_numbers(self):
document_numbers = []
regex = re.compile(r"(\?)(\d+)([a-z]+)(\d+)(-)(.*)")
for url in self.html.xpath(
"//table[2]//tr[position()>0]/td[3]/a/@href"
):
# Because we are acting directly on the entire url list, no padding
# of the docket number field is required.
doc_no = regex.search(url)
# In 2012 (and perhaps elsewhere) they have a few weird urls.
if re.search(regex, url) is not None:
document_numbers.append(doc_no.group(6))
nature_of_suit_match = re.search(self.nature_of_suit_regex, url)
# In 2012 (and perhaps elsewhere) they have a few weird urls.
if not nature_of_suit_match:
nature_of_suit = "Unknown"
else:
nature_code = nature_of_suit_match.group(3)
if nature_code == "cv":
nature_of_suit = "Civil"
elif nature_code == "cr":
nature_of_suit = "Criminal"
# This is a tough call. Magistrate Cases are typically also
# Criminal or Civil cases, and their docket_number field will
# reflect this, but they do classify these separately under
# these 'mj' and 'mc' codes and the first page of these
# documents will often refer to them as 'Magistrate Case
# ####-####' so, we will too.
elif nature_code == "mj" or "mc":
nature_of_suit = "Magistrate Case"
else:
document_numbers.append(url)
return document_numbers
nature_of_suit = "Unknown"
grossir marked this conversation as resolved.
Show resolved Hide resolved

def _get_judges(self):
judges = []
rowpath = "//table[2]//tr[position()>0]"
urlpath = "./td[3]/a/@href"
judgepath = "./td[3]"
for row in self.html.xpath(rowpath):
for judge_element in row.xpath(judgepath):
judge_string = html.tostring(
judge_element, method="text", encoding="unicode"
)
judge = re.search(
r"(by\s)(.*)", judge_string, re.MULTILINE
).group(2)
# Determine the number of urls in each row and pad the judges
# list sufficiently
count = len(row.xpath(urlpath))
judges.extend([judge] * count)
return judges
return doc_number, nature_of_suit

def _get_docket_document_numbers(self):
return [case["docket_document_numbers"] for case in self.cases]
grossir marked this conversation as resolved.
Show resolved Hide resolved

def _get_nature_of_suit(self):
nos = []
for url in self.html.xpath(
"//table[2]//tr[position()>0]/td[3]/a/@href"
):
# Because we are acting directly on the entire url list, no padding
# of the nature of suit field is required.
regex = r"(\?)(\d+)([a-z]+)(\d+)(\-)(.*)"
url_str = re.search(r"(\?)(\d+)([a-z]+)(\d+)(-)(.*)", url)
# In 2012 (and perhaps elsewhere) they have a few weird urls.
if re.search(regex, url) is not None:
nature_code = url_str.group(3)
if nature_code == "cv":
nos.append("Civil")
elif nature_code == "cr":
nos.append("Criminal")
# This is a tough call. Magistrate Cases are typically also
# Criminal or Civil cases, and their docket_number field will
# reflect this, but they do classify these separately under
# these 'mj' and 'mc' codes and the first page of these
# documents will often refer to them as 'Magistrate Case
# ####-####' so, we will too.
elif nature_code == "mj" or "mc":
nos.append("Magistrate Case")
else:
nos.append("Unknown")
else:
nos.append("Unknown")
return nos
return [case["nature_of_suit"] for case in self.cases]
grossir marked this conversation as resolved.
Show resolved Hide resolved
Loading