-
-
Notifications
You must be signed in to change notification settings - Fork 98
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
19 changed files
with
5,608 additions
and
5,704 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
183 changes: 78 additions & 105 deletions
183
juriscraper/opinions/united_states/federal_appellate/ca1.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,124 +1,97 @@ | ||
import re | ||
from datetime import date, datetime, timedelta | ||
from datetime import date, datetime | ||
from typing import Tuple | ||
from urllib.parse import urlencode | ||
|
||
from dateutil.rrule import DAILY, rrule | ||
from juriscraper.AbstractSite import logger | ||
from juriscraper.lib.date_utils import make_date_range_tuples | ||
from juriscraper.OpinionSiteLinear import OpinionSiteLinear | ||
|
||
from juriscraper.OpinionSite import OpinionSite | ||
|
||
class Site(OpinionSiteLinear): | ||
# This URL will show most recent opinions | ||
base_url = "https://www.ca1.uscourts.gov/opn/aci" | ||
days_interval = 5 | ||
first_opinion_date = datetime(2003, 3, 23) | ||
|
||
class Site(OpinionSite): | ||
def __init__(self, *args, **kwargs): | ||
super().__init__(*args, **kwargs) | ||
self.base_url = "http://media.ca1.uscourts.gov/cgi-bin/opinions.pl" | ||
self.court_id = self.__module__ | ||
today = date.today() | ||
params = urlencode( | ||
{ | ||
"FROMDATE": (today - timedelta(7)).strftime("%m/%d/%Y"), | ||
"TODATE": today.strftime("%m/%d/%Y"), | ||
"puid": "", | ||
} | ||
) | ||
self.url = f"{self.base_url}/?{params}" | ||
# self.url = "http://media.ca1.uscourts.gov/cgi-bin/opinions.pl/?TODATE=06%2F24%2F1993&puid=&FROMDATE=05%2F25%2F1993" | ||
self.interval = 30 | ||
self.back_scrape_iterable = [ | ||
i.date() | ||
for i in rrule( | ||
DAILY, | ||
interval=self.interval, | ||
dtstart=date(1992, 1, 1), | ||
until=date(2016, 1, 1), | ||
) | ||
] | ||
self.url = self.base_url | ||
self.make_backscrape_iterable(kwargs) | ||
|
||
def _get_case_names(self): | ||
return [ | ||
e.strip() | ||
for e in self.html.xpath( | ||
"//tr[position() > 1]/td[4]/text()[contains(., 'v.')]" | ||
def _process_html(self): | ||
for row in self.html.xpath("//tr[not(th)]"): | ||
title = row.xpath("td[2]/a/text()")[0] | ||
url = row.xpath("td[2]/a/@href")[0] | ||
status = self.get_status_from_opinion_title(title) | ||
docket = row.xpath("td[3]/a/text()")[0] | ||
date_filed = row.xpath("td[1]/span/text()")[0] | ||
name = row.xpath("td[4]/text()")[0] | ||
lower_court = row.xpath("td[4]/span/text()")[0] | ||
self.cases.append( | ||
{ | ||
"name": name.strip(), | ||
"url": url, | ||
"date": date_filed, | ||
"status": status, | ||
"docket": docket, | ||
"lower_court": lower_court, | ||
} | ||
) | ||
] | ||
|
||
def _get_download_urls(self): | ||
return [ | ||
e for e in self.html.xpath("//tr[position() > 1]/td[2]//@href") | ||
] | ||
|
||
def _get_case_dates(self): | ||
dates = [] | ||
for s in self.html.xpath("//tr[position() > 1]/td[1]//text()"): | ||
s = s.replace(r"\t", "").replace(r"\n", "").strip() | ||
if s == "1996/05/32": | ||
s = "1996/05/30" # My life is thus lain to waste. | ||
dates.append(datetime.strptime(s.strip(), "%Y/%m/%d").date()) | ||
return dates | ||
|
||
def _get_docket_numbers(self): | ||
regex = re.compile(r"(\d{2}-.*?\W)(.*)$") | ||
docket_numbers = [] | ||
for s in self.html.xpath("//tr[position() > 1]/td[2]/a/text()"): | ||
s = s.replace("O1-", "01-") # I grow older, the input grows worse. | ||
docket_numbers.append( | ||
regex.search(s).group(1).strip().replace(".", "") | ||
) | ||
return docket_numbers | ||
def get_status_from_opinion_title(self, title: str) -> str: | ||
"""Status is encoded in opinion's link title | ||
def _get_precedential_statuses(self): | ||
statuses = [] | ||
for text in self.html.xpath("//tr[position() > 1]/td[2]//@href"): | ||
if "U" in text: | ||
statuses.append("Unpublished") | ||
elif "P" in text: | ||
statuses.append("Published") | ||
elif "E" in text: | ||
statuses.append("Errata") | ||
else: | ||
statuses.append("Unknown") | ||
return statuses | ||
:param title: opinion title. Ex: 23-1667P.01A, 23-1639U.01A | ||
def _get_lower_courts(self): | ||
lower_courts = [] | ||
for e in self.html.xpath("//tr[position() > 1]/td[4]/font"): | ||
try: | ||
lower_courts.append(e.xpath("./text()")[0].strip()) | ||
except IndexError: | ||
lower_courts.append("") | ||
return lower_courts | ||
:return: status string | ||
""" | ||
if "U" in title: | ||
status = "Unpublished" | ||
elif "P" in title: | ||
status = "Published" | ||
elif "E" in title: | ||
status = "Errata" | ||
else: | ||
status = "Unknown" | ||
return status | ||
|
||
def _download_backwards(self, d): | ||
params = urlencode( | ||
{ | ||
"FROMDATE": d.strftime("%m/%d/%Y"), | ||
"TODATE": (d + timedelta(self.interval)).strftime("%m/%d/%Y"), | ||
"puid": "", | ||
} | ||
) | ||
self.url = f"{self.base_url}/?{params}" | ||
def _download_backwards(self, dates: Tuple[date]) -> None: | ||
"""Change URL to backscraping date range | ||
:param dates: tuple with date range to scrape | ||
:return None | ||
""" | ||
start, end = dates | ||
params = { | ||
"field_opn_csno_value_op": "starts", | ||
"field_opn_issdate_value[min][date]": start.strftime("%m/%d/%Y"), | ||
"field_opn_issdate_value[max][date]": end.strftime("%m/%d/%Y"), | ||
} | ||
self.url = f"{self.base_url}?{urlencode(params)}" | ||
self.html = self._download() | ||
if self.html is not None: | ||
# Setting status is important because it prevents the download | ||
# function from being run a second time by the parse method. | ||
self.status = 200 | ||
self._process_html() | ||
|
||
def _post_parse(self): | ||
"""This will remove the cases without a case name""" | ||
to_be_removed = [ | ||
index | ||
for index, case_name in enumerate(self.case_names) | ||
if not case_name.replace("v.", "").strip() | ||
] | ||
def make_backscrape_iterable(self, kwargs: dict) -> None: | ||
"""Checks if backscrape start and end arguments have been passed | ||
by caller, and parses them accordingly | ||
for attr in self._all_attrs: | ||
item = getattr(self, attr) | ||
if item is not None: | ||
new_item = self.remove_elements(item, to_be_removed) | ||
self.__setattr__(attr, new_item) | ||
:param kwargs: passed when initializing the scraper, may or | ||
may not contain backscrape controlling arguments | ||
:return None | ||
""" | ||
start = kwargs.get("backscrape_start") | ||
end = kwargs.get("backscrape_end") | ||
|
||
@staticmethod | ||
def remove_elements(list_, indexes_to_be_removed): | ||
return [ | ||
i for j, i in enumerate(list_) if j not in indexes_to_be_removed | ||
] | ||
if start: | ||
start = datetime.strptime(start, "%m/%d/%Y") | ||
else: | ||
start = self.first_opinion_date | ||
if end: | ||
end = datetime.strptime(end, "%m/%d/%Y") | ||
else: | ||
end = datetime.now() | ||
|
||
self.back_scrape_iterable = make_date_range_tuples( | ||
start, end, self.days_interval | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,93 +1,103 @@ | ||
# Author: Phil Ardery | ||
# Contact: https://www.ndcourts.gov/contact-us | ||
# Date created: 2019-02-28 | ||
# Updated: 2024-05-08, grossir: to OpinionSiteLinear and new URL | ||
import re | ||
from typing import Tuple | ||
from urllib.parse import urljoin | ||
|
||
from juriscraper.lib.exceptions import InsanityException | ||
from juriscraper.lib.string_utils import convert_date_string | ||
from juriscraper.OpinionSite import OpinionSite | ||
from juriscraper.OpinionSiteLinear import OpinionSiteLinear | ||
|
||
|
||
class Site(OpinionSite): | ||
class Site(OpinionSiteLinear): | ||
base_url = "https://www.ndcourts.gov/" | ||
ordered_fields = [ | ||
"name", | ||
"docket", | ||
"date", | ||
"nature_of_suit", | ||
"judge", | ||
] | ||
|
||
def __init__(self, *args, **kwargs): | ||
super().__init__(*args, **kwargs) | ||
self.court_id = self.__module__ | ||
self.url = "https://www.ndcourts.gov/supreme-court/recent-opinions?pageSize=100" | ||
self.cases = [] | ||
self.url = "https://www.ndcourts.gov/supreme-court/opinions?topic=&author=&searchQuery=&trialJudge=&pageSize=100&sortOrder=1" | ||
self.status = "Published" | ||
|
||
def _process_html(self): | ||
for row in self.html.xpath('//table//div[@class="row"]'): | ||
case = self.case_fields_extract(row) | ||
self.case_fields_validate(case) | ||
case = self.case_fields_sanitize(case) | ||
self.cases.append(case) | ||
def _process_html(self) -> None: | ||
"""Most values are inside a <p>: whitespace and | ||
field names need to be cleaned | ||
def case_fields_extract(self, row): | ||
text_lines = row.xpath("./div/p[1]/text()") | ||
text_lines = [ | ||
l.strip() for l in text_lines if l.strip() | ||
] # Remove empty lines | ||
line_count = len(text_lines) | ||
return { | ||
"citation": text_lines[0], | ||
"docket": text_lines[1], | ||
"date": text_lines[2], | ||
"name": row.xpath(".//a")[0].text_content().strip(), | ||
"nature": text_lines[3], | ||
"judge": text_lines[4], | ||
"summary": " ".join(text_lines[5:line_count]) | ||
if line_count > 5 | ||
else "", | ||
"url": row.xpath(".//button/@onclick")[0].split("'")[1], | ||
} | ||
|
||
def case_fields_validate(self, case): | ||
if "ND" not in case["citation"]: | ||
raise InsanityException(f"Invalid citation: {case['citation']}") | ||
if not case["docket"].startswith("Docket No.:"): | ||
raise InsanityException( | ||
f"Invalid docket raw string: {case['docket']}" | ||
) | ||
if not case["date"].startswith("Filing Date:"): | ||
raise InsanityException( | ||
f"Invalid date string raw string: {case['date']}" | ||
) | ||
if not case["nature"].startswith("Case Type:"): | ||
raise InsanityException( | ||
f"Invalid type raw string: {case['nature']}" | ||
Citation used to be available, now must be got from inside | ||
the document's text | ||
""" | ||
for row in self.html.xpath('//table//div[@class="row"]'): | ||
raw_values = list(map(str.strip, row.xpath("./div/p[1]/text()"))) | ||
values = [] | ||
|
||
for idx, txt in enumerate(raw_values[:5]): | ||
if idx == 0: | ||
txt, extra_docket = self.clean_name(txt) | ||
else: | ||
txt = txt.split(":", 1)[1].strip() | ||
values.append(txt) | ||
|
||
summary = ( | ||
" ".join(raw_values[5:]).strip() if len(raw_values) > 5 else "" | ||
) | ||
if not case["judge"].startswith("Author:"): | ||
raise InsanityException( | ||
f"Invalid author raw string: {case['judge']}" | ||
url = urljoin( | ||
self.base_url, | ||
row.xpath(".//button[@onclick]/@onclick")[0].split("'")[1], | ||
) | ||
case = dict(zip(self.ordered_fields, values[:5])) | ||
case["summary"] = summary | ||
case["url"] = url | ||
|
||
def case_fields_sanitize(self, case): | ||
for field in ["date", "docket", "judge", "nature"]: | ||
case[field] = case[field].split(":", 1)[1].strip() | ||
return case | ||
|
||
def _get_download_urls(self): | ||
return [case["url"] for case in self.cases] | ||
# There is a per_curiam field on the CL Opinion model, | ||
# but we don't process it if sent by the scraper | ||
if "Per Curiam" in case["judge"]: | ||
case["judge"] = "" | ||
|
||
def _get_case_names(self): | ||
return [case["name"] for case in self.cases] | ||
|
||
def _get_case_dates(self): | ||
return [convert_date_string(case["date"]) for case in self.cases] | ||
|
||
def _get_docket_numbers(self): | ||
return [case["docket"] for case in self.cases] | ||
self.cases.append(case) | ||
|
||
def _get_nature_of_suit(self): | ||
return [case["nature"] for case in self.cases] | ||
|
||
def _get_citations(self): | ||
return [case["citation"] for case in self.cases] | ||
|
||
def _get_judges(self): | ||
return [case["judge"] for case in self.cases] | ||
|
||
def _get_precedential_statuses(self): | ||
return ["Published"] * len(self.cases) | ||
|
||
def _get_summaries(self): | ||
return [case["summary"] for case in self.cases] | ||
return [case["nature_of_suit"] for case in self.cases] | ||
|
||
def clean_name(self, name: str) -> Tuple[str, str]: | ||
"""Cleans case name | ||
Some case names list the consolidated docket or a | ||
(CONFIDENTIAL) parentheses | ||
:param name: raw case name | ||
:return: cleaned name and extra_docket numbers | ||
""" | ||
other_dockets = "" | ||
if "(consolidated w/" in name: | ||
other_dockets = ",".join(re.findall(r"\d{8}", name)) | ||
name = name.split("(consolidated w/")[0] | ||
if "(CONFIDENTIAL" in name: | ||
name = name.split("(CONFIDENTIAL")[0] | ||
|
||
return name.strip(), other_dockets | ||
|
||
def extract_from_text(self, scraped_text: str) -> dict: | ||
"""Extract Citation from text | ||
:param scraped_text: Text of scraped content | ||
:return: date filed | ||
""" | ||
regex = r"(?P<vol>20\d{2})\sND\s(?P<page>\d+)" | ||
match = re.search(regex, scraped_text[:1000]) | ||
|
||
if match: | ||
return { | ||
"Citation": { | ||
"volume": match.group("vol"), | ||
"reporter": "ND", | ||
"page": match.group("page"), | ||
"type": 8, # NEUTRAL in courtlistener Citation model | ||
}, | ||
} | ||
return {} |
Oops, something went wrong.