Skip to content

Commit

Permalink
Merge branch 'main' into fix_dcd
Browse files Browse the repository at this point in the history
  • Loading branch information
quevon24 committed Jun 11, 2024
2 parents 0834825 + 9574b0c commit cfe404a
Show file tree
Hide file tree
Showing 19 changed files with 5,608 additions and 5,704 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ jobs:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
python -m pip install --upgrade pip setuptools
pip install wheel
pip install -r requirements.txt
pip install -r requirements-dev.txt
Expand Down
183 changes: 78 additions & 105 deletions juriscraper/opinions/united_states/federal_appellate/ca1.py
Original file line number Diff line number Diff line change
@@ -1,124 +1,97 @@
import re
from datetime import date, datetime, timedelta
from datetime import date, datetime
from typing import Tuple
from urllib.parse import urlencode

from dateutil.rrule import DAILY, rrule
from juriscraper.AbstractSite import logger
from juriscraper.lib.date_utils import make_date_range_tuples
from juriscraper.OpinionSiteLinear import OpinionSiteLinear

from juriscraper.OpinionSite import OpinionSite

class Site(OpinionSiteLinear):
# This URL will show most recent opinions
base_url = "https://www.ca1.uscourts.gov/opn/aci"
days_interval = 5
first_opinion_date = datetime(2003, 3, 23)

class Site(OpinionSite):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.base_url = "http://media.ca1.uscourts.gov/cgi-bin/opinions.pl"
self.court_id = self.__module__
today = date.today()
params = urlencode(
{
"FROMDATE": (today - timedelta(7)).strftime("%m/%d/%Y"),
"TODATE": today.strftime("%m/%d/%Y"),
"puid": "",
}
)
self.url = f"{self.base_url}/?{params}"
# self.url = "http://media.ca1.uscourts.gov/cgi-bin/opinions.pl/?TODATE=06%2F24%2F1993&puid=&FROMDATE=05%2F25%2F1993"
self.interval = 30
self.back_scrape_iterable = [
i.date()
for i in rrule(
DAILY,
interval=self.interval,
dtstart=date(1992, 1, 1),
until=date(2016, 1, 1),
)
]
self.url = self.base_url
self.make_backscrape_iterable(kwargs)

def _get_case_names(self):
return [
e.strip()
for e in self.html.xpath(
"//tr[position() > 1]/td[4]/text()[contains(., 'v.')]"
def _process_html(self):
for row in self.html.xpath("//tr[not(th)]"):
title = row.xpath("td[2]/a/text()")[0]
url = row.xpath("td[2]/a/@href")[0]
status = self.get_status_from_opinion_title(title)
docket = row.xpath("td[3]/a/text()")[0]
date_filed = row.xpath("td[1]/span/text()")[0]
name = row.xpath("td[4]/text()")[0]
lower_court = row.xpath("td[4]/span/text()")[0]
self.cases.append(
{
"name": name.strip(),
"url": url,
"date": date_filed,
"status": status,
"docket": docket,
"lower_court": lower_court,
}
)
]

def _get_download_urls(self):
return [
e for e in self.html.xpath("//tr[position() > 1]/td[2]//@href")
]

def _get_case_dates(self):
dates = []
for s in self.html.xpath("//tr[position() > 1]/td[1]//text()"):
s = s.replace(r"\t", "").replace(r"\n", "").strip()
if s == "1996/05/32":
s = "1996/05/30" # My life is thus lain to waste.
dates.append(datetime.strptime(s.strip(), "%Y/%m/%d").date())
return dates

def _get_docket_numbers(self):
regex = re.compile(r"(\d{2}-.*?\W)(.*)$")
docket_numbers = []
for s in self.html.xpath("//tr[position() > 1]/td[2]/a/text()"):
s = s.replace("O1-", "01-") # I grow older, the input grows worse.
docket_numbers.append(
regex.search(s).group(1).strip().replace(".", "")
)
return docket_numbers
def get_status_from_opinion_title(self, title: str) -> str:
"""Status is encoded in opinion's link title
def _get_precedential_statuses(self):
statuses = []
for text in self.html.xpath("//tr[position() > 1]/td[2]//@href"):
if "U" in text:
statuses.append("Unpublished")
elif "P" in text:
statuses.append("Published")
elif "E" in text:
statuses.append("Errata")
else:
statuses.append("Unknown")
return statuses
:param title: opinion title. Ex: 23-1667P.01A, 23-1639U.01A
def _get_lower_courts(self):
lower_courts = []
for e in self.html.xpath("//tr[position() > 1]/td[4]/font"):
try:
lower_courts.append(e.xpath("./text()")[0].strip())
except IndexError:
lower_courts.append("")
return lower_courts
:return: status string
"""
if "U" in title:
status = "Unpublished"
elif "P" in title:
status = "Published"
elif "E" in title:
status = "Errata"
else:
status = "Unknown"
return status

def _download_backwards(self, d):
params = urlencode(
{
"FROMDATE": d.strftime("%m/%d/%Y"),
"TODATE": (d + timedelta(self.interval)).strftime("%m/%d/%Y"),
"puid": "",
}
)
self.url = f"{self.base_url}/?{params}"
def _download_backwards(self, dates: Tuple[date]) -> None:
"""Change URL to backscraping date range
:param dates: tuple with date range to scrape
:return None
"""
start, end = dates
params = {
"field_opn_csno_value_op": "starts",
"field_opn_issdate_value[min][date]": start.strftime("%m/%d/%Y"),
"field_opn_issdate_value[max][date]": end.strftime("%m/%d/%Y"),
}
self.url = f"{self.base_url}?{urlencode(params)}"
self.html = self._download()
if self.html is not None:
# Setting status is important because it prevents the download
# function from being run a second time by the parse method.
self.status = 200
self._process_html()

def _post_parse(self):
"""This will remove the cases without a case name"""
to_be_removed = [
index
for index, case_name in enumerate(self.case_names)
if not case_name.replace("v.", "").strip()
]
def make_backscrape_iterable(self, kwargs: dict) -> None:
"""Checks if backscrape start and end arguments have been passed
by caller, and parses them accordingly
for attr in self._all_attrs:
item = getattr(self, attr)
if item is not None:
new_item = self.remove_elements(item, to_be_removed)
self.__setattr__(attr, new_item)
:param kwargs: passed when initializing the scraper, may or
may not contain backscrape controlling arguments
:return None
"""
start = kwargs.get("backscrape_start")
end = kwargs.get("backscrape_end")

@staticmethod
def remove_elements(list_, indexes_to_be_removed):
return [
i for j, i in enumerate(list_) if j not in indexes_to_be_removed
]
if start:
start = datetime.strptime(start, "%m/%d/%Y")
else:
start = self.first_opinion_date
if end:
end = datetime.strptime(end, "%m/%d/%Y")
else:
end = datetime.now()

self.back_scrape_iterable = make_date_range_tuples(
start, end, self.days_interval
)
162 changes: 86 additions & 76 deletions juriscraper/opinions/united_states/state/nd.py
Original file line number Diff line number Diff line change
@@ -1,93 +1,103 @@
# Author: Phil Ardery
# Contact: https://www.ndcourts.gov/contact-us
# Date created: 2019-02-28
# Updated: 2024-05-08, grossir: to OpinionSiteLinear and new URL
import re
from typing import Tuple
from urllib.parse import urljoin

from juriscraper.lib.exceptions import InsanityException
from juriscraper.lib.string_utils import convert_date_string
from juriscraper.OpinionSite import OpinionSite
from juriscraper.OpinionSiteLinear import OpinionSiteLinear


class Site(OpinionSite):
class Site(OpinionSiteLinear):
base_url = "https://www.ndcourts.gov/"
ordered_fields = [
"name",
"docket",
"date",
"nature_of_suit",
"judge",
]

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.court_id = self.__module__
self.url = "https://www.ndcourts.gov/supreme-court/recent-opinions?pageSize=100"
self.cases = []
self.url = "https://www.ndcourts.gov/supreme-court/opinions?topic=&author=&searchQuery=&trialJudge=&pageSize=100&sortOrder=1"
self.status = "Published"

def _process_html(self):
for row in self.html.xpath('//table//div[@class="row"]'):
case = self.case_fields_extract(row)
self.case_fields_validate(case)
case = self.case_fields_sanitize(case)
self.cases.append(case)
def _process_html(self) -> None:
"""Most values are inside a <p>: whitespace and
field names need to be cleaned
def case_fields_extract(self, row):
text_lines = row.xpath("./div/p[1]/text()")
text_lines = [
l.strip() for l in text_lines if l.strip()
] # Remove empty lines
line_count = len(text_lines)
return {
"citation": text_lines[0],
"docket": text_lines[1],
"date": text_lines[2],
"name": row.xpath(".//a")[0].text_content().strip(),
"nature": text_lines[3],
"judge": text_lines[4],
"summary": " ".join(text_lines[5:line_count])
if line_count > 5
else "",
"url": row.xpath(".//button/@onclick")[0].split("'")[1],
}

def case_fields_validate(self, case):
if "ND" not in case["citation"]:
raise InsanityException(f"Invalid citation: {case['citation']}")
if not case["docket"].startswith("Docket No.:"):
raise InsanityException(
f"Invalid docket raw string: {case['docket']}"
)
if not case["date"].startswith("Filing Date:"):
raise InsanityException(
f"Invalid date string raw string: {case['date']}"
)
if not case["nature"].startswith("Case Type:"):
raise InsanityException(
f"Invalid type raw string: {case['nature']}"
Citation used to be available, now must be got from inside
the document's text
"""
for row in self.html.xpath('//table//div[@class="row"]'):
raw_values = list(map(str.strip, row.xpath("./div/p[1]/text()")))
values = []

for idx, txt in enumerate(raw_values[:5]):
if idx == 0:
txt, extra_docket = self.clean_name(txt)
else:
txt = txt.split(":", 1)[1].strip()
values.append(txt)

summary = (
" ".join(raw_values[5:]).strip() if len(raw_values) > 5 else ""
)
if not case["judge"].startswith("Author:"):
raise InsanityException(
f"Invalid author raw string: {case['judge']}"
url = urljoin(
self.base_url,
row.xpath(".//button[@onclick]/@onclick")[0].split("'")[1],
)
case = dict(zip(self.ordered_fields, values[:5]))
case["summary"] = summary
case["url"] = url

def case_fields_sanitize(self, case):
for field in ["date", "docket", "judge", "nature"]:
case[field] = case[field].split(":", 1)[1].strip()
return case

def _get_download_urls(self):
return [case["url"] for case in self.cases]
# There is a per_curiam field on the CL Opinion model,
# but we don't process it if sent by the scraper
if "Per Curiam" in case["judge"]:
case["judge"] = ""

def _get_case_names(self):
return [case["name"] for case in self.cases]

def _get_case_dates(self):
return [convert_date_string(case["date"]) for case in self.cases]

def _get_docket_numbers(self):
return [case["docket"] for case in self.cases]
self.cases.append(case)

def _get_nature_of_suit(self):
return [case["nature"] for case in self.cases]

def _get_citations(self):
return [case["citation"] for case in self.cases]

def _get_judges(self):
return [case["judge"] for case in self.cases]

def _get_precedential_statuses(self):
return ["Published"] * len(self.cases)

def _get_summaries(self):
return [case["summary"] for case in self.cases]
return [case["nature_of_suit"] for case in self.cases]

def clean_name(self, name: str) -> Tuple[str, str]:
"""Cleans case name
Some case names list the consolidated docket or a
(CONFIDENTIAL) parentheses
:param name: raw case name
:return: cleaned name and extra_docket numbers
"""
other_dockets = ""
if "(consolidated w/" in name:
other_dockets = ",".join(re.findall(r"\d{8}", name))
name = name.split("(consolidated w/")[0]
if "(CONFIDENTIAL" in name:
name = name.split("(CONFIDENTIAL")[0]

return name.strip(), other_dockets

def extract_from_text(self, scraped_text: str) -> dict:
"""Extract Citation from text
:param scraped_text: Text of scraped content
:return: date filed
"""
regex = r"(?P<vol>20\d{2})\sND\s(?P<page>\d+)"
match = re.search(regex, scraped_text[:1000])

if match:
return {
"Citation": {
"volume": match.group("vol"),
"reporter": "ND",
"page": match.group("page"),
"type": 8, # NEUTRAL in courtlistener Citation model
},
}
return {}
Loading

0 comments on commit cfe404a

Please sign in to comment.