Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(coloctapp): dynamic backscraper #1011

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
219 changes: 131 additions & 88 deletions juriscraper/opinions/united_states/state/colo.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,112 +9,155 @@
- 2023-01-05: Updated by WEP
- 2023-11-19: Drop Selenium by WEP
- 2023-12-20: Updated with citations, judges and summaries, Palin
- 2024-07-04: Update to new site, grossir
"""

import datetime
import re
from datetime import date, timedelta
from typing import Any, Dict
from datetime import date, datetime
from typing import Any, Dict, Tuple
from urllib.parse import urlencode

from dateutil import parser
from lxml import html

from juriscraper.AbstractSite import logger
from juriscraper.lib.date_utils import make_date_range_tuples
from juriscraper.OpinionSiteLinear import OpinionSiteLinear


class Site(OpinionSiteLinear):
base_url = "https://research.coloradojudicial.gov/search.json"
detail_url = "https://research.coloradojudicial.gov/vid/{}.json?include=abstract%2Cparent%2Cmeta%2Cformats%2Cchildren%2Cproperties_with_ids%2Clibrary%2Csource&fat=1&locale=en&hide_ct6=true&t={}"
days_interval = 30
first_opinion_date = datetime(2010, 1, 1)
api_court_code = "14024_01"

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.court_id = self.__module__
self.status = "Published"
self.url = "https://www.courts.state.co.us/Courts/Supreme_Court/Proceedings/Index.cfm"

def _process_html(self):
""""""
for row in self.html.xpath("//div[@id='Dispositions']/a"):
case_id = row.attrib["onclick"].split("'")[1]
div = row.xpath(f"//div[@id='Case_{case_id}']")[0]
if not div.xpath(".//a/text()"):
# This is set to avoid data decades back
continue
document_type = div.xpath(".//a/text()")[0]
if "Opinion" not in document_type:
# Only collect opinions and not orders
continue
summary = (
row.xpath(f"//div[@id='Case_{case_id}']")[0]
.text_content()
.strip()
)
url = div.xpath(".//a/@href")[0]
title = row.xpath("following-sibling::text()")[0].strip()
docket, name = title.split(" ", 1)
name, _ = name.split("  ")
if "(Honorable" in name:
name, judge = name.split("(")
name = name.strip()
judges = judge.strip(")").strip().replace("Honorable", "")
self.params = {
"product_id": "WW",
"jurisdiction": "US",
"content_type": "2",
"court": self.api_court_code,
"bypass_rabl": "true",
"include": "parent,abstract,snippet,properties_with_ids",
"per_page": "30", # Server breaks down when per_page=500, returns 503
"page": "1",
"sort": "date",
"include_local_exclusive": "true",
"cbm": "6.0|361.0|5.0|9.0|4.0|2.0=0.01|400.0|1.0|0.001|1.5|0.2",
"locale": "en",
"hide_ct6": "true",
"t": str(datetime.now().timestamp())[:10],
"type": "document",
}
self.url = f"{self.base_url}?{urlencode(self.params)}"

# Request won't work without some of these X- headers
self.request["headers"].update(
{
"X-Requested-With": "XMLHttpRequest",
"X-Root-Account-Email": "colorado@vlex.com",
"X-User-Email": "colorado@vlex.com",
"X-Webapp-Seed": "9887408",
}
)
self.make_backscrape_iterable(kwargs)

def _process_html(self) -> None:
search_json = self.html
logger.info(
"Number of results %s; %s in page",
search_json["count"],
len(search_json["results"]),
)

for result in search_json["results"]:
timestamp = str(datetime.now().timestamp())[:10]
url = self.detail_url.format(result["id"], timestamp)

if self.test_mode_enabled():
# we have manually nested detail JSONs to
# to be able to have a test file
detail_json = result["detail_json"]
else:
judges = ""
date_filed = self.find_date(summary)
if parser.parse(date_filed).date() < self.set_min_date():
# Only collect back 6 months
break
if "https://www.courts.state.co.us/" not in url:
url = f"https://www.courts.state.co.us/{url}"
self.cases.append(
{
"summary": summary,
"date": date_filed,
"name": name,
"docket": docket.strip(","),
"url": url,
"judge": judges,
}
)

def set_min_date(self):
"""Set minimum date to add opinions

:return: Date 6 months back
"""
if self.test_mode_enabled():
today = datetime.date(2023, 11, 19)
return today - timedelta(180)
else:
return date.today() - timedelta(180)
# Full case name and docket number are only available
# on the detail page
self._request_url_get(url)
detail_json = self.request["response"].json()

def find_date(self, summary) -> str:
"""Find date filed
# Example of parallel citation:
# https://research.coloradojudicial.gov/vid/907372624
citation, parallel_citation = "", ""
for p in detail_json["properties"]:
label = p["property"]["label"]
if label == "Docket Number":
docket_number = p["values"][0]
if label == "Parties":
case_name_full = p["values"][0]
if label == "Decision Date":
# Note that json['published_at'] is not the date_filed
date_filed = p["values"][0]
if label == "Citation":
citation = p["values"][0]
if len(p["values"]) > 1:
parallel_citation = p["values"][1]

Normally it follows a typical pattern but not always
case = {
"date": date_filed,
"docket": docket_number,
"name": case_name_full,
"url": f"{detail_json['public_url']}/content",
"status": "Published" if citation else "Unknown",
"citation": citation,
"parallel_citation": parallel_citation,
}

self.cases.append(case)

def _download_backwards(self, dates: Tuple[date]) -> None:
"""Make custom date range request

:param summary: Use summary text to find date filed
:return: date as string
:param dates: (start_date, end_date) tuple
:return None
"""
if "Opinion issued" in summary:
return summary.split("Opinion issued")[1]
date_pattern = re.compile(
r"((January|February|March|April|May|June|July|August|September|October|November|December)\s+(\d{1,2})\s?,?\s+(\d{4}))"
start = dates[0].strftime("%Y-%m-%d")
end = dates[1].strftime("%Y-%m-%d")
timestamp = str(datetime.now().timestamp())[:10]
params = {**self.params}
params.update(
{
"date": f"{start}..{end}",
# These are duplicated by the frontend too
"locale": ["en", "en"],
"hide_ct6": ["true", "true"],
"t": [timestamp, timestamp],
}
)
match = re.findall(date_pattern, summary)
date_filed = match[-1][0] if match else ""
return date_filed
self.url = f"{self.base_url}?{urlencode(params)}"
self.html = self._download()
self._process_html()

def extract_from_text(self, scraped_text: str) -> Dict[str, Any]:
"""Extract Citation from text
def make_backscrape_iterable(self, kwargs: dict) -> None:
"""Checks if backscrape start and end arguments have been passed
by caller, and parses them accordingly

:param scraped_text: Text of scraped content
:return: date filed
:param kwargs: passed when initializing the scraper, may or
may not contain backscrape controlling arguments
:return None
"""
m = re.findall(r"(20\d{2})\s(CO)\s(\d+A?)", scraped_text)
if m:
vol, reporter, page = m[0]
return {
"Citation": {
"volume": vol,
"reporter": reporter,
"page": page,
"type": 8, # NEUTRAL in courtlistener Citation model
},
}
return {}
start = kwargs.get("backscrape_start")
end = kwargs.get("backscrape_end")

if start:
start = datetime.strptime(start, "%m/%d/%Y")
else:
start = self.first_opinion_date
if end:
end = datetime.strptime(end, "%m/%d/%Y")
else:
end = datetime.now()

self.back_scrape_iterable = make_date_range_tuples(
start, end, self.days_interval
)
56 changes: 4 additions & 52 deletions juriscraper/opinions/united_states/state/coloctapp.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,57 +9,9 @@
- 2023-11-19: Updated by William E. Palin
"""

import datetime
import re
from juriscraper.opinions.united_states.state import colo

from juriscraper.OpinionSiteLinear import OpinionSiteLinear


class Site(OpinionSiteLinear):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.court_id = self.__module__
self.year = datetime.date.today().year
self.url = "https://www.courts.state.co.us/Courts/Court_of_Appeals/Case_Announcements/Index.cfm"
self.status = None

def _process_html(self) -> None:
"""Parses html into case dictionaries

:return None
"""
if self.test_mode_enabled():
self.year = "2023"

date_xpath = (
"//span[text()='Future Case Announcements']/following-sibling::p"
)
date = self.html.xpath(date_xpath)[0].text_content()

for row in self.html.xpath("//p"):
modified_string = re.sub(r"\s", "", row.text_content())
if "PUBLISHED" == modified_string[:9]:
self.status = "Published"
continue
if "UNPUBLISHED" == modified_string[:11]:
self.status = None
continue
if not self.status:
continue

pattern = re.compile(r"\b[0-9A-Z& ]{5,}\b")
matches = re.findall(pattern, row.text_content())
if not matches:
continue

docket = matches[0].strip()
name = row.text_content().replace(docket, "").strip()
self.cases.append(
{
"name": name,
"docket": docket,
"date": date,
"status": self.status,
"url": f"https://www.courts.state.co.us/Courts/Court_of_Appeals/Opinion/{self.year}/{docket}-PD.pdf",
}
)
class Site(colo.Site):
api_court_code = "14024_02"
days_interval = 15
74 changes: 13 additions & 61 deletions tests/examples/opinions/united_states/colo_example.compare.json
Original file line number Diff line number Diff line change
@@ -1,74 +1,26 @@
[
{
"case_dates": "2023-11-14",
"case_names": "in Re People in the Interest of Minor Children J.P.",
"download_urls": "https://www.courts.state.co.us//userfiles/file/Court_Probation/Supreme_Court/Opinions/2023/23SA126.pdf",
"case_dates": "2024-07-01",
"case_names": "Ricardo Castro v. The People of the State of Colorado",
"download_urls": "https://colorado.vlex.io/vid/castro-v-people-1042407550/content",
"precedential_statuses": "Published",
"blocked_statuses": false,
"date_filed_is_approximate": false,
"docket_numbers": "23SA126",
"judges": "Justin Haenlein",
"summaries": "The petitioner seeks relief from the trial court's order of April 7, 2023. On May 18, 2023, the Supreme Court issued a rule to show cause why the trial court did not err in ordering the petitioner to turn over potentially privileged documents for in camera review. The respondents are directed to file a written answer on or before June 8, 2023. The petitioner has 14 days from receipt of the answer to reply. Opinion issued November 14, 2023",
"docket_numbers": "22SC712",
"citations": "2024 CO 56",
"parallel_citations": "",
"case_name_shorts": ""
},
{
"case_dates": "2023-10-23",
"case_names": "In Re: People v. Walthour, Ashleigh",
"download_urls": "https://www.courts.state.co.us//userfiles/file/Court_Probation/Supreme_Court/Opinions/2023/23SA125.pdf",
"precedential_statuses": "Published",
"blocked_statuses": false,
"date_filed_is_approximate": false,
"docket_numbers": "23SA125",
"judges": "Joshua Williford",
"summaries": "The petitioner seeks relief from the trial court's order of March 7, 2023. On May 12, 2023, the Supreme Court issued a rule to show cause why the trial court did not err in suppressing the results of an impending blood test because it hadn't been completed by the court's prior deadline. The respondents are directed to file a written answer on or before June 9, 2023. The petitioner has 21 days from receipt of the answer to reply. Opinion issued October 23, 2023",
"case_name_shorts": ""
},
{
"case_dates": "2023-10-16",
"case_names": "In Re People v. Seymour, Gavin",
"download_urls": "https://www.courts.state.co.us//userfiles/file/Court_Probation/Supreme_Court/Opinions/2023/23SA12.pdf",
"precedential_statuses": "Published",
"blocked_statuses": false,
"date_filed_is_approximate": false,
"docket_numbers": "23SA12",
"judges": "Martin Egelhoff",
"summaries": "The petitioner seeks relief from the trial court's Order of November 16, 2022. On January 17, 2023, the Supreme Court issued a rule to show cause why the trial court did not err in denying the defendant's motion to suppress. The respondents are directed to file a written Answer on or before February 14, 2023. The petitioner has 21 days from receipt of the Answer within which to Reply. Opinion issued October 16, 2023",
"case_name_shorts": ""
},
{
"case_dates": "2023-09-25",
"case_names": "In Re Edwards, Tana v. New Century Hospice",
"download_urls": "https://www.courts.state.co.us//userfiles/file/Court_Probation/Supreme_Court/Opinions/2023/23SA91.pdf",
"precedential_statuses": "Published",
"blocked_statuses": false,
"date_filed_is_approximate": false,
"docket_numbers": "23SA91",
"judges": "Mark Bailey",
"summaries": "The petitioners seek relief from the trial court's order of December 19, 2022. On April 3, 2023, the Supreme Court issued a rule to show cause why the trial court did not err in denying the defendants' motion for summary judgment on the plaintiff's claim for negligent supervision. The respondents are directed to file a written answer on or before May 1, 2023. The petitioners have 21 days from receipt of the answer within which to reply. Opinion issued September 25, 2023",
"case_name_shorts": ""
},
{
"case_dates": "2023-06-20",
"case_names": "In Re Smith, Jerrelle v. People",
"download_urls": "https://www.courts.state.co.us//userfiles/file/Court_Probation/Supreme_Court/Opinions/2023/23SA2.pdf",
"precedential_statuses": "Published",
"blocked_statuses": false,
"date_filed_is_approximate": false,
"docket_numbers": "23SA2",
"judges": "Robert Kiesnowski",
"summaries": "On January 6, 2023, the Supreme Court issued a rule to show cause why the trial court did not err in denying bond to the defendant. The respondents are directed to file a written answer on or before February 3, 2023. The petitioner has 21 days from receipt of the answer within which to reply. Opinion issued 6/20/23",
"case_name_shorts": ""
},
{
"case_dates": "2023-06-05",
"case_names": "In Re People v. Kelley, Noelle",
"download_urls": "https://www.courts.state.co.us//userfiles/file/Court_Probation/Supreme_Court/Opinions/2022/22SA874.pdf",
"precedential_statuses": "Published",
"case_dates": "2024-07-01",
"case_names": "In re the Marriage of Elyssa M. Fox, and Alexander L. Speaker",
"download_urls": "https://colorado.vlex.io/vid/in-re-marriage-of-1042461692/content",
"precedential_statuses": "Unknown",
"blocked_statuses": false,
"date_filed_is_approximate": false,
"docket_numbers": "22SA874",
"judges": "Eric White",
"summaries": "The petitioner seeks relief from the trial court's order of November 23, 2022. On December 5, 2022, the Supreme Court issued a rule to show cause why the trial court did not err in (1) permitting the prosecution to present evidence that the defendant refused to waive her privilege, and (2) ruling that the defendant waived her privilege. The respondents are directed to file a written answer on or before January 3, 2023. The petitioner has 21 days from receipt of the answer within which to reply.t Opinion issued on June 5, 2023",
"docket_numbers": "24SC76",
"citations": "",
"parallel_citations": "",
"case_name_shorts": ""
}
]
Loading
Loading