Skip to content

Commit

Permalink
feat(mass, massappct): backscraper for masscases.com
Browse files Browse the repository at this point in the history
Helps solve freelawproject#984
  • Loading branch information
grossir committed Apr 16, 2024
1 parent 40b1da5 commit 2b4ef07
Show file tree
Hide file tree
Showing 3 changed files with 129 additions and 4 deletions.
118 changes: 114 additions & 4 deletions juriscraper/opinions/united_states/state/mass.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,17 +17,42 @@

import datetime
import re
from datetime import datetime
from datetime import date, datetime
from typing import Any, Dict, Tuple

from dateutil import parser

from juriscraper.OpinionSiteLinear import OpinionSiteLinear


class Site(OpinionSiteLinear):
# This mapper is incomplete
backscrape_date_range_mapper = [
{
"start": datetime(2016, 7, 25),
"end": None,
"url": "http://masscases.com/475-499.html",
},
{
"start": datetime(2007, 10, 25),
"end": datetime(2016, 5, 26),
"url": "http://masscases.com/450-474.html",
},
{
"start": datetime(1931, 2, 25),
"end": datetime(1938, 3, 8),
"url": "http://masscases.com/275-299.html",
},
]
first_opinion_date = datetime(1931, 2, 26)
court_identifier = "SJC"

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.url = "https://www.mass.gov/service-details/new-opinions"
self.court_id = self.__module__
self.court_identifier = "SJC"
self.is_backscrape = False
self.make_backscrape_iterable(kwargs)

def _process_html(self):
for file in self.html.xpath(".//a/@href[contains(.,'pdf')]/.."):
Expand All @@ -41,13 +66,98 @@ def _process_html(self):
url = file.get("href")
parts = url.split("/")[-4:-1]
parts = [int(d) for d in parts]
date = datetime(year=parts[0], month=parts[1], day=parts[2]).date()
date = datetime(parts[0], parts[1], parts[2])
self.cases.append(
{
"name": name,
"status": "Published",
"date": str(date),
"date": date.strftime("%m/%d/%Y"),
"docket": docket,
"url": url,
}
)

def _process_masscases(self) -> None:
"""Parse HTML into case dictionaries
:return None
"""
for row in self.html.xpath("//tr[td/a]"):
_, date_filed_str, name = row.xpath("td/text()")
date_filed = parser.parse(date_filed_str)
if (
self.start_date
and self.start_date <= date_filed
and date_filed <= self.end_date
):
cite = row.xpath(".//a/text()")[0]
url = row.xpath(".//a/@href")[0]
self.cases.append(
{
"citation": cite,
"date": date_filed_str,
"name": name,
"url": url,
"docket": "",
"status": "Published",
}
)

def extract_from_text(self, scraped_text: str) -> Dict[str, Any]:
"""Extracts docket number from downloaded opinion HTML
Possible on SJC. since `Records And Briefs` section cites
docket entries and each is labeled with the docket number
Even when that sections does not exist, the docket number is available.
For example: http://masscases.com/cases/sjc/493/493mass1019.html
The format on App Ct opinions is different
"""
if not self.is_backscrape:
return {}
match = re.search(rf"{self.court_identifier}-\d+", scraped_text[:2000])
docket = match.group(0) if match else ""
return {"Docket": {"docket_number": docket}}

def _download_backwards(
self, dates_and_url: Tuple[date, date, str]
) -> None:
"""Set proper `masscases.com` url as self.url, and parse content
:param dates_and_url: contains target url, and start and end date used
for filtering opinions of interest by date
:return None
"""
self.is_backscrape = True
self.start_date, self.end_date, self.url = dates_and_url
self.html = self._download()
self._process_masscases()

def make_backscrape_iterable(self, kwargs: dict) -> None:
"""Checks if backscrape start and end arguments have been passed
by caller, and parses them accordingly
:param kwargs: passed when initializing the scraper, may or
may not contain backscrape controlling arguments
:return None
"""
start = kwargs.get("backscrape_start")
end = kwargs.get("backscrape_end")
now = datetime.now()

if start:
start = datetime.strptime(start, "%m/%d/%Y")
else:
start = self.first_opinion_date
if end:
end = datetime.strptime(end, "%m/%d/%Y")
else:
end = now

assert start < end, "Incorrect backscraper start / end inputs"

# If there is overlap between backscraping range and reports'
# page range append the reports' page URL to the iterable
self.back_scrape_iterable = []
for item in self.backscrape_date_range_mapper:
if max(item["start"], start) < min(item["end"] or now, end):
self.back_scrape_iterable.append((start, end, item["url"]))
13 changes: 13 additions & 0 deletions juriscraper/opinions/united_states/state/massappct.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,23 @@
"""

from juriscraper.opinions.united_states.state import mass
from datetime import datetime


class Site(mass.Site):
backscrape_date_range_mapper = [
{
"start": datetime(2021, 7, 9),
"end": None,
"url": "http://masscases.com/app100-124.html",
}
]

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.court_id = self.__module__
self.court_identifier = "AC"

def extract_from_text(self, scraped_text: str) -> dict:
"""See `mass.Site` docstring"""
return {}
2 changes: 2 additions & 0 deletions tests/local/test_ScraperExtractFromTextTest.py
Original file line number Diff line number Diff line change
Expand Up @@ -483,6 +483,8 @@ class ScraperExtractFromText(unittest.TestCase):
},
),
],
"juriscraper.opinions.united_states.state.mass": [("""""", {})],
"juriscraper.opinions.united_states.state.massappct": [("""""", {})]
}

def test_extract_from_text(self):
Expand Down

0 comments on commit 2b4ef07

Please sign in to comment.