Skip to content

Commit

Permalink
feat(mass, massappct): backscraper for masscases.com
Browse files Browse the repository at this point in the history
Helps solve freelawproject#984
  • Loading branch information
grossir committed Apr 17, 2024
1 parent 40b1da5 commit dbc78d1
Show file tree
Hide file tree
Showing 4 changed files with 168 additions and 3 deletions.
11 changes: 8 additions & 3 deletions juriscraper/opinions/united_states/state/mass.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,16 @@


class Site(OpinionSiteLinear):
"""
Backscraper is implemented on `united_states_backscrapers.state.mass.py`
"""

court_identifier = "SJC"

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.url = "https://www.mass.gov/service-details/new-opinions"
self.court_id = self.__module__
self.court_identifier = "SJC"

def _process_html(self):
for file in self.html.xpath(".//a/@href[contains(.,'pdf')]/.."):
Expand All @@ -41,12 +46,12 @@ def _process_html(self):
url = file.get("href")
parts = url.split("/")[-4:-1]
parts = [int(d) for d in parts]
date = datetime(year=parts[0], month=parts[1], day=parts[2]).date()
date = datetime(parts[0], parts[1], parts[2])
self.cases.append(
{
"name": name,
"status": "Published",
"date": str(date),
"date": date.strftime("%m/%d/%Y"),
"docket": docket,
"url": url,
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
"idahoctapp_civil",
"idahoctapp_criminal",
"idahoctapp_u",
"mass",
"massappct",
"me_2013",
"nd",
"sd",
Expand Down
132 changes: 132 additions & 0 deletions juriscraper/opinions/united_states_backscrapers/state/mass.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
import datetime
import re
from datetime import date, datetime
from typing import Any, Dict, Tuple

from dateutil import parser

from juriscraper.OpinionSiteLinear import OpinionSiteLinear


class Site(OpinionSiteLinear):
first_opinion_date = datetime(1931, 2, 26)
court_identifier = "SJC"
# This mapper is incomplete
backscrape_date_range_mapper = [
{
"start": datetime(2016, 7, 25),
"end": None,
"url": "http://masscases.com/475-499.html",
},
{
"start": datetime(2007, 10, 25),
"end": datetime(2016, 5, 26),
"url": "http://masscases.com/450-474.html",
},
{
"start": datetime(1997, 5, 15),
"end": datetime(2007, 9, 28),
"url": "http://masscases.com/425-449.html",
},
{
"start": datetime(1987, 5, 14),
"end": datetime(1997, 5, 12),
"url": "http://masscases.com/400-424.html",
},
{
"start": datetime(1931, 2, 25),
"end": datetime(1938, 3, 8),
"url": "http://masscases.com/275-299.html",
},
]

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.court_id = self.__module__
self.is_backscrape = False
self.make_backscrape_iterable(kwargs)

def _process_html(self) -> None:
"""Parse HTML into case dictionaries
:return None
"""
for row in self.html.xpath("//tr[td/a]"):
_, date_filed_str, name = row.xpath("td/text()")
date_filed = parser.parse(date_filed_str)
if (
self.start_date
and self.start_date <= date_filed
and date_filed <= self.end_date
):
cite = row.xpath(".//a/text()")[0]
url = row.xpath(".//a/@href")[0]
self.cases.append(
{
"citation": cite,
"date": date_filed_str,
"name": name,
"url": url,
"docket": "",
"status": "Published",
}
)

def extract_from_text(self, scraped_text: str) -> Dict[str, Any]:
"""Extracts docket number from downloaded opinion HTML
Possible on SJC. since `Records And Briefs` section cites
docket entries and each is labeled with the docket number
Even when that sections does not exist, the docket number is available.
For example: http://masscases.com/cases/sjc/493/493mass1019.html
The format on App Ct opinions is different
"""
if not self.is_backscrape:
return {}
match = re.search(rf"{self.court_identifier}-\d+", scraped_text[:2000])
docket = match.group(0) if match else ""
return {"Docket": {"docket_number": docket}}

def _download_backwards(
self, dates_and_url: Tuple[date, date, str]
) -> None:
"""Set proper `masscases.com` url as self.url, and parse content
:param dates_and_url: contains target url, and start and end date used
for filtering opinions of interest by date
:return None
"""
self.is_backscrape = True
self.start_date, self.end_date, self.url = dates_and_url
self.html = self._download()
self._process_html()

def make_backscrape_iterable(self, kwargs: dict) -> None:
"""Checks if backscrape start and end arguments have been passed
by caller, and parses them accordingly
:param kwargs: passed when initializing the scraper, may or
may not contain backscrape controlling arguments
:return None
"""
start = kwargs.get("backscrape_start")
end = kwargs.get("backscrape_end")
now = datetime.now()

if start:
start = datetime.strptime(start, "%m/%d/%Y")
else:
start = self.first_opinion_date
if end:
end = datetime.strptime(end, "%m/%d/%Y")
else:
end = now

assert start < end, "Incorrect backscraper start / end inputs"

# If there is overlap between backscraping range and reports'
# page range append the reports' page URL to the iterable
self.back_scrape_iterable = []
for item in self.backscrape_date_range_mapper:
if max(item["start"], start) < min(item["end"] or now, end):
self.back_scrape_iterable.append((start, end, item["url"]))
26 changes: 26 additions & 0 deletions juriscraper/opinions/united_states_backscrapers/state/massappct.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from datetime import datetime

from juriscraper.opinions.united_states_backscrapers.state import mass


class Site(mass.Site):
backscrape_date_range_mapper = [
{
"start": datetime(2021, 7, 9),
"end": None,
"url": "http://masscases.com/app100-124.html",
},
{
"start": datetime(2009, 8, 20),
"end": datetime(2021, 5, 12),
"url": "http://masscases.com/app75-99.html",
},
]

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.court_id = self.__module__

def extract_from_text(self, scraped_text: str) -> dict:
"""Check comment on backscraper parent class"""
return {}

0 comments on commit dbc78d1

Please sign in to comment.