Merge branch 'main' into fix_dcd

freelawproject · Jun 11, 2024 · cfe404a · cfe404a
2 parents 0834825 + 9574b0c
commit cfe404a
Show file tree

Hide file tree

Showing 19 changed files with 5,608 additions and 5,704 deletions.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -18,7 +18,7 @@ jobs:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
       run: |
-        python -m pip install --upgrade pip
+        python -m pip install --upgrade pip setuptools
         pip install wheel
         pip install -r requirements.txt
         pip install -r requirements-dev.txt

diff --git a/juriscraper/opinions/united_states/federal_appellate/ca1.py b/juriscraper/opinions/united_states/federal_appellate/ca1.py
@@ -1,124 +1,97 @@
-import re
-from datetime import date, datetime, timedelta
+from datetime import date, datetime
+from typing import Tuple
 from urllib.parse import urlencode
 
-from dateutil.rrule import DAILY, rrule
+from juriscraper.AbstractSite import logger
+from juriscraper.lib.date_utils import make_date_range_tuples
+from juriscraper.OpinionSiteLinear import OpinionSiteLinear
 
-from juriscraper.OpinionSite import OpinionSite
 
+class Site(OpinionSiteLinear):
+    # This URL will show most recent opinions
+    base_url = "https://www.ca1.uscourts.gov/opn/aci"
+    days_interval = 5
+    first_opinion_date = datetime(2003, 3, 23)
 
-class Site(OpinionSite):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.base_url = "http://media.ca1.uscourts.gov/cgi-bin/opinions.pl"
         self.court_id = self.__module__
-        today = date.today()
-        params = urlencode(
-            {
-                "FROMDATE": (today - timedelta(7)).strftime("%m/%d/%Y"),
-                "TODATE": today.strftime("%m/%d/%Y"),
-                "puid": "",
-            }
-        )
-        self.url = f"{self.base_url}/?{params}"
-        # self.url = "http://media.ca1.uscourts.gov/cgi-bin/opinions.pl/?TODATE=06%2F24%2F1993&puid=&FROMDATE=05%2F25%2F1993"
-        self.interval = 30
-        self.back_scrape_iterable = [
-            i.date()
-            for i in rrule(
-                DAILY,
-                interval=self.interval,
-                dtstart=date(1992, 1, 1),
-                until=date(2016, 1, 1),
-            )
-        ]
+        self.url = self.base_url
+        self.make_backscrape_iterable(kwargs)
 
-    def _get_case_names(self):
-        return [
-            e.strip()
-            for e in self.html.xpath(
-                "//tr[position() > 1]/td[4]/text()[contains(., 'v.')]"
+    def _process_html(self):
+        for row in self.html.xpath("//tr[not(th)]"):
+            title = row.xpath("td[2]/a/text()")[0]
+            url = row.xpath("td[2]/a/@href")[0]
+            status = self.get_status_from_opinion_title(title)
+            docket = row.xpath("td[3]/a/text()")[0]
+            date_filed = row.xpath("td[1]/span/text()")[0]
+            name = row.xpath("td[4]/text()")[0]
+            lower_court = row.xpath("td[4]/span/text()")[0]
+            self.cases.append(
+                {
+                    "name": name.strip(),
+                    "url": url,
+                    "date": date_filed,
+                    "status": status,
+                    "docket": docket,
+                    "lower_court": lower_court,
+                }
             )
-        ]
-
-    def _get_download_urls(self):
-        return [
-            e for e in self.html.xpath("//tr[position() > 1]/td[2]//@href")
-        ]
-
-    def _get_case_dates(self):
-        dates = []
-        for s in self.html.xpath("//tr[position() > 1]/td[1]//text()"):
-            s = s.replace(r"\t", "").replace(r"\n", "").strip()
-            if s == "1996/05/32":
-                s = "1996/05/30"  # My life is thus lain to waste.
-            dates.append(datetime.strptime(s.strip(), "%Y/%m/%d").date())
-        return dates
 
-    def _get_docket_numbers(self):
-        regex = re.compile(r"(\d{2}-.*?\W)(.*)$")
-        docket_numbers = []
-        for s in self.html.xpath("//tr[position() > 1]/td[2]/a/text()"):
-            s = s.replace("O1-", "01-")  # I grow older, the input grows worse.
-            docket_numbers.append(
-                regex.search(s).group(1).strip().replace(".", "")
-            )
-        return docket_numbers
+    def get_status_from_opinion_title(self, title: str) -> str:
+        """Status is encoded in opinion's link title
 
-    def _get_precedential_statuses(self):
-        statuses = []
-        for text in self.html.xpath("//tr[position() > 1]/td[2]//@href"):
-            if "U" in text:
-                statuses.append("Unpublished")
-            elif "P" in text:
-                statuses.append("Published")
-            elif "E" in text:
-                statuses.append("Errata")
-            else:
-                statuses.append("Unknown")
-        return statuses
+        :param title: opinion title. Ex: 23-1667P.01A, 23-1639U.01A
 
-    def _get_lower_courts(self):
-        lower_courts = []
-        for e in self.html.xpath("//tr[position() > 1]/td[4]/font"):
-            try:
-                lower_courts.append(e.xpath("./text()")[0].strip())
-            except IndexError:
-                lower_courts.append("")
-        return lower_courts
+        :return: status string
+        """
+        if "U" in title:
+            status = "Unpublished"
+        elif "P" in title:
+            status = "Published"
+        elif "E" in title:
+            status = "Errata"
+        else:
+            status = "Unknown"
+        return status
 
-    def _download_backwards(self, d):
-        params = urlencode(
-            {
-                "FROMDATE": d.strftime("%m/%d/%Y"),
-                "TODATE": (d + timedelta(self.interval)).strftime("%m/%d/%Y"),
-                "puid": "",
-            }
-        )
-        self.url = f"{self.base_url}/?{params}"
+    def _download_backwards(self, dates: Tuple[date]) -> None:
+        """Change URL to backscraping date range
 
+        :param dates: tuple with date range to scrape
+        :return None
+        """
+        start, end = dates
+        params = {
+            "field_opn_csno_value_op": "starts",
+            "field_opn_issdate_value[min][date]": start.strftime("%m/%d/%Y"),
+            "field_opn_issdate_value[max][date]": end.strftime("%m/%d/%Y"),
+        }
+        self.url = f"{self.base_url}?{urlencode(params)}"
         self.html = self._download()
-        if self.html is not None:
-            # Setting status is important because it prevents the download
-            # function from being run a second time by the parse method.
-            self.status = 200
+        self._process_html()
 
-    def _post_parse(self):
-        """This will remove the cases without a case name"""
-        to_be_removed = [
-            index
-            for index, case_name in enumerate(self.case_names)
-            if not case_name.replace("v.", "").strip()
-        ]
+    def make_backscrape_iterable(self, kwargs: dict) -> None:
+        """Checks if backscrape start and end arguments have been passed
+        by caller, and parses them accordingly
 
-        for attr in self._all_attrs:
-            item = getattr(self, attr)
-            if item is not None:
-                new_item = self.remove_elements(item, to_be_removed)
-                self.__setattr__(attr, new_item)
+        :param kwargs: passed when initializing the scraper, may or
+            may not contain backscrape controlling arguments
+        :return None
+        """
+        start = kwargs.get("backscrape_start")
+        end = kwargs.get("backscrape_end")
 
-    @staticmethod
-    def remove_elements(list_, indexes_to_be_removed):
-        return [
-            i for j, i in enumerate(list_) if j not in indexes_to_be_removed
-        ]
+        if start:
+            start = datetime.strptime(start, "%m/%d/%Y")
+        else:
+            start = self.first_opinion_date
+        if end:
+            end = datetime.strptime(end, "%m/%d/%Y")
+        else:
+            end = datetime.now()
+
+        self.back_scrape_iterable = make_date_range_tuples(
+            start, end, self.days_interval
+        )
diff --git a/juriscraper/opinions/united_states/state/nd.py b/juriscraper/opinions/united_states/state/nd.py
@@ -1,93 +1,103 @@
 # Author: Phil Ardery
 # Contact: https://www.ndcourts.gov/contact-us
 # Date created: 2019-02-28
+# Updated: 2024-05-08, grossir: to OpinionSiteLinear and new URL
+import re
+from typing import Tuple
+from urllib.parse import urljoin
 
-from juriscraper.lib.exceptions import InsanityException
-from juriscraper.lib.string_utils import convert_date_string
-from juriscraper.OpinionSite import OpinionSite
+from juriscraper.OpinionSiteLinear import OpinionSiteLinear
 
 
-class Site(OpinionSite):
+class Site(OpinionSiteLinear):
+    base_url = "https://www.ndcourts.gov/"
+    ordered_fields = [
+        "name",
+        "docket",
+        "date",
+        "nature_of_suit",
+        "judge",
+    ]
+
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.court_id = self.__module__
-        self.url = "https://www.ndcourts.gov/supreme-court/recent-opinions?pageSize=100"
-        self.cases = []
+        self.url = "https://www.ndcourts.gov/supreme-court/opinions?topic=&author=&searchQuery=&trialJudge=&pageSize=100&sortOrder=1"
+        self.status = "Published"
 
-    def _process_html(self):
-        for row in self.html.xpath('//table//div[@class="row"]'):
-            case = self.case_fields_extract(row)
-            self.case_fields_validate(case)
-            case = self.case_fields_sanitize(case)
-            self.cases.append(case)
+    def _process_html(self) -> None:
+        """Most values are inside a <p>: whitespace and
+        field names need to be cleaned
 
-    def case_fields_extract(self, row):
-        text_lines = row.xpath("./div/p[1]/text()")
-        text_lines = [
-            l.strip() for l in text_lines if l.strip()
-        ]  # Remove empty lines
-        line_count = len(text_lines)
-        return {
-            "citation": text_lines[0],
-            "docket": text_lines[1],
-            "date": text_lines[2],
-            "name": row.xpath(".//a")[0].text_content().strip(),
-            "nature": text_lines[3],
-            "judge": text_lines[4],
-            "summary": " ".join(text_lines[5:line_count])
-            if line_count > 5
-            else "",
-            "url": row.xpath(".//button/@onclick")[0].split("'")[1],
-        }
-
-    def case_fields_validate(self, case):
-        if "ND" not in case["citation"]:
-            raise InsanityException(f"Invalid citation: {case['citation']}")
-        if not case["docket"].startswith("Docket No.:"):
-            raise InsanityException(
-                f"Invalid docket raw string: {case['docket']}"
-            )
-        if not case["date"].startswith("Filing Date:"):
-            raise InsanityException(
-                f"Invalid date string raw string: {case['date']}"
-            )
-        if not case["nature"].startswith("Case Type:"):
-            raise InsanityException(
-                f"Invalid type raw string: {case['nature']}"
+        Citation used to be available, now must be got from inside
+        the document's text
+        """
+        for row in self.html.xpath('//table//div[@class="row"]'):
+            raw_values = list(map(str.strip, row.xpath("./div/p[1]/text()")))
+            values = []
+
+            for idx, txt in enumerate(raw_values[:5]):
+                if idx == 0:
+                    txt, extra_docket = self.clean_name(txt)
+                else:
+                    txt = txt.split(":", 1)[1].strip()
+                values.append(txt)
+
+            summary = (
+                " ".join(raw_values[5:]).strip() if len(raw_values) > 5 else ""
             )
-        if not case["judge"].startswith("Author:"):
-            raise InsanityException(
-                f"Invalid author raw string: {case['judge']}"
+            url = urljoin(
+                self.base_url,
+                row.xpath(".//button[@onclick]/@onclick")[0].split("'")[1],
             )
+            case = dict(zip(self.ordered_fields, values[:5]))
+            case["summary"] = summary
+            case["url"] = url
 
-    def case_fields_sanitize(self, case):
-        for field in ["date", "docket", "judge", "nature"]:
-            case[field] = case[field].split(":", 1)[1].strip()
-        return case
-
-    def _get_download_urls(self):
-        return [case["url"] for case in self.cases]
+            # There is a per_curiam field on the CL Opinion model,
+            # but we don't process it if sent by the scraper
+            if "Per Curiam" in case["judge"]:
+                case["judge"] = ""
 
-    def _get_case_names(self):
-        return [case["name"] for case in self.cases]
-
-    def _get_case_dates(self):
-        return [convert_date_string(case["date"]) for case in self.cases]
-
-    def _get_docket_numbers(self):
-        return [case["docket"] for case in self.cases]
+            self.cases.append(case)
 
     def _get_nature_of_suit(self):
-        return [case["nature"] for case in self.cases]
-
-    def _get_citations(self):
-        return [case["citation"] for case in self.cases]
-
-    def _get_judges(self):
-        return [case["judge"] for case in self.cases]
-
-    def _get_precedential_statuses(self):
-        return ["Published"] * len(self.cases)
-
-    def _get_summaries(self):
-        return [case["summary"] for case in self.cases]
+        return [case["nature_of_suit"] for case in self.cases]
+
+    def clean_name(self, name: str) -> Tuple[str, str]:
+        """Cleans case name
+
+        Some case names list the consolidated docket or a
+        (CONFIDENTIAL) parentheses
+
+        :param name: raw case name
+        :return: cleaned name and extra_docket numbers
+        """
+        other_dockets = ""
+        if "(consolidated w/" in name:
+            other_dockets = ",".join(re.findall(r"\d{8}", name))
+            name = name.split("(consolidated w/")[0]
+        if "(CONFIDENTIAL" in name:
+            name = name.split("(CONFIDENTIAL")[0]
+
+        return name.strip(), other_dockets
+
+    def extract_from_text(self, scraped_text: str) -> dict:
+        """Extract Citation from text
+
+        :param scraped_text: Text of scraped content
+        :return: date filed
+        """
+        regex = r"(?P<vol>20\d{2})\sND\s(?P<page>\d+)"
+        match = re.search(regex, scraped_text[:1000])
+
+        if match:
+            return {
+                "Citation": {
+                    "volume": match.group("vol"),
+                    "reporter": "ND",
+                    "page": match.group("page"),
+                    "type": 8,  # NEUTRAL in courtlistener Citation model
+                },
+            }
+        return {}