freelawproject · quevon24 · Jun 11, 2024 · May 4, 2024 · May 9, 2024 · Jun 7, 2024
diff --git a/juriscraper/opinions/united_states/federal_district/dcd.py b/juriscraper/opinions/united_states/federal_district/dcd.py
@@ -4,142 +4,102 @@
 Author: V. David Zvenyach
 Date created: 2014-02-27
 Substantially Revised: Brian W. Carver, 2014-03-28
+2024-05-03, grossir: Change base class OpinionSiteLinear
 """
 
 import re
-import time
-from datetime import date
+from datetime import date, datetime
+from typing import Tuple
 
 from lxml import html
 
 from juriscraper.lib.string_utils import titlecase
-from juriscraper.OpinionSite import OpinionSite
+from juriscraper.OpinionSiteLinear import OpinionSiteLinear
 
 
-class Site(OpinionSite):
+class Site(OpinionSiteLinear):
+    docket_document_number_regex = re.compile(r"(\?)(\d+)([a-z]+)(\d+)(-)(.*)")
+    nature_of_suit_regex = re.compile(r"(\?)(\d+)([a-z]+)(\d+)(-)(.*)")
+
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.court_id = self.__module__
-        self.url = "https://ecf.dcd.uscourts.gov/cgi-bin/Opinions.pl?" + str(
-            date.today().year
-        )
+        self.url = f"https://ecf.dcd.uscourts.gov/cgi-bin/Opinions.pl?{date.today().year}"
+        self.status = "Published"
 
-    def _get_download_urls(self):
-        # There are often multiple documents and hence urls for each case.
-        # This requires us to pad every other metadata field to match the
-        # number of urls we find here.
-        path = "//table[2]//tr[position()>0]/td[3]/a/@href"
-        return [url for url in self.html.xpath(path)]
+    def _process_html(self):
+        """
+        Some rows have mutliple documents and hence urls for each case.
+        We will "pad" every other metadata field to match the urls
+        """
+        for row in self.html.xpath("//table[2]//tr[not(th)]"):
+            case_name = titlecase(
+                row.xpath("td[2]//text()[preceding-sibling::br]")[0].lower()
+            )
+            date_string = row.xpath("td[1]/text()")[0]
+            date_filed = datetime.strptime(date_string, "%m/%d/%Y")
+            docket = row.xpath("td[2]//text()[following-sibling::br]")[0]
 
-    def _get_case_names(self):
-        casenames = []
-        rowpath = "//table[2]//tr[position()>0]"
-        cnpath = "./td[2]//text()[preceding-sibling::br]"
-        urlpath = "./td[3]/a/@href"
-        for row in self.html.xpath(rowpath):
-            case_list = row.xpath(cnpath)
-            for rough_case_name in case_list:
-                case_name = titlecase(rough_case_name.lower())
-                # Determine the number of urls in each row and pad the case
-                # name list sufficiently
-                count = len(row.xpath(urlpath))
-                casenames.extend([case_name] * count)
-        return casenames
+            judge_element = row.xpath("td[3]")[0]
+            judge_string = html.tostring(
+                judge_element, method="text", encoding="unicode"
+            )
+            judge = re.search(r"(by\s)(.*)", judge_string, re.MULTILINE).group(
+                2
+            )
 
-    def _get_case_dates(self):
-        dates = []
-        rowpath = "//table[2]//tr[position()>0]"
-        datepath = "./td[1]/text()"
-        urlpath = "./td[3]/a/@href"
-        for row in self.html.xpath(rowpath):
-            date_string = row.xpath(datepath)
-            for d in date_string:
-                date_object = date.fromtimestamp(
-                    time.mktime(time.strptime(d, "%m/%d/%Y"))
+            for url in row.xpath("td[3]/a/@href"):
+                doc_number, nature_of_suit = self.get_values_from_url(url)
+                self.cases.append(
+                    {
+                        "name": case_name,
+                        "date": str(date_filed),
+                        "url": url,
+                        "docket": docket,
+                        "docket_document_numbers": doc_number,
+                        "nature_of_suit": nature_of_suit,
+                        "judge": judge,
+                    }
                 )
-                # Determine the number of urls in each row and pad the date
-                # list sufficiently
-                count = len(row.xpath(urlpath))
-                dates.extend([date_object] * count)
-        return dates
 
-    def _get_precedential_statuses(self):
-        return ["Published"] * len(self.case_names)
+    def get_values_from_url(self, url: str) -> Tuple[str, str]:
+        """Get docket document number and nature_of_suit values from URL
 
-    def _get_docket_numbers(self):
-        docket_numbers = []
-        rowpath = "//table[2]//tr[position()>0]"
-        dktpath = "./td[2]//text()[following-sibling::br]"
-        urlpath = "./td[3]/a/@href"
-        for row in self.html.xpath(rowpath):
-            docket_number = row.xpath(dktpath)
-            # Determine the number of urls in each row and pad the docket
-            # numbers list sufficiently
-            count = len(row.xpath(urlpath))
-            docket_numbers.extend(docket_number * count)
-        return docket_numbers
+        :param url:
+        :return:  docket document number and nature_of_suit
+        """
+        # In 2012 (and perhaps elsewhere) they have a few weird urls.
+        match = self.docket_document_number_regex.search(url)
+        if match:
+            doc_number = match.group(6)
+        else:
+            doc_number = url
 
-    def _get_docket_document_numbers(self):
-        document_numbers = []
-        regex = re.compile(r"(\?)(\d+)([a-z]+)(\d+)(-)(.*)")
-        for url in self.html.xpath(
-            "//table[2]//tr[position()>0]/td[3]/a/@href"
-        ):
-            # Because we are acting directly on the entire url list, no padding
-            # of the docket number field is required.
-            doc_no = regex.search(url)
-            # In 2012 (and perhaps elsewhere) they have a few weird urls.
-            if re.search(regex, url) is not None:
-                document_numbers.append(doc_no.group(6))
+        nature_of_suit_match = re.search(self.nature_of_suit_regex, url)
+        # In 2012 (and perhaps elsewhere) they have a few weird urls.
+        if not nature_of_suit_match:
+            nature_of_suit = "Unknown"
+        else:
+            nature_code = nature_of_suit_match.group(3)
+            if nature_code == "cv":
+                nature_of_suit = "Civil"
+            elif nature_code == "cr":
+                nature_of_suit = "Criminal"
+            # This is a tough call. Magistrate Cases are typically also
+            # Criminal or Civil cases, and their docket_number field will
+            # reflect this, but they do classify these separately under
+            # these 'mj' and 'mc' codes and the first page of these
+            #  documents will often refer to them as 'Magistrate Case
+            # ####-####' so, we will too.
+            elif nature_code == "mj" or "mc":
+                nature_of_suit = "Magistrate Case"
             else:
-                document_numbers.append(url)
-        return document_numbers
+                nature_of_suit = "Unknown"
 
-    def _get_judges(self):
-        judges = []
-        rowpath = "//table[2]//tr[position()>0]"
-        urlpath = "./td[3]/a/@href"
-        judgepath = "./td[3]"
-        for row in self.html.xpath(rowpath):
-            for judge_element in row.xpath(judgepath):
-                judge_string = html.tostring(
-                    judge_element, method="text", encoding="unicode"
-                )
-                judge = re.search(
-                    r"(by\s)(.*)", judge_string, re.MULTILINE
-                ).group(2)
-                # Determine the number of urls in each row and pad the judges
-                # list sufficiently
-                count = len(row.xpath(urlpath))
-                judges.extend([judge] * count)
-        return judges
+        return doc_number, nature_of_suit
+
+    def _get_docket_document_numbers(self):
+        return [case["docket_document_numbers"] for case in self.cases]
 
     def _get_nature_of_suit(self):
-        nos = []
-        for url in self.html.xpath(
-            "//table[2]//tr[position()>0]/td[3]/a/@href"
-        ):
-            # Because we are acting directly on the entire url list, no padding
-            # of the nature of suit field is required.
-            regex = r"(\?)(\d+)([a-z]+)(\d+)(\-)(.*)"
-            url_str = re.search(r"(\?)(\d+)([a-z]+)(\d+)(-)(.*)", url)
-            # In 2012 (and perhaps elsewhere) they have a few weird urls.
-            if re.search(regex, url) is not None:
-                nature_code = url_str.group(3)
-                if nature_code == "cv":
-                    nos.append("Civil")
-                elif nature_code == "cr":
-                    nos.append("Criminal")
-                # This is a tough call. Magistrate Cases are typically also
-                # Criminal or Civil cases, and their docket_number field will
-                # reflect this, but they do classify these separately under
-                # these 'mj' and 'mc' codes and the first page of these
-                #  documents will often refer to them as 'Magistrate Case
-                # ####-####' so, we will too.
-                elif nature_code == "mj" or "mc":
-                    nos.append("Magistrate Case")
-                else:
-                    nos.append("Unknown")
-            else:
-                nos.append("Unknown")
-        return nos
+        return [case["nature_of_suit"] for case in self.cases]