-
-
Notifications
You must be signed in to change notification settings - Fork 98
/
oral_argument_template.py
115 lines (92 loc) · 3.83 KB
/
oral_argument_template.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
"""Scraper for [Full name of court]
CourtID: [unique abbreviation to be used by software/filesystem]
Court Short Name: [standard abbreviation used in citations]
Author:
Reviewer:
History:
YYYY-MM-DD: Created by XXX
"""
from lxml import html
from juriscraper.lib.string_utils import convert_date_string, titlecase
from juriscraper.OralArgumentSite import OralArgumentSite
class Site(OralArgumentSite):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.court_id = self.__module__
self.url = "http://court-url.gov/some-path.html"
self.method = "POST"
self.uses_selenium = False
# Complete this variable if you create a backscraper.
self.back_scrape_iterable = None
"""
Required fields - InsanityExceptions will be thrown if omitted.
Remove this comment before submission.
"""
def _get_download_urls(self):
"""This is an example of a basic piece of meta data accessible with a
simple XPath query.
On a good site, most of your methods will follow this pattern.
Note that relative URLs are made absolute by the AbstractSite
object's cleanup routines, so doing so here is not needed.
"""
path = "//path/to/text/text()"
return list(self.html.xpath(path))
def _get_case_names(self):
"""This example demonstrates how to extract text from an element that
may contain other elements.
For example, this will work well on something like:
<strong>Nadim v. <em>Jenny</em></strong>
Resulting in text like:
Nadim v. Jenny
Note that titlecase() should be used here in the case that the case
names are provided in uppercase. Use the titlecase function on
cases where the name is provided in uppercase only.
"""
case_names = []
for e in self.html.xpath("//path/to/an/element/p"):
s = html.tostring(e, method="text", encoding="unicode")
case_names.append(titlecase(s))
return case_names
def _get_case_dates(self):
"""This is an example of a date field. Note that the format string
will likely need to be updated to match the date formats
on the site you are scraping. The datetime formats can be found
here: http://docs.python.org/2/library/datetime.html
"""
path = "//path/to/text/text()"
return [
convert_date_string(date_string)
for date_string in self.html.xpath(path)
]
"""
High priority fields
Remove this comment and any unused methods before submission
"""
def _get_docket_numbers(self):
"""
This is typically of the form ##-####
"""
return None
def _get_judges(self):
"""
For the name of the judge authoring the lead opinion
"""
return None
"""
Optional method used for downloading multiple pages of a court site.
"""
def _download_backwards(self, date_str):
"""This is a simple method that can be used to generate Site objects
that can be used to paginate through a court's entire website.
This method is usually called by a backscraper caller (see the
one in CourtListener/alert/scrapers for details), and typically
modifies aspects of the Site object's attributes such as Site.url.
A simple example has been provided below. The idea is that the
caller runs this method with a different variable on each iteration.
That variable is often a date that is getting iterated or is simply
a index (i), that we iterate upon.
This can also be used to hold notes useful to future backscraper
development.
"""
self.url = f"http://example.com/new/url/{date_str}"
self.html = self._download()