generated from freelawproject/new-project-template
-
-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathcourtlistener.py
263 lines (217 loc) · 7.98 KB
/
courtlistener.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
import logging
import re
from typing import TypedDict
import courts_db
import requests
from django.conf import settings
from django.core.exceptions import ValidationError
from django.core.validators import URLValidator
from .exceptions import MultiDefendantCaseError
logger = logging.getLogger(__name__)
# Regex expression to match dockets and documents URL from CL. ie:
# https://www.courtlistener.com/docket/65745614/united-states-v-ward/
# https://www.courtlistener.com/docket/65364032/6/1/antonyuk-v-hochul/
DOCKET_URL_PATTERN = re.compile(
r"(?:www\.courtlistener\.com\/docket\/)(?P<docket_id>\d+)(?:\/.*)"
)
# Regex expression to match PDF URLs from CL. ie:
# https://storage.courtlistener.com/recap/gov.uscourts.dcd.178502/gov.uscourts.dcd.178502.1.0_48.pdf
# https://storage.courtlistener.com/recap/gov.uscourts.cand.373179/gov.uscourts.cand.373179.1.0.pdf
PDF_URL_PATTERN = re.compile(
r"(?P<url_for_redirect>(https:\/{2}storage\.courtlistener\.com\/recap\/gov.uscourts.(?P<court>[a-z]+).(?P<pacer_case_id>\d+)))(?:\/.*)"
)
CL_API = {
"docket": "https://www.courtlistener.com/api/rest/v3/dockets/",
"docket-alerts": "https://www.courtlistener.com/api/rest/v3/docket-alerts/",
"recap-documents": "https://www.courtlistener.com/api/rest/v3/recap-documents/",
"recap-fetch": "https://www.courtlistener.com/api/rest/v3/recap-fetch/",
"media-storage": "https://storage.courtlistener.com/",
}
pacer_to_cl_ids = {
# Maps PACER ids to their CL equivalents
"azb": "arb", # Arizona Bankruptcy Court
"cofc": "uscfc", # Court of Federal Claims
"neb": "nebraskab", # Nebraska Bankruptcy
"nysb-mega": "nysb", # Remove the mega thing
}
# Reverse dict of pacer_to_cl_ids
cl_to_pacer_ids = {v: k for k, v in pacer_to_cl_ids.items() if v != "nysb"}
def map_pacer_to_cl_id(pacer_id):
return pacer_to_cl_ids.get(pacer_id, pacer_id)
def map_cl_to_pacer_id(cl_id):
return cl_to_pacer_ids.get(cl_id, cl_id)
def get_docket_id_from_query(query: str) -> int:
"""Returns the docket id extracted from the search query
Args:
query (str): the query string provided by the curators using the search bar
Raises:
ValidationError: if the provided string is not a number or a valid URL.
Returns:
int: the docket id
"""
cleaned_str = query.strip()
if cleaned_str.isnumeric():
return int(cleaned_str)
# check if the query string is a valid URL
validator = URLValidator()
validator(cleaned_str)
# check if the query string is a PDF link
is_pdf_link = re.search(PDF_URL_PATTERN, cleaned_str)
if is_pdf_link:
r = requests.get(is_pdf_link.group("url_for_redirect"), timeout=5)
r.raise_for_status()
cleaned_str = r.url
# check if the query string is a CL docket link or a CL PDF link
is_docket_link = re.search(DOCKET_URL_PATTERN, cleaned_str)
if is_docket_link:
return int(is_docket_link.group("docket_id"))
raise ValidationError("The query string provided is invalid")
def lookup_court(court: str):
"""
Lookup a court name or citation string using courts-db.
Returns court ID (e.g., "cand" for "N.D. Cal.")
Returns None if it can't find anything.
"""
results = courts_db.find_court(court)
if len(results) == 1:
return results[0]
elif len(results) == 0:
print(f"No results for court '{court}'")
return None
else:
print(f"Could not resolve court '{court}'")
return None
def auth_header() -> dict:
token = settings.COURTLISTENER_API_KEY
header_dict = {"Authorization": f"Token {token}"}
return header_dict
def lookup_docket_by_cl_id(cl_id: int):
"""
Performs a GET query on /api/rest/v3/dockets/
to get a Docket using the CourtListener ID
"""
url = f"{CL_API['docket']}{cl_id}/"
response = requests.get(url, headers=auth_header(), timeout=5)
response.raise_for_status()
return response.json()
class DocumentDict(TypedDict):
page_count: int
filepath_local: str
def lookup_document_by_doc_id(doc_id: int | None) -> DocumentDict:
"""
Performs a GET query on /api/rest/v3/recap-documents/
using the document_id to get a recap document
"""
response = requests.get(
f"{CL_API['recap-documents']}{doc_id}/",
params={"fields": "filepath_local,page_count"},
headers=auth_header(),
timeout=5,
)
response.raise_for_status()
data: DocumentDict = response.json()
return data
def download_pdf_from_cl(filepath: str) -> bytes:
document_url = f"{CL_API['media-storage']}{filepath}"
document_request = requests.get(document_url, timeout=3)
document_request.raise_for_status()
return document_request.content
def purchase_pdf_by_doc_id(doc_id: int | None) -> int:
"""
Performs a POST query on /api/rest/v3/recap-fetch/
using the document_id from CL and the PACER's login
credentials.
"""
response = requests.post(
f"{CL_API['recap-fetch']}",
json={
"request_type": 2,
"pacer_username": settings.PACER_USERNAME,
"pacer_password": settings.PACER_PASSWORD,
"recap_document": doc_id,
},
headers=auth_header(),
timeout=5,
)
response.raise_for_status()
data = response.json()
return data["id"]
def lookup_docket_by_case_number(court: str, docket_number: str):
"""
Performs a GET query on /api/rest/v3/dockets/
using the court_id and docket_number to get a
Docket.
"""
response = requests.get(
CL_API["docket"],
params={"court_id": court, "docket_number": docket_number},
headers=auth_header(),
timeout=5,
)
data = response.json()
num_results = data["count"]
if num_results == 1:
return data["results"][0]
elif num_results == 0:
return None
else:
msg = f"Expected 0 or 1 results, but got {num_results}"
# Produce some useful information for debugging, maybe
pacer_ids = {}
for result in data["results"]:
cl_id = result["id"]
pacer_id = result.get("pacer_case_id")
pacer_ids[cl_id] = pacer_id
raise MultiDefendantCaseError(msg)
# RESULT: We have multiple CL dockets corresponding to
# multiple PACER IDs :(
#
# See, e.g., nyed 1:09-cr-00466, which gives this mapping
# of CL IDs to PACER IDs
# {
# 4319866: '294052',
# 6146972: '294050',
# 6360330: '294049',
# 6452146: '294051',
# 14197745: '294048',
# 14569244: '294054',
# 14665429: '294053'
# }
# The PACER IDs are all consecutive, from 294048 to 294054.
# This is a criminal case with 6 defendants. Is that it?
#
# Aha. Yep. Brad had the case number "1:09-cr-00466-4". I'd trimmed the "-4".
#
# Notes in Issue: https://github.com/freelawproject/bigcases2/issues/18
# TODO: Figure out how to choose the "best" of multiple dockets
def subscribe_to_docket_alert(cl_id: int) -> bool:
"""
Performs a POST query on /api/rest/v3/docket-alerts/
to subscribe to docket alerts for a given CourtListener docket ID.
"""
response = requests.post(
CL_API["docket-alerts"],
headers=auth_header(),
data={
"docket": cl_id,
},
timeout=5,
)
try:
response.raise_for_status()
return True
except requests.exceptions.HTTPError as err:
print(
f"Error subscribing to case {cl_id}: got HTTP response {err.response.status_code}"
)
return False
def handle_multi_defendant_cases(queue):
logger.debug("handle_multi_defendant_cases(): started")
for tpl in queue:
court, case_number = tpl
logger.debug(
f"handle_multi_defendant_cases(): trying {court} {case_number}"
)
# TODO
raise NotImplementedError
logger.debug("handle_multi_defendant_cases(): done")