# Get Plain English Docket Progression

This notebook outlines the steps undertook to clean the RECAP docket and document descriptions, generate plain English docket progression for embedding generation, and look into availability of "complaints" for clustering.

# Import Libaries

In [1]:
import numpy as np
import pandas as pd

# Load the data

In [2]:
df = pd.read_csv("dockets.csv")
df = df[~df["docket_entry_id"].isna()]
len(df)

8209

In [3]:
df.columns

Index(['docket_id', 'docket_date_created', 'docket_case_name_short',
       'docket_case_name', 'docket_case_name_full', 'docket_slug',
       'docket_number', 'blocked', 'docket_court_id', 'assigned_to_id',
       'assigned_to_str', 'cause', 'date_filed', 'date_last_filing',
       'date_terminated', 'jurisdiction_type', 'jury_demand', 'nature_of_suit',
       'docket_pacer_case_id', 'appeal_from_id', 'appeal_from_str',
       'appellate_case_type_information', 'originating_court_information_id',
       'docket_number_core', 'idb_data_id', 'parent_docket_id',
       'docket_entry_id', 'docket_entry_date_created', 'entry_number',
       'description', 'docket_entry_docket_id', 'pacer_sequence_number',
       'recap_sequence_number', 'recap_document_id', 'recap_date_created',
       'recap_document_type', 'recap_document_number',
       'recap_attachment_number', 'recap_pacer_doc_id', 'is_available',
       'recap_docket_entry_id', 'recap_description', 'ocr_status',
       'plain_text',

# Clean up the data

In [4]:
df = df[["docket_id", "docket_number", "docket_case_name", "court_id", "court_short_name", "nature_of_suit",
         "docket_entry_id", "docket_entry_date_created", "entry_number", "description", 
        'recap_document_id', "recap_date_created", 'recap_document_number', "recap_description", 
         "plain_text"]]

df = df.rename(columns={"docket_entry_id": "entry_id",
                        "docket_entry_date_created": "entry_time",
                        "description": "entry_description",
                        "recap_date_created": "recap_time",
                        "plain_text": "recap_document"})

df.head()

Unnamed: 0,docket_id,docket_number,docket_case_name,court_id,court_short_name,nature_of_suit,entry_id,entry_time,entry_number,entry_description,recap_document_id,recap_time,recap_document_number,recap_description,recap_document
8,66759071,1:23-cv-00078,"Belfor USA Group, Inc. v. Promesa Apartments, ...",txwd,W.D. Texas,,400962733.0,2024-08-30 17:06:39.002445+00,26.0,,410207044.0,2024-08-30 17:06:39.025711+00,26.0,Order AND ~Util - Terminate Civil Case,
9,66759071,1:23-cv-00078,"Belfor USA Group, Inc. v. Promesa Apartments, ...",txwd,W.D. Texas,,400873530.0,2024-08-29 21:05:43.581988+00,25.0,,410116604.0,2024-08-29 21:05:43.598356+00,25.0,Stipulation of Dismissal,
10,66759071,1:23-cv-00078,"Belfor USA Group, Inc. v. Promesa Apartments, ...",txwd,W.D. Texas,,400255673.0,2024-08-23 14:19:00.030502+00,24.0,,409484496.0,2024-08-23 14:19:00.043908+00,24.0,Order AND ~Util - Set Hearings,
11,66759071,1:23-cv-00078,"Belfor USA Group, Inc. v. Promesa Apartments, ...",txwd,W.D. Texas,,390150565.0,2024-05-15 14:03:22.792887+00,23.0,,399087857.0,2024-05-15 14:03:22.868789+00,23.0,Order AND ~Util - Terminate Hearings,
12,66759071,1:23-cv-00078,"Belfor USA Group, Inc. v. Promesa Apartments, ...",txwd,W.D. Texas,,382121924.0,2024-03-26 14:58:35.656236+00,22.0,,389933428.0,2024-03-26 14:58:35.726643+00,22.0,Notice,


In [5]:
for each in ["docket_id", "entry_id", "entry_number", "recap_document_id", "recap_document_number"]:
    df[each] = df[each].apply(lambda x: np.nan if (pd.isna(x) or np.isinf(x)) else int(x))

df.head()

Unnamed: 0,docket_id,docket_number,docket_case_name,court_id,court_short_name,nature_of_suit,entry_id,entry_time,entry_number,entry_description,recap_document_id,recap_time,recap_document_number,recap_description,recap_document
8,66759071,1:23-cv-00078,"Belfor USA Group, Inc. v. Promesa Apartments, ...",txwd,W.D. Texas,,400962733,2024-08-30 17:06:39.002445+00,26.0,,410207044,2024-08-30 17:06:39.025711+00,26.0,Order AND ~Util - Terminate Civil Case,
9,66759071,1:23-cv-00078,"Belfor USA Group, Inc. v. Promesa Apartments, ...",txwd,W.D. Texas,,400873530,2024-08-29 21:05:43.581988+00,25.0,,410116604,2024-08-29 21:05:43.598356+00,25.0,Stipulation of Dismissal,
10,66759071,1:23-cv-00078,"Belfor USA Group, Inc. v. Promesa Apartments, ...",txwd,W.D. Texas,,400255673,2024-08-23 14:19:00.030502+00,24.0,,409484496,2024-08-23 14:19:00.043908+00,24.0,Order AND ~Util - Set Hearings,
11,66759071,1:23-cv-00078,"Belfor USA Group, Inc. v. Promesa Apartments, ...",txwd,W.D. Texas,,390150565,2024-05-15 14:03:22.792887+00,23.0,,399087857,2024-05-15 14:03:22.868789+00,23.0,Order AND ~Util - Terminate Hearings,
12,66759071,1:23-cv-00078,"Belfor USA Group, Inc. v. Promesa Apartments, ...",txwd,W.D. Texas,,382121924,2024-03-26 14:58:35.656236+00,22.0,,389933428,2024-03-26 14:58:35.726643+00,22.0,Notice,


In [6]:
for each in ["docket_id", "entry_id", "recap_document_id"]:
    df[f'str_{each}'] = df[each].apply(lambda x: '9999999999' if pd.isna(x) else str(x))
    
df['unique_id'] = df['str_docket_id'] + '-' + df['str_entry_id'] + '-' + df['str_recap_document_id']
assert df['unique_id'].nunique() == len(df)

df = df.drop(columns=['str_docket_id', 'str_entry_id', 'str_recap_document_id'])
df.head()

Unnamed: 0,docket_id,docket_number,docket_case_name,court_id,court_short_name,nature_of_suit,entry_id,entry_time,entry_number,entry_description,recap_document_id,recap_time,recap_document_number,recap_description,recap_document,unique_id
8,66759071,1:23-cv-00078,"Belfor USA Group, Inc. v. Promesa Apartments, ...",txwd,W.D. Texas,,400962733,2024-08-30 17:06:39.002445+00,26.0,,410207044,2024-08-30 17:06:39.025711+00,26.0,Order AND ~Util - Terminate Civil Case,,66759071-400962733-410207044
9,66759071,1:23-cv-00078,"Belfor USA Group, Inc. v. Promesa Apartments, ...",txwd,W.D. Texas,,400873530,2024-08-29 21:05:43.581988+00,25.0,,410116604,2024-08-29 21:05:43.598356+00,25.0,Stipulation of Dismissal,,66759071-400873530-410116604
10,66759071,1:23-cv-00078,"Belfor USA Group, Inc. v. Promesa Apartments, ...",txwd,W.D. Texas,,400255673,2024-08-23 14:19:00.030502+00,24.0,,409484496,2024-08-23 14:19:00.043908+00,24.0,Order AND ~Util - Set Hearings,,66759071-400255673-409484496
11,66759071,1:23-cv-00078,"Belfor USA Group, Inc. v. Promesa Apartments, ...",txwd,W.D. Texas,,390150565,2024-05-15 14:03:22.792887+00,23.0,,399087857,2024-05-15 14:03:22.868789+00,23.0,Order AND ~Util - Terminate Hearings,,66759071-390150565-399087857
12,66759071,1:23-cv-00078,"Belfor USA Group, Inc. v. Promesa Apartments, ...",txwd,W.D. Texas,,382121924,2024-03-26 14:58:35.656236+00,22.0,,389933428,2024-03-26 14:58:35.726643+00,22.0,Notice,,66759071-382121924-389933428


In [7]:
entry = df[["unique_id", "entry_description"]].dropna(subset=['entry_description'])
entry = entry.rename(columns={"entry_description": "text"})
len(entry)

2228

In [8]:
recap = df[["unique_id", "recap_description"]].dropna(subset=['recap_description'])
recap = recap.rename(columns={"recap_description":"text"})
len(recap)

6583

In [9]:
attachment = df[["unique_id", "recap_document"]].dropna(subset=['recap_document'])
attachment = attachment.rename(columns={"recap_document":"text"})
len(attachment)

205

In [10]:
df.to_csv("data/clean_dockets.csv", index=False)
entry.to_csv("data/entry.csv", index=False)
recap.to_csv("data/recap.csv", index=False)
attachment.to_csv("data/attachment.csv", index=False)

OSError: Cannot save file into a non-existent directory: 'data'

In [11]:
df["docket_id"].value_counts().describe()

count    339.000000
mean      24.215339
std       28.521521
min        1.000000
25%        6.500000
50%       17.000000
75%       30.000000
max      181.000000
Name: count, dtype: float64

In [12]:
df["docket_id"].value_counts()

docket_id
4497394     181
7126218     178
6190659     177
30917334    167
6974057     143
           ... 
54366202      1
4662800       1
17271373      1
7325780       1
47033749      1
Name: count, Length: 339, dtype: int64

In [22]:
len(df)

8209

# Identify "Complaint" entries

In [25]:
filtered_df = df[
    df["entry_description"].str.contains(r"^Complaint", case=False, na=False) |
    df["recap_description"].str.contains(r"^Complaint", case=False, na=False)
]
len(filtered_df)

98

In [26]:
len(filtered_df[~filtered_df["recap_document"].isna()])

18

# Get plain English description of docket trajectory in chronological order

In [13]:
df["entry_time"] = pd.to_datetime(df["entry_time"], format='mixed')
df["recap_time"] = pd.to_datetime(df["recap_time"], format='mixed')

# Group by unique_id and sort within each group
grouped = df.sort_values(["docket_id", "entry_time", "recap_time"]).groupby("docket_id")
len(grouped)

339

In [14]:
summaries = {}

for docket_id, group in grouped:
    docket_number, case_name, court, nature = group.iloc[0][["docket_number", "docket_case_name", "court_short_name", "nature_of_suit"]]
    intro = f"For docket number {docket_number} of case {case_name} from {court} court, "
    if pd.notna(nature):  # Check if nature is not blank
        intro += f"the nature of suit is {nature}, and"
    intro += "the docket progress is as follows:\n"
    
    case_progress = [intro]

    last_entry_part = None  # Track previous entry_part

    for _, row in group.iterrows():
        entry_part = f"On {row['entry_time'].strftime('%Y-%m-%d')}, {row['entry_description']}." if pd.notna(row['entry_description']) else ""
        doc_part = f"The entry contains documents from {row['recap_time'].strftime('%Y-%m-%d')} with description: {row['recap_description']}." if pd.notna(row['recap_description']) else ""
        content_part = f"The document content is: {row['recap_document']}." if pd.notna(row['recap_document']) else ""

        # Skip repeating entry_part, but still add doc_part & content_part if present
        if entry_part != last_entry_part:
            progress_entry = " ".join(filter(None, [entry_part, doc_part, content_part]))
            last_entry_part = entry_part  # Update last entry_part
        else:
            progress_entry = " ".join(filter(None, [doc_part, content_part]))

        if progress_entry:
            progress_entry = progress_entry.replace(" .", "").replace("=", "").replace("  ", "")
            case_progress.append(f"{progress_entry} \n")

    summaries[docket_id] = " ".join(case_progress)


In [15]:
df[df["docket_id"] == 119293]

Unnamed: 0,docket_id,docket_number,docket_case_name,court_id,court_short_name,nature_of_suit,entry_id,entry_time,entry_number,entry_description,recap_document_id,recap_time,recap_document_number,recap_description,recap_document,unique_id
8431,119293,12-3705,United States v. Ann Piper,ca7,Seventh Circuit,,374496118,2024-01-02 06:06:59.871325+00:00,702069438.0,FOR COURT USE ONLY: Certified copy of 08/16/20...,382080165,2024-01-02 06:06:59.886407+00:00,702069438.0,,,119293-374496118-382080165
8432,119293,12-3705,United States v. Ann Piper,ca7,Seventh Circuit,,374496117,2024-01-02 06:06:59.846613+00:00,18.0,Mandate issued. No record to be returned. [18]...,382080164,2024-01-02 06:06:59.858867+00:00,18.0,,,119293-374496117-382080164
8433,119293,12-3705,United States v. Ann Piper,ca7,Seventh Circuit,,374496116,2024-01-02 06:06:59.817340+00:00,17.0,ORDER: Final judgment filed per nonprecedentia...,382080163,2024-01-02 06:06:59.828927+00:00,17.0,,,119293-374496116-382080163
8434,119293,12-3705,United States v. Ann Piper,ca7,Seventh Circuit,,374496115,2024-01-02 06:06:59.792950+00:00,16.0,Filed Nonprecedential Disposition PER CURIAM. ...,382080162,2024-01-02 06:06:59.805225+00:00,16.0,,,119293-374496115-382080162
8435,119293,12-3705,United States v. Ann Piper,ca7,Seventh Circuit,,374496114,2024-01-02 06:06:59.768124+00:00,15.0,Filed electronic transcript of proceedings hel...,382080161,2024-01-02 06:06:59.781449+00:00,15.0,,,119293-374496114-382080161
8436,119293,12-3705,United States v. Ann Piper,ca7,Seventh Circuit,,374496113,2024-01-02 06:06:59.740783+00:00,14.0,Original record on appeal filed electronically...,382080218,2024-01-02 06:07:28.526896+00:00,14.0,Transmittal Letter,,119293-374496113-382080218
8437,119293,12-3705,United States v. Ann Piper,ca7,Seventh Circuit,,374496113,2024-01-02 06:06:59.740783+00:00,14.0,Original record on appeal filed electronically...,382080160,2024-01-02 06:06:59.754169+00:00,14.0,Pleadings,,119293-374496113-382080160
8438,119293,12-3705,United States v. Ann Piper,ca7,Seventh Circuit,,374496112,2024-01-02 06:06:59.708050+00:00,13.0,Notice to the District Court to transmit the r...,382080159,2024-01-02 06:06:59.726715+00:00,13.0,,,119293-374496112-382080159
8439,119293,12-3705,United States v. Ann Piper,ca7,Seventh Circuit,,374496111,2024-01-02 06:06:59.657121+00:00,12.0,ORDER: Notice to Defendant re: Counsel's Motio...,382080158,2024-01-02 06:06:59.676615+00:00,12.0,,,119293-374496111-382080158
8440,119293,12-3705,United States v. Ann Piper,ca7,Seventh Circuit,,374496110,2024-01-02 06:06:59.450945+00:00,11.0,Anders brief filed by Mr. Robert A. Alvarado f...,382080157,2024-01-02 06:06:59.478661+00:00,11.0,,Case: 12-3705 Document: 11 ...,119293-374496110-382080157


In [16]:
summaries[119293]

'For docket number 12-3705 of case United States v. Ann Piper from Seventh Circuit court, the docket progress is as follows:\n On 2024-01-02, Criminal case docketed. IFP. Docketing statement filed. Appellant\'s brief due on or before 01/09/2013 for Ann Marie Piper. Transcript information sheet due by 12/14/2012. [1] [6446592] [12-3705] (CMD) [Entered: 11/30/2012 11:07 AM]. The entry contains documents from 2024-01-02 with description: Criminal Case Docketed. \n The entry contains documents from 2024-01-02 with description: Attorney / Party Notice of Docketing. \n The entry contains documents from 2024-01-02 with description: Notice to District Court. \n On 2024-01-02, Disclosure Statement filed by Attorney Robert A. Alvarado for Appellant Ann Marie Piper. [2] [6449947] [12-3705] (Alvarado, Robert) [Entered: 12/13/2012 03:29 PM]. \n On 2024-01-02, Motion filed by Appellant Ann Marie Piper to extend time to file appellant brief. [3] [6453303] [12-3705] (Alvarado, Robert) [Entered: 01/02/

In [17]:
len(summaries.keys())

339

In [19]:
# Get all the summary texts
summary_texts = list(summaries.values())

# Calculate lengths of each summary
summary_lengths = [len(text) for text in summary_texts]

# Compute min, max, and average
max_len = max(summary_lengths)
min_len = min(summary_lengths)
avg_len = sum(summary_lengths) / len(summary_lengths) if summary_lengths else 0

print(f"Max length: {max_len}")
print(f"Min length: {min_len}")
print(f"Average length: {avg_len:.2f}")

Max length: 371535
Min length: 205
Average length: 16907.05


In [20]:
df_summaries = pd.DataFrame(list(summaries.items()), columns=["docket_id", "summary"])
df_summaries

Unnamed: 0,docket_id,summary
0,119293,For docket number 12-3705 of case United State...
1,1088363,For docket number 17-60417 of case Cerrell Dar...
2,3050501,"For docket number 2015-1767, 2015-1768 of case..."
3,4239842,For docket number 16-10572 of case Connie Lee ...
4,4246017,For docket number 1:04-mj-00312 of case United...
...,...,...
334,69391743,For docket number 8:24-bk-06835 of case Eufald...
335,69418857,For docket number 24-14840 of case McPherson W...
336,69459960,For docket number 1:24-cv-08414 of case Xie v....
337,69625844,For docket number 1:25-sm-00026 of case Social...


In [21]:
df_summaries.to_csv("docket_progression.csv", index=False)