# Clean the RECAP docket and document descriptions

This notebook outlines the steps undertook to clean the RECAP docket and document descriptions for creating an eval dataset

# Import Libaries

In [1]:
import numpy as np
import pandas as pd

# Load the data

In [2]:
df = pd.read_csv("data/dockets.csv")
df = df[~df["docket_entry_id"].isna()]
len(df)

8209

In [3]:
df.columns

Index(['docket_id', 'docket_date_created', 'docket_case_name_short',
       'docket_case_name', 'docket_case_name_full', 'docket_slug',
       'docket_number', 'blocked', 'docket_court_id', 'assigned_to_id',
       'assigned_to_str', 'cause', 'date_filed', 'date_last_filing',
       'date_terminated', 'jurisdiction_type', 'jury_demand', 'nature_of_suit',
       'docket_pacer_case_id', 'appeal_from_id', 'appeal_from_str',
       'appellate_case_type_information', 'originating_court_information_id',
       'docket_number_core', 'idb_data_id', 'parent_docket_id',
       'docket_entry_id', 'docket_entry_date_created', 'entry_number',
       'description', 'docket_entry_docket_id', 'pacer_sequence_number',
       'recap_sequence_number', 'recap_document_id', 'recap_date_created',
       'recap_document_type', 'recap_document_number',
       'recap_attachment_number', 'recap_pacer_doc_id', 'is_available',
       'recap_docket_entry_id', 'recap_description', 'ocr_status',
       'plain_text',

# Clean up the data

In [4]:
df = df[["docket_id", "docket_number", "court_id", "court_short_name", 
         "docket_entry_id", "entry_number", "description", 
        'recap_document_id', 'recap_document_number', "recap_description", 
         "plain_text"]]

df = df.rename(columns={"docket_entry_id": "entry_id",
                        "description": "entry_description",
                        "plain_text": "recap_document"})

df.head()

Unnamed: 0,docket_id,docket_number,court_id,court_short_name,entry_id,entry_number,entry_description,recap_document_id,recap_document_number,recap_description,recap_document
8,66759071,1:23-cv-00078,txwd,W.D. Texas,400962733.0,26.0,,410207044.0,26.0,Order AND ~Util - Terminate Civil Case,
9,66759071,1:23-cv-00078,txwd,W.D. Texas,400873530.0,25.0,,410116604.0,25.0,Stipulation of Dismissal,
10,66759071,1:23-cv-00078,txwd,W.D. Texas,400255673.0,24.0,,409484496.0,24.0,Order AND ~Util - Set Hearings,
11,66759071,1:23-cv-00078,txwd,W.D. Texas,390150565.0,23.0,,399087857.0,23.0,Order AND ~Util - Terminate Hearings,
12,66759071,1:23-cv-00078,txwd,W.D. Texas,382121924.0,22.0,,389933428.0,22.0,Notice,


In [5]:
for each in ["docket_id", "entry_id", "entry_number", "recap_document_id", "recap_document_number"]:
    df[each] = df[each].apply(lambda x: np.nan if (pd.isna(x) or np.isinf(x)) else int(x))

df.head()

Unnamed: 0,docket_id,docket_number,court_id,court_short_name,entry_id,entry_number,entry_description,recap_document_id,recap_document_number,recap_description,recap_document
8,66759071,1:23-cv-00078,txwd,W.D. Texas,400962733,26.0,,410207044,26.0,Order AND ~Util - Terminate Civil Case,
9,66759071,1:23-cv-00078,txwd,W.D. Texas,400873530,25.0,,410116604,25.0,Stipulation of Dismissal,
10,66759071,1:23-cv-00078,txwd,W.D. Texas,400255673,24.0,,409484496,24.0,Order AND ~Util - Set Hearings,
11,66759071,1:23-cv-00078,txwd,W.D. Texas,390150565,23.0,,399087857,23.0,Order AND ~Util - Terminate Hearings,
12,66759071,1:23-cv-00078,txwd,W.D. Texas,382121924,22.0,,389933428,22.0,Notice,


In [6]:
for each in ["docket_id", "entry_id", "recap_document_id"]:
    df[f'str_{each}'] = df[each].apply(lambda x: '9999999999' if pd.isna(x) else str(x))
    
df['unique_id'] = df['str_docket_id'] + '-' + df['str_entry_id'] + '-' + df['str_recap_document_id']
assert df['unique_id'].nunique() == len(df)

df = df.drop(columns=['str_docket_id', 'str_entry_id', 'str_recap_document_id'])
df.head()

Unnamed: 0,docket_id,docket_number,court_id,court_short_name,entry_id,entry_number,entry_description,recap_document_id,recap_document_number,recap_description,recap_document,unique_id
8,66759071,1:23-cv-00078,txwd,W.D. Texas,400962733,26.0,,410207044,26.0,Order AND ~Util - Terminate Civil Case,,66759071-400962733-410207044
9,66759071,1:23-cv-00078,txwd,W.D. Texas,400873530,25.0,,410116604,25.0,Stipulation of Dismissal,,66759071-400873530-410116604
10,66759071,1:23-cv-00078,txwd,W.D. Texas,400255673,24.0,,409484496,24.0,Order AND ~Util - Set Hearings,,66759071-400255673-409484496
11,66759071,1:23-cv-00078,txwd,W.D. Texas,390150565,23.0,,399087857,23.0,Order AND ~Util - Terminate Hearings,,66759071-390150565-399087857
12,66759071,1:23-cv-00078,txwd,W.D. Texas,382121924,22.0,,389933428,22.0,Notice,,66759071-382121924-389933428


# Keep only the docket entries, the recap description, and the attachment description

In [7]:
entry = df[["unique_id", "entry_description"]].dropna(subset=['entry_description'])
entry = entry.rename(columns={"entry_description": "text"})
len(entry)

2228

In [8]:
recap = df[["unique_id", "recap_description"]].dropna(subset=['recap_description'])
recap = recap.rename(columns={"recap_description":"text"})
len(recap)

6583

In [9]:
attachment = df[["unique_id", "recap_document"]].dropna(subset=['recap_document'])
attachment = attachment.rename(columns={"recap_document":"text"})
len(attachment)

205

# Save for future use

In [10]:
df.to_csv("data/clean_dockets.csv", index=False)
entry.to_csv("data/entry.csv", index=False)
recap.to_csv("data/recap.csv", index=False)
attachment.to_csv("data/attachment.csv", index=False)