# Clean Document Dataset

The document dataset 042925_documents_query.csv was provided by clearinghouse through their internal SQL query. However, the file is too large to efficiently work with, so this notebook outlines the steps I undertook to make the file more workable.

# Import Libaries

In [1]:
import pandas as pd

# Take a peak at the data

In [2]:
df = pd.read_csv('data/042925_documents_query.csv', nrows=10)
df.head()

Unnamed: 0,doc_id,doc_title,doc_date,doc_ocr_text,case_id,doc_file,doc_type,cite_1_page,cite_1_reporter_id,cite_1_vol,cite_2_page,cite_2_reporter_id,cite_2_vol,cite_3_page,cite_3_reporter_id,cite_3_vol,doc_source,doc_url,docket_id,docket_recap_data
0,145883,USCA Notice of Docketing ROA,2022-03-18,CCaassee: 52:22-21-4c9v9-0 0 3 D97o-cJuFmLe...,44667,doc/145883.pdf,Coding Complete,,,,,,,,,,RECAP,https://www.courtlistener.com/docket/62639631/...,58431.0,"{""id"": 62639631, ""slug"": ""migliori-v-lehigh-co..."
1,141480,New Document,,Case 5:06-cv-00118-H Document 4-4 Filed 12...,43837,doc/141480.pdf,Deleted,,,,,,,,,,,,,
2,3434,Memorandum Opinion and Order,1999-11-17,NotReportedinF.Supp.2d FOREDUCATIONALUSEONLY P...,689,doc/3434.pdf,Deleted,,,,1068669.0,5996.0,1999.0,,,,,,,
3,142764,Exhibit 2 - Order to Joint Motion,2010-05-25,Case 1:08-cv-01464-JEC Document 60-6 Filed...,43829,doc/142764.pdf,Coding Complete,,,,,,,,,,PACER [Public Access to Court Electronic Records],https://www.courtlistener.com/docket/12437212/...,56761.0,"{""id"": 12437212, ""slug"": ""city-of-college-park..."
4,142770,Exhibit C,2010-08-02,Case 1:08-cv-01464-JEC Document 64-3 Filed...,43829,doc/142770.pdf,Coding Complete,,,,,,,,,,PACER [Public Access to Court Electronic Records],https://www.courtlistener.com/docket/12437212/...,56761.0,"{""id"": 12437212, ""slug"": ""city-of-college-park..."


# Save the ocr text to plain text files & remove the ocr text and columns that we don't need

In [3]:
cols = ['case_id', 'doc_id', 'doc_title', 'doc_date', 'doc_type', 'doc_source', 'doc_url', 'doc_ocr_text']

In [4]:
with pd.read_csv('data/042925_documents_query.csv', chunksize=1000, usecols=cols) as reader:
    for i, chunk in enumerate(reader):
        # Save each doc_ocr_text to a separate file
        for _, row in chunk.iterrows():
            filename = f"docs/{row['case_id']}_{row['doc_id']}.txt"
            with open(filename, 'w', encoding='utf-8') as f:
                f.write(str(row['doc_ocr_text']))

        # Save the metadata to documents.csv
        chunk.loc[:, "doc_len"] = chunk["doc_ocr_text"].str.len()
        chunk = chunk[cols[:-1] + ['doc_len']]
        chunk.to_csv('data/documents.csv', mode='a', index=False, header=(i == 0))

# Load the document csv

In [11]:
docs = pd.read_csv("data/documents.csv")
docs.head()

Unnamed: 0,case_id,doc_id,doc_title,doc_date,doc_type,doc_source,doc_url,doc_len
0,44667,145883,USCA Notice of Docketing ROA,2022-03-18,Coding Complete,RECAP,https://www.courtlistener.com/docket/62639631/...,4422.0
1,43837,141480,New Document,,Deleted,,,134.0
2,689,3434,Memorandum Opinion and Order,1999-11-17,Deleted,,,29396.0
3,43829,142764,Exhibit 2 - Order to Joint Motion,2010-05-25,Coding Complete,PACER [Public Access to Court Electronic Records],https://www.courtlistener.com/docket/12437212/...,140.0
4,43829,142770,Exhibit C,2010-08-02,Coding Complete,PACER [Public Access to Court Electronic Records],https://www.courtlistener.com/docket/12437212/...,211.0


# Clean the csv

## Remove all documents that don't have any content as they will not be useful for our task

In [12]:
len(docs)

134671

In [13]:
docs = docs[~docs["doc_len"].isna()]
len(docs)

111632

## Drop any duplicates

In [14]:
docs = docs.drop_duplicates()
len(docs)

103482

## Add flags for identifing useful (and not useful) documents

In [15]:
docs.loc[:, "complaint_flag"] = docs['doc_title'].str.contains(r'\bcomplaint\b', case=False, na=False)
docs.loc[:, "opinion_flag"] = docs['doc_title'].str.contains(r'\bopinion\b', case=False, na=False)
docs.loc[:, "exhibit_flag"] = docs['doc_title'].str.contains(r'\bexhibit\b', case=False, na=False)

## Drop all exhibits as they do not contain useful information

In [16]:
docs = docs[~((docs["complaint_flag"] == 0) & (docs["opinion_flag"] == 0) & (docs["exhibit_flag"] == 1))]
docs[docs["exhibit_flag"] == 1]

Unnamed: 0,case_id,doc_id,doc_title,doc_date,doc_type,doc_source,doc_url,doc_len,complaint_flag,opinion_flag,exhibit_flag
4854,44476,143811,Amended Complaint and [Exhibit A] Affidavit o...,2020-10-23,Coding Complete,RECAP,https://www.courtlistener.com/docket/18569316/...,28296.0,True,False,True
15271,6509,20150,Exhibit A [Attachment to Intervenor Complaint],2003-05-07,Coding Complete,,,2262.0,True,False,True
28096,14801,77889,Class Action Complaint (With Exhibit),2015-06-16,Coding Complete,Bloomberg Law,,24137.0,True,False,True
31210,17436,105077,Exhibits to Complaint (except Exhibit 19),2020-03-20,Coding Complete,,,289817.0,True,False,True
31211,17436,105078,Flores Settlement (Exhibit to Complaint),2020-03-20,Coding Complete,,,82636.0,True,False,True
32424,46259,157392,Exhibit Redline Amended Complaint,2025-03-19,Deleted,RECAP,https://www.courtlistener.com/docket/69733020/...,91643.0,True,False,True
39469,3293,6699,Plaintiff Exhibit: Appellate Court Opinion,2006-06-01,Coding Complete,,,15630.0,False,True,True
41197,6509,20153,Exhibit B [Attachment to Intervenor Complaint],2003-05-07,Coding Complete,,,8156.0,True,False,True
47530,44824,155318,Exhibit 1 - Motion To Dismiss The Amended Comp...,2018-08-27,Coding Complete,RECAP,https://www.courtlistener.com/docket/7162477/1...,48048.0,True,False,True
56851,17400,105212,Exhibit 8 of the Complaint,2017-10-27,Coding Complete,PACER [Public Access to Court Electronic Records],,35872.0,True,False,True


## Save the csv for future use

In [17]:
docs.to_json("data/case_documents.json")