# Produce a dataset with both issues & documents, also filter for only those with either complaint or opinions

# Import Libaries

In [1]:
import pandas as pd

# Load the datasets

In [2]:
df = pd.read_json("data/clean.json")
docs = pd.read_json("data/case_documents.json")

In [3]:
df.head()

Unnamed: 0,case_id,case_name,case_status,case_state,court_name,case_type,case_ongoing,case_special_collections,case_causes_of_action,issue_category,issues
0,8429,"EEOC v. EADS AEROFRAME SERVICES, LLC",Coding Complete,Louisiana,Western District of Louisiana,Equal Employment,No,"EEOC Study — in sample, Multi-LexSum (in sample)","Title VII (including PDA), 42 U.S.C. § 2000e","[Affected National Origin/Ethnicity(s), Affect...","[Black, Conditions of Employment (including as..."
1,9719,Regan v. Salt Lake County,Coding Complete,Utah,District of Utah,Jail Conditions,No,Strip Search Cases,42 U.S.C. § 1983,"[Affected Sex/Gender(s), Discrimination Basis,...","[Female, Search policies, Sex discrimination, ..."
2,18526,Kerrigan v. Philadelphia Board of Elections,Coding Complete,Pennsylvania,Eastern District of Pennsylvania,Disability Rights,No,,"Americans with Disabilities Act (ADA), 42 U.S....","[Disability and Disability Rights, Discriminat...","[Disability (inc. reasonable accommodations), ..."
3,6263,EEOC v. TRANSIT MIX CONCRETE,Coding Complete,Texas,Northern District of Texas,Equal Employment,No,"EEOC Study — in sample, IWPR/Wage Project Cons...","Title VII (including PDA), 42 U.S.C. § 2000e","[Affected Race(s), Discrimination Area, Discri...","[Black, Direct Suit on Merits, Disparate Treat..."
4,10082,Shorter v. DC,Coding Complete,District of Columbia,District of District of Columbia,Policing,No,,42 U.S.C. § 1983,"[Disability and Disability Rights, Discriminat...","[Disability (inc. reasonable accommodations), ..."


In [4]:
docs.head()

Unnamed: 0,case_id,doc_id,doc_title,doc_date,doc_type,doc_source,doc_url,doc_len,complaint_flag,opinion_flag,exhibit_flag
0,44667,145883,USCA Notice of Docketing ROA,2022-03-18,Coding Complete,RECAP,https://www.courtlistener.com/docket/62639631/...,4422,False,False,False
1,43837,141480,New Document,,Deleted,,,134,False,False,False
2,689,3434,Memorandum Opinion and Order,1999-11-17,Deleted,,,29396,False,True,False
6,43829,142757,Notice of Appeal,2009-04-30,Coding Complete,PACER [Public Access to Court Electronic Records],https://www.courtlistener.com/docket/12437212/...,2383,False,False,False
7,43829,142758,Joint Motion for Relief from Order and Opinion...,2010-05-25,Coding Complete,PACER [Public Access to Court Electronic Records],https://www.courtlistener.com/docket/12437212/...,550,False,True,False


# Keep only cases that have both documents and assigned issues

In [5]:
cases = list(set(df["case_id"].unique()) & set(docs["case_id"].unique()))
len(cases)

10464

In [6]:
len(df)

10549

In [7]:
df = df[df["case_id"].isin(cases)]
len(df)

10464

In [8]:
len(docs)

102811

In [9]:
docs = docs[docs["case_id"].isin(cases)]
len(docs)

84380

## Create a joined issue & document dataset for all future use

In [10]:
result = docs.merge(df, how="left", on="case_id")
len(result)

84380

In [11]:
assert len(docs) == len(result)

In [12]:
result.head()

Unnamed: 0,case_id,doc_id,doc_title,doc_date,doc_type,doc_source,doc_url,doc_len,complaint_flag,opinion_flag,...,case_name,case_status,case_state,court_name,case_type,case_ongoing,case_special_collections,case_causes_of_action,issue_category,issues
0,43837,141480,New Document,,Deleted,,,134,False,False,...,United States v. State of North Carolina,Approved,North Carolina,Eastern District of North Carolina,Election/Voting Rights,No,Law Firm Antiracism Alliance (LFAA) project,Uniformed and Overseas Citizens Absentia Votin...,[Voting],[Voting: General & Misc.]
1,689,3434,Memorandum Opinion and Order,1999-11-17,Deleted,,,29396,False,True,...,Williams v. Illinois Department of Corrections,Coding Complete,Illinois,Northern District of Illinois,Prison Conditions,No,,"42 U.S.C. § 1983, Americans with Disabilities ...","[Disability and Disability Rights, Discriminat...","[Disability (inc. reasonable accommodations), ..."
2,43829,142757,Notice of Appeal,2009-04-30,Coding Complete,PACER [Public Access to Court Electronic Records],https://www.courtlistener.com/docket/12437212/...,2383,False,False,...,City of College Park v. City of Atlanta,Approved,Georgia,Northern District of Georgia,Election/Voting Rights,Yes,Law Firm Antiracism Alliance (LFAA) project,"Voting Rights Act, section 5, 52 U.S.C. § 1030...","[Affected Race(s), Discrimination Basis, Voting]","[Black, Race discrimination, Voting: General &..."
3,43829,142758,Joint Motion for Relief from Order and Opinion...,2010-05-25,Coding Complete,PACER [Public Access to Court Electronic Records],https://www.courtlistener.com/docket/12437212/...,550,False,True,...,City of College Park v. City of Atlanta,Approved,Georgia,Northern District of Georgia,Election/Voting Rights,Yes,Law Firm Antiracism Alliance (LFAA) project,"Voting Rights Act, section 5, 52 U.S.C. § 1030...","[Affected Race(s), Discrimination Basis, Voting]","[Black, Race discrimination, Voting: General &..."
4,43829,142765,Order,2010-06-09,Coding Complete,PACER [Public Access to Court Electronic Records],https://www.courtlistener.com/docket/12437212/...,67,False,False,...,City of College Park v. City of Atlanta,Approved,Georgia,Northern District of Georgia,Election/Voting Rights,Yes,Law Firm Antiracism Alliance (LFAA) project,"Voting Rights Act, section 5, 52 U.S.C. § 1030...","[Affected Race(s), Discrimination Basis, Voting]","[Black, Race discrimination, Voting: General &..."


## Save the result df for future use

In [13]:
result.to_json("data/case_issues_documents.json")

## Do some EDA

In [14]:
result["case_id"].nunique()

10464

Of ~10K cases, ~6.8K have at least one complaint document

In [15]:
result[result['complaint_flag'] == 1]['case_id'].nunique()

6838

Of ~10K cases, ~8.3K have at least one complaint document or at least one opinion document

In [16]:
result[(result['complaint_flag'] == 1) | (result['opinion_flag'] == 1)]['case_id'].nunique()

8265

## Given the complaint & opinion have the most useful information, filter for only complaints & opinions

In [17]:
meaningful = result[(result['complaint_flag'] == 1) | (result['opinion_flag'] == 1)]
len(meaningful)

22966

In [18]:
meaningful.head()

Unnamed: 0,case_id,doc_id,doc_title,doc_date,doc_type,doc_source,doc_url,doc_len,complaint_flag,opinion_flag,...,case_name,case_status,case_state,court_name,case_type,case_ongoing,case_special_collections,case_causes_of_action,issue_category,issues
1,689,3434,Memorandum Opinion and Order,1999-11-17,Deleted,,,29396,False,True,...,Williams v. Illinois Department of Corrections,Coding Complete,Illinois,Northern District of Illinois,Prison Conditions,No,,"42 U.S.C. § 1983, Americans with Disabilities ...","[Disability and Disability Rights, Discriminat...","[Disability (inc. reasonable accommodations), ..."
3,43829,142758,Joint Motion for Relief from Order and Opinion...,2010-05-25,Coding Complete,PACER [Public Access to Court Electronic Records],https://www.courtlistener.com/docket/12437212/...,550,False,True,...,City of College Park v. City of Atlanta,Approved,Georgia,Northern District of Georgia,Election/Voting Rights,Yes,Law Firm Antiracism Alliance (LFAA) project,"Voting Rights Act, section 5, 52 U.S.C. § 1030...","[Affected Race(s), Discrimination Basis, Voting]","[Black, Race discrimination, Voting: General &..."
9,17891,110352,Opinion,2020-10-23,Coding Complete,,,19824,False,True,...,Common Cause Indiana v. Lawson,Coding Complete,Indiana,Southern District of Indiana,Election/Voting Rights,No,"COVID-19 (novel coronavirus), Healthy Election...",42 U.S.C. § 1983,[Voting],"[Election administration, Voting: General & Mi..."
28,199,885,Opinion and Order (Granting in part and denyin...,2000-06-06,Deleted,Westlaw,,99451,False,True,...,Benjamin v. Horn,Approved,New York,Southern District of New York,Jail Conditions,Yes,,42 U.S.C. § 1983,"[Affected Sex/Gender(s), General/Misc., Jails,...","[Access to lawyers or judicial system, Bathing..."
37,12206,42378,Complaint,2010-03-08,Coding Complete,State Court Website,,41688,True,False,...,Jones v. Hobbs,Approved,Arkansas,Arkansas state trial court,Criminal Justice (Other),Yes,,State law,[Death Penalty],"[Lethal Injection - Chemicals Used, Lethal Inj..."


Majority of the cases have at most 3 documents, given they could have more than one complaint or opinion for each case. Some cases have many documents, this is for cases that have numerous dockets that span across years.

In [19]:
meaningful["case_id"].value_counts().describe()

count    8265.000000
mean        2.778705
std         3.468365
min         1.000000
25%         1.000000
50%         2.000000
75%         3.000000
max        86.000000
Name: count, dtype: float64

## Remove any documents have fewer than \~250 tokens (\~1000 chars) as they are unlikely to contain meaningful information for the prediction

In [20]:
meaningful = meaningful[meaningful["doc_len"] > (250 * 4)]
len(meaningful)

22391

In [21]:
meaningful["case_id"].nunique()

8170

## Save complaint & opinion df for future use

In [22]:
meaningful.to_json("data/case_with_complaint_opinion.json")