# Case Issues data cleaning

The labels we are most interested in are case issues and causes of action. This notebook outlines the steps undertook to generate and clean the data for case issues, to be used for model training and evaluation.

# Import libaries

In [1]:
import numpy as np
import pandas as pd

import re
from datetime import datetime

from sklearn.preprocessing import MultiLabelBinarizer

from clearinghouse_util import fetch_all_case_data

# Load the data

In [2]:
df = pd.read_json("0.clearinghouse_all_case_data.json")
df.head()

Unnamed: 0,case_id,case_name,case_status,case_state,court_name,case_type,case_ongoing,case_special_collections,case_causes_of_action,case_issues,response,response_head
0,8429,"EEOC v. EADS AEROFRAME SERVICES, LLC",Coding Complete,Louisiana,Western District of Louisiana,Equal Employment,No,"EEOC Study — in sample, Multi-LexSum (in sample)","Title VII (including PDA), 42 U.S.C. § 2000e","Race discrimination, Black, Hispanic, White, R...","{'attorney_orgs': ['EEOC'], 'available_documen...",attorney_orgs
1,9719,Regan v. Salt Lake County,Coding Complete,Utah,District of Utah,Jail Conditions,No,Strip Search Cases,42 U.S.C. § 1983,"Search policies, Strip search policy (faciliti...","{'attorney_orgs': [], 'available_documents': [...",attorney_orgs
2,18526,Kerrigan v. Philadelphia Board of Elections,Coding Complete,Pennsylvania,Eastern District of Pennsylvania,Disability Rights,No,,"Americans with Disabilities Act (ADA), 42 U.S....","Disability (inc. reasonable accommodations), M...",{'attorney_orgs': ['NDRN/Protection & Advocacy...,attorney_orgs
3,6263,EEOC v. TRANSIT MIX CONCRETE,Coding Complete,Texas,Northern District of Texas,Equal Employment,No,"EEOC Study — in sample, IWPR/Wage Project Cons...","Title VII (including PDA), 42 U.S.C. § 2000e","Race discrimination, Black, Disparate Treatmen...","{'attorney_orgs': ['EEOC'], 'available_documen...",attorney_orgs
4,10082,Shorter v. DC,Coding Complete,District of Columbia,District of District of Columbia,Policing,No,,42 U.S.C. § 1983,"Hearing impairment, Disability (inc. reasonabl...",{'attorney_orgs': ['Washington Lawyers' Commit...,attorney_orgs


In [3]:
assert len(df) == df["case_id"].nunique()

# Initial EDA on the overall dataset

In [4]:
len(df)

10796

In [5]:
df.columns

Index(['case_id', 'case_name', 'case_status', 'case_state', 'court_name',
       'case_type', 'case_ongoing', 'case_special_collections',
       'case_causes_of_action', 'case_issues', 'response', 'response_head'],
      dtype='object')

In [6]:
for each in ['case_status', 'case_state', 'court_name',
       'case_type', 'case_ongoing', 'case_special_collections',
       'case_causes_of_action', 'case_issues']:
    print("---")
    print(df[each].nunique())
    print(df[each].value_counts())

---
2
case_status
Coding Complete    5590
Approved           5206
Name: count, dtype: int64
---
57
case_state
California                      1114
New York                         848
Texas                            689
District of Columbia             645
Illinois                         556
Pennsylvania                     410
Michigan                         403
Florida                          397
Alabama                          297
Georgia                          287
Louisiana                        279
Washington                       260
Tennessee                        251
Arizona                          249
Maryland                         249
Ohio                             237
North Carolina                   226
Missouri                         222
Massachusetts                    215
Virginia                         202
Indiana                          201
Colorado                         199
New Jersey                       191
Mississippi                      171
Mi

# Split up the case_issues to individual labels

In [7]:
def issue_split(text):
    text = str(text)
    parts = re.split(r',\s*(?![^()]*\))', text)
    return [part.strip() for part in parts]

In [8]:
df['issues'] = df['case_issues'].apply(issue_split)
df['issues_len'] = df['issues'].apply(len)
df['issues_len'].value_counts()

issues_len
2     1406
3     1183
5     1166
4     1125
1     1106
6     1024
7      855
8      660
9      455
10     347
11     260
13     186
12     184
14     121
15     115
16      90
17      80
18      63
19      53
22      38
21      33
23      32
20      31
24      27
25      23
28      20
29      16
30      15
26      15
31       9
27       8
32       7
36       7
34       6
33       5
38       5
35       4
44       3
37       3
42       2
39       2
40       2
47       1
46       1
48       1
43       1
Name: count, dtype: int64

In [9]:
file_path = '1.clean_issues.json'
df.to_json(file_path)

In [10]:
mlb = MultiLabelBinarizer()
binary_matrix = mlb.fit_transform(df['issues'].to_list())
binary_matrix.shape

(10796, 405)

In [11]:
label_df = pd.DataFrame(binary_matrix, columns=mlb.classes_)
label_df.head()

Unnamed: 0,42 U.S.C.A. §§ 11301 et seq.,ATM Machines,Abortion,Access (physical),Access to information systems,Access to lawyers or judicial system,Access to public accommodations - governmental,Access to public accommodations - privately owned,Accommodation / Leave,Administrative segregation,...,White,Work authorization - criteria,Work authorization - procedures,Work release or work assignments,Wound care,Youth / Adult separation,Zoning,administration of,general,unspecified
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
label_df.sum().sort_values(ascending=False)

Disparate Treatment                                          3372
Direct Suit on Merits                                        2469
Female                                                       1960
Sex discrimination                                           1656
Race discrimination                                          1563
                                                             ... 
Sikhism                                                         1
Currency                                                        1
Buddhism                                                        1
Underground Storage Tank (UST) leakage                          1
Local / state enforcement of immigration laws (duplicate)       1
Length: 405, dtype: int64

# Keep only the issues that are most frequently occurring (> 90 percentile)

In [13]:
label_counts = label_df.sum()

threshold = label_counts.quantile(0.90)
labels = label_counts[label_counts >= threshold]
labels.sort_values(ascending=False)

Disparate Treatment                                                                           3372
Direct Suit on Merits                                                                         2469
Female                                                                                        1960
Sex discrimination                                                                            1656
Race discrimination                                                                           1563
Discharge / Constructive Discharge / Layoff                                                   1256
Harassment / Hostile Work Environment                                                         1141
Disability (inc. reasonable accommodations)                                                   1090
Black                                                                                         1084
general                                                                                       1034
Retaliatio

In [14]:
label_set = set(labels.index.tolist())
df['filtered_issues'] = df['issues'].apply(lambda labels: [label for label in labels if label in label_set])

In [15]:
df_filtered = df[df['filtered_issues'].str.len() > 0].reset_index(drop=True)
len(df_filtered)

9125

In [16]:
file_path = '2.filtered_issues.json'
df_filtered.to_json(file_path)