# Split the joined data to train/val/test sets for experiments

# Import Libaries

In [1]:
import random
random.seed(1234)

import pandas as pd

from sklearn.preprocessing import MultiLabelBinarizer

# Load the data

In [3]:
df = pd.read_json("data/clean_joined.json")
df.head()

Unnamed: 0,case_id,doc_id,doc_title,doc_date,doc_type,doc_source,doc_url,doc_len,complaint_flag,opinion_flag,...,case_name,case_status,case_state,court_name,case_type,case_ongoing,case_special_collections,case_causes_of_action,issue_category,issues
0,689,3434,Memorandum Opinion and Order,1999-11-17,Deleted,,,29396,False,True,...,Williams v. Illinois Department of Corrections,Coding Complete,Illinois,Northern District of Illinois,Prison Conditions,No,,"42 U.S.C. § 1983, Americans with Disabilities ...","[Disability and Disability Rights, Discriminat...","[Disability (inc. reasonable accommodations), ..."
1,17891,110352,Opinion,2020-10-23,Coding Complete,,,19824,False,True,...,Common Cause Indiana v. Lawson,Coding Complete,Indiana,Southern District of Indiana,Election/Voting Rights,No,"COVID-19 (novel coronavirus), Healthy Election...",42 U.S.C. § 1983,[Voting],"[Election administration, Voting: General & Mi..."
2,199,885,Opinion and Order (Granting in part and denyin...,2000-06-06,Deleted,Westlaw,,99451,False,True,...,Benjamin v. Horn,Approved,New York,Southern District of New York,Jail Conditions,Yes,,42 U.S.C. § 1983,"[Affected Sex/Gender(s), General/Misc., Jails,...","[Access to lawyers or judicial system, Bathing..."
3,45341,154633,Opinion,2001-04-13,Coding Complete,RECAP,https://www.courtlistener.com/docket/6164539/1...,1042,False,True,...,Gregorio T v. Pete Wilson,Approved,California,Central District of California,Immigration and/or the Border,"Perhaps, but long-dormant",,"42 U.S.C. § 1983, Ex parte Young (federal or s...","[General/Misc., Immigration/Border]","[Education, Government services, Undocumented ..."
4,43823,150624,Opinion,2018-03-12,Coding Complete,RECAP,https://plus.lexis.com/document?pdmfid=1530671...,3320,False,True,...,Common Cause v. Kemp,Approved,Georgia,Northern District of Georgia,Election/Voting Rights,No,Law Firm Antiracism Alliance (LFAA) project,"National Voter Registration Act (""Motor Voter ...",[Voting],"[Voter registration rules, Voting: General & M..."


## Given how imbalanced the categories are (some categories appear in many cases while other categories only appear in a handful of cases), the best way to produce a balanced dataset for training is to generate binary-classification dataset & models

In [3]:
cases = df[["case_id", "issue_category"]]
cases["issue_category"] = cases["issue_category"].apply(tuple)
cases = cases.drop_duplicates()
assert len(cases) == cases["case_id"].nunique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cases["issue_category"] = cases["issue_category"].apply(tuple)


In [4]:
mlb = MultiLabelBinarizer()
binary_matrix = mlb.fit_transform(cases['issue_category'].to_list())
binary_matrix.shape

(8078, 17)

In [5]:
category_df = pd.DataFrame(binary_matrix, columns=mlb.classes_)
category_df.sum().sort_values(ascending=False)

General/Misc.                                                5285
Discrimination Basis                                         4192
Discrimination Area                                          2822
Affected Sex/Gender(s)                                       1935
Jails, Prisons, Detention Centers, and Other Institutions    1916
EEOC-centric                                                 1419
Affected Race(s)                                             1131
Disability and Disability Rights                             1090
Medical/Mental Health Care                                   1030
Immigration/Border                                            876
Voting                                                        713
Affected National Origin/Ethnicity(s)                         520
Policing                                                      471
Reproductive rights                                           419
LGBTQ+                                                        371
Benefits (

In [6]:
category_df['case_id'] = cases['case_id'].values

## To ensure no case level data leakage, we will start by sampling from the least common categories, while keeping track of the case ids in train set

In [7]:
cats = category_df.sum().sort_values(ascending=False).index.drop("case_id")

In [8]:
for cat in cats[::-1]:
    print(cat)

COVID-19
Benefits (Source)
LGBTQ+
Reproductive rights
Policing
Affected National Origin/Ethnicity(s)
Voting
Immigration/Border
Medical/Mental Health Care
Disability and Disability Rights
Affected Race(s)
EEOC-centric
Jails, Prisons, Detention Centers, and Other Institutions
Affected Sex/Gender(s)
Discrimination Area
Discrimination Basis
General/Misc.


In [9]:
train = {}
num_samples = 120

for cat in cats[::-1]:

    # Flatten all case_ids already in train
    train_ids = set(train_id for ids in train.values() for train_id in ids)
    
    # Get list of all case_ids for this category
    cat_ids = category_df[category_df[cat] == 1]['case_id'].to_list()

    # Filter cat_ids already in train_ids
    existing_ids = [cat_id for cat_id in cat_ids if cat_id in train_ids]

    # If we already have enough ids in train, do not add more, simply sample from the existing ids
    if len(existing_ids) >= num_samples:
        sampled_ids = random.sample(existing_ids, num_samples)
    # Otherwise, get more ids to be added to train_id
    else:
        remaining_ids = [cat_id for cat_id in cat_ids if cat_id not in train_ids]
        addl_ids = random.sample(remaining_ids, (num_samples - len(existing_ids)))
        sampled_ids = list(set(existing_ids + addl_ids))

    # Also get equal number of negative samples
    neg_ids = category_df[category_df[cat] == 0]['case_id'].to_list()
    neg_samples = random.sample(neg_ids, num_samples)
    
    train[cat] = list(set(sampled_ids + neg_samples))

In [10]:
for key, val in train.items():
    print(key)
    print(len(val))

COVID-19
240
Benefits (Source)
240
LGBTQ+
240
Reproductive rights
240
Policing
240
Affected National Origin/Ethnicity(s)
240
Voting
240
Immigration/Border
240
Medical/Mental Health Care
240
Disability and Disability Rights
240
Affected Race(s)
240
EEOC-centric
240
Jails, Prisons, Detention Centers, and Other Institutions
240
Affected Sex/Gender(s)
240
Discrimination Area
240
Discrimination Basis
240
General/Misc.
240


In [11]:
train_ids = list(set(train_id for ids in train.values() for train_id in ids))
len(train_ids)

2299

## We have sampled ~2K cases for training, now we will randomly sample ~800 cases from the rest of the cases to be put into val/test, while adhereing to the underlying distribution of categories & ensuring the cases in val are not in test

In [12]:
remaining_ids = category_df[~category_df['case_id'].isin(train_ids)]["case_id"].to_list()
len(remaining_ids)

5779

In [13]:
val_ids = random.sample(remaining_ids, 800)
len(val_ids)

800

In [14]:
val_df = category_df[category_df['case_id'].isin(val_ids)]
val_df.sum().sort_values(ascending=False)

case_id                                                      13660250
General/Misc.                                                     546
Discrimination Basis                                              422
Discrimination Area                                               302
Affected Sex/Gender(s)                                            222
Jails, Prisons, Detention Centers, and Other Institutions         187
EEOC-centric                                                      148
Affected Race(s)                                                  106
Disability and Disability Rights                                  104
Medical/Mental Health Care                                        101
Immigration/Border                                                 85
Voting                                                             53
Reproductive rights                                                41
Affected National Origin/Ethnicity(s)                              40
Policing            

In [15]:
remaining_ids = category_df[~((category_df['case_id'].isin(train_ids)) | (category_df['case_id'].isin(val_ids)))]["case_id"].to_list()
len(remaining_ids)

4979

In [16]:
test_ids = random.sample(remaining_ids, 800)
len(test_ids)

800

In [17]:
test_df = category_df[category_df['case_id'].isin(test_ids)]
test_df.sum().sort_values(ascending=False)

case_id                                                      13622053
General/Misc.                                                     548
Discrimination Basis                                              440
Discrimination Area                                               308
Affected Sex/Gender(s)                                            196
Jails, Prisons, Detention Centers, and Other Institutions         190
EEOC-centric                                                      154
Affected Race(s)                                                  138
Medical/Mental Health Care                                        103
Disability and Disability Rights                                  101
Immigration/Border                                                 90
Voting                                                             59
Affected National Origin/Ethnicity(s)                              53
Policing                                                           45
Reproductive rights 

## Produce dataframes for train, val & test

In [19]:
train.keys()

dict_keys(['COVID-19', 'Benefits (Source)', 'LGBTQ+', 'Reproductive rights', 'Policing', 'Affected National Origin/Ethnicity(s)', 'Voting', 'Immigration/Border', 'Medical/Mental Health Care', 'Disability and Disability Rights', 'Affected Race(s)', 'EEOC-centric', 'Jails, Prisons, Detention Centers, and Other Institutions', 'Affected Sex/Gender(s)', 'Discrimination Area', 'Discrimination Basis', 'General/Misc.'])

In [20]:
abbreviations = {
    'COVID-19': 'COVID',
    'Benefits (Source)': 'BENEFITS',
    'LGBTQ+': 'LGBTQ',
    'Reproductive rights': 'REPRO',
    'Policing': 'POLICING',
    'Affected National Origin/Ethnicity(s)': 'NATION_ORIG',
    'Voting': 'VOTE',
    'Immigration/Border': 'IMMIGRATION',
    'Medical/Mental Health Care': 'MED',
    'Disability and Disability Rights': 'DISABILITY',
    'Affected Race(s)': 'RACE',
    'EEOC-centric': 'EEOC',
    'Jails, Prisons, Detention Centers, and Other Institutions': 'PRISON',
    'Affected Sex/Gender(s)': 'GENDER',
    'Discrimination Area': 'DISC_AREA',
    'Discrimination Basis': 'DISC_BASE',
    'General/Misc.': 'GENERAL'
}

In [25]:
filenames = []

for key, value in train.items():
    sub_df = df[df["case_id"].isin(value)]
    filename = f"data/train/{abbreviations[key]}.json"
    sub_df.to_json(filename)
    filenames.append(filename)

In [34]:
for i, filename in enumerate(filenames):
    reload = pd.read_json(filename)
    print("------")
    print(filename)
    print("number of records: ", len(reload))
    
    cases = reload[["case_id", "issue_category"]]
    cases["issue_category"] = cases["issue_category"].apply(tuple)
    cases = cases.drop_duplicates()
    
    mlb = MultiLabelBinarizer()
    binary_matrix = mlb.fit_transform(cases['issue_category'].to_list())
    category_df = pd.DataFrame(binary_matrix, columns=mlb.classes_)

    print("number of pos cases: ", len(category_df[category_df[list(train.keys())[i]] == 1]))
    print("number of neg cases: ", len(category_df[category_df[list(train.keys())[i]] == 0]))

------
data/train/COVID.json
number of records:  794
number of pos cases:  120
number of neg cases:  120
------
data/train/BENEFITS.json
number of records:  663
number of pos cases:  120
number of neg cases:  120
------
data/train/LGBTQ.json
number of records:  568
number of pos cases:  120
number of neg cases:  120
------
data/train/REPRO.json
number of records:  621
number of pos cases:  120
number of neg cases:  120
------
data/train/POLICING.json
number of records:  666
number of pos cases:  120
number of neg cases:  120
------
data/train/NATION_ORIG.json
number of records:  707
number of pos cases:  120
number of neg cases:  120
------
data/train/VOTE.json
number of records:  699
number of pos cases:  120
number of neg cases:  120
------
data/train/IMMIGRATION.json
number of records:  740
number of pos cases:  120
number of neg cases:  120
------
data/train/MED.json
number of records:  658
number of pos cases:  120
number of neg cases:  120
------
data/train/DISABILITY.json
number

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cases["issue_category"] = cases["issue_category"].apply(tuple)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cases["issue_category"] = cases["issue_category"].apply(tuple)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cases["issue_category"] = cases["issue_category"].apply(tuple)
A value is tryin

In [35]:
sub_df = df[df["case_id"].isin(val_ids)]
sub_df.to_json("data/val/val.json")
print(len(sub_df))

2184


In [37]:
sub_df = df[df["case_id"].isin(test_ids)]
sub_df.to_json("data/test/test.json")
print(len(sub_df))

2310
