# Clean the joined data & look at the category distribution

# Import Libaries

In [1]:
import pandas as pd

from sklearn.preprocessing import MultiLabelBinarizer

# Load the dataset

In [2]:
df = pd.read_json("data/case_with_complaint_opinion.json")

## Do some EDA on the cases with complaint or opinion

In [3]:
subset = df[["case_id", "case_state", "court_name", "case_type", "case_ongoing", "issue_category", "issues"]]
subset["issue_category"] = subset["issue_category"].apply(tuple)
subset["issues"] = subset["issues"].apply(tuple)
subset = subset.drop_duplicates()
assert len(subset) == subset["case_id"].nunique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset["issue_category"] = subset["issue_category"].apply(tuple)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset["issues"] = subset["issues"].apply(tuple)


In [4]:
cols = ["case_state", "court_name", "case_type", "case_ongoing"]

In [5]:
for col in cols:
    print("---", col, "---")
    display(subset[col].nunique())
    display(subset[col].value_counts())

--- case_state ---


55

case_state
California                  795
New York                    654
District of Columbia        539
Texas                       503
Illinois                    468
Florida                     338
Michigan                    329
Pennsylvania                310
Alabama                     246
Georgia                     212
Washington                  205
Louisiana                   192
Maryland                    191
North Carolina              183
Ohio                        179
Massachusetts               172
Arizona                     160
Missouri                    160
Indiana                     151
Tennessee                   143
New Jersey                  142
Colorado                    142
Virginia                    127
Mississippi                 116
Arkansas                    103
Minnesota                    88
Oregon                       87
Connecticut                  86
Wisconsin                    85
New Mexico                   84
Kentucky                     

--- court_name ---


185

court_name
District of District of Columbia               481
Northern District of Illinois                  392
Northern District of California                372
Southern District of New York                  367
Eastern District of Michigan                   232
                                              ... 
U.S. Court of Appeals for the Fifth Circuit      1
Kansas state appellate court                     1
Massachusetts state appellate court              1
Washington state appellate court                 1
South Dakota state trial court                   1
Name: count, Length: 185, dtype: int64

--- case_type ---


27

case_type
Equal Employment                             1914
Prison Conditions                             908
Immigration and/or the Border                 834
Election/Voting Rights                        684
Jail Conditions                               581
Public Benefits/Government Services           420
Disability Rights                             359
Policing                                      335
Criminal Justice (Other)                      291
Speech and Religious Freedom                  280
Healthcare Access and Reproductive Issues     226
Education                                     221
Fair Housing/Lending/Insurance                201
National Security                             198
Juvenile Institution                          153
Presidential/Gubernatorial Authority          125
Intellectual Disability (Facility)             84
Child Welfare                                  75
School Desegregation                           67
Mental Health (Facility)                

--- case_ongoing ---


5

case_ongoing
No                           5646
Yes                          2008
No reason to think so         400
Perhaps, but long-dormant      73
Unknown                        43
Name: count, dtype: int64

## Look at the issue category distribution

In [6]:
mlb = MultiLabelBinarizer()
binary_matrix = mlb.fit_transform(subset['issue_category'].to_list())
binary_matrix.shape

(8170, 22)

In [7]:
category_df = pd.DataFrame(binary_matrix, columns=mlb.classes_)
category_df.sum().sort_values(ascending=False)

General/Misc.                                                5285
Discrimination Basis                                         4192
Discrimination Area                                          2822
Affected Sex/Gender(s)                                       1935
Jails, Prisons, Detention Centers, and Other Institutions    1916
EEOC-centric                                                 1419
Affected Race(s)                                             1131
Disability and Disability Rights                             1090
Medical/Mental Health Care                                   1030
Immigration/Border                                            876
Voting                                                        713
Affected National Origin/Ethnicity(s)                         520
Policing                                                      471
Reproductive rights                                           419
LGBTQ+                                                        371
Benefits (

## Given some categories have very few samples, select only the categories that have at least 200 cases so we have enough data for training

In [8]:
categories = category_df.sum().sort_values(ascending=False).index[:-5]
categories

Index(['General/Misc.', 'Discrimination Basis', 'Discrimination Area',
       'Affected Sex/Gender(s)',
       'Jails, Prisons, Detention Centers, and Other Institutions',
       'EEOC-centric', 'Affected Race(s)', 'Disability and Disability Rights',
       'Medical/Mental Health Care', 'Immigration/Border', 'Voting',
       'Affected National Origin/Ethnicity(s)', 'Policing',
       'Reproductive rights', 'LGBTQ+', 'Benefits (Source)', 'COVID-19'],
      dtype='object')

In [9]:
len(categories)

17

In [10]:
len(df)

22391

In [11]:
df['issue_category'] = df['issue_category'].apply(lambda labels: [label for label in labels if label in categories])
df = df[df['issue_category'].str.len() > 0].reset_index(drop=True)
len(df)

22170

In [12]:
df["case_id"].nunique()

8078

## Inspect the token distribution by case_id

Majority of the cases have more than 40K tokens, with very few cases have excessively long (and likely many) documents. Given the model context window 8,192 tokens, we need to apply some kind of aggregation on the embeddings by case such that we end up with one embedding per case.

In [13]:
token_df = df[["case_id", "doc_len"]].groupby("case_id").sum() / 4
token_df = token_df.rename(columns={"doc_len": "num_tokens"})

pd.set_option('display.float_format', '{:,.0f}'.format)  # No decimals, add commas
token_df.describe()

Unnamed: 0,num_tokens
count,8078
mean,36105
std,60463
min,253
25%,5911
50%,16612
75%,41350
max,1020302


## Save the cleaned joined data for future use

In [14]:
df.to_json("data/clean_joined.json")