# Prepare model data

Since the actual contents of the documents were saved as separate txt files to save memory when working with dfs, this notebook outlines the steps to actually combine all the contents (and metadata) for a case + the issue category labels for modeling.

# Import Libraries

In [1]:
import pandas as pd

# Helper Functions

In [2]:
def load_content(filename):
    try:
        with open(f"docs/{filename}.txt", 'r', encoding='utf-8') as f:
            return f.read()
    except Exception as e:
        print(f"[Error reading {filename}: {e}]")
        return ""

def save_content(filepath, content):
    try:
        with open(filepath, 'w', encoding='utf-8') as f:
            f.write(content)
    except Exception as e:
        print(f"[Error saving {filepath}: {e}]")
    

def process_cases(df, filepath):
    case_groups = df.groupby("case_id")
    records = []

    for case_id, group in case_groups:
        first_row = group.iloc[0]

        # Compose the metadata sentence
        metadata_sentence = (
            f"The case \"{first_row['case_name']}\" from {first_row['case_state']} "
            f"was filed in {first_row['court_name']}."
        )

        # Compose the document text
        document_texts = []
        for _, row in group.iterrows():
            content = load_content(row['case_doc_id'])
            document_texts.append(f"\n\nDocument Title: {row['doc_title']}\nDocument Content: \n{content}")

        # Combine metadata and document texts
        full_content = metadata_sentence + ''.join(document_texts)

        # Save the content to file
        save_content(f"{filepath}{case_id}.txt", full_content)

        # Add summary info for metadata
        records.append({
            "case_id": case_id,
            "content": full_content,
            "num_tokens": int(len(full_content) / 4),  # Rough token estimate
            "issue_category": first_row["issue_category"]
        })

    return pd.DataFrame(records)

def process_df(filename, og_path, new_path):
    print(f"Processing: {filename}")

    df = pd.read_json(f"{og_path}{filename}.json")

    df["case_doc_id"] = df["case_id"].astype(str) + "_" + df["doc_id"].astype(str)
    df = df[["case_id", "doc_id", "case_doc_id", "doc_title", "case_name", "case_state", "court_name", "issue_category"]]
    
    processed_df = process_cases(df, filepath=f"{new_path}docs/")
    processed_df.to_json(f"{new_path}{filename}.json")

# Prepare train data

In [3]:
abbreviations = {
    'COVID-19': 'COVID',
    'Benefits (Source)': 'BENEFITS',
    'LGBTQ+': 'LGBTQ',
    'Reproductive rights': 'REPRO',
    'Policing': 'POLICING',
    'Affected National Origin/Ethnicity(s)': 'NATION_ORIG',
    'Voting': 'VOTE',
    'Immigration/Border': 'IMMIGRATION',
    'Medical/Mental Health Care': 'MED',
    'Disability and Disability Rights': 'DISABILITY',
    'Affected Race(s)': 'RACE',
    'EEOC-centric': 'EEOC',
    'Jails, Prisons, Detention Centers, and Other Institutions': 'PRISON',
    'Affected Sex/Gender(s)': 'GENDER',
    'Discrimination Area': 'DISC_AREA',
    'Discrimination Basis': 'DISC_BASE',
    'General/Misc.': 'GENERAL'
}

filenames = list(abbreviations.values())
filenames

['COVID',
 'BENEFITS',
 'LGBTQ',
 'REPRO',
 'POLICING',
 'NATION_ORIG',
 'VOTE',
 'IMMIGRATION',
 'MED',
 'DISABILITY',
 'RACE',
 'EEOC',
 'PRISON',
 'GENDER',
 'DISC_AREA',
 'DISC_BASE',
 'GENERAL']

In [4]:
for filename in filenames:
    process_df(filename, og_path="data/train/", new_path="data/train/clean/")

Processing: COVID
Processing: BENEFITS
Processing: LGBTQ
Processing: REPRO
Processing: POLICING
Processing: NATION_ORIG
Processing: VOTE
Processing: IMMIGRATION
Processing: MED
Processing: DISABILITY
Processing: RACE
Processing: EEOC
Processing: PRISON
Processing: GENDER
Processing: DISC_AREA
Processing: DISC_BASE
Processing: GENERAL


# Prepare val & test data

In [5]:
process_df("val", og_path="data/val/", new_path="data/val/clean/")

Processing: val


In [6]:
process_df("test", og_path="data/test/", new_path="data/test/clean/")

Processing: test
