# Prepare input data

This notebook outlines the steps undertook to prepare the input data.

The csv file and the excerpts were generated and provided by Pau. Below are the steps I undertook to prepare the data for my experiments.

# Import libraries

In [1]:
import os
import numpy as np
import pandas as pd

# Clean up the csv file

From some prior experiments, I identified a few examples with excerpts that are either empty or insufficent to identify whether a Target Case was overruled or not. I excluded these cases from the dataset.

In [2]:
df = pd.read_csv("../prior_experiments/evals_share/courtlistener_dataset_with_overrulings.csv", sep="\t")
len(df)

1099

In [6]:
file_mapping = {}

for index, row in df.iterrows():
    citing_id = row["citing_opinion_id"]
    cited_id = row["cited_opinion_id"]

    filename = f"{index + 1:04d}.{citing_id}_cites_{cited_id}.txt"
    file_mapping[(citing_id, cited_id)] = filename
    print(f"Processed: {filename}")

# Add filenames to the DataFrame
df["filename"] = df.apply(
    lambda row: file_mapping.get((row["citing_opinion_id"], row["cited_opinion_id"])), axis=1
)

len(df)

Processed: 0001.91306_cites_88061.txt
Processed: 0002.91306_cites_88994.txt
Processed: 0003.91306_cites_87633.txt
Processed: 0004.91306_cites_88240.txt
Processed: 0005.91306_cites_88693.txt
Processed: 0006.91306_cites_88924.txt
Processed: 0007.91306_cites_89668.txt
Processed: 0008.91306_cites_90400.txt
Processed: 0009.92059_cites_91368.txt
Processed: 0010.92059_cites_91371.txt
Processed: 0011.92059_cites_88673.txt
Processed: 0012.92059_cites_87371.txt
Processed: 0013.92059_cites_91583.txt
Processed: 0014.92059_cites_90667.txt
Processed: 0015.92059_cites_85451.txt
Processed: 0016.92291_cites_88699.txt
Processed: 0017.92291_cites_89664.txt
Processed: 0018.92291_cites_90600.txt
Processed: 0019.92291_cites_91869.txt
Processed: 0020.92291_cites_91985.txt
Processed: 0021.93311_cites_92988.txt
Processed: 0022.93311_cites_87743.txt
Processed: 0023.93311_cites_91125.txt
Processed: 0024.93311_cites_87776.txt
Processed: 0025.93311_cites_88640.txt
Processed: 0026.93904_cites_93354.txt
Processed: 0

1099

## Clean up the column names

In [7]:
df = df[["filename", "citing_opinion_id", "citing_index", "court", "docket_id", "cluster_id", "citing_decision_name",
         "cited_opinion_id", "cited_index", "depth", "cited_decision_name", "overruled", "notes"]]
df.head()

Unnamed: 0,filename,citing_opinion_id,citing_index,court,docket_id,cluster_id,citing_decision_name,cited_opinion_id,cited_index,depth,cited_decision_name,overruled,notes
0,0001.91306_cites_88061.txt,91306,0,scotus,2297559,91306,"Morgan v. United States,113 U.S. 476 (1885)",88061,0,3,Texas v. White (1869),yes,
1,0002.91306_cites_88994.txt,91306,0,scotus,2297559,91306,"Morgan v. United States,113 U.S. 476 (1885)",88994,1,2,Vermilye & Co. v. Adams Express Co. (1875),no,
2,0003.91306_cites_87633.txt,91306,0,scotus,2297559,91306,"Morgan v. United States,113 U.S. 476 (1885)",87633,2,1,Murray v. Lardner (1865),no,
3,0004.91306_cites_88240.txt,91306,0,scotus,2297559,91306,"Morgan v. United States,113 U.S. 476 (1885)",88240,3,1,Texas v. Hardenberg (1869),no,
4,0005.91306_cites_88693.txt,91306,0,scotus,2297559,91306,"Morgan v. United States,113 U.S. 476 (1885)",88693,4,1,Huntington v. Texas (1873),no,


# Save the cleaned up csv

In [8]:
df.to_csv("data/dataset.csv", index=False)