<a href="https://colab.research.google.com/github/davidisinta/AI/blob/main/law_llms_p2_2_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [28]:
import re

# Regex to match case names (start of a case)
case_name_pattern = re.compile(r'^[A-Z][a-zA-Z.\- ]+ v\. [A-Z][a-zA-Z.\- ]+')

# Read file
file_path = "combined-text.txt"


cases = []
cases_headers = []
current_case = []
blank_line_count = 0

line_num = 0
page = int(0)

with open(file_path, "r", encoding="utf-8") as file:
    for line in file:
        line_num += 1
        page = int(line_num / 10)
        if line.strip() == "":  # Blank line detected
            blank_line_count += 1
            continue

        if blank_line_count >= 2 and case_name_pattern.match(line) or line_num == 1:  # New case starts
            #get rid of prev case
            cases.append("\n".join(current_case))
            current_case = []
            current_case.append(line.strip())  # Start capturing new case
            cases_headers.append(f"{line.strip()}")
        else:  # Continue capturing case content
            current_case.append(line.strip())

        blank_line_count = 0  # Reset blank line counter


# Add last case if file doesn't end with a blank line
if current_case:
    cases.append("\n".join(current_case))

# Print extracted cases and total count
# for case in cases:
#     print(case)
#     print("=" * 80)  # Separator for readability

for case in cases_headers:
    print(case)
    print("=" * 80)  # Separator for readability


print(f"\nTotal cases extracted: {len(cases)}")


Burns v. McGraw-Hill Broadcasting Co., Inc., 9 Media L. Rep. (BNA) 1257 (Colo. 1983)
Garden v. Parfumerie Rigaud, Inc., 151 Misc. 692 (N.Y. Sup. Ct. 1933)
Gautier v. Pro-Football, Inc., 107 N.E.2d 485 (NY 1952)
Belluomo v. Kake TV & Radio, Inc., 596 P.2d 832 (Kan. Ct. App. 1979)
Nobles v. Cartwright, 659 N.E.2d 1064 (Ind. Ct. App. 1995)
Passalacqua v. Naviant, Inc., 844 So. 2d 792 (Fla. Dist. Ct. App. 2003)
McGough v. Nalco Co., 496 F. Supp. 2d 729 (N.D.W. Va. 2007)
National Abortion Federation v. Center for Medical Progress, 926 F.3d 534 (9th Cir. 2019)
Anonsen v. Donahue, 857 S.W.2d 700 (Tex. App. 1993)
Kenol v. Nelson, 181 A.D.2d 863 (N.Y. App. Div. 1992)
Keyzer v. Amerlink, Ltd., 654 S.E.2d 833 (N.C. Ct. App. 2008)
Miller v. Brooks, 47 S.E. 646 (Ga. 1904)
Miami Herald Pub. Co. v. Ane, 10 Media L. Rep. (BNA) 2383 (Fla. 1984)
MILE MARKER INC. v. Petersen Publishing, LLC, 811 So. 2d 841 (Fla. Dist. Ct. App. 2002)
Della-Donna v. Gore Newspapers Co., 489 So. 2d 72 (Fla. Dist. Ct. App. 1

Lets Extract All the Cases Cited by the LLM

In [27]:
import re

# File path
file_path = "Model-output.txt"

# Simple regex to detect court case citations (e.g., "X v. Y")
case_pattern = re.compile(r'\b[A-Z][a-zA-Z]+ v\. [A-Z][a-zA-Z]+(?:, [\w\s.()]+)?')

# Store matched court case names only
court_cases = []

# Read file line by line
with open(file_path, "r", encoding="utf-8") as file:
    for line in file:  # Read each line separately
        matches = case_pattern.findall(line)  # Find all matches in the line
        court_cases.extend(matches)  # Add matches to the list

# Print only the extracted court case names
for case in court_cases:
    print(case)


Hurwitz v. United
Messenger v. Gruner
Andrea v. Fakename, 972 F.Supp. 154
Gautier v. Pro
Weil v. Johnson, Index No. 119431
Piskac v. Shapiro, 230 Conn. 345 (2025). If the exhibition was advertised
Lemerond v. Twentieth
Delan v. CBS, Inc.
Finger v. Omni
Arrington v. New
Candelaria v. Spurlock, the plaintiff appeared briefly in the documentary Super Size Me. The plaintiff did not allege that his appearance was for advertising or trade. The plaintiff did
Spurlock v. Candelaria, 08 Civ. 1830 (BMC) (RER)


In [29]:
import spacy

# Load SpaCy English model
nlp = spacy.load("en_core_web_sm")

# Read file
file_path = "combined-text.txt"

cases = []
cases_headers = []
current_case = []
blank_line_count = 0
line_num = 0

# Function to detect case headers using SpaCy
def is_case_header(line):
    doc = nlp(line.strip())  # Process line with SpaCy
    if "v." in line:  # Basic filter to ensure it looks like a case name
        for ent in doc.ents:
            if ent.label_ == "LAW":  # SpaCy's legal entity recognition (if available)
                return True
        # If no "LAW" entity, fall back on simple pattern match
        return doc[0].is_title and "v." in line
    return False

with open(file_path, "r", encoding="utf-8") as file:
    for line in file:
        line_num += 1

        if line.strip() == "":  # Blank line detected
            blank_line_count += 1
            continue

        # Detect new case start
        if blank_line_count >= 2 and is_case_header(line) or line_num == 1:
            # Store previous case
            if current_case:
                cases.append("\n".join(current_case))

            # Start new case
            current_case = [line.strip()]
            cases_headers.append(line.strip())

        else:
            # Continue collecting lines for the current case
            current_case.append(line.strip())

        blank_line_count = 0  # Reset blank line counter

# Add the last case if the file does not end with a blank line
if current_case:
    cases.append("\n".join(current_case))

# Print extracted case headers
for case in cases_headers:
    print(case)
    print("=" * 80)  # Separator for readability

print(f"\nTotal cases extracted: {len(cases)}")


Burns v. McGraw-Hill Broadcasting Co., Inc., 9 Media L. Rep. (BNA) 1257 (Colo. 1983)
Ken-Do Contracting, L.P. v. F.A. Brown's Construction, LLC D/B/A Brown Construction and Brown's Concrete Construction, 05-19-00228-CV (Tex. App. 2020)
Garden v. Parfumerie Rigaud, Inc., 151 Misc. 692 (N.Y. Sup. Ct. 1933)
Nutrivida, Inc. v. Inmuno Vital, Inc., 46 F. Supp. 2d 1310 (S.D. Fla. 1998)
Gautier v. Pro-Football, Inc., 107 N.E.2d 485 (NY 1952)
Belluomo v. Kake TV & Radio, Inc., 596 P.2d 832 (Kan. Ct. App. 1979)
Michael S. Virgil, AKA Mike Virgil v. Time, Inc., a New York Corporation, 527 F.2d 1122 (9th Cir. 1975)
Nobles v. Cartwright, 659 N.E.2d 1064 (Ind. Ct. App. 1995)
Passalacqua v. Naviant, Inc., 844 So. 2d 792 (Fla. Dist. Ct. App. 2003)
Service Centers of Chicago, Inc. v. Minogue, 535 N.E.2d 1132 (Ill. App. Ct. 1989)
Legal Servicing, LLC v. Lewis, 2021 NY Slip Op 06088
National Abortion Federation v. Center for Medical Progress, 926 F.3d 534 (9th Cir. 2019)
Anonsen v. Donahue, 857 S.W.2d 70

In [30]:
import spacy

# Load SpaCy English model
nlp = spacy.load("en_core_web_sm")

# File path
file_path = "Model-output.txt"

# Store extracted case citations
court_cases = set()  # Using a set to avoid duplicates

# Read and process file
with open(file_path, "r", encoding="utf-8") as file:
    text = file.read()  # Read the entire file at once for better context analysis

# Process text with SpaCy
doc = nlp(text)

# Extract case citations
for sentence in doc.sents:  # Iterate through detected sentences
    if "v." in sentence.text:  # Basic filter to reduce false positives
        court_cases.add(sentence.text.strip())

# Print extracted case citations
for case in court_cases:
    print(case)
    print("=" * 80)  # Separator for readability

print(f"\nTotal court cases extracted: {len(court_cases)}")


See D’Andrea v. Fakename, 972 F.Supp.
Piskac v. Shapiro, 230 Conn. 345 (2025).
L. Rev. 383, 402-03).
In Candelaria v. Spurlock, the plaintiff appeared briefly in the documentary Super Size Me.
See Gautier v. Pro-Football, Inc., 304 N.Y. 354, 359, 107 N.E.2d 485 (1952) (holding that a football player’s image was not used for trade or advertising when it appeared in a newsreel of a football game).
See Spurlock v. Candelaria, 08 Civ. 1830 (BMC) (RER), E.D.N.Y. Jul. 3, 2008).
Arrington v. New York Times Co., 55 N.Y.22d 433, 440, 449 N.Y.S.22d 941, 944, 434 N.E.22d 1319, 1322 (1982).
Lemerond v. Twentieth Century Fox Film Corp.,  564 F.Supp.2d 315, 323 (S.D.N.Y.), aff'd sub nom.
Delan by Delan v. CBS, Inc., 91 A.D.3d 255, 458 N.Y.S.23d 608 (2d Dep’t 2013).
Finger v. Omni Publs.
See Weil v. Johnson, Index No. 119431/02, 2002 WL 31972157, *4-5 (
See Hurwitz v. United States, 884 F.2d 684, 687 (2d Cir. 1989);Messenger v. Gruner Key Symbol Jahr Printing and Publ’g, 94 N.Y.2d 436, 441, 706 N.Y.S

COMBINATION STRUCTURE

In [32]:
# Extract cases from both files
case_database = cases_headers
cited_cases = court_cases

print(f"Total real cases: {len(case_database)}, Total cited cases: {len(cited_cases)}")


Total real cases: 222, Total cited cases: 12


In [33]:
from sentence_transformers import SentenceTransformer

# Load embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Generate embeddings
case_embeddings = {case: model.encode(case) for case in case_database}
cited_embeddings = {case: model.encode(case) for case in cited_cases}


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [36]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m32.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0


In [37]:
import faiss
import numpy as np

# Get embedding dimension
dim = len(next(iter(case_embeddings.values())))

# Create FAISS index
index = faiss.IndexFlatL2(dim)

# Convert case embeddings to numpy arrays
case_names = list(case_embeddings.keys())
case_vectors = np.array(list(case_embeddings.values()), dtype=np.float32)

# Add embeddings to FAISS index
index.add(case_vectors)

# Function to find closest match
def find_nearest_case(query_case):
    """Find the closest matching case in the FAISS index."""
    query_vec = np.array([cited_embeddings[query_case]], dtype=np.float32)
    distances, indices = index.search(query_vec, 1)  # Search for nearest neighbor
    return case_names[indices[0][0]], distances[0][0]  # Return closest case and distance


In [38]:
from sklearn.metrics.pairwise import cosine_similarity

# Detect nonexistent cases
nonexistent_cases = [case for case in cited_cases if case not in case_database]

# Detect misinterpreted cases
misinterpreted_cases = []
for case in cited_cases:
    if case in case_database:  # Only check for misinterpretation in valid cases
        closest_case, distance = find_nearest_case(case)
        similarity = cosine_similarity([cited_embeddings[case]], [case_embeddings[closest_case]])[0][0]
        if similarity < 0.7:  # Threshold for misinterpretation
            misinterpreted_cases.append((case, similarity))

# Print results
print(f"Nonexistent cases: {nonexistent_cases}")
print(f"Misinterpreted cases: {misinterpreted_cases}")


Nonexistent cases: ['See D’Andrea v. Fakename, 972 F.Supp.', 'Piskac v. Shapiro, 230 Conn. 345 (2025).', 'L. Rev. 383, 402-03).', 'In Candelaria v. Spurlock, the plaintiff appeared briefly in the documentary Super Size Me.', 'See Gautier v. Pro-Football, Inc., 304 N.Y. 354, 359, 107 N.E.2d 485 (1952) (holding that a football player’s image was not used for trade or advertising when it appeared in a newsreel of a football game).', 'See Spurlock v. Candelaria, 08 Civ. 1830 (BMC) (RER), E.D.N.Y. Jul. 3, 2008).', 'Arrington v. New York Times Co., 55 N.Y.22d 433, 440, 449 N.Y.S.22d 941, 944, 434 N.E.22d 1319, 1322 (1982).', "Lemerond v. Twentieth Century Fox Film Corp.,  564 F.Supp.2d 315, 323 (S.D.N.Y.), aff'd sub nom.", 'Delan by Delan v. CBS, Inc., 91 A.D.3d 255, 458 N.Y.S.23d 608 (2d Dep’t 2013).', 'Finger v. Omni Publs.', 'See Weil v. Johnson, Index No. 119431/02, 2002 WL 31972157, *4-5 (', 'See Hurwitz v. United States, 884 F.2d 684, 687 (2d Cir. 1989);Messenger v. Gruner Key Symbol J