<a href="https://colab.research.google.com/github/davidisinta/AI/blob/main/law_llms_p2_2_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [148]:
!pip install faiss-cpu



In [149]:
!pip install faiss-cpu sentence-transformers




Lets Extract All the Cases in The Combined Text

In [150]:
import spacy
import re

# Load SpaCy English model
nlp = spacy.load("en_core_web_sm")

# Read file
file_path = "combined-text.txt"

cases_dict = {}
current_case = []
blank_line_count = 0
line_num = 0
current_header = None

# Function to detect case headers using SpaCy
def is_case_header(line):
    doc = nlp(line.strip())
    if "v." in line:
        for ent in doc.ents:
            if ent.label_ == "LAW":
                return True
        return doc[0].is_title and "v." in line
    return False

with open(file_path, "r", encoding="utf-8") as file:
    for line in file:
        line_num += 1

        # Count blank lines (used for case separation)
        if line.strip() == "":
            blank_line_count += 1
            continue

        # New case start
        if blank_line_count >= 2 and is_case_header(line) or line_num == 1:
            # Store previous case in dictionary
            if current_header and current_case:
                cases_dict[current_header] = "\n".join(current_case)

            # Extract new case header in "Who v. Who" format
            match = re.search(r'([A-Za-z]+ v\. [A-Za-z]+)', line)
            current_header = match.group(1) if match else line.strip()

            # Start new case collection
            current_case = [line.strip()]

        else:
            current_case.append(line.strip())

        blank_line_count = 0

if current_header and current_case:
    cases_dict[current_header] = "\n".join(current_case)


for header in cases_dict.keys():
    print(header)
    print("-" * 100)

count = 0
for header, description in cases_dict.items():
    if count < 3:
      print(f"{header}:\n{description}")
      print("-" * 100)
    count += 1


print(f"\nTotal cases extracted: {len(cases_dict)}")


Burns v. McGraw
----------------------------------------------------------------------------------------------------
Ken-Do Contracting, L.P. v. F.A. Brown's Construction, LLC D/B/A Brown Construction and Brown's Concrete Construction, 05-19-00228-CV (Tex. App. 2020)
----------------------------------------------------------------------------------------------------
Garden v. Parfumerie
----------------------------------------------------------------------------------------------------
Nutrivida, Inc. v. Inmuno Vital, Inc., 46 F. Supp. 2d 1310 (S.D. Fla. 1998)
----------------------------------------------------------------------------------------------------
Gautier v. Pro
----------------------------------------------------------------------------------------------------
Belluomo v. Kake
----------------------------------------------------------------------------------------------------
Virgil v. Time
-----------------------------------------------------------------------------------

In [151]:
import spacy
import re

nlp = spacy.load("en_core_web_sm")
file_path = "Model-output.txt"
cited_cases_dict = {}

with open(file_path, "r", encoding="utf-8") as file:
    text = file.read()

# Process text
doc = nlp(text)

current_case = None
current_explanation = []

case_pattern = re.compile(r'[A-Z][a-zA-Z]+ v\. [A-Z][a-zA-Z]+(?:, [\w\s.()]+)?')

for sentence in doc.sents:
    sentence_text = sentence.text.strip()

    # Detect multiple cases in the same sentence using regex
    found_cases = case_pattern.findall(sentence_text)

    if found_cases:
        if current_case:
            cited_cases_dict[current_case] = " ".join(current_explanation)

        # If multiple cases are found in one sentence, process each separately
        for i, case in enumerate(found_cases):
            if i == 0:
                current_case = case
                current_explanation = [sentence_text]
            else:
                cited_cases_dict[case] = sentence_text
    elif current_case:
        current_explanation.append(sentence_text)

if current_case:
    cited_cases_dict[current_case] = " ".join(current_explanation)

# Extract case headers
cited_cases_headers = list(cited_cases_dict.keys())

print("-" * 500)
print(f" CASE HEADERS: ")
print("-" * 500)

# Print extracted case headers
for case in cited_cases_headers:
    print(case)
    print("-" * 100)

print("-" * 500)
print(f" CASES WITH EXPLANATIONS: ")
print("-" * 500)

# Print extracted cases and their interpretations
for case, explanation in cited_cases_dict.items():
    print(f"Case: {case}")
    print(f"Explanation: {explanation}")
    print("-" * 100)

print(f"\nTotal cases extracted: {len(cited_cases_dict)}")


--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 CASE HEADERS: 
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

Now we take this headers and create embeddings and store them in vector db

In [152]:
# Extract case headers from both files
case_headers_db = cases_headers
cited_case_headers_db = cited_cases_headers

print(f"Total real cases: {len(case_headers_db)}, Total cited cases: {len(cited_cases_headers)}")


Total real cases: 222, Total cited cases: 12


In [153]:
from sentence_transformers import SentenceTransformer

# Load embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

print(f"Total real cases: {len(case_headers_db)}")
print(f"Total cited cases: {len(cited_case_headers_db)}")

print("-" * 100)

#print first 2 cases
print(f"Sample Real cases: ")
print("-" * 100)
for case in case_headers_db[:2]:
    print(case)
    print("-" * 100)

print(f"Sample Cited cases: ")
print("-" * 100)
#print first 2 cited cases
for case in cited_case_headers_db[:2]:
    print(case)
    print("-" * 80)

# Generate embeddings
case_headers_embeddings = {case: model.encode(case) for case in case_headers_db}
cited_headers_embeddings = {case: model.encode(case) for case in cited_case_headers_db}


Total real cases: 222
Total cited cases: 12
----------------------------------------------------------------------------------------------------
Sample Real cases: 
----------------------------------------------------------------------------------------------------
Burns v. McGraw
----------------------------------------------------------------------------------------------------
Ken-Do Contracting, L.P. v. F.A. Brown's Construction, LLC D/B/A Brown Construction and Brown's Concrete Construction, 05-19-00228-CV (Tex. App. 2020)
----------------------------------------------------------------------------------------------------
Sample Cited cases: 
----------------------------------------------------------------------------------------------------
Messenger v. Gruner
--------------------------------------------------------------------------------
Hurwitz v. United
--------------------------------------------------------------------------------


In [154]:
import faiss
import numpy as np

# Get embedding dimension
dim = len(next(iter(case_headers_embeddings.values())))

# Create FAISS index
index = faiss.IndexFlatL2(dim)

case_names = list(case_headers_embeddings.keys())
case_vectors = np.array(list(case_headers_embeddings.values()), dtype=np.float32)

index.add(case_vectors)

def find_nearest_case(query_case):
    query_vec = np.array([cited_headers_embeddings[query_case]], dtype=np.float32)
    distances, indices = index.search(query_vec, 1)
    return case_names[indices[0][0]], distances[0][0]


In [155]:
from scipy.spatial.distance import cosine

MATCH_THRESHOLD = 0.69
POSSIBLE_MATCH_THRESHOLD = 0.55
FAKE_CASE_THRESHOLD = 0.5

real_cited_cases = []
fake_cases = []
possible_matches = []

# Compare cited cases against real cases
for cited_case, cited_vector in cited_headers_embeddings.items():
    max_similarity = 0
    best_match = None
    found_strong_match = False

    for real_case, real_vector in case_headers_embeddings.items():
        similarity = 1 - cosine(cited_vector, real_vector)

        if similarity > max_similarity:
            max_similarity = similarity
            best_match = real_case

        if similarity > MATCH_THRESHOLD:
            found_strong_match = True
            real_cited_cases.append(cited_case)

        elif similarity > POSSIBLE_MATCH_THRESHOLD:
            possible_matches.append((cited_case, real_case, similarity))

    if max_similarity < FAKE_CASE_THRESHOLD:
        fake_cases.append(cited_case)

print("\n Real Cited Cases (Strong Matches): ")
print("*" * 80)
for case in real_cited_cases:
    print(case)

print("\nPossible Matches")
print("*" * 80)
for cited, real, sim in possible_matches:
    print(f"Cited: {cited} | Real: {real} | Similarity: {sim:.4f}")

print("\n Non Existent Case Citation: ")
print("*" * 80)
for fake in fake_cases:
    print(fake)



 Real Cited Cases (Strong Matches): 
********************************************************************************
Messenger v. Gruner
Hurwitz v. United
Gautier v. Pro
Delan v. CBS, Inc.
Finger v. Omni
Arrington v. New

Possible Matches
********************************************************************************
Cited: Hurwitz v. United | Real: Powers v. United | Similarity: 0.5806
Cited: Hurwitz v. United | Real: Smith v. United | Similarity: 0.6261
Cited: Delan v. CBS, Inc. | Real: J.H. Desnick, M.D., Eye Services, Ltd. v. American Broadcasting Companies, Inc., Jon Entine, and Sam Donaldson, 233 F.3d 514 (7th Cir. 2000) | Similarity: 0.5520

 Non Existent Case Citation: 
********************************************************************************
Andrea v. Fakename, 972 F.Supp.
Piskac v. Shapiro, 230 Conn. 345 (2025).
Candelaria v. Spurlock, the plaintiff appeared briefly in the documentary Super Size Me.
Spurlock v. Candelaria, 08 Civ. 1830 (BMC) (RER)


Embedding the Case Descriptions

In [156]:
import faiss
import numpy as np
import logging
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

model = SentenceTransformer("all-MiniLM-L6-v2")

NAME_MATCH_THRESHOLD = 0.7
DESCRIPTION_SIMILARITY_THRESHOLD = 0.5

# FAISS index for case names
case_keys = list(cases_dict.keys())
case_name_embeddings = np.array([model.encode(name) for name in case_keys], dtype=np.float32)
case_name_index = faiss.IndexFlatL2(case_name_embeddings.shape[1])
case_name_index.add(case_name_embeddings)
case_to_index = {case_keys[i]: i for i in range(len(case_keys))}

# FAISS index for case descriptions
case_embeddings = np.array([model.encode(desc) for desc in cases_dict.values()], dtype=np.float32)
case_desc_index = faiss.IndexFlatL2(case_embeddings.shape[1])
case_desc_index.add(case_embeddings)

# Function to find best matching case name from a given citation name
def find_best_matching_case(cited_case_name):
    cited_vector = np.array([model.encode(cited_case_name)], dtype=np.float32)
    _, indices = case_name_index.search(cited_vector, 1)

    best_match_index = indices[0][0]
    best_matching_case = case_keys[best_match_index]

    best_match_vector = case_name_embeddings[best_match_index].reshape(1, -1)
    cited_vector = cited_vector.reshape(1, -1)
    similarity = cosine_similarity(cited_vector, best_match_vector)[0][0]

    if similarity >= NAME_MATCH_THRESHOLD:
        return best_matching_case


print(f"\n Misinterpreted Cases: ")

# Compare cited descriptions with actual descriptions
for cited_case_name, cited_description in cited_cases_dict.items():
    best_matching_case = find_best_matching_case(cited_case_name)

    if best_matching_case is None:
        continue

    case_index = case_to_index[best_matching_case]
    actual_vector = case_embeddings[case_index]

    cited_vector = model.encode(cited_description, convert_to_numpy=True)

    similarity = cosine_similarity(actual_vector.reshape(1, -1), cited_vector.reshape(1, -1))[0][0]

    if similarity < DESCRIPTION_SIMILARITY_THRESHOLD:
      print(f"\n Case: {best_matching_case} (matched for {cited_case_name})")
      print(f"   Cosine Similarity: {similarity:.4f}")



 Misinterpreted Cases: 

 Case: Messenger v. Gruner (matched for Messenger v. Gruner)
   Cosine Similarity: 0.2055

 Case: Hurwitz v. United (matched for Hurwitz v. United)
   Cosine Similarity: 0.3044
