## Legal Effect Extraction

Giacomo Grazia

Academic Year 2024-2025

---

### 1. Data for API calls

In [4]:
import pandas as pd
#df = pd.read_csv('')

### 2. LLM-based classification (1)

Api function with prompt 1 (categorization based on decision headline and description only).

In [None]:
# Function to preprocess each row and call the OpenAI API for categorization
def categorize_with_openai(row):
    """
    Processes a row to extract headline and description, prepares a categorization prompt,
    and calls the OpenAI API to determine the appropriate category/categories.

    :param row: A row from the DataFrame containing 'headline' and 'description'.
    :return: list of categories assigned by the API, if any.
    """
    decision_headline = row.get("headline", "")
    decision_description = row.get("description", "")
    
    prompt = f"""
    Given the following headline and description of an administrative decision, 
    categorize it into one of the following 6 categories:
    
    - "verguning verlening": Decisions to grant a license
    - "wijziging of aanpassen": Decisions to amend/modify a license
    - "overdracht": Decisions to transfer a license to another party (including name changes)
    - "intrekking of beëindiging": Decisions to revoke/withdraw a license
    - "schorsing": Decisions to suspend a license
    - "vernieuwing of verlenging": Decisions to renew or extend a license

    Guidelines for Categorization:
    1. You may assign ONLY ONE category, if applicable. 
    2. If no category is relevant, return nothing.

    Input:
    - Headline: {decision_headline}
    - Description: {decision_description}

    Output Format:
    - If a category is identified, return a list in this format:
        ["assigned_category"]
    - If no category is identified, return an empty list like this:
        []
    """
    
    # Call the OpenAI API
    try:
        response = OpenAI().chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "You are an assistant for categorizing legal decisions into 6 categories."},
                {"role": "user", "content": prompt},
            ],
            temperature=0,
            # max_tokens=150,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
        )
        
        # Extract the assistant's response
        assistant_output = response.choices[0].message.content.strip()
        print(assistant_output)
        return json.loads(assistant_output)  # Parse JSON response from API

    except Exception as e:
        return [f"Error: {str(e)}"]  # Return error message in a list format

df_merged["category_single_gpt_4o-mini"] = df_merged.progress_apply(categorize_with_openai, axis=1)

### 3. LLM-based classification (2)

Api function with prompt 2 (categorization based on headline, description and full text).

In [None]:
# Function to preprocess each row and call the OpenAI API for categorization
def categorize_with_openai(row):
    """
    Processes a row to extract headline and description, prepares a categorization prompt,
    and calls the OpenAI API to determine the appropriate category/categories.

    :param row: A row from the DataFrame containing 'headline' and 'description'.
    :return: list of categories assigned by the API, if any.
    """
    decision_headline = row.get("headline", "")
    decision_description = row.get("description", "")
    decision_text = row.get("text_pypdf2", "")


    prompt = f"""
    Given the following headline, description and text of an administrative decision, 
    categorize the decision into one of the following 6 categories:
    
    - "verguning verlening": Decisions to grant a license
    - "wijziging of aanpassen": Decisions to amend/modify a license
    - "overdracht": Decisions to transfer a license to another party (including name changes)
    - "intrekking of beëindiging": Decisions to revoke/withdraw a license
    - "schorsing": Decisions to suspend a license
    - "vernieuwing of verlenging": Decisions to renew or extend a license

    Guidelines for Categorization:
    1. You may assign ONLY ONE category, if applicable. 
    2. If no category is relevant, return nothing.

    Input:
    - Headline: {decision_headline}
    - Description: {decision_description}
    - Text: {decision_text}

    Output Format:
    - If a category is identified, return a list containing the category as a string, like this:
        ["assigned_category"]
    - If no category is identified, return an empty list like this:
        []
    """
    
    # print(prompt)

    # Call the OpenAI API
    try:
        response = OpenAI().chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "You are an assistant for categorizing legal decisions into 6 categories."},
                {"role": "user", "content": prompt},
            ],
            temperature=0,
            # max_tokens=150,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
        )
        
        # Extract the assistant's response
        assistant_output = response.choices[0].message.content.strip()
        # print(assistant_output)
        return json.loads(assistant_output)  # Parse JSON response from API

    except Exception as e:
        return [f"Error: {str(e)}"]  # Return error message in a list format

df_merged["category_single_from_TEXT_gpt_4o-mini"] = df_merged.progress_apply(categorize_with_openai, axis=1)

### 4. Comparison of approaches

#### 4.1 Creating word embeddings

##### 4.1.1 Create spaCy Word Embeddings

In [None]:
# embeddings spacy on combined text
import spacy

# Load the Dutch spaCy model
nlp = spacy.load("nl_core_news_lg")

# Combine headline and description
df_merged["combined_text"] = df_merged["headline"] + " " + df_merged["description"]

# Create embeddings using spaCy
def get_embeddings(text):
    doc = nlp(text)
    return doc.vector  # Returns the document vector (dense representation)

# Embeddings of combined text
df_merged["embedding_spacy_nl_lg_combined_text"] = df_merged["combined_text"].apply(get_embeddings) #ok

# Embeddings of decision body
df_merged["embedding_spacy_nl_lg_body"] = df_merged["text_pypdf2"].progress_apply(get_embeddings) #ok

##### 4.1.2 Create OpenAI Word Embeddings

In [1]:
from openai import OpenAI
OpenAI.api_key = OPENAI_API_KEY

a) Getting embeddings for the combined text (headline - description).

In [None]:
client = OpenAI()

# Function to get embeddings using OpenAI API
def get_embedding(text, model="text-embedding-3-small"):
    """
    Generates an embedding for the given text using the specified OpenAI model.

    :param text: The text to embed.
    :param model: The embedding model to use (default: "text-embedding-3-small").
    :return: A list representing the embedding vector.
    """
    response = client.embeddings.create(input=text, model=model)
    return response.data[0].embedding

# Apply the embedding function to all rows
tqdm.pandas()  # Initialize tqdm for pandas
df_merged['embedding_combined_text-embedding-3-small'] = df_merged['combined_text'].progress_apply(
    lambda x: get_embedding(x, model="text-embedding-3-small")
)

# Display the DataFrame
df_merged.head()

b) Getting embeddings for the decisions' full text.

In [None]:
# Embeddings have a token limit of 8192 tokens. 
# We will truncate the text to fit within this limit.

def truncate_text(text, row_index, max_tokens=8192, encoding_name="cl100k_base"):
    """
    Truncates text to fit within the token limit and logs rows exceeding the limit.

    :param text: str, the input text to truncate.
    :param row_index: int, the index of the row (for logging purposes).
    :param max_tokens: int, the maximum number of tokens allowed.
    :param encoding_name: str, the encoding model name.
    :return: str, truncated text that fits within the token limit.
    """
    encoding = tiktoken.get_encoding(encoding_name)
    tokens = encoding.encode(text)
    
    if len(tokens) > max_tokens:
        print(f"Row {row_index} exceeded the {max_tokens} token limit. Original tokens: {len(tokens)}")
        tokens = tokens[:max_tokens]
    
    return encoding.decode(tokens)


# Truncate text in the DataFrame before embedding
df['truncated_text'] = df.apply(
    lambda row: truncate_text(row['text_pypdf2'], row.name, max_tokens=8191, encoding_name="cl100k_base"),
    axis=1
)

In [None]:
# Embedding body openai

client = OpenAI()

# Function to get embeddings using OpenAI API
def get_embedding(text, model="text-embedding-3-small"):
    """
    Generates an embedding for the given text using the specified OpenAI model.

    :param text: The text to embed.
    :param model: The embedding model to use (default: "text-embedding-3-small").
    :return: A list representing the embedding vector.
    """
    response = client.embeddings.create(input=text, model=model)
    return response.data[0].embedding

# Apply embeddings to the truncated text
tqdm.pandas()  # Initialize tqdm for pandas
df['embedding_decision-body_text-embedding-3-small'] =  df['truncated_text'].progress_apply(
    lambda x: get_embedding(x, model="text-embedding-3-small")
)

# Save the DataFrame to a file
df.to_csv('df_merged_embeddings.csv', index=False)

#### 4.2 Visualizing embeddings

In [None]:
df_merged

##### 4.2.1 Visualizing spaCy embeddings

In [6]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

df_merged = pd.read_csv('../df_merged_embeddings.csv')

In [8]:
df_merged.columns

Index(['id', 'file_number', 'text_pypdf2', 'headline', 'description',
       'publication_date', 'decision_date', 'case', 'parties', 'file_link',
       'parties_list', 'extracted_parties_gpt', 'extracted_parties_gpt_v2',
       'extracted_parties_gpt_v3', 'categories_rule_based',
       'categories_gpt_4o-mini', 'jaccard_similarity', 'combined_text',
       'embedding_headline-description', 'truncated_text',
       'embedding_decision-body_text-embedding-3-small',
       'category_single_gpt_4o-mini', 'category_single_from_TEXT_gpt_4o-mini'],
      dtype='object')

In [7]:
# Convert embeddings to a matrix for clustering
import numpy as np
embedding_matrix_combined_text = np.vstack(df_merged["embedding_spacy_nl_lg_combined_text"].values)
embedding_matrix_body = np.vstack(df_merged["embedding_spacy_nl_lg_body"].values)

KeyError: 'embedding_spacy_nl_lg_combined_text'

In [None]:
# K-Means clustering COMBINED TEXT
n_clusters = 5 # since 5 categories have been assigned
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
df_merged["cluster_kmeans_combined_text"] = kmeans.fit_predict(embedding_matrix_combined_text)

a) k-means on spaCy embeddings