In [8]:
import os
import logging
import pandas as pd
from supabase import create_client, Client
import ast # For safely evaluating string representations of lists
from typing import List, Dict, Any, Optional
import openai

openai.api_key = os.getenv("OPENAI_API_KEY")
if not openai.api_key:
    print("Warning: OpenAI API key not found. Set the OPENAI_API_KEY environment variable or set openai.api_key directly.")

In [5]:
# --- Configuration & Constants ---
CSV_PATH: str = "../data/all_cases_embeddings.csv"
SUPABASE_TABLE_NAME: str = "documents" # The table created with the SQL above
EMBEDDING_DIM: int = 512 # Must match the 'embedding vector(XYZ)' dimension

# --- Supabase Configuration ---
SUPABASE_URL: Optional[str] = os.environ.get("SUPABASE_URL")
SUPABASE_KEY: Optional[str] = os.environ.get("SUPABASE_SERVICE_ROLE_KEY")

# --- Global Variables ---
supabase: Optional[Client] = None

# --- Supabase Initialization ---
def initialize_supabase_client() -> bool:
    """Initializes the Supabase client using service role key."""
    global supabase
    print("Attempting to initialize Supabase client...")
    if not SUPABASE_URL:
        print("ERROR: SUPABASE_URL environment variable not set.")
        return False
    if not SUPABASE_KEY:
        print("ERROR: SUPABASE_SERVICE_ROLE_KEY environment variable not set. Required for data insertion.")
        return False
    try:
        supabase = create_client(SUPABASE_URL, SUPABASE_KEY)
        print("INFO: Supabase client initialized successfully (using Service Role).")
        return True
    except Exception as e:
        print(f"ERROR: Failed to initialize Supabase client: {e}")
        return False

In [4]:
def parse_embedding(embedding_str: str, expected_dim: int) -> Optional[List[float]]:
    """Safely parses a string representation of a list into a list of floats."""
    if not isinstance(embedding_str, str) or not embedding_str.startswith('[') or not embedding_str.endswith(']'):
        logging.warning(f"Invalid embedding string format: {embedding_str[:100]}...")
        return None
    try:
        embedding_list = ast.literal_eval(embedding_str)
        if isinstance(embedding_list, list) and all(isinstance(x, (int, float)) for x in embedding_list):
            if len(embedding_list) == expected_dim:
                return [float(x) for x in embedding_list]
            else:
                logging.warning(f"Parsed embedding dimension ({len(embedding_list)}) does not match expected dimension ({expected_dim}). Skipping. Value: {embedding_str[:100]}...")
                return None
        else:
            logging.warning(f"Parsed data is not a list of numbers: {embedding_str[:100]}...")
            return None
    except (ValueError, SyntaxError, TypeError, MemoryError) as e:
        logging.error(f"Error parsing embedding string: {embedding_str[:100]}... Error: {e}")
        return None

In [6]:
def run_data_ingestion():
    """Reads data, processes it, and inserts it into Supabase."""
    global supabase # Ensure we use the globally initialized client

    # 0. Initialize Supabase Client
    print("--- Starting Data Ingestion ---")
    if not initialize_supabase_client():
        print("CRITICAL: Exiting due to Supabase client initialization failure.")
        return # Use return instead of exit(1) in notebooks

    # 1. Read CSV
    print(f"\n--- Section 1: Reading CSV ---")
    print(f"INFO: Reading CSV file from: {CSV_PATH}")
    try:
        df = pd.read_csv(CSV_PATH)
        print(f"INFO: Successfully read {len(df)} rows from CSV.")
    except FileNotFoundError:
        print(f"ERROR: CSV file not found at {CSV_PATH}.")
        return
    except Exception as e:
        print(f"ERROR: Error reading CSV file: {e}")
        return

    # 2. Preprocess Data
    print(f"\n--- Section 2: Preprocessing Data ---")
    print("INFO: Preprocessing data...")
    required_columns = ['case_id', 'case_title', 'case_text', 'embeddings']
    if not all(col in df.columns for col in required_columns):
        print(f"ERROR: CSV missing one or more required columns: {required_columns}. Found: {list(df.columns)}")
        return

    df['case_id'] = df['case_id'].fillna('UNKNOWN_ID').astype(str)
    df['case_title'] = df['case_title'].fillna('UNKNOWN_TITLE').astype(str)
    df['case_text'] = df['case_text'].fillna('').astype(str)
    df['embeddings'] = df['embeddings'].fillna('[]')
    print("INFO: Data preprocessing complete (fillna and type casting).")

    # 3. Parse Embeddings
    print(f"\n--- Section 3: Parsing Embeddings ---")
    print(f"INFO: Parsing 'embeddings' column (expecting dimension {EMBEDDING_DIM})...")
    parsed_embeddings = df['embeddings'].apply(lambda x: parse_embedding(x, EMBEDDING_DIM))
    df['embedding_parsed'] = parsed_embeddings

    original_count = len(df)
    df = df.dropna(subset=['embedding_parsed'])
    parsed_count = len(df)
    if original_count > parsed_count:
        print(f"WARNING: Filtered out {original_count - parsed_count} rows due to embedding parsing errors or dimension mismatch.")
    if parsed_count == 0:
         print("ERROR: No valid embeddings found after parsing. Cannot proceed.")
         return
    print(f"INFO: Successfully parsed embeddings for {parsed_count} rows.")

    # 4. Prepare data for Supabase insertion
    print(f"\n--- Section 4: Preparing Data for Supabase ---")
    # Select the columns needed for the Supabase table.
    df_insert = df[['case_id', 'case_title', 'case_text', 'embedding_parsed']].copy()

    # Rename 'embedding_parsed' to 'embedding' to match the Supabase column name.
    df_insert.rename(columns={'embedding_parsed': 'embedding'}, inplace=True)

    # Convert DataFrame to list of dictionaries
    data_to_insert: List[Dict[str, Any]] = df_insert.to_dict('records')
    print(f"INFO: Prepared {len(data_to_insert)} records for insertion.")

    # 5. Insert Data into Supabase
    print(f"\n--- Section 5: Inserting Data into Supabase ---")
    if data_to_insert:
        if not supabase:
            print("ERROR: Supabase client is not initialized. Cannot insert data.")
            return

        print(f"INFO: Attempting to insert {len(data_to_insert)} records into Supabase table '{SUPABASE_TABLE_NAME}'...")
        try:
            # Ensure the client is valid before using it
            response = supabase.table(SUPABASE_TABLE_NAME).insert(data_to_insert).execute()
            # You might want to inspect the response for potential partial failures if needed
            print(f"INFO: Successfully executed insert operation for {len(data_to_insert)} prepared records.")
            # print(f"DEBUG: Supabase response: {response}") # Optional: for debugging
        except Exception as e:
            print(f"ERROR: Error inserting data into Supabase: {e}")
            print("ERROR: Insertion failed. Check Supabase logs and table schema.")
            return
    else:
        print("WARNING: No valid data prepared for insertion.")

    print("\n--- Data Ingestion Finished ---")

In [7]:
run_data_ingestion()

--- Starting Data Ingestion ---
Attempting to initialize Supabase client...
INFO: Supabase client initialized successfully (using Service Role).

--- Section 1: Reading CSV ---
INFO: Reading CSV file from: ../data/all_cases_embeddings.csv
INFO: Successfully read 997 rows from CSV.

--- Section 2: Preprocessing Data ---
INFO: Preprocessing data...
INFO: Data preprocessing complete (fillna and type casting).

--- Section 3: Parsing Embeddings ---
INFO: Parsing 'embeddings' column (expecting dimension 512)...
INFO: Successfully parsed embeddings for 997 rows.

--- Section 4: Preparing Data for Supabase ---
INFO: Prepared 997 records for insertion.

--- Section 5: Inserting Data into Supabase ---
INFO: Attempting to insert 997 records into Supabase table 'documents'...


2025-04-12 21:15:46,786 - INFO - HTTP Request: POST https://bpayzlivyykwfcizukhu.supabase.co/rest/v1/documents?columns=%22embedding%22%2C%22case_title%22%2C%22case_text%22%2C%22case_id%22 "HTTP/2 201 Created"


INFO: Successfully executed insert operation for 997 prepared records.

--- Data Ingestion Finished ---


In [9]:
client = openai.OpenAI(api_key=openai.api_key)

query_text = "what are some bankrupcy cases?"

print(f"\n--- Performing Hybrid Search for: '{query_text}' ---")

# 1. Generate query embedding
try:
    query_embedding = client.embeddings.create(
                        model="text-embedding-3-small",
                        input=query_text,
                        dimensions=512
                    ).data[0].embedding
except Exception as e:
    print(f"ERROR: Failed to generate embedding for query: {e}")

# 2. Define parameters for the RPC call (matching SQL function args)
rpc_params = {
    'query_text': query_text,
    'query_embedding': query_embedding,
    'match_count': 5
    # Optional: Add weights or rrf_k if you want to override defaults
    # 'full_text_weight': 1.5,
    # 'semantic_weight': 1.0,
    # 'rrf_k': 60
}

# 3. Call the function
response = supabase.rpc('hybrid_search', rpc_params).execute()



--- Performing Hybrid Search for: 'what are some bankrupcy cases?' ---


2025-04-12 21:26:15,164 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-04-12 21:26:15,924 - INFO - HTTP Request: POST https://bpayzlivyykwfcizukhu.supabase.co/rest/v1/rpc/hybrid_search "HTTP/2 200 OK"


In [12]:
import pandas as pd
from IPython.display import display # Optional: For explicit display control in notebooks

def display_results_table(results: Optional[List[Dict[str, Any]]]):
    """
    Displays the search results in a formatted pandas DataFrame table.

    Args:
        results: A list of dictionaries, where each dict represents a document.
                 Expected keys include 'id', 'case_id', 'case_title', 'case_text'.
                 Can be None or empty.
    """
    if not results:
        print("No results to display.")
        return

    # Create a DataFrame from the list of dictionaries
    df = pd.DataFrame(results)

    # --- Select and potentially shorten columns for better display ---

    # Keep essential identifiers
    display_columns = ['id', 'case_id', 'case_title']

    # Add a snippet of the case text if it exists
    if 'case_text' in df.columns:
        # Create a shorter snippet column
        df['case_text_snippet'] = df['case_text'].str.slice(0, 150) + '...' # Adjust length as needed
        display_columns.append('case_text_snippet')
    else:
         print("WARNING: 'case_text' column not found in results.")


    # Select only the columns we want to show
    df_display = df[display_columns]

    # --- Display the table ---
    print(f"--- Search Results ({len(df_display)} documents) ---")

    # Option 1: Just print (works everywhere, basic formatting)
    # print(df_display.to_string(index=False)) # .to_string() gives better control than raw print(df)

    # Option 2: Use IPython display for nicer notebook rendering
    # Set pandas options for better notebook display (optional)
    pd.set_option('display.max_rows', 10) # Show max 10 rows initially
    pd.set_option('display.max_colwidth', 100) # Max width for columns like snippet
    pd.set_option('display.width', 1000) # Wider display area

    display(df_display) # This renders the nice HTML table in Jupyter/Colab/etc.


In [13]:
display_results_table(response.data)

--- Search Results (5 documents) ---


Unnamed: 0,id,case_id,case_title,case_text_snippet
0,511,Case519,Re Pannowitz; Ex parte Wilson (1975) 38 FLR 184,"It is not considered that the "" final judgment or final\norder "" could be constituted by the ord..."
1,510,Case518,Re Ling; Ex parte Ling v Commonwealth [1995] FCA 1410 ;\n(1995) 58 FCR 129,This provision exists for the benefit of the debtor: Re\nWillats; Ex parte Nissan Finance Corp L...
2,505,Case513,"Re Gibbs; Ex parte Triscott (1995) 65 FCR 80 , 133 ALR 718","The "" final judgment or final order "" in the present\nproceeding, it was ultimately submitted by..."
3,506,Case514,"Re Glew; Glew v Harrowell [2003] FCA 373 , 198 ALR 331","To "" satisfy "" the Court it is not necessary for the debtor\nto prove, as on a final hearing, th..."
4,509,Case517,Re Jocumsen (1929) 1 ABC 82,"The "" counter-claim, set-off or cross demand "" referred to\nin s 40(1)(g) and 41(7) "" must be so..."
