In [None]:
import lancedb
import pandas as pd
from lancedb.pydantic import LanceModel, Vector
from lancedb.rerankers import RRFReranker
import ast
import pyarrow as pa
import openai
import os
from lancedb.embeddings import EmbeddingFunction, EmbeddingFunctionConfig, get_registry, registry, EmbeddingFunctionRegistry

openai.api_key = os.getenv("OPENAI_API_KEY")
if not openai.api_key:
    print("Warning: OpenAI API key not found. Set the OPENAI_API_KEY environment variable or set openai.api_key directly.")

uri = "../data/sample-lancedb"
db = lancedb.connect(uri)


In [65]:
TABLE_NAME = "cases_hybrid_search"
CSV_PATH = "../data/all_cases_embeddings.csv"
TEXT_COLUMN = "case_text" # Column for FTS and potentially main text content
VECTOR_COLUMN = "vector" # Name for the vector field in the schema
EMBEDDING_DIM = 512

# --- LanceDB Schema Definition ---
# Define a schema that matches your CSV structure and includes the vector
class CaseDocuments(LanceModel):
    case_id: str
    case_title: str
    case_text: str
    # Define the vector field with the correct dimensions.
    # We don't need SourceField here as embeddings are pre-computed.
    vector: Vector(512)

In [25]:
def parse_embedding(embedding_str: str) -> list[float]:
    """Parses a string representation of a list into a list of floats."""
    try:
        embedding_list = ast.literal_eval(embedding_str)
        if isinstance(embedding_list, list) and all(isinstance(x, (int, float)) for x in embedding_list):
            return [float(x) for x in embedding_list]
        else:
            print(f"Warning: Could not parse embedding string correctly: {embedding_str[:100]}...")
            return [0.0] * EMBEDDING_DIM
    except (ValueError, SyntaxError, TypeError) as e:
        print(f"Error parsing embedding string: {embedding_str[:100]}... Error: {e}")
        return [0.0] * EMBEDDING_DIM

In [26]:
df = pd.read_csv(CSV_PATH)

required_columns = ['case_id', 'case_title', 'case_text', 'embeddings']
if not all(col in df.columns for col in required_columns):
    raise ValueError(f"CSV missing one or more required columns: {required_columns}")

    # Handle potential NaN/missing values before parsing
df['embeddings'] = df['embeddings'].fillna('[]')
df['case_id'] = df['case_id'].fillna('UNKNOWN_ID').astype(str)
df['case_title'] = df['case_title'].fillna('UNKNOWN_TITLE').astype(str)
df['case_text'] = df['case_text'].fillna('').astype(str)


# 3. Parse embedding strings into lists of floats
print("Parsing embedding strings...")
# IMPORTANT: This assumes your 'embeddings' column contains strings like '[0.1, 0.2, ...]', which mine do since its a csv file
df[VECTOR_COLUMN] = df['embeddings'].apply(parse_embedding)
print("Finished parsing embeddings.")

# Verify embedding dimensions (optional but recommended)
first_valid_vector = df[VECTOR_COLUMN].iloc[0] # Check the first one
if len(first_valid_vector) != EMBEDDING_DIM:
        print(f"Warning: Parsed vector dimension ({len(first_valid_vector)}) does not match expected dimension ({EMBEDDING_DIM}). Check CSV format and parsing logic.")

# Select columns for LanceDB, renaming 'embeddings' if needed
# We need columns matching the CaseDocuments schema
lancedb_data = df[['case_id', 'case_title', 'case_text', VECTOR_COLUMN]].to_dict('records')



Parsing embedding strings...
Finished parsing embeddings.


In [27]:
# 4. Create LanceDB Table
print(f"Creating/Overwriting LanceDB table: {TABLE_NAME}")
try:
    # Use mode="overwrite" to start fresh each time, or "create" to fail if exists
    table = db.create_table(TABLE_NAME, schema=CaseDocuments, mode="overwrite")
    print("Table created successfully.")
except Exception as e:
    print(f"Error creating LanceDB table: {e}")

# 5. Add data to the table
print(f"Adding {len(lancedb_data)} records to the table...")
try:
    # LanceDB can infer schema from list of dicts, but explicit schema is safer
    table.add(lancedb_data)
    print("Data added successfully.")
except pa.ArrowInvalid as e:
        print(f"Error adding data to LanceDB table (potential schema mismatch or data type issue): {e}")
except Exception as e:
    print(f"Error adding data to LanceDB table: {e}")


Creating/Overwriting LanceDB table: cases_hybrid_search
Table created successfully.
Adding 997 records to the table...
Data added successfully.


In [28]:
# 6. Create FTS index
print(f"Creating FTS index on column: {TEXT_COLUMN}")
try:
    table.create_fts_index(TEXT_COLUMN)
    print("FTS index creation initiated. It might take some time to build.")
except Exception as e:
    print(f"Error creating FTS index: {e}")

Creating FTS index on column: case_text
Error creating FTS index: Index already exists. Use replace=True to overwrite.


In [None]:
# --- Example Hybrid Search ---
print("\n--- Performing Hybrid Search Example ---")
# Create a reranker (Optional but recommended for hybrid search)
reranker = RRFReranker()

try:
    search_query = "bankruptcy case" # Example query
    print(f"Searching for: '{search_query}'")

    print("Embedding the search query using OpenAI...")
    client = openai.OpenAI(api_key=openai.api_key)
    
    query_vector = client.embeddings.create(
                        model="text-embedding-3-small",
                        input=search_query,
                        dimensions=512
                    ).data[0].embedding

    # 3. Perform the search using the vector and the text query
    results = (
        table.search(query_type='hybrid')
        .vector(query_vector)
        .text(search_query)
        .rerank(reranker=reranker)
        .limit(5)
        .to_pandas()
    )

    print("\nHybrid search results:")
    print(results[['case_id', 'case_title', 'case_text']]) # Show relevant columns

except Exception as e:
    print(f"\nAn error occurred during search: {e}")
    # This could happen if the FTS index isn't ready yet or other issues.


--- Performing Hybrid Search Example ---
Searching for: 'bankruptcy case'
Embedding the search query using OpenAI...

Hybrid search results:
   case_id                                         case_title  \
0  Case514  Re Glew; Glew v Harrowell [2003] FCA 373 , 198...   
1  Case516   Re Griffin, Ex parte Soutar (1890) 1 BC (NSW) 29   
2  Case515       Re Gould; Ex parte Skinner (1983) 72 FLR 393   
3  Case517                        Re Jocumsen (1929) 1 ABC 82   
4  Case513  Re Gibbs; Ex parte Triscott (1995) 65 FCR 80 ,...   

                                           case_text  
0  To " satisfy " the Court it is not necessary f...  
1  A claim for unliquidated damages for breach of...  
2  Some decisions involving the application of s ...  
3  The " counter-claim, set-off or cross demand "...  
4  The " final judgment or final order " in the p...  
