In [46]:
import os
from pinecone import Pinecone, ServerlessSpec
from tqdm import tqdm
import time
from dotenv import load_dotenv
from tabulate import tabulate
load_dotenv()

True

In [15]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

if not PINECONE_API_KEY:
    print("Warning: Using placeholder Pinecone API key. Set your key.")
    
pinecone = Pinecone(api_key=PINECONE_API_KEY)

In [None]:
index_name = "hybrid-search"

existing_indexes = pinecone.list_indexes()
if index_name in existing_indexes:
    print(f"Index '{index_name}' already exists. Deleting index.")
    pinecone.delete_index(index_name)
    time.sleep(3)

print(f"Creating new index '{index_name}'")
pinecone.create_index(name=index_name,
                      dimension=512,
                      metric='dotproduct',
                      spec=ServerlessSpec(cloud='aws', region='us-east-1'))

index = pinecone.Index(index_name)

print(index.describe_index_stats())



Creating new index 'hybrid-search'
{'dimension': 512,
 'index_fullness': 0.0,
 'metric': 'dotproduct',
 'namespaces': {},
 'total_vector_count': 0,
 'vector_type': 'dense'}


In [20]:
from pinecone_text.sparse import BM25Encoder
import pandas as pd


In [22]:
encoder = BM25Encoder()

train_df = pd.read_csv('../data/legal_test_500.csv')

encoder.fit(train_df['case_text'].astype(str).tolist())

100%|██████████| 500/500 [00:01<00:00, 319.77it/s]


<pinecone_text.sparse.bm25_encoder.BM25Encoder at 0x147bd0e00>

In [50]:
def group_embeddings_and_sparse_vectors(cases, sparse_encoder):
    """
    Group cases into batches of 100 and embed them using OpenAI's embedding API.
    Also encode the case_text using the sparse encoder.
    """
    all_embeddings_and_sparse_vectors = []
    print("Grouping embeddings and sparse vectors...")
    
    texts_to_encode = [case['case_text'] for case in cases if isinstance(case.get('case_text'), str)]
    case_ids_map = {case['case_id']: i for i, case in enumerate(cases) if isinstance(case.get('case_text'), str)}
    
    original_indices = [case_ids_map[case['case_id']] for case in cases if isinstance(case.get('case_text'), str)]
   
    if not texts_to_encode:
       print("No texts to encode")
       return []
   
    # batch encode sparse vectors
    sparse_values = sparse_encoder.encode_documents(texts_to_encode)
    
    sparse_map = {}
    for i, original_index in enumerate(original_indices):
        sparse_map[original_index] = sparse_values[i]
        
    for i, case in enumerate(cases):
        case_id = case.get('case_id')
        case_title = case.get('case_title')
        case_text = case.get('case_text')
        embeddings = case.get('embeddings')
        
        # If embeddings is a string, convert it to a list of floats
        if isinstance(embeddings, str):
            # Remove brackets and split by comma
            embeddings = embeddings.strip('[]').split(',')
            # Convert to float values
            embeddings = [float(x.strip()) for x in embeddings]

        # Skip if essential data is missing
        if not all([case_id, case_title, case_text, embeddings]):
            print(f"Skipping case index {i} due to missing data.")
            continue

        # Get the corresponding sparse values using the original index 'i'
        sparse_values = sparse_map.get(i)
        if sparse_values is None:
             print(f"Warning: Sparse vector not found for case index {i}, case_id {case_id}. Skipping.")
             continue


        # Create the new dictionary with the required structure
        new_case_dict = {
            'id': str(case_id), # Pinecone ID must be string
            'sparse_values': sparse_values,
            'values': embeddings,
            'metadata': {
                'case_title': case_title,
                'case_text': case_text
            }
        }
        # Add the new dictionary to the list
        all_embeddings_and_sparse_vectors.append(new_case_dict)

    return all_embeddings_and_sparse_vectors
       

In [51]:
all_cases_with_embeddings_csv = pd.read_csv('../data/all_cases_embeddings.csv')

# convert to dataframe
all_cases_with_embeddings = all_cases_with_embeddings_csv.to_dict(orient='records')

print(all_cases_with_embeddings[1])

{'case_id': 'Case2', 'case_title': 'Black v Lipovac [1998] FCA 699 ; (1998) 217 ALR 386', 'case_text': 'The general principles governing the exercise of the\ndiscretion to award indemnity costs after rejection by an\nunsuccessful party of a so called Calderbank letter were set\nout in the judgment of the Full Court in Black v Lipovac\n[1998] FCA 699 ; (1998) 217 ALR 386. In summary those\nprinciples are: 1. Mere refusal of a "Calderbank offer" does\nnot itself warrant an order for indemnity costs. In this\nconnection it may be noted that Jessup J in Dais Studio Pty\nLtd v Bullet Creative Pty Ltd [2008] FCA 42 said that (at\n[6]): if the rejection of such an offer is to ground a claim\nfor indemnity costs, it must be by reason of some\ncircumstance other than that the offer happened to comply\nwith the Calderbank principle. 2. To obtain an order for\nindemnity costs the offeror must show that the refusal to\naccept it was unreasonable. 3. The reasonableness of the\nconduct of the offere

In [52]:
all_cases_embeddings_and_sparse_vectors = group_embeddings_and_sparse_vectors(all_cases_with_embeddings, encoder)
print(f"Prepared {len(all_cases_embeddings_and_sparse_vectors)} vectors for upsert.")

Grouping embeddings and sparse vectors...
Prepared 997 vectors for upsert.


In [53]:
table_data = []
for case in all_cases_embeddings_and_sparse_vectors[:10]:
     table_data.append([
        case['id'],
        case['metadata']['case_title'][:30] + '...' if len(case['metadata']['case_title']) > 30 else case['metadata']['case_title'], # Truncate title
        case['metadata']['case_text'][:30] + '...' if len(case['metadata']['case_text']) > 30 else case['metadata']['case_text'], # Truncate text
        str(case['sparse_values'])[:30] + '...', # Show truncated sparse values
        str(case['values'])[:30] + '...' # Show truncated dense values
     ])

# Define the headers based on the keys
headers = ["ID", "Case Title", "Case Text", "Sparse Values", "Embeddings"]

# Print the table
# Presenter: So we got sparse values here and we got the embeddings.
print("\nSample data prepared for Pinecone:")
print(tabulate(table_data, headers=headers, tablefmt='grid'))


Sample data prepared for Pinecone:
+--------+-----------------------------------+-----------------------------------+-----------------------------------+-----------------------------------+
| ID     | Case Title                        | Case Text                         | Sparse Values                     | Embeddings                        |
| Case1  | Alpine Hardwood (Aust) Pty Ltd... | Ordinarily that discretion wil... | {'indices': [2486146960, 18412... | [-0.008599691092967987, 0.0388... |
+--------+-----------------------------------+-----------------------------------+-----------------------------------+-----------------------------------+
| Case2  | Black v Lipovac [1998] FCA 699... | The general principles governi... | {'indices': [1026658409, 31135... | [-0.008599691092967987, 0.0388... |
+--------+-----------------------------------+-----------------------------------+-----------------------------------+-----------------------------------+
| Case3  | Colgate Palmolive Co v 

In [54]:
def batch_upsert(index, vectors, batch_size=100):
    print(f"Upserting {len(vectors)} vectors in batches of {batch_size}...")
    for i in range(0, len(vectors), batch_size):
        i_end = min(i + batch_size, len(vectors))
        batch = vectors[i:i_end]
        try:
            index.upsert(vectors=batch)
        except Exception as e:
            print(f"Error upserting batch {i // batch_size}: {e}")
    print("Upsert complete.")

In [55]:
batch_upsert(index, all_cases_embeddings_and_sparse_vectors)

Upserting 997 vectors in batches of 100...
Upsert complete.
