In [1]:
import pandas as pd
import json
import re
import math
import numpy as np
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
from qdrant_client.http import models
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from sentence_transformers import SentenceTransformer
import os
from typing import List
import time
from qdrant_client import QdrantClient

qdrant_client = QdrantClient(
    url="https://bab1dc9e-748d-4d59-b3d6-1db1bea5118b.us-east4-0.gcp.cloud.qdrant.io:6333", 
    api_key="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.XoCtltn93PaLB9H-00lmVxK9z90Y1CTD8TjTftNnIgw",
    timeout=180,
    prefer_grpc=True
)

print(qdrant_client.get_collections())

collections=[CollectionDescription(name='lending_club_loans')]


In [None]:
def create_loan_text(row) -> str:
    """Create a text representation of each loan for embedding"""
    text_parts = []

    if pd.notna(row.get("Loan Title")):
        text_parts.append(f"Loan Title: {row['Loan Title']}")
    if pd.notna(row.get("purpose")):
        text_parts.append(f"Purpose: {row['purpose']}")
    if pd.notna(row.get("emp_title")):
        text_parts.append(f"Employment: {row['emp_title']}")
    if pd.notna(row.get("Employment Length")):
        text_parts.append(f"Employment Length: {row['Employment Length']}")
    if pd.notna(row.get("home_ownership")):
        text_parts.append(f"Home Ownership: {row['home_ownership']}")
    if pd.notna(row.get("Amount Requested")):
        text_parts.append(f"Amount Requested: ${row['Amount Requested']}")
    if pd.notna(row.get("annual_inc")):
        text_parts.append(f"Annual Income: ${row['annual_inc']}")
    if pd.notna(row.get("Debt-To-Income Ratio")):
        text_parts.append(f"DTI Ratio: {row['Debt-To-Income Ratio']}")
    if pd.notna(row.get("int_rate")):
        text_parts.append(f"Interest Rate: {row['int_rate']}")
    if pd.notna(row.get("term")):
        text_parts.append(f"Term: {row['term']}")
    if pd.notna(row.get("loan_status")):
        text_parts.append(f"Status: {row['loan_status']}")

    return " | ".join(text_parts)

def precompute_embeddings(csv_file: str, output_dir: str, batch_size: int = 5000, start_batch: int = 1):
    """
    Precompute embeddings in batches and save to disk.
    Each batch is stored as two files:
      - embeddings_batch_<n>.npy
      - payload_batch_<n>.parquet
    """
    os.makedirs(output_dir, exist_ok=True)

    print("🚀 Loading data...")
    df = pd.read_csv(csv_file)
    numeric_columns = ["Amount Requested", "annual_inc", "Debt-To-Income Ratio", "int_rate"]
    for col in numeric_columns:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce")
    print(f"📊 Loaded {len(df)} rows")

    print("🤖 Loading embedding model...")
    model = SentenceTransformer("all-MiniLM-L6-v2")

    print("🤖 Generating text...")
    df["loan_text"] = df.apply(create_loan_text, axis=1)

    total_batches = (len(df) - 1) // batch_size + 1
    for batch_num in range(start_batch, total_batches + 1):
        i = (batch_num - 1) * batch_size
        batch_df = df.iloc[i:i+batch_size].copy()

        print(f"🔄 Encoding batch {batch_num}/{total_batches} ({len(batch_df)} rows)...")
        embeddings = model.encode(batch_df["loan_text"].tolist(),
                                  batch_size=512,
                                  show_progress_bar=False)

        # Save embeddings
        np.save(os.path.join(output_dir, f"embeddings_batch_{batch_num}.npy"), embeddings)

        # Save payloads (metadata)
        batch_df.to_parquet(os.path.join(output_dir, f"payload_batch_{batch_num}.parquet"))

        print(f"✅ Saved batch {batch_num}")

    print("🎉 All embeddings saved!")


def _json_safe_value(v):
    # Convert numpy scalars to native Python and sanitize NaN/Inf/NaT
    if isinstance(v, (np.generic,)):
        v = v.item()
    if v is pd.NaT:
        return None
    if isinstance(v, float) and (math.isnan(v) or math.isinf(v)):
        return None
    return v

def _df_to_json_safe_payloads(df: pd.DataFrame) -> List[dict]:
    # Replace Inf/-Inf -> NaN, then NaN/NaT -> None so JSON is valid
    df = df.replace([np.inf, -np.inf], np.nan)
    df = df.where(pd.notnull(df), None)
    records = df.to_dict(orient="records")

    safe_records = []
    for rec in records:
        safe = {}
        for k, v in rec.items():
            safe[k] = _json_safe_value(v)
        safe_records.append(safe)
    # quick sanity check on one record
    if safe_records:
        json.dumps(safe_records[0])
    return safe_records

def upload_precomputed_batches(client, output_dir: str, collection_name: str = "lending_club_loans", recreate_collection: bool = True, vector_size: int = 384, start_batch: int = 1, end_batch: int = None, id_start: int = 0, upload_batch_size: int = 1000, parallel_workers: int = 2, batch_id_block: int = 1_000_000):
    """
    Uploads batches saved as:
      - embeddings_batch_<n>.npy
      - payload_batch_<n>.parquet
    Uses upload_collection per batch (streamed internally).
    Cleans payloads to avoid 'cannot convert to json number' errors.
    """

    start_time = time.perf_counter()
    # Find available batch numbers
    npy_pattern = re.compile(r"embeddings_batch_(\d+)\.npy$")
    batch_nums = []
    for fname in os.listdir(output_dir):
        m = npy_pattern.match(fname)
        if m:
            batch_nums.append(int(m.group(1)))
    batch_nums = sorted(n for n in batch_nums if n >= start_batch and (end_batch is None or n <= end_batch))

    if not batch_nums:
        print("❗ No batches found to upload.")
        return

    if recreate_collection:
        client.recreate_collection(
            collection_name=collection_name,
            vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE),
        )
        print(f"✨ (Re)created collection '{collection_name}'")
    else:
        # Ensure collection exists; create if not
        try:
            info = client.get_collection(collection_name)
            print(f"ℹ️  Collection '{collection_name}' exists with {info.points_count} points.")


        except Exception:
            client.create_collection(
                collection_name=collection_name,
                vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE),
            )
            print(f"✨ Created collection '{collection_name}'")

    running_id = id_start
    total_uploaded = ((start_batch-1)*5000)

    for b in batch_nums:
        emb_path = os.path.join(output_dir, f"embeddings_batch_{b}.npy")
        payload_path = os.path.join(output_dir, f"payload_batch_{b}.parquet")

        if not os.path.exists(payload_path):
            print(f"⚠️  Missing payload file for batch {b}, skipping.")
            continue

        print(f"📦 Loading batch {b}...")
        embeddings = np.load(emb_path)
        payload_df = pd.read_parquet(payload_path)

        # Ensure payloads are JSON-safe (None instead of NaN/Inf; native types)
        payloads = _df_to_json_safe_payloads(payload_df)

        # Sanity check: vectors shape should match payload count
        if len(payloads) != embeddings.shape[0]:
            raise ValueError(f"Batch {b}: embeddings count {embeddings.shape[0]} != payloads count {len(payloads)}")

        ids = [b * batch_id_block + i for i in range(len(payloads))]

        json.dumps(payloads[0])
        elapsed = time.perf_counter() - start_time
        minutes, seconds = divmod(int(elapsed), 60)
        formatted_time = f"{minutes:02d}:{seconds:02d}"
        print(f"⬆️  Uploading batch {b} ({len(ids)} vectors) ... started at {formatted_time}")

        client.upload_collection(
            collection_name=collection_name,
            vectors=embeddings,      # shape: (N, 384)
            payload=payloads,        # list[dict]
            ids=ids,                 # list[int]
            batch_size=upload_batch_size,
            parallel=parallel_workers,
        )

        running_id += len(ids)
        total_uploaded += len(ids)
        print(f"✅ Uploaded batch {b}. Total uploaded so far: {total_uploaded}.")

    # Final count
    try:
        info = client.get_collection(collection_name)
        print(f"🎉 Done. Collection '{collection_name}' now has {info.points_count} points.")
    except Exception:
        print("🎉 Done. (Skipped final count fetch.)")

In [3]:
#precompute_embeddings('./data/cleaned_combined_lendingclub.csv', "./data/embeddings", batch_size=5000, start_batch=446)

In [None]:
# note to self: previous upload ended at 234 (incl)

upload_precomputed_batches(
    qdrant_client,
    output_dir="./data/embeddings",   # where your .parquet/.npy live
    collection_name="lending_club_loans",
    recreate_collection=False,           # set False to append
    start_batch=580,                      # or resume from any batch number
    upload_batch_size=500,             # tune if needed
    parallel_workers=4,                 # tune for your network/CPU
)

In [7]:
info = qdrant_client.get_collection('lending_club_loans')
print(f"Collection has {info.points_count} points.")

Collection has 1450701 points.
