<a href="https://colab.research.google.com/github/frank-morales2020/MLxDL/blob/main/ARAG_DEMO.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# install PSQL and DEV Libraries locally
!apt install postgresql postgresql-contrib &>log
!service postgresql restart


!apt-get install postgresql-server-dev-14 -q

In [None]:
!git clone https://github.com/pgvector/pgvector.git
%cd /content/pgvector/
print()
print('START: PG VECTOR COMPILATION')
!make
!make install
#print('END: PG VECTOR COMPILATION')

In [None]:
!pip install pgvector -q
!pip install openai -q
!pip install colab-env -q

In [None]:
import os
from openai import OpenAI
import psycopg2
from psycopg2.extras import RealDictCursor
import numpy as np
from pgvector.psycopg2 import register_vector # Import pgvector's psycopg2 integration
import colab_env

In [2]:
# PostGRES SQL Settings
!sudo -u postgres psql -c "ALTER USER postgres PASSWORD 'postgres'"

connection_string = 'postgresl://postgres:postgres@localhost:5432/postgres'

#CREATE EXTENSION IF NOT EXISTS btree_gist
!sudo -u postgres psql -c "CREATE EXTENSION IF NOT EXISTS vector"

import psycopg2 as ps

DB_NAME = "postgres"
DB_USER = "postgres"
DB_PASS = "postgres"
DB_HOST = "localhost"
DB_PORT = "5432"

conn = ps.connect(database=DB_NAME,
							user=DB_USER,
							password=DB_PASS,
							host=DB_HOST,
							port=DB_PORT)

cur = conn.cursor() # creating a cursor

# Connect to PostgreSQL database in Timescale using connection string
#conn = psycopg2.connect(connection_string)

cur = conn.cursor()

#install pgvector
cur.execute("CREATE EXTENSION IF NOT EXISTS vector");
conn.commit()

from pgvector.psycopg2 import register_vector

# Register the vector type with psycopg2
register_vector(conn)

!sudo -u postgres psql -c "DROP TABLE embeddings"

# Create table to store embeddings and metadata
table_create_command = """
CREATE TABLE IF NOT EXISTS embeddings (
            id bigserial primary key,
            title text,
            url text,
            content text,
            tokens integer,
            embedding vector(1536)
            );
            """

cur.execute(table_create_command)
cur.close()
conn.commit()


import os
from openai import OpenAI
import psycopg2
from psycopg2.extras import RealDictCursor
import numpy as np
from pgvector.psycopg2 import register_vector # Import pgvector's psycopg2 integration
import colab_env

# --- Configuration & Initialization ---
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# Using the database credentials provided in your reference
DB_NAME = "postgres"
DB_USER = "postgres"
DB_PASS = "postgres"
DB_HOST = "localhost"
DB_PORT = "5432"

# Construct the DATABASE_URL from individual components for consistency
DATABASE_URL = f"postgresql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}"

# Validate environment variables
if not OPENAI_API_KEY:
    raise ValueError("OPENAI_API_KEY not found in environment variables. Please set it.")

# --- Database Connection and Setup (using the provided table schema) ---
conn = None
try:
    conn = psycopg2.connect(
        database=DB_NAME,
        user=DB_USER,
        password=DB_PASS,
        host=DB_HOST,
        port=DB_PORT
    )
    conn.autocommit = True # For CREATE EXTENSION

    cur = conn.cursor()

    # Create the vector extension if not already present
    # This command is also run via shell, but idempotent so safe to run here too
    print("Ensuring pgvector extension is created...")
    cur.execute("CREATE EXTENSION IF NOT EXISTS vector;")
    print("pgvector extension checked/created.")

    # Drop table if it exists (for fresh runs, as in your reference)
    # Be cautious with this in production! Uncomment only if you want to reset the table on each run.
    # print("Attempting to drop existing 'embeddings' table (if any)...")
    # cur.execute("DROP TABLE IF EXISTS embeddings;")
    # print("'embeddings' table dropped (if it existed).")

    # Create table to store embeddings and metadata, using your provided schema
    print("Creating 'embeddings' table if it doesn't exist...")
    table_create_command = """
    CREATE TABLE IF NOT EXISTS embeddings (
                id BIGSERIAL PRIMARY KEY,
                title TEXT,
                url TEXT,
                content TEXT NOT NULL,
                tokens INTEGER,
                embedding VECTOR(1536) NOT NULL
                );
    """
    cur.execute(table_create_command)
    print("Table 'embeddings' checked/created.")

    cur.close()
    conn.close()
    print("Database setup complete.")

except Exception as e:
    print(f"Error during database initial setup: {e}")
    if conn:
        conn.close()
    # It's critical to ensure the database is set up, so we exit if it fails
    # In a full application, you'd handle this more gracefully
    exit()

# --- OpenAI Client Initialization ---
openai_client = OpenAI(api_key=OPENAI_API_KEY)

# --- OpenAI Embedding Function ---
def get_embedding(text: str, model: str = "text-embedding-3-small"):
    """
    Generates an embedding for the given text using OpenAI's embedding model.
    Converts the list embedding to a numpy array, which pgvector's psycopg2
    integration expects.
    """
    text = text.replace("\n", " ")
    try:
        response = openai_client.embeddings.create(input=[text], model=model)
        return np.array(response.data[0].embedding)
    except Exception as e:
        print(f"Error getting embedding for text: '{text[:50]}...'. Error: {e}")
        return None

# --- PostgreSQL Interaction Functions (Adapted to your 'embeddings' schema) ---
def store_document_embedding(title: str, url: str, content: str, tokens: int):
    """
    Stores document content and its embedding in the 'embeddings' table.
    """
    embedding = get_embedding(content)
    if embedding is None:
        print(f"Skipping storage for content (title: {title}) due to embedding error.")
        return

    conn = None
    try:
        conn = psycopg2.connect(
            database=DB_NAME, user=DB_USER, password=DB_PASS, host=DB_HOST, port=DB_PORT
        )
        register_vector(conn) # Register vector type for this specific connection
        cur = conn.cursor()
        cur.execute(
            """
            INSERT INTO embeddings (title, url, content, tokens, embedding)
            VALUES (%s, %s, %s, %s, %s)
            """,
            (title, url, content, tokens, embedding)
        )
        conn.commit()
        print(f"Stored document: '{title}' (content: '{content[:50]}...')")
    except Exception as e:
        print(f"Error storing document '{title}': {e}")
    finally:
        if conn:
            conn.close()

def search_similar_documents(query_text: str, top_k: int = 3):
    """
    Searches the 'embeddings' table for documents semantically similar to the query.
    Returns the 'content', 'title', and 'url' of the top_k most similar documents.
    """
    query_embedding = get_embedding(query_text)
    if query_embedding is None:
        print(f"Skipping search for query: '{query_text}' due to embedding error.")
        return []

    conn = None
    results = []
    try:
        conn = psycopg2.connect(
            database=DB_NAME, user=DB_USER, password=DB_PASS, host=DB_HOST, port=DB_PORT
        )
        register_vector(conn) # Register vector type for this specific connection
        cur = conn.cursor(cursor_factory=RealDictCursor) # To get results as dictionaries

        # <-> is the L2 distance operator, which works well for normalized embeddings
        cur.execute(
            """
            SELECT title, url, content
            FROM embeddings
            ORDER BY embedding <-> %s
            LIMIT %s
            """,
            (query_embedding, top_k)
        )
        results = cur.fetchall()
        print(f"Found {len(results)} relevant documents for query: '{query_text[:50]}...'")
    except Exception as e:
        print(f"Error searching documents for query '{query_text}': {e}")
    finally:
        if conn:
            conn.close()
    return results


ALTER ROLE
NOTICE:  extension "vector" already exists, skipping
CREATE EXTENSION
ERROR:  table "embeddings" does not exist
Ensuring pgvector extension is created...
pgvector extension checked/created.
Creating 'embeddings' table if it doesn't exist...
Table 'embeddings' checked/created.
Database setup complete.


## ARAG

In [14]:
import numpy as np
from typing import List, Dict, Any, Tuple
import os
import psycopg2
from psycopg2.extras import RealDictCursor

# Attempt to import pgvector's psycopg2 integration.
# This assumes pgvector is correctly installed and compiled as per your !make commands.
try:
    from pgvector.psycopg2 import register_vector
except ImportError:
    print("WARNING: pgvector.psycopg2 not found. Ensure pgvector is installed and compiled.")
    # Define a mock if pgvector is not available for pure conceptual run
    def register_vector(conn):
        print("Mock: pgvector.psycopg2.register_vector called.")

# --- 1. Data Representation (Conceptual Classes/Dictionaries) ---
class UserContext:
    """Represents the combined long-term and session user context."""
    def __init__(self, long_term_data: Any, session_data: Any):
        self.long_term_data = long_term_data  # e.g., list of past interactions, text summaries
        self.session_data = session_data      # e.g., list of recent interactions, current query

class Item:
    """Represents a candidate item with its metadata."""
    def __init__(self, item_id: str, metadata: Dict[str, Any]):
        self.item_id = item_id
        self.metadata = metadata # e.g., {'title': 'Dasein Hobo Handbag', 'description': 'vegan leather, checkered'}

# --- 2. Configuration for Agent ---
class AgentConfig:
    LLM_MODEL_NAME: str = "gemini-2.5-flash"

# --- 3. Google Colab / Gemini API Imports and Configuration ---
# Mock the `google.generativeai` module if not truly installed, for conceptual run
class MockGenAIModel:
    def __init__(self, model_name):
        self.model_name = model_name
    def generate_content(self, prompt, generation_config):
        # These are the conceptual mock responses for the LLM calls in the ARAG agents
        if "Summarize the user's generic interests" in prompt:
            return MockResponse("The user shows interest in women's fashion, especially vegan leather accessories with checkered design, with a focus on stylish, functional handbags, based on their Browse history.")
        elif "Evaluate the semantic alignment" in prompt:
            if "Dasein Hobo Handbag" in prompt: return MockResponse("Score: 0.85 (Good Match)")
            if "BUTIED Checkered Tote Shoulder Handbag" in prompt: return MockResponse("Score: 0.98 (Excellent Match)")
            if "Leather Belt" in prompt: return MockResponse("Score: 0.1 (No Match)")
            return MockResponse("Score: 0.7 (Aligned)")
        elif "Summarize the following textual metadata" in prompt:
            return MockResponse("Documents highlight PU vegan leather bags with checkered patterns—mainly totes, crossbody, and shoulder styles from BUTIED, GOWELL, and RICHPORTS offer stylish, versatile designs for various occasions.")
        elif "rank the items" in prompt:
            # Conceptual ranking based on the example in the paper's text
            return MockResponse("['item_2', 'item_1', 'item_4']") # BUTIED, Dasein, Women's Large Tote (conceptual ranking)
        return MockResponse("Conceptual LLM response from MockGenAIModel.")

class MockResponse:
    def __init__(self, text):
        self.text = text

GOOGLE_API_KEY = None
try:
    # This line will only work if running in Google Colab with `userdata` available
    from google.colab import userdata
    GOOGLE_API_KEY = userdata.get('GEMINI')
    print("Google Generative AI configured successfully using Colab Secrets.")
    import google.generativeai as genai # Only import if in Colab and key found
except (ImportError, KeyError):
    print("Not running in Google Colab or 'GEMINI' secret not found. Attempting to get 'GEMINI' environment variable.")
    GOOGLE_API_KEY = os.getenv('GEMINI')
    # If not in Colab, assume genai is installed globally if GOOGLE_API_KEY is found
    if GOOGLE_API_KEY:
        try:
            import google.generativeai as genai
        except ImportError:
            print("WARNING: 'google-generativeai' library not found. Gemini API calls will be mocked.")
            genai = None # Set to None if not found

if not GOOGLE_API_KEY:
    print("WARNING: GEMINI API Key not found. Gemini API calls will be mocked or fail if a real call is attempted.")
else:
    if genai: # Only configure if the module was successfully imported
        genai.configure(api_key=GOOGLE_API_KEY)
        print("Gemini API configured for real calls.")
    else:
        print("Gemini API key found, but 'google-generativeai' library not available. Gemini API calls will be mocked.")

# --- OpenAI Client Initialization ---
# Assuming colab_env handles setting this from Google Colab Secrets or environment
import colab_env
from openai import OpenAI
try:
    OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
    if not OPENAI_API_KEY:
        raise ValueError("OPENAI_API_KEY not found in environment variables. Please set it.")
    openai_client = OpenAI(api_key=OPENAI_API_KEY)
    print("OpenAI client initialized.")
except ValueError as e:
    print(f"ERROR: {e}. OpenAI client not initialized. Embedding calls will use a conceptual placeholder.")
    openai_client = None # Set to None if API key is missing

# --- Database Connection and Setup ---
DB_NAME = "postgres"
DB_USER = "postgres"
DB_PASS = "postgres"
DB_HOST = "localhost"
DB_PORT = "5432"

def setup_database():
    """Handles the initial database setup as provided in your context."""
    conn = None
    try:
        conn = psycopg2.connect(
            database=DB_NAME,
            user=DB_USER,
            password=DB_PASS,
            host=DB_HOST,
            port=DB_PORT
        )
        conn.autocommit = True
        cur = conn.cursor()

        print("Ensuring pgvector extension is created...")
        cur.execute("CREATE EXTENSION IF NOT EXISTS vector;")
        print("pgvector extension checked/created.")

        print("Creating 'embeddings' table if it doesn't exist...")
        table_create_command = """
        CREATE TABLE IF NOT EXISTS embeddings (
                    id BIGSERIAL PRIMARY KEY,
                    title TEXT,
                    url TEXT,
                    content TEXT NOT NULL,
                    tokens INTEGER,
                    embedding VECTOR(1536) NOT NULL
                    );
        """
        cur.execute(table_create_command)
        print("Table 'embeddings' checked/created.")

        cur.close()
        conn.close()
        print("Database setup complete.")

    except Exception as e:
        print(f"Error during database initial setup: {e}")
        if conn:
            conn.close()
        exit()

# Run the database setup once at the start of the script's execution
setup_database()

# --- OpenAI Embedding Function ---
def get_embedding(text: str, model: str = "text-embedding-3-small"):
    """
    Generates an embedding for the given text using OpenAI's embedding model.
    Converts the list embedding to a numpy array, which pgvector's psycopg2
    integration expects.
    """
    text = text.replace("\n", " ")
    if openai_client:
        try:
            response = openai_client.embeddings.create(input=[text], model=model)
            return np.array(response.data[0].embedding)
        except Exception as e:
            print(f"Error getting embedding for text: '{text[:50]}...'. Error: {e}")
            return None
    else:
        print(f"OpenAI client not initialized. Returning conceptual embedding for: '{text[:50]}...'")
        return np.random.rand(1536) # Conceptual embedding if OpenAI is not configured

# --- PostgreSQL Interaction Functions ---
def store_document_embedding(title: str, url: str, content: str, tokens: int):
    """
    Stores document content and its embedding in the 'embeddings' table.
    """
    embedding = get_embedding(content)
    if embedding is None:
        print(f"Skipping storage for content (title: {title}) due to embedding error.")
        return

    conn = None
    try:
        conn = psycopg2.connect(
            database=DB_NAME, user=DB_USER, password=DB_PASS, host=DB_HOST, port=DB_PORT
        )
        register_vector(conn)
        cur = conn.cursor()
        cur.execute(
            """
            INSERT INTO embeddings (title, url, content, tokens, embedding)
            VALUES (%s, %s, %s, %s, %s)
            """,
            (title, url, content, tokens, embedding)
        )
        conn.commit()
        print(f"Stored document: '{title}' (content: '{content[:50]}...')")
    except Exception as e:
        print(f"Error storing document '{title}': {e}")
    finally:
        if conn:
            conn.close()

def search_similar_documents(query_text: str, top_k: int = 3):
    """
    Searches the 'embeddings' table for documents semantically similar to the query.
    Returns the 'content', 'title', and 'url' of the top_k most similar documents.
    """
    query_embedding = get_embedding(query_text)
    if query_embedding is None:
        print(f"Skipping search for query: '{query_text}' due to embedding error.")
        return []

    conn = None
    results = []
    try:
        conn = psycopg2.connect(
            database=DB_NAME, user=DB_USER, password=DB_PASS, host=DB_HOST, port=DB_PORT
        )
        register_vector(conn)
        cur = conn.cursor(cursor_factory=RealDictCursor)

        cur.execute(
            """
            SELECT title, url, content, embedding <-> %s AS distance
            FROM embeddings
            ORDER BY distance
            LIMIT %s
            """,
            (query_embedding, top_k)
        )
        results = cur.fetchall()
        print(f"Found {len(results)} relevant documents for query: '{query_text[:50]}...'")
    except Exception as e:
        print(f"Error searching documents for query '{query_text}': {e}")
    finally:
        if conn:
            conn.close()
    return results

# --- Core LLM Call Function (using Gemini configuration) ---
def actual_llm_call(prompt: str, model_name: str = AgentConfig.LLM_MODEL_NAME, temperature: float = 0.0) -> str:
    """
    Function to make an LLM API call using the configured Gemini model.
    This will attempt a real Gemini call if `genai` is available and configured,
    otherwise it will use the mock.
    """
    print(f"Calling LLM ({model_name}) with prompt snippet: '{prompt[:70]}...'")

    try:
        if genai and GOOGLE_API_KEY:
            model = genai.GenerativeModel(model_name)
            response = model.generate_content(prompt, generation_config={"temperature": temperature})
            return response.text
        else:
            mock_model = MockGenAIModel(model_name)
            response = mock_model.generate_content(prompt, generation_config={"temperature": temperature})
            return response.text

    except Exception as e:
        print(f"Error during LLM call: {e}")
        return "Error: LLM call failed."

# --- Helper function for cosine similarity (now defined before ARAG_Framework) ---
def conceptual_cosine_similarity(embedding1: np.ndarray, embedding2: np.ndarray) -> float:
    """
    Conceptual function to calculate cosine similarity between two embeddings.
    """
    dot_product = np.dot(embedding1, embedding2)
    norm_a = np.linalg.norm(embedding1)
    norm_b = np.linalg.norm(embedding2)
    if norm_a == 0 or norm_b == 0:
        return 0.0
    return dot_product / (norm_a * norm_b)

# --- ARAG Agents (using `actual_llm_call` for Gemini) ---

class UserUnderstandingAgent:
    def __init__(self, llm_caller=actual_llm_call):
        self.llm_caller = llm_caller

    def generate_summary(self, user_context: UserContext) -> str:
        # Refined prompt for User Understanding
        prompt = (
            "Analyze the user's long-term behavioral data and current session interactions.\n"
            f"Long-term data: {user_context.long_term_data}\n"
            f"Current session data: {user_context.session_data}\n"
            "Identify and summarize the user's core generic interests, specific product preferences (e.g., categories, materials, styles), "
            "and their likely immediate goals or intent based on the current session. "
            "Format the summary clearly, separating generic interests, specific preferences, and immediate goals."
        )
        s_user = self.llm_caller(prompt)
        return s_user

class NLI_Agent:
    def __init__(self, llm_caller=actual_llm_call):
        self.llm_caller = llm_caller

    def evaluate_alignment(self, item: Item, user_context: UserContext) -> float:
        # Refined prompt for NLI
        prompt = (
            "Given the user's context and an item's metadata, evaluate the semantic alignment.\n"
            f"User context (summary of interests and goals): {user_context.long_term_data} and {user_context.session_data}\n"
            f"Item metadata: {item.metadata}\n"
            "Based on the item's attributes and description, determine how well it aligns with the user's preferences and current intent. "
            "Consider factors like category, material, style, function, and any explicit preferences mentioned in the user context. "
            "Provide a semantic alignment score between 0.0 (no alignment) and 1.0 (perfect alignment). "
            "Your response must start with 'Score:' followed by the numerical score, e.g., 'Score: 0.92'."
        )
        response = self.llm_caller(prompt)
        try:
            # Robustly extract the score
            score_str = response.split("Score:")[1].strip().split(" ")[0]
            return float(score_str)
        except (IndexError, ValueError):
            print(f"Warning: Could not extract score from NLI response: {response}")
            return 0.5 # Default to a neutral score on parsing failure

class ContextSummaryAgent:
    def __init__(self, llm_caller=actual_llm_call):
        self.llm_caller = llm_caller

    def summarize_context(self, accepted_items: List[Item]) -> str:
        if not accepted_items:
            return "No relevant context items were accepted."

        metadata_to_summarize = "\n".join([f"- {item.metadata}" for item in accepted_items])
        # Refined prompt for Context Summary
        prompt = (
            "Analyze the metadata of the following items that have been identified as semantically aligned with the user's intent:\n"
            f"{metadata_to_summarize}\n"
            "Synthesize the key themes, product attributes, categories, and styles present across these items. "
            "Provide a concise summary that captures the common characteristics and overall profile of these relevant items. "
            "This summary should help in the final ranking by highlighting what the user is currently interested in based on aligned items."
        )
        s_ctx = self.llm_caller(prompt)
        return s_ctx

class ItemRankerAgent:
    def __init__(self, llm_caller=actual_llm_call):
        self.llm_caller = llm_caller

    def rank_items(self, s_user: str, s_ctx: str, candidate_items: List[Item]) -> List[Item]:
        if not candidate_items:
            return []

        item_descriptions = "\n".join([f"- Item ID: {item.item_id}, Metadata: {item.metadata}" for item in candidate_items])
        # Refined prompt for Item Ranker
        prompt = (
            "Given the user's summarized preferences and goals, the synthesized context from items already deemed relevant, and a list of candidate items,\n"
            f"User Preferences Summary: {s_user}\n"
            f"Context Summary of Aligned Items: {s_ctx}\n"
            "Candidate Items to Rank:\n"
            f"{item_descriptions}\n\n"
            "Carefully consider the user's detailed interests (from S_user) and the common features of the aligned items (from S_ctx). "
            "Evaluate each candidate item's metadata against these summaries to determine its potential relevance and appeal to the user *right now*. "
            "Rank the Candidate Items from most likely to be purchased to least likely. "
            "Your response MUST be a Python list of only the Item IDs in the ranked order. Example: ['item_id_1', 'item_id_5', 'item_id_2']"
        )
        ranked_list_str = self.llm_caller(prompt)
        try:
            # Attempt to parse the list safely
            ranked_item_ids = eval(ranked_list_str)
            if not isinstance(ranked_item_ids, list):
                 raise ValueError("LLM response is not a list.")

            # Create a mapping from item_id to Item object for efficient lookup
            item_map = {item.item_id: item for item in candidate_items}

            # Build the ranked list of Item objects based on the parsed IDs
            final_ranked_items = [item_map[item_id] for item_id in ranked_item_ids if item_id in item_map]

            # Include any items not returned by the LLM at the end, maintaining their original relative order from candidates
            # This ensures all candidates are included, even if the LLM misses some IDs
            items_not_ranked_by_llm = [item for item in candidate_items if item.item_id not in ranked_item_ids]
            final_ranked_items.extend(items_not_ranked_by_llm)

            return final_ranked_items

        except Exception as e:
            print(f"Warning: Could not parse or process ranked list from LLM response ({e}). Response was: {ranked_list_str}. Returning original candidates.")
            return candidate_items # Return original list as a fallback




# --- ARAG Framework (Orchestration) ---

class ARAG_Framework:
    """
    Orchestrates the multi-agent collaboration for personalized recommendation.
    """
    def __init__(self, embedding_func=get_embedding,
                 similarity_func=conceptual_cosine_similarity,
                 llm_caller=actual_llm_call,
                 nli_threshold: float = 0.7):
        self.embedding_func = embedding_func
        self.similarity_func = similarity_func
        self.nli_threshold = nli_threshold

        self.user_understanding_agent = UserUnderstandingAgent(llm_caller)
        self.nli_agent = NLI_Agent(llm_caller)
        self.context_summary_agent = ContextSummaryAgent(llm_caller)
        self.item_ranker_agent = ItemRankerAgent(llm_caller)

    def recommend(self, user_context: UserContext, all_candidate_items: List[Item], top_k_initial_retrieval: int = 100) -> List[Item]:
        print("\n--- ARAG Recommendation Process Started ---")

        print("\n1. Initial Retrieval (Cosine Similarity-based RAG)")
        user_embedding = self.embedding_func(user_context.long_term_data + " " + user_context.session_data)
        item_similarities = []
        for item in all_candidate_items:
            # Ensure 'title' and 'description' keys exist before accessing, provide defaults if not
            item_text_for_embedding = item.metadata.get('title', '') + " " + item.metadata.get('description', '')
            item_embedding = self.embedding_func(item_text_for_embedding)
            if user_embedding is not None and item_embedding is not None:
                similarity = self.similarity_func(item_embedding, user_embedding)
                item_similarities.append((item, similarity))
            else:
                print(f"Skipping similarity for item {item.item_id} due to missing embeddings.")

        item_similarities.sort(key=lambda x: x[1], reverse=True)
        initial_recall_set_items = [item for item, _ in item_similarities[:top_k_initial_retrieval]]
        print(f"Initial recall set size: {len(initial_recall_set_items)}")

        print("\n2. Parallel Agent Execution (User Understanding & NLI)")
        s_user = self.user_understanding_agent.generate_summary(user_context)
        print(f"User Understanding Agent Summary (S_user): {s_user}")

        accepted_items: List[Item] = []
        for item in initial_recall_set_items:
            score = self.nli_agent.evaluate_alignment(item, user_context)
            if score >= self.nli_threshold:
                accepted_items.append(item)
        print(f"NLI Agent filtered {len(initial_recall_set_items) - len(accepted_items)} items. Accepted: {len(accepted_items)}")

        print("\n3. Context Summary Agent")
        s_ctx = self.context_summary_agent.summarize_context(accepted_items)
        print(f"Context Summary Agent (S_ctx): {s_ctx}")

        print("\n4. Item Ranker Agent")
        final_ranked_list = self.item_ranker_agent.rank_items(s_user, s_ctx, initial_recall_set_items)
        print("--- ARAG Recommendation Process Finished ---")

        return final_ranked_list

# --- Conceptual Usage Example ---

if __name__ == "__main__":
    print("\n--- Data Ingestion Example ---")
    documents_to_ingest = [
    {"title": "Dasein Hobo Handbag", "url": "url_hobo", "content": "Classic hobo style, made of high-quality vegan leather. Perfect for everyday use.", "tokens": 20},
    {"title": "BUTIED Checkered Tote Shoulder Handbag", "url": "url_butied", "content": "A stylish and functional tote bag featuring a unique checkered pattern, crafted from durable PU vegan leather. Ideal for various occasions.", "tokens": 30},
    {"title": "GOWELL Checkered Tote", "url": "url_gowell", "content": "Spacious tote bag with a classic checkered design, vegan-friendly material. Great for carrying essentials.", "tokens": 25},
    {"title": "Women's Large Tote", "url": "url_large_tote", "content": "A basic large tote bag made of synthetic material. Simple design for casual outings.", "tokens": 18},
    {"title": "Leather Belt", "url": "url_belt", "content": "Genuine cowhide leather belt, available in various sizes and colors. Durable and classic.", "tokens": 20},
    {"title": "Casual Pants", "url": "url_pants", "content": "Comfortable cotton blend casual pants for everyday wear. Relaxed fit.", "tokens": 15},
    {"title": "Stylish Canvas Backpack", "url": "url_backpack1", "content": "Durable canvas backpack with multiple compartments, ideal for school or travel.", "tokens": 25},
    {"title": "Minimalist Leather Wallet", "url": "url_wallet1", "content": "Slim genuine leather wallet with RFID blocking technology, holds up to 8 cards.", "tokens": 22},
    {"title": "Wireless Bluetooth Earbuds", "url": "url_earbuds1", "content": "High-fidelity wireless earbuds with noise cancellation and 24-hour battery life.", "tokens": 30},
    {"title": "Ergonomic Office Chair", "url": "url_chair1", "content": "Adjustable ergonomic chair with lumbar support and mesh back for comfort during long hours.", "tokens": 35},
    {"title": "Stainless Steel Water Bottle", "url": "url_bottle1", "content": "Insulated stainless steel water bottle keeps drinks cold for 24 hours or hot for 12.", "tokens": 28},
    {"title": "Yoga Mat Non-Slip", "url": "url_yogamat1", "content": "Eco-friendly TPE yoga mat with excellent grip for all types of yoga and Pilates.", "tokens": 20},
    {"title": "Smart Home Hub", "url": "url_smarthub1", "content": "Central hub to control all your smart home devices, compatible with multiple protocols.", "tokens": 32},
    {"title": "Portable External Hard Drive", "url": "url_hdd1", "content": "1TB USB 3.0 portable external hard drive, fast data transfer speeds.", "tokens": 25},
    {"title": "Digital Kitchen Scale", "url": "url_scale1", "content": "High-precision digital kitchen scale with tare function, measures up to 5kg.", "tokens": 23},
    {"title": "Beginner Acoustic Guitar Kit", "url": "url_guitar1", "content": "Full-size acoustic guitar kit including gig bag, picks, strap, and tuner.", "tokens": 30},
    {"title": "Running Shoes Men's", "url": "url_shoes1", "content": "Lightweight and breathable running shoes for men, provides excellent cushioning and support.", "tokens": 28},
    {"title": "Coding for Beginners Book", "url": "url_book1", "content": "An introductory guide to programming with Python, perfect for absolute beginners.", "tokens": 25},
    {"title": "Indoor Plant Set (3)", "url": "url_plant1", "content": "Set of three easy-care indoor plants to beautify your living space.", "tokens": 20},
    {"title": "Resistance Band Set", "url": "url_bands1", "content": "Set of 5 resistance bands with varying levels of tension for strength training and physical therapy.", "tokens": 35},
    {"title": "Noise Cancelling Headphones", "url": "url_headphones1", "content": "Over-ear noise cancelling headphones with superior sound quality and comfortable fit.", "tokens": 30},
    {"title": "Desk Lamp with Wireless Charger", "url": "url_desklamp1", "content": "Modern desk lamp with adjustable brightness and integrated wireless phone charger.", "tokens": 32},
    {"title": "Travel Pillow Memory Foam", "url": "url_travelpillow1", "content": "Ergonomic memory foam travel pillow for comfortable sleep on flights or road trips.", "tokens": 25},
    {"title": "Art Drawing Kit", "url": "url_artkit1", "content": "Comprehensive art drawing kit including pencils, charcoal, erasers, and sketchpad.", "tokens": 28},
    {"title": "Electric Kettle Fast Boil", "url": "url_kettle1", "content": "1.7L electric kettle with fast boiling feature and automatic shut-off.", "tokens": 22},
    {"title": "Board Game Strategy", "url": "url_boardgame1", "content": "A popular strategy board game for 2-4 players, challenging and engaging gameplay.", "tokens": 26},
]

    for doc in documents_to_ingest:
        store_document_embedding(doc["title"], doc["url"], doc["content"], doc["tokens"])
    print("--- Data Ingestion Complete ---")

    user_long_term = "User has a history of purchasing vegan leather handbags, specifically tote and crossbody styles. Likes checkered patterns."
    user_session = "Recently viewed several stylish, functional handbags."
    user_context = UserContext(user_long_term, user_session)

    # Convert ingested documents into Item objects for the ARAG framework
    all_arag_items = [
        Item(doc["title"], {"title": doc["title"], "description": doc["content"]})
        for doc in documents_to_ingest
    ]

    arag = ARAG_Framework()
    recommended_items = arag.recommend(user_context, all_arag_items, top_k_initial_retrieval=len(all_arag_items))

    print("\nConceptual Final Recommended Items (Ordered):")
    for item in recommended_items:
        print(f"- {item.item_id}: {item.metadata['title']}")

    print("\n--- Direct Search Example from PGVector ---")
    query = "stylish vegan leather checkered tote bag"
    search_results = search_similar_documents(query, top_k=2)
    for res in search_results:
        print(f"Title: {res['title']}, Content: {res['content'][:70]}..., Distance: {res['distance']:.4f}")

Google Generative AI configured successfully using Colab Secrets.
Gemini API configured for real calls.
OpenAI client initialized.
Ensuring pgvector extension is created...
pgvector extension checked/created.
Creating 'embeddings' table if it doesn't exist...
Table 'embeddings' checked/created.
Database setup complete.

--- Data Ingestion Example ---
Stored document: 'Dasein Hobo Handbag' (content: 'Classic hobo style, made of high-quality vegan lea...')
Stored document: 'BUTIED Checkered Tote Shoulder Handbag' (content: 'A stylish and functional tote bag featuring a uniq...')
Stored document: 'GOWELL Checkered Tote' (content: 'Spacious tote bag with a classic checkered design,...')
Stored document: 'Women's Large Tote' (content: 'A basic large tote bag made of synthetic material....')
Stored document: 'Leather Belt' (content: 'Genuine cowhide leather belt, available in various...')
Stored document: 'Casual Pants' (content: 'Comfortable cotton blend casual pants for everyday...')
Store