# Environment Setup

In [1]:
# !rm -f "/content/drive/MyDrive/Colab Notebooks/bm25_index.pkl"
# !rm -f "/content/drive/MyDrive/Colab Notebooks/processed_metadata.parquet"

In [2]:
!pip install sentence-transformers pinecone transformers pandas torch rank_bm25 nltk --quiet
!pip install nest_asyncio --quiet
!pip install -q -U bitsandbytes  # Required for 4-bit quantization
# !pip install flash-attn --no-build-isolation
!pip install -U transformers

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m421.9/421.9 kB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m108.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m84.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m54.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
!nvidia-smi  # Check GPU memory
!free -h     # Check RAM usage

Wed Apr  2 13:01:27 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   46C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

#Configuration & Imports


In [4]:
import pandas as pd
import numpy as np
import torch
import re
import ast
import nltk
import asyncio
import nest_asyncio
import gc
import time
import joblib
import os
from datetime import datetime
from pinecone import Pinecone, ServerlessSpec
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from sentence_transformers import SentenceTransformer, CrossEncoder
from rank_bm25 import BM25Okapi
# from ragas.metrics import faithfulness, answer_relevance, context_precision
# from ragas import evaluate

In [5]:
# Free up memory before starting
torch.cuda.empty_cache()
gc.collect()

CONFIG = {
    # Core components
    "index_name": "movie-rag-final-final",
    "embedding_model": "sentence-transformers/all-mpnet-base-v2",
    "reranker": "cross-encoder/ms-marco-MiniLM-L-6-v2",
    "llm_model": "microsoft/Phi-3-mini-4k-instruct",

    # Processing parameters
    "chunk_size": 600,
    "chunk_overlap": 100,
    "top_k_retrieve": 50,
    "top_k_final": 12,

    # Response generation
    "max_response_tokens": 800,
    "min_imdb_rating": 6.5,
    "default_year_range": 10,

    # System settings
    "pinecone_env": "us-east1-gcp",
    "upload_batch_size": 150,
    "rate_limit_delay": 12,  # Seconds between Pinecone batches

    "bm25_path": "/content/drive/MyDrive/Colab Notebooks/bm25_index.pkl",
    "metadata_path": "/content/drive/MyDrive/Colab Notebooks/processed_metadata.parquet",
}

# Hardware configuration
device = "cuda" if torch.cuda.is_available() else "cpu"
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

#Devices and Models

In [6]:
nltk.download(['stopwords', 'punkt', 'punkt_tab'], quiet=True)
reranker = CrossEncoder(CONFIG["reranker"], max_length=512).to(device)
tokenizer = AutoTokenizer.from_pretrained(CONFIG["llm_model"])
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.44k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.94M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

1

In [7]:
# Model Loading with Error Recovery
try:
    model = AutoModelForCausalLM.from_pretrained(
        CONFIG["llm_model"],
        device_map="auto",
        torch_dtype=torch.float16,
        trust_remote_code=True,
        use_cache=False  # Disable cache at load time
    )
    model.eval()
except Exception as e:
    print(f"Model loading failed: {str(e)}")
    raise

del quantization_config
gc.collect()
torch.cuda.empty_cache()

config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

configuration_phi3.py:   0%|          | 0.00/11.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi3.py:   0%|          | 0.00/73.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/16.5k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

#Data Preprocessing Pipeline


In [8]:
# Data Processing Pipeline
def clean_text(text):
    """Sanitize text input with robust error handling"""
    try:
        text = str(text)
        # Remove HTML, URLs, and special characters
        text = re.sub(r'<[^>]+>|http\S+|[^a-zA-Z\s]', '', text)
        return text.lower().strip()[:2000]  # Limit to prevent OOM
    except Exception:
        return ""

def process_data(df):
    """Transform raw data into structured chunks with metadata"""
    df = df.copy()

    # Convert stringified lists to actual lists
    list_fields = ['genres', 'cast']
    for field in list_fields:
        df[field] = df[field].apply(
            lambda x: ast.literal_eval(str(x)) if pd.notnull(x) else [])

    # Calculate review helpfulness metrics
    df[['helpful_votes', 'total_votes']] = (
        df['helpful'].str.split('/', expand=True)
        .apply(pd.to_numeric, errors='coerce'))
    df['helpful_ratio'] = np.where(
        df['total_votes'] > 0,
        df['helpful_votes'] / df['total_votes'],
        0.0
    )

    # Text preprocessing
    df['clean_text'] = df['review_text'].apply(clean_text)

    # Generate chunks with full metadata
    chunks = []
    for _, row in df.iterrows():
        words = row['clean_text'].split()
        for i in range(0, len(words), CONFIG['chunk_size'] - CONFIG['chunk_overlap']):
            chunk = ' '.join(words[i:i+CONFIG['chunk_size']])

            chunks.append({
                "text": chunk,
                "title": str(row['title']),
                "year": int(row['year']) if pd.notnull(row['year']) else 0,
                "genres": ', '.join(row['genres']),
                "imdb_rating": float(row['imdb_rating']) if pd.notnull(row['imdb_rating']) else 0.0,
                "director": "Unknown" if pd.isnull(row['director']) else str(row['director']),
                "cast": ', '.join(row['cast']),
                "user_rating": float(row['user_rating']) if pd.notnull(row['user_rating']) else 0.0,
                "helpful_ratio": round(row['helpful_ratio'], 2),
                "raw_helpful": f"{int(row['helpful_votes'])}/{int(row['total_votes'])}"
            })

    return pd.DataFrame({"metadata": chunks})

#Load and process data

#Vector Store Initialization


In [9]:
def initialize_pinecone():
    pc = Pinecone(api_key="pcsk_8dnpj_HxyVoDGjVXjKKezSLxujhADXAtEJ7UPiBcPTBo9TcgkjhR8QbBihsRp6viwGwv1")

    if CONFIG["index_name"] not in pc.list_indexes().names():
        print("Creating new index...")
        encoder = SentenceTransformer(CONFIG["embedding_model"])
        pc.create_index(
            name=CONFIG["index_name"],
            dimension=encoder.get_sentence_embedding_dimension(),
            metric="cosine",
            spec=ServerlessSpec(cloud="aws", region="us-east-1")
        )
    return pc.Index(CONFIG["index_name"])

def upload_to_pinecone(index, chunks_df, start_id=0):
    """Batch upload with rate limiting and error handling"""
    encoder = SentenceTransformer(CONFIG["embedding_model"]).to(device)
    current_id = start_id

    for batch_idx in range(0, len(chunks_df), CONFIG["upload_batch_size"]):
        batch = chunks_df.iloc[batch_idx:batch_idx+CONFIG["upload_batch_size"]]
        texts = batch['metadata'].apply(lambda x: x['text']).tolist()

        try:
            # Generate embeddings with proper type handling
            with torch.no_grad(), torch.amp.autocast(device_type='cuda'):
                embeddings = encoder.encode(
                    texts,
                    batch_size=32,
                    show_progress_bar=True,
                    convert_to_tensor=True  # Ensure tensor output
                )
                # Move to CPU if needed and convert to numpy
                if isinstance(embeddings, torch.Tensor):
                    embeddings = embeddings.cpu().numpy()
                else:
                    embeddings = np.array(embeddings)

            # Prepare batch records
            records = [{
                "id": f"chunk-{current_id + idx}",
                "values": emb.tolist(),
                "metadata": {k: str(v) if not isinstance(v, (int, float)) else float(v)
                           for k, v in row['metadata'].items()}
            } for idx, ((_, row), emb) in enumerate(zip(batch.iterrows(), embeddings))]

            # Upsert with rate limiting
            index.upsert(vectors=records)
            current_id += len(records)

            print(f"Successfully uploaded batch {batch_idx//CONFIG['upload_batch_size']}")

        except Exception as e:
            print(f"Batch {batch_idx} failed: {str(e)}")
            continue

        time.sleep(CONFIG["rate_limit_delay"])

    return current_id

In [10]:
# pc = Pinecone(api_key="pcsk_4iMJMG_NbCDeiwiRz3Ln3b8fiRSVDaK7vab8TnDNyphawDtZFaM1LXzRUuWrSb7ydT771M")
# pc.delete_index(CONFIG["index_name"])

# RAG Core Functions

In [11]:
# Global dictionary to track user preferences
user_preferences = {
    "watch_history": [],
    "genre_preferences": {},
    "director_preferences": {},
    "actor_preferences": {},
    "conversation_history": []
}

def update_user_preferences(query, response, context):
    """Update user preferences based on interaction"""
    # Update genre preferences from context
    for ctx in context:
        if 'genres' in ctx:
            genres = [g.strip() for g in ctx.get('genres', '').split(',')]
            for genre in genres:
                if genre:
                    user_preferences["genre_preferences"][genre] = user_preferences["genre_preferences"].get(genre, 0) + 1

        # Add to watch history
        title = ctx.get('title', '')
        if title and title not in user_preferences["watch_history"]:
            user_preferences["watch_history"].append(title)

    # Extract the actual response content (not the system prompt)
    actual_response = response
    if "You are CinematicAI" in response and "\n\n" in response:
        # Skip the system prompt part
        actual_response = response.split("\n\n", 1)[-1].strip()

    # Add to conversation history with proper truncation
    user_preferences["conversation_history"].append({
        "query": query,
        "response": actual_response[:200]  # Store more of the response for context
    })

    # Keep the most recent 3 exchanges
    if len(user_preferences["conversation_history"]) > 3:
        user_preferences["conversation_history"] = user_preferences["conversation_history"][-3:]

In [12]:
def extract_query_filters(query):
    """Parse natural language queries into structured filters with helpfulness consideration"""
    filters = {"year": {}, "imdb_rating": {}}

    # Year extraction
    year_match = re.search(r'\b(20\d{2})s?\\b', query)
    current_year = datetime.now().year
    if year_match:
        target_year = int(year_match.group(1))
        filters["year"] = {"$gte": target_year - 2, "$lte": target_year + 2}
    else:
        filters["year"] = {"$gte": current_year - CONFIG["default_year_range"]}

    # Rating extraction
    rating_keywords = {
        'excellent': 8.5, 'great': 8.0, 'good': 7.5,
        'decent': 7.0, 'average': 6.5
    }
    for term, threshold in rating_keywords.items():
        if term in query.lower():
            filters["imdb_rating"] = {"$gte": threshold}
            break
    else:
        filters["imdb_rating"] = {"$gte": CONFIG["min_imdb_rating"]}

    # People extraction
    people = []
    if 'directed by' in query:
        people.extend(re.findall(r'directed by (\w+ \w+)', query, re.I))
    if 'starring' in query:
        people.extend(re.findall(r'starring (\w+ \w+)', query, re.I))

    if people:
        # Use simple equality match instead of unsupported $contains_any
        if len(people) == 1:
            filters["$or"] = [
                {"director": people[0]},
                {"cast": people[0]}
            ]

    # If the query mentions reviews or opinions, prioritize helpful reviews
    if any(term in query.lower() for term in ['review', 'opinion', 'liked', 'popular', 'recommended', 'highly rated']):
        filters["helpful_ratio"] = {"$gte": 0.7}

    # Detect if query is about TV shows (but without using preferences for now)
    is_tv_query = any(term in query.lower() for term in
                     ['tv', 'show', 'series', 'episode', 'season'])
    if is_tv_query:
        filters["content_type"] = "tv_show"

    return filters

In [13]:
def initialize_bm25(chunks_df):
    """Initialize BM25 index for keyword-based retrieval
    Required for hybrid search architecture
    """
    # Extract texts from all metadata entries
    texts = [row['metadata']['text'] for _, row in chunks_df.iterrows()]

    # Enhanced tokenization with stopword removal and lemmatization
    stop_words = set(stopwords.words('english'))
    tokenized = [
        [t.lower() for t in word_tokenize(text)
         if t not in stop_words and len(t) > 2]  # Filter short tokens
        for text in texts
    ]

    return BM25Okapi(tokenized)

def normalize_metadata(metadata):
    """Universal metadata format converter
    Critical for handling different Pinecone response formats
    """
    if isinstance(metadata, list):
        # Handle list format: [{'key': 'year', 'value': 2010}, ...]
        return {item.get('key', 'unknown'): item.get('value', '')
                for item in metadata}
    elif isinstance(metadata, dict):
        # Handle direct dict format
        return {k: v for k, v in metadata.items() if not k.startswith('_')}
    else:
        # Fallback for unexpected formats
        return dict(metadata)

In [14]:
async def hybrid_search(query, index, bm25_index, chunks_df):
    """Enhanced retrieval with semantic + lexical search"""
    # Query normalization
    query = re.sub(r"(\w)([A-Z][a-z])", r"\1 \2", query)
    query = re.sub(r'\s+', ' ', query).strip()

    # Extract basic filters (avoiding unsupported operators)
    filters = extract_query_filters(query)

    # Vector search with metadata filtering
    encoder = SentenceTransformer(CONFIG["embedding_model"])
    vector_results = index.query(
        vector=encoder.encode(query).tolist(),
        top_k=CONFIG["top_k_retrieve"],
        filter=filters,
        include_metadata=True
    ).matches

    # BM25 lexical search
    tokenized_query = [t.lower() for t in word_tokenize(query)]
    bm25_scores = bm25_index.get_scores(tokenized_query)
    bm25_indices = np.argsort(bm25_scores)[-CONFIG["top_k_retrieve"]:][::-1]

    # Combine results
    combined = []

    # Add vector results
    for match in vector_results:
        metadata = normalize_metadata(match.metadata)
        combined.append((metadata, match.score))

    # Add BM25 results
    for i in bm25_indices:
        metadata = chunks_df.iloc[i]["metadata"]
        combined.append((metadata, bm25_scores[i]))

    # Apply post-retrieval boosts
    boosted_combined = []
    for metadata, score in combined:
        boost = 1.0

        # Boost based on helpfulness ratio
        try:
            helpful_ratio = float(metadata.get('helpful_ratio', 0))
            boost *= (1 + helpful_ratio * 0.3)
        except (ValueError, TypeError):
            pass

        # Apply genre preference boost (if available and no errors in first query)
        if hasattr(user_preferences, "genre_preferences") and user_preferences["genre_preferences"]:
            try:
                genres = str(metadata.get('genres', '')).split(',')
                for genre in genres:
                    if genre.strip() in user_preferences["genre_preferences"]:
                        boost *= 1.1  # Small boost for preferred genres
            except:
                pass

        boosted_combined.append((metadata, score * boost))

    # Sort and return top results
    sorted_results = sorted(boosted_combined, key=lambda x: x[1], reverse=True)
    return [item[0] for item in sorted_results[:CONFIG["top_k_final"]]]

In [15]:
def extract_review_insights(context, max_reviews=3):
    """Extract meaningful insights from the most helpful reviews"""
    if not context:
        return ""

    # Sort context by helpful_ratio
    try:
        sorted_context = sorted(context,
                               key=lambda x: float(x.get('helpful_ratio', 0)),
                               reverse=True)
    except (ValueError, TypeError):
        # Fallback if sorting fails
        sorted_context = context

    # Get top reviews with high helpful ratio
    insights = []
    for review in sorted_context:
        # Only include if reviewed by multiple people and majority found it helpful
        try:
            total_votes = int(review.get('raw_helpful', '0/0').split('/')[1])
            helpful_ratio = float(review.get('helpful_ratio', 0))
        except (ValueError, IndexError):
            continue

        if total_votes >= 5 and helpful_ratio >= 0.7:
            review_text = review.get('text', '')

            # Find a meaningful excerpt (limited to 150 chars for conciseness)
            excerpt = review_text[:500].strip()
            if len(excerpt) < len(review_text):
                excerpt += "..."

            # Format with helpful information
            insight = f"'{excerpt}' - Review rated helpful by {int(helpful_ratio * 100)}% of {total_votes} viewers"
            insights.append(insight)

            if len(insights) >= max_reviews:
                break

    if insights:
        return "\n\nHelpful Review Highlights:\n• " + "\n• ".join(insights)
    else:
        return ""

In [16]:
def apply_diversity_reranking(results, scores, query_intent):
    """Promotes diversity in results based on query intent"""
    # Get initial top item
    top_indices = [np.argmax(scores)]
    remaining = list(range(len(scores)))
    remaining.remove(top_indices[0])

    # Set diversity factor based on query
    diversity_factor = 0.3
    if query_intent.get('wants_variety', False):
        diversity_factor = 0.5

    # Add items with penalty for similarity to already selected
    while len(top_indices) < min(CONFIG["top_k_final"], len(scores)):
        best_idx = -1
        best_score = -float('inf')

        for idx in remaining:
            # Base score
            score = scores[idx]

            # Diversity penalty based on similarity to selected items
            penalty = 0
            for selected in top_indices:
                sim = compute_film_similarity(results[idx][0], results[selected][0])
                penalty += sim

            penalty /= len(top_indices)
            adjusted_score = score * (1 - diversity_factor * penalty)

            if adjusted_score > best_score:
                best_score = adjusted_score
                best_idx = idx

        if best_idx != -1:
            top_indices.append(best_idx)
            remaining.remove(best_idx)

    return [results[i][0] for i in top_indices]

In [17]:
def build_response_prompt(query, context):
    """Structure the LLM prompt for coherent responses with review insights"""
    # Basic context information
    context_str = "\n\n".join(
        f"Title: {ctx['title']} ({ctx.get('year', 'N/A')})\n"
        f"Type: {ctx.get('content_type', 'movie')}\n"
        f"Director: {ctx.get('director', 'Unknown')}\n"
        f"Cast: {ctx.get('cast', 'N/A')}\n"
        f"IMDB: {ctx.get('imdb_rating', 0.0)}/10 | "
        f"User Rating: {ctx.get('user_rating', 0.0)}/10\n"
        f"Review Helpfulness: {ctx.get('raw_helpful', '0/0')} ({int(float(ctx.get('helpful_ratio', 0)) * 100)}% positive)\n"
        f"Excerpt: {ctx.get('text', '')[:200]}..."
        for ctx in context
    )

    # Extract review insights
    review_insights = extract_review_insights(context)

    # Add conversation history context
    conversation_context = ""
    if 'conversation_history' in user_preferences and user_preferences["conversation_history"]:
        recent = user_preferences["conversation_history"][-3:]  # Last 3 exchanges
        conversation_context = "Recent conversation:\n" + "\n".join(
            [f"User: {ex['query']}\nCinematicAI: {ex['response'][:150]}" for ex in recent]
        )

    # Add user preference context
    user_context = ""
    if 'genre_preferences' in user_preferences and user_preferences["genre_preferences"]:
        # Get top genres
        top_genres = sorted(user_preferences["genre_preferences"].items(),
                           key=lambda x: x[1], reverse=True)[:3]
        genre_str = ", ".join([g[0] for g in top_genres]) if top_genres else "Not enough data"

        # Get recent watches
        recent_watches = user_preferences["watch_history"][-3:] if user_preferences["watch_history"] else []
        watch_str = ", ".join(recent_watches) if recent_watches else "No recent history"

        user_context = f"""
User Preferences:
- Favorite Genres: {genre_str}
- Recently Discussed: {watch_str}
"""

    # Complete prompt
    return f"""<|system|>
You are CinematicAI, a conversational AI expert on films and TV shows. Provide personalized recommendations
and insights based on the context and user preferences. Maintain a natural, engaging conversational style.

{user_context}
{conversation_context}

<|user|>
{query}

Context Films and Shows:
{context_str}{review_insights}

<|assistant|>
"""

async def generate_response(query, context):
    """Generate response with conversation tracking"""
    prompt = build_response_prompt(query, context)

    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    generation_config = {
        "max_new_tokens": 500,  # Increased from default
        "temperature": 0.4,
        "top_p": 0.85,
        "do_sample": True,
        "pad_token_id": tokenizer.eos_token_id,
        "use_cache": False  # Disable cache to avoid errors
    }

    with torch.no_grad():
        outputs = model.generate(**inputs, **generation_config)

    full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract just the assistant's response
    if "<|assistant|>" in full_response:
        response = full_response.split("<|assistant|>")[-1].strip()
    else:
        response = full_response.split("Context Films and Shows:")[-1].strip()

    # Update user preferences with this interaction
    update_user_preferences(query, response, context)

    return response

#Main Execution


In [18]:
async def main():
    """End-to-end processing pipeline"""
    try:
        pinecone_index = initialize_pinecone()
        stats = pinecone_index.describe_index_stats()
        csv_path = "/content/drive/MyDrive/Colab Notebooks/reviews.csv"

        # Check if first run or missing files
        first_run = (
            stats['total_vector_count'] == 0 or
            not os.path.exists(CONFIG["bm25_path"]) or
            not os.path.exists(CONFIG["metadata_path"])
        )

        if first_run:
            # Your existing first-run data processing code
            pass
        else:
            print("Loading existing resources...")
            import joblib
            bm25_index = joblib.load(CONFIG["bm25_path"])
            full_data = pd.read_parquet(CONFIG["metadata_path"])
            print("Successfully loaded cached resources")

        # Initialize global preferences dict
        global user_preferences
        user_preferences = {
            "watch_history": [],
            "genre_preferences": {},
            "director_preferences": {},
            "actor_preferences": {},
            "conversation_history": []
        }

        # Set 1: Recommendation-Focused Queries
        recommendation_queries = [
            "I loved Inception. What other Christopher Nolan films might I enjoy and why?",
            "Tell me more about the cast of that movie",
            "I enjoy comedies with romance. What should I watch next?",
            "Are there any TV shows similar to that?",
            "What are the best sci-fi films from the last 5 years?",
            "I'm in the mood for something with great visual effects but also an emotional story",
            "What's a good thriller to watch with my family? Nothing too violent please",
            "I liked Breaking Bad. What other critically acclaimed TV dramas should I watch?",
            "What are some feel-good movies for a rainy day?",
            "Show me action movies with strong female leads",
            "I'm planning a movie marathon. What are the essential films from the Marvel universe?",
            "What scary movies are actually worth watching according to viewer ratings?"
        ]

        # Set 2: Film Knowledge and Analysis Queries
        film_knowledge_queries = [
            "How did Robert De Niro's acting style evolve throughout his career?",
            "Can you explain what makes Citizen Kane so influential?",
            "What were the major innovations in cinematography during the New Hollywood era?",
            "How do Quentin Tarantino's films use music?",
            "What's the difference between film noir and neo-noir?",
            "Explain the Bechdel test and its significance in film criticism",
            "How has CGI technology changed filmmaking since the 1990s?",
            "What are some examples of practical effects that still hold up today?",
            "Who were the most influential directors of the French New Wave and why?",
            "How do South Korean films like Parasite use social commentary?",
            "Compare the directing styles of Martin Scorsese and Steven Spielberg",
            "What makes a good screenplay according to film critics?"
        ]

        # Choose which query set to run (or run both)
        query_mode = "both"  # Options: "recommendations", "knowledge", "both"

        if query_mode in ["recommendations", "both"]:
            print("\n\n" + "="*20 + " RECOMMENDATION QUERIES " + "="*20 + "\n")
            # Reset preferences between sets
            user_preferences = {
                "watch_history": [],
                "genre_preferences": {},
                "director_preferences": {},
                "actor_preferences": {},
                "conversation_history": []
            }

            for query in recommendation_queries:
                print(f"\n{'='*40}\nUser: {query}")
                try:
                    context = await hybrid_search(query, pinecone_index, bm25_index, full_data)
                    response = await generate_response(query, context)
                    print(f"\nCinematicAI:\n{response}\n{'='*40}")
                except Exception as e:
                    print(f"Query failed: {str(e)}")
                    continue

        if query_mode in ["knowledge", "both"]:
            print("\n\n" + "="*20 + " FILM KNOWLEDGE QUERIES " + "="*20 + "\n")
            # Reset preferences between sets
            user_preferences = {
                "watch_history": [],
                "genre_preferences": {},
                "director_preferences": {},
                "actor_preferences": {},
                "conversation_history": []
            }

            for query in film_knowledge_queries:
                print(f"\n{'='*40}\nUser: {query}")
                try:
                    context = await hybrid_search(query, pinecone_index, bm25_index, full_data)
                    response = await generate_response(query, context)
                    print(f"\nCinematicAI:\n{response}\n{'='*40}")
                except Exception as e:
                    print(f"Query failed: {str(e)}")
                    continue

        print("\n\nFinal verification:")
        print(f"- Pinecone vectors: {pinecone_index.describe_index_stats().get('total_vector_count', 0)}")
        print(f"- BM25 exists: {os.path.exists(CONFIG['bm25_path'])}")
        print(f"- Metadata exists: {os.path.exists(CONFIG['metadata_path'])}")

    except Exception as e:
        print(f"Critical error: {str(e)}")

In [19]:
def evaluate_rag_system(test_queries, ground_truth=None):
    """Evaluate the RAG system performance"""
    results = {
        'retrieval_metrics': {
            'precision': [],
            'recall': [],
            'ndcg': []
        },
        'response_metrics': {
            'relevance': [],
            'factual_accuracy': [],
            'recommendation_quality': []
        },
        'efficiency_metrics': {
            'retrieval_time': [],
            'generation_time': []
        }
    }

    for query in test_queries:
        # Measure retrieval performance
        start_time = time.time()
        context = asyncio.run(hybrid_search(query, index, bm25_index, full_data))
        retrieval_time = time.time() - start_time

        # Evaluate retrieval against ground truth if available
        if ground_truth and query in ground_truth:
            retrieval_metrics = evaluate_retrieval(context, ground_truth[query])
            results['retrieval_metrics']['precision'].append(retrieval_metrics['precision'])
            results['retrieval_metrics']['recall'].append(retrieval_metrics['recall'])
            results['retrieval_metrics']['ndcg'].append(retrieval_metrics['ndcg'])

        # Measure response generation
        start_time = time.time()
        response = asyncio.run(generate_response(query, context))
        generation_time = time.time() - start_time

        # Record timing metrics
        results['efficiency_metrics']['retrieval_time'].append(retrieval_time)
        results['efficiency_metrics']['generation_time'].append(generation_time)

        # Evaluate response quality if ground truth available
        if ground_truth and query in ground_truth:
            response_metrics = evaluate_response_quality(response, ground_truth[query])
            for metric in response_metrics:
                results['response_metrics'][metric].append(response_metrics[metric])

    # Aggregate results
    for category in results:
        for metric in results[category]:
            if results[category][metric]:
                results[category][metric] = {
                    'mean': np.mean(results[category][metric]),
                    'std': np.std(results[category][metric]),
                    'min': min(results[category][metric]),
                    'max': max(results[category][metric])
                }

    return results

#Run

In [None]:
nest_asyncio.apply()
asyncio.run(main())

Loading existing resources...
Successfully loaded cached resources




User: I loved Inception. What other Christopher Nolan films might I enjoy and why?


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]




CinematicAI:
Title: Insomnia (2002)
Type: movie
Director: Christopher Nolan
Cast: Al Pacino, Robin Williams, Hilary Swank
IMDB: 7.2/10 | User Rating: 7.0/10
Review Helpfulness: 13/23 (56% positive)
Excerpt: as is evident by now anyone who has enjoyed christopher nolans movies would most probably insomnia at the bottom of their favorite from nolan while people who dont enjoy the nolan brand of cinema will...

Title: Tenet (2020)
Type: movie
Director: Christopher Nolan
Cast: John David Washington, Robert Pattinson, Elizabeth Debicki
IMDB: 7.3/10 | User Rating: 10.0/10
Review Helpfulness: 46/68 (68% positive)
Excerpt: this is as good as cinema gets i didnt expect that nolan would create such a original movie once again this is never been done before its incredible one of the best movies ive ever sen and i actually ...

Title: Oppenheimer (2023)
Type: movie
Director: Christopher Nolan
Cast: Cillian Murphy, Emily Blunt, Matt Damon
IMDB: 8.3/10 | User Rating: 10.0/10
Review Helpfulness: 883

#Streamlit

In [None]:
# # Install additional required packages for API
# !pip install flask-ngrok pyngrok flask flask-cors

# # Import necessary libraries
# from flask import Flask, request, jsonify
# from flask_cors import CORS
# from flask_ngrok import run_with_ngrok
# from pyngrok import ngrok
# import nest_asyncio

# # Apply nest_asyncio to allow nested async loops (needed for Flask + asyncio)
# nest_asyncio.apply()

# # Setup Flask app
# app = Flask(__name__)
# CORS(app)  # Enable CORS for all routes

# # Register authentication token for ngrok
# !ngrok authtoken 2v8IupkEuWyholoQI7LcZhRp5Ue_3kE9PJsEE8TL86mooKQAA

# # Initialize the models globally
# # These variables should already be defined in your notebook
# # pinecone_index = ...
# # encoder = ...
# # tokenizer = ...
# # model = ...

# @app.route('/api/query', methods=['POST'])
# def process_query():
#     """Process a query from the Streamlit app"""
#     data = request.json
#     query = data.get('query', '')

#     if not query:
#         return jsonify({'error': 'No query provided'}), 400

#     try:
#         # Call your existing async function but in a synchronous way
#         # Get the asyncio event loop
#         loop = asyncio.get_event_loop()

#         # First, get the context using hybrid_search
#         context = loop.run_until_complete(hybrid_search(query, pinecone_index, encoder))

#         # Then generate the response
#         if not context:
#             response = "I couldn't find any relevant movies to answer your query. Can you try rephrasing or asking about a different topic?"
#         else:
#             # Build prompt and generate response
#             prompt = build_response_prompt(query, context)
#             response = loop.run_until_complete(generate_response(query, context))

#         return jsonify({
#             'query': query,
#             'response': response
#         })

#     except Exception as e:
#         import traceback
#         traceback.print_exc()
#         return jsonify({'error': str(e)}), 500

# # Start the server with ngrok
# run_with_ngrok(app)
# app.run()

# # After running this cell, you'll get a public URL that looks like:
# # * Running on http://xxxx.ngrok.io
# # Copy this URL as it will be needed in the Streamlit app