# News Retrieval and Embedding System - RPP RSS Feed

This notebook implements a complete news retrieval system that:
1. Loads RSS feed from RPP Perú
2. Tokenizes articles using tiktoken
3. Generates embeddings using SentenceTransformers
4. Stores in ChromaDB
5. Provides similarity-based retrieval
6. Orchestrates everything with LangChain

## Setup and Imports

In [None]:
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
from datetime import datetime

# Import custom modules
from rss_loader import load_rss_feed, format_news_for_embedding
from tokenizer import tokenize_text, count_tokens, should_chunk
from embeddings import EmbeddingGenerator
from vector_store import ChromaDBStore
from langchain_pipeline import NewsRetrievalPipeline
from utils import create_results_dataframe, display_results

print("✅ All modules imported successfully!")

## Step 0: Load RSS Feed Data from RPP

Load the latest 50 news items from RPP Perú RSS feed.

In [None]:
# Load RSS feed
print("📡 Loading RSS feed from RPP Perú...")
news_items = load_rss_feed(url="https://rpp.pe/rss", max_items=50)

print(f"✅ Loaded {len(news_items)} news items")
print("\nFirst 3 news items:")
for i, item in enumerate(news_items[:3], 1):
    print(f"\n{i}. {item['title']}")
    print(f"   Published: {item['published']}")
    print(f"   Link: {item['link']}")
    print(f"   Description: {item['description'][:100]}...")

In [None]:
# Create DataFrame for visualization
df_news = pd.DataFrame(news_items)
print("\n📊 News DataFrame:")
print(df_news.head())
print(f"\nShape: {df_news.shape}")

## Step 1: Tokenization with tiktoken

Tokenize a sample article to understand token counts and determine if chunking is needed.

In [None]:
# Select a sample article
sample_article = format_news_for_embedding(news_items[0])

print("📝 Sample Article:")
print(sample_article)
print("\n" + "="*80)

In [None]:
# Tokenize and count tokens
tokens = tokenize_text(sample_article)
num_tokens = count_tokens(sample_article)

print(f"\n🔢 Token Analysis:")
print(f"   Number of tokens: {num_tokens}")
print(f"   First 10 token IDs: {tokens[:10]}")

# Check if chunking is needed
needs_chunking = should_chunk(sample_article, max_tokens=8192)
print(f"\n   Chunking needed (>8192 tokens): {needs_chunking}")

In [None]:
# Analyze token counts for all articles
token_counts = [count_tokens(format_news_for_embedding(item)) for item in news_items]

print("\n📊 Token Statistics Across All Articles:")
print(f"   Average tokens: {np.mean(token_counts):.2f}")
print(f"   Min tokens: {np.min(token_counts)}")
print(f"   Max tokens: {np.max(token_counts)}")
print(f"   Median tokens: {np.median(token_counts):.2f}")

## Step 2: Generate Embeddings with SentenceTransformers

Use the `sentence-transformers/all-MiniLM-L6-v2` model to generate embeddings.

In [None]:
# Initialize embedding generator
print("🤖 Initializing SentenceTransformer model...")
embedding_generator = EmbeddingGenerator(model_name="sentence-transformers/all-MiniLM-L6-v2")
print("✅ Model loaded!")

In [None]:
# Generate embeddings for all news items
print("\n🔄 Generating embeddings for all news items...")
texts = [format_news_for_embedding(item) for item in news_items]
embeddings = embedding_generator.embed_texts(texts)

print(f"✅ Generated {len(embeddings)} embeddings")
print(f"   Embedding dimension: {embeddings[0].shape[0]}")
print(f"   Sample embedding (first 10 values): {embeddings[0][:10]}")

## Step 3: Create ChromaDB Collection and Store Embeddings

Store documents, metadata, and embeddings in ChromaDB.

In [None]:
# Initialize ChromaDB store
print("💾 Initializing ChromaDB store...")
chroma_store = ChromaDBStore(
    collection_name="rpp_news",
    persist_directory="../chroma_db"
)
print("✅ ChromaDB store initialized!")

In [None]:
# Prepare metadata
metadatas = [
    {
        'title': item['title'],
        'description': item['description'],
        'link': item['link'],
        'published': item['published']
    }
    for item in news_items
]

# Generate unique IDs
ids = [f"news_{i}" for i in range(len(news_items))]

print(f"📝 Prepared {len(metadatas)} metadata entries")

In [None]:
# Upsert documents to ChromaDB
print("\n⬆️  Upserting documents to ChromaDB...")
chroma_store.upsert_documents(
    documents=texts,
    metadatas=metadatas,
    embeddings=embeddings.tolist(),
    ids=ids
)

collection_count = chroma_store.get_collection_count()
print(f"✅ Collection now contains {collection_count} documents")

## Step 4: Query and Retrieve Results

Query the system with "Últimas noticias de economía" and display results in a DataFrame.

In [None]:
# Query the collection
query_text = "Últimas noticias de economía"
print(f"🔍 Querying: '{query_text}'")

results = chroma_store.query(
    query_texts=[query_text],
    n_results=10
)

print(f"✅ Found {len(results['metadatas'][0])} results")

In [None]:
# Create and display results DataFrame
df_results = create_results_dataframe(results)

print("\n📊 Query Results:")
display(df_results)

# Save to CSV
output_path = "../outputs/query_results_economia.csv"
df_results.to_csv(output_path, index=False)
print(f"\n💾 Results saved to: {output_path}")

In [None]:
# Try another query - sports
query_text_sports = "Noticias de deportes y fútbol"
print(f"\n🔍 Querying: '{query_text_sports}'")

results_sports = chroma_store.query(
    query_texts=[query_text_sports],
    n_results=10
)

df_results_sports = create_results_dataframe(results_sports)
print("\n📊 Sports Query Results:")
display(df_results_sports)

In [None]:
# Try another query - politics
query_text_politics = "Noticias de política y gobierno"
print(f"\n🔍 Querying: '{query_text_politics}'")

results_politics = chroma_store.query(
    query_texts=[query_text_politics],
    n_results=10
)

df_results_politics = create_results_dataframe(results_politics)
print("\n📊 Politics Query Results:")
display(df_results_politics)

## Step 5: LangChain Orchestration Pipeline

Implement the complete end-to-end pipeline using LangChain.

In [None]:
# Initialize LangChain pipeline
print("🔗 Initializing LangChain Pipeline...")
langchain_pipeline = NewsRetrievalPipeline(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    persist_directory="../chroma_db"
)
print("✅ LangChain pipeline initialized!")

In [None]:
# Load fresh RSS feed for LangChain demo
print("\n📡 Loading fresh RSS feed...")
fresh_news = load_rss_feed(url="https://rpp.pe/rss", max_items=50)
print(f"✅ Loaded {len(fresh_news)} fresh news items")

In [None]:
# Step 1: Load and Process with LangChain
print("\n🔄 Step 1: Loading and processing documents...")
documents = langchain_pipeline.load_and_process(fresh_news)
print(f"✅ Created {len(documents)} LangChain documents")
print(f"   Sample document content: {documents[0].page_content[:100]}...")
print(f"   Sample metadata: {documents[0].metadata}")

In [None]:
# Step 2: Create Vector Store
print("\n🔄 Step 2: Creating vector store...")
langchain_pipeline.create_vectorstore(documents)
print("✅ Vector store created!")

In [None]:
# Step 3: Query with LangChain
print("\n🔄 Step 3: Querying vector store...")
query = "Últimas noticias de economía"
df_langchain_results = langchain_pipeline.query(query, k=10)

print(f"\n📊 LangChain Query Results for: '{query}'")
display(df_langchain_results)

# Save results
output_path_lc = "../outputs/langchain_query_results.csv"
df_langchain_results.to_csv(output_path_lc, index=False)
print(f"\n💾 Results saved to: {output_path_lc}")

In [None]:
# Complete Pipeline Demo
print("\n🚀 Running Complete End-to-End Pipeline...\n")

# Create a new pipeline instance
complete_pipeline = NewsRetrievalPipeline(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    persist_directory="../chroma_db"
)

# Run complete pipeline
query_complete = "Noticias sobre tecnología e innovación"
df_complete = complete_pipeline.run_pipeline(
    news_items=fresh_news,
    query_text=query_complete,
    k=10
)

print(f"\n📊 Complete Pipeline Results for: '{query_complete}'")
display(df_complete)

print("\n✅ Complete pipeline executed successfully!")

## Summary and Statistics

In [None]:
print("\n" + "="*80)
print("SUMMARY")
print("="*80)
print(f"✅ RSS Feed Source: https://rpp.pe/rss")
print(f"✅ Total News Items Loaded: {len(news_items)}")
print(f"✅ Embedding Model: sentence-transformers/all-MiniLM-L6-v2")
print(f"✅ Embedding Dimension: {embeddings[0].shape[0]}")
print(f"✅ Documents in ChromaDB: {chroma_store.get_collection_count()}")
print(f"✅ Average Tokens per Article: {np.mean(token_counts):.2f}")
print("\n🎯 All Steps Completed Successfully!")
print("   - Step 0: RSS Feed Loading ✓")
print("   - Step 1: Tokenization with tiktoken ✓")
print("   - Step 2: Embedding Generation ✓")
print("   - Step 3: ChromaDB Storage ✓")
print("   - Step 4: Query & Retrieval ✓")
print("   - Step 5: LangChain Orchestration ✓")
print("="*80)