# News Aggregator App

Important Installations | Required

In [None]:
!pip install --upgrade --quiet  langchain langchain-community langchain-pinecone langchain-huggingface neo4j langchain-core tiktoken yfiles_jupyter_graphs newsapi-python requests huggingface_hub pinecone-client tqdm pinecone sentence_transformers

Importing libraries | Required

In [5]:
# Filter warnings
import warnings
warnings.filterwarnings('ignore')

# Standard library imports
import hashlib
import json
import os
import time
from uuid import uuid4
from datetime import datetime, timedelta
from typing import Dict, List, Optional, Tuple

# Third-party imports
import pinecone
import requests
import torch
from google.colab import output, userdata
from neo4j import GraphDatabase
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
from yfiles_jupyter_graphs import GraphWidget
from pinecone import Pinecone, ServerlessSpec

# LangChain imports
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.graphs import Neo4jGraph
from langchain_community.vectorstores import Neo4jVector
from langchain_core.messages import AIMessage, HumanMessage
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts.prompt import PromptTemplate
from langchain_core.runnables import (
    ConfigurableField,
    RunnableBranch,
    RunnableLambda,
    RunnableParallel,
    RunnablePassthrough,
)
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore

# Google Colab widget configuration
try:
    output.enable_custom_widget_manager()
except:
    pass

Loading environment variables | API keys

In [6]:
HUGGINGFACE_TOKEN = userdata.get('HUGGINGFACE_TOKEN')
NEWSAPI_KEY = userdata.get('NEWSAPI_KEY')
PINECONE_API_KEY = userdata.get('PINECONE_API_KEY')
NEO4J_PASSWORD = userdata.get('NEO4J_PASSWORD')
NEO4J_URI="neo4j+s://87dc4a97.databases.neo4j.io"
NEO4J_USERNAME="neo4j"

Validate HuggingFace access token   
*(run below command to validate your access token in terminal)*

In [7]:
!huggingface-cli login --token $HUGGINGFACE_TOKEN

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
The token `giru-upgrad-news-agg-read-only` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `giru-upgrad-news-agg-read-only`


Setup OS environment variables

In [8]:
os.environ['HUGGINGFACE_TOKEN'] = HUGGINGFACE_TOKEN
os.environ['NEWSAPI_KEY'] = NEWSAPI_KEY
os.environ['PINECONE_API_KEY'] = PINECONE_API_KEY
os.environ['NEO4J_PASSWORD'] = NEO4J_PASSWORD
os.environ['NEO4J_URI'] = NEO4J_URI
os.environ['NEO4J_USERNAME'] = NEO4J_USERNAME

Wait for it!!

In [29]:
def count_words(text: str) -> int:
    """Count words in a text string."""
    if not text:
        return 0
    return len(text.split())

def create_safe_filename(topic: str, title: str) -> str:
    """Create a safe filename from topic and title."""
    # Remove or replace invalid filename characters
    invalid_chars = '<>:"/\\|?*'
    safe_title = ''.join(c if c not in invalid_chars else '_' for c in title)
    safe_title = safe_title[:100]  # Limit length
    return f"{topic}_{safe_title}.txt"

def fetch_multiple_topics(api_key: str, topics: List[str], database_folder: str = "database") -> Dict[str, int]:
    """
    Fetch articles for multiple topics and save each article to a separate file.

    Args:
        api_key: NewsAPI key
        topics: List of topics to fetch articles for
        database_folder: Folder to store article files
        days_from: Number of days from today to fetch articles

    Returns:
        Dictionary with topics and their saved article counts
    """
    # Ensure database folder exists
    os.makedirs(database_folder, exist_ok=True)
    article_counts = {topic: 0 for topic in topics}

    for topic in topics:
        try:
            url = (
                f"https://newsapi.org/v2/everything"
                f"?q={topic}"
                f"&sortBy=popularity"
                f"&pageSize=100"
                f"&apiKey={api_key}"
            )

            response = requests.get(url)
            response.raise_for_status()

            articles = response.json().get('articles', [])

            for article in articles:
                title = article.get('title', 'No title')
                content = article.get('content', '')
                description = article.get('description', '')

                # Combine content and description for word count
                full_text = f"{content}\n{description}".strip()
                word_count = count_words(full_text)

                # Skip if content is too short
                if word_count < 10:
                    continue

                # Create filename using topic and title
                filename = create_safe_filename(topic, title)
                filepath = os.path.join(database_folder, filename)

                # Write article to file
                with open(filepath, 'w', encoding='utf-8') as file:
                    # Write metadata header
                    file.write("=" * 50 + "\n")
                    file.write(f"Topic: {topic}\n")
                    file.write(f"Title: {title}\n")
                    file.write(f"Published: {article.get('publishedAt', 'No date')}\n")
                    file.write(f"Source: {article.get('source', {}).get('name', 'Unknown')}\n")
                    file.write(f"URL: {article.get('url', 'No URL')}\n")
                    file.write(f"Word Count: {word_count}\n")
                    file.write("=" * 50 + "\n\n")

                    # Write content
                    file.write(full_text)

                article_counts[topic] += 1

            # Sleep to respect API rate limits
            time.sleep(1)

        except requests.exceptions.RequestException as e:
            print(f"Error fetching articles for {topic}: {str(e)}")
            continue

    return article_counts

In [30]:
class ArticleRAG:
    def __init__(self, database_folder: str = "database", index_name: str = "articles-embeddings"):
        """
        Initialize RAG system using LangChain and HuggingFace embeddings with Pinecone integration.

        Args:
            database_folder: Folder containing article files
            index_name: Name for the Pinecone index
        """
        self.database_folder = database_folder

        # Initialize HuggingFace Embeddings
        print("Initializing HuggingFace Embeddings...")
        self.embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

        # Initialize Pinecone with new pattern
        print("Initializing Pinecone...")
        self.pc = Pinecone(api_key=PINECONE_API_KEY)
        self.index_name = index_name

        # Check if index exists and create if needed
        if self.index_name not in self.pc.list_indexes().names():
            print(f"Creating new Pinecone index: {self.index_name}")
            self.pc.create_index(
                name=self.index_name,
                dimension=self.embeddings.embed_query("").shape[0],
                metric='cosine',
                spec=ServerlessSpec(
                    cloud='aws',
                    region='us-east-1'
                )
            )

        # Initialize vector store with the new index
        self.vector_store = PineconeVectorStore(
            index=self.pc.Index(self.index_name),
            embedding=self.embeddings
        )

        # Text splitter for chunking
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=500,  # Adjust for larger chunks
            chunk_overlap=100,
            length_function=len
        )

    def process_articles(self):
        """
        Read and process articles from the database folder, storing embeddings in Pinecone.
        """
        print("Processing articles from database folder...")
        docs = []
        for filename in os.listdir(self.database_folder):
            if filename.endswith(".txt"):
                filepath = os.path.join(self.database_folder, filename)
                with open(filepath, "r", encoding="utf-8") as file:
                    # Read the file content
                    content = file.read()

                    # Split content into metadata and body
                    parts = content.split("=" * 50)
                    if len(parts) < 3:
                        print(f"Skipping malformed file: {filename}")
                        continue

                    metadata_text = parts[1].strip()
                    article_content = parts[2].strip()

                    # Chunk the article content
                    chunks = self.text_splitter.split_text(article_content)

                    # Create Document objects with metadata
                    for i, chunk in enumerate(chunks):
                        doc = Document(
                            page_content=chunk,
                            metadata={
                                "source_file": filename,
                                "chunk_index": i,
                                "total_chunks": len(chunks),
                                "metadata_text": metadata_text
                            }
                        )
                        docs.append(doc)

        if docs:
            # Generate UUIDs for documents
            uuids = [str(uuid4()) for _ in range(len(docs))]

            # Add documents to Pinecone
            print("Adding documents to Pinecone...")
            self.vector_store.add_documents(documents=docs, ids=uuids)
        else:
            print("No valid documents to process.")

    def query(self, query_text: str, k: int = 5):
        """
        Query the Pinecone vector database.

        Args:
            query_text: Query string
            k: Number of top results to return

        Returns:
            List of relevant results with metadata
        """
        print("Querying Pinecone...")
        results = self.vector_store.similarity_search(query_text, k=k)
        formatted_results = [
            {
                "content": res.page_content,
                "metadata": res.metadata
            }
            for res in results
        ]
        return formatted_results

In [31]:
if __name__ == "__main__":
    # Fetch articles using fetch_multiple_topics
    API_KEY = NEWSAPI_KEY  # Replace with your API key
    topics = ["technology", "artificial intelligence", "economics", "politics", "climate change"]

    print("Fetching articles...")
    results = fetch_multiple_topics(api_key=API_KEY, topics=topics, database_folder="database")

    # Print summary
    print("\nArticles saved:")
    for topic, count in results.items():
        print(f"{topic}: {count} articles")

    # Initialize and process articles with ArticleRAG
    rag = ArticleRAG(database_folder="database", index_name="articles-embeddings")
    rag.process_articles()

Fetching articles...

Articles saved:
technology: 86 articles
artificial intelligence: 96 articles
economics: 100 articles
politics: 98 articles
climate change: 96 articles
Initializing HuggingFace Embeddings...
Initializing Pinecone...
Processing articles from database folder...
Adding documents to Pinecone...




Querying Pinecone...

Query Results:


In [33]:
# Example query
query = "What are the latest developments in artificial intelligence?"
results = rag.query(query, k=5)

print("\nQuery Results:")
for i, result in enumerate(results, 1):
    print(f"\nResult {i}:")
    print(f"Content: {result['content'][:200]}...")
    print(f"Metadata: {result['metadata']}")

Querying Pinecone...

Query Results:

Result 1:
Content: OpenAI CEO Sam Altmanexpects AGI, or artificial general intelligenceAI that outperforms humans at most tasksaround 2027 or 2028. Elon Musks prediction is either 2025 or 2026, and he has claimed that …...
Metadata: {'chunk_index': 0.0, 'metadata_text': 'Topic: artificial intelligence\nTitle: Human Misuse Will Make Artificial Intelligence More Dangerous\nPublished: 2024-12-13T14:00:00Z\nSource: Wired\nURL: https://www.wired.com/story/human-misuse-will-make-artificial-intelligence-more-dangerous/\nWord Count: 58', 'source_file': 'artificial intelligence_Human Misuse Will Make Artificial Intelligence More Dangerous.txt', 'total_chunks': 1.0}

Result 2:
Content: Those who are worried that advancements in artificial intelligence could lead to the destruction of humanity have a new reason to be anxious.
New research on OpenAI's latest series of AI models, kno… ...
Metadata: {'chunk_index': 0.0, 'metadata_text': "Topic: artificial intell

Backend

In [2]:
!pip install py2neo

Collecting py2neo
  Downloading py2neo-2021.2.4-py2.py3-none-any.whl.metadata (9.9 kB)
Collecting interchange~=2021.0.4 (from py2neo)
  Downloading interchange-2021.0.4-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting monotonic (from py2neo)
  Downloading monotonic-1.6-py2.py3-none-any.whl.metadata (1.5 kB)
Collecting pansi>=2020.7.3 (from py2neo)
  Downloading pansi-2024.11.0-py2.py3-none-any.whl.metadata (3.1 kB)
Downloading py2neo-2021.2.4-py2.py3-none-any.whl (177 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m177.2/177.2 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading interchange-2021.0.4-py2.py3-none-any.whl (28 kB)
Downloading pansi-2024.11.0-py2.py3-none-any.whl (26 kB)
Downloading monotonic-1.6-py2.py3-none-any.whl (8.2 kB)
Installing collected packages: monotonic, pansi, interchange, py2neo
Successfully installed interchange-2021.0.4 monotonic-1.6 pansi-2024.11.0 py2neo-2021.2.4


In [3]:
# Backend and Neo4j Integration in FastAPI
from fastapi import FastAPI, HTTPException
from py2neo import Graph, Node, Relationship
from pydantic import BaseModel
import uvicorn
import threading
# FastAPI News Endpoint
from typing import List, Dict

In [4]:
app = FastAPI()

# Neo4j Connection
# graph = Graph("bolt://localhost:7687", auth=("neo4j", "password"))
graph = Neo4jGraph()

class FetchNewsRequest(BaseModel):
    user_id: str
    category: str
    custom_category: str

@app.post("/fetch_news/")
def fetch_news(data: FetchNewsRequest):

    news = [
        {"title": f"Latest in {data.category}", "content": "News content about this category..."},
        {"title": f"More on {data.custom_category}", "content": "Custom category-related news..."}
    ]
    return {"news": news}

# Run FastAPI server in a thread
def run_fastapi():
    uvicorn.run(app, host="0.0.0.0", port=8000)

threading.Thread(target=run_fastapi, daemon=True).start()

NameError: name 'Neo4jGraph' is not defined

Frontend

In [35]:
!pip install gradio

Collecting gradio
  Downloading gradio-5.9.1-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.6-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.5.2 (from gradio)
  Downloading gradio_client-1.5.2-py3-none-any.whl.metadata (7.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.2.2 (from gradio)
  Downloading ruff-0.8.6-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metad

In [36]:
import gradio as gr
import requests

# Backend URL
backend_url = "http://0.0.0.0:8000/"

# Function to send data to backend and fetch news
def fetch_news(user_id, category, custom_category):
    payload = {
        "user_id": user_id,
        "category": category,
        "custom_category": custom_category,
    }
    # Backend endpoint to fetch news (implement this in FastAPI)
    response = requests.post(f"{backend_url}/fetch_news/", json=payload)

    if response.status_code == 200:
        news = response.json().get("news", [])
        cards = "\n\n".join([f"**{n['title']}**\n{n['content']}" for n in news])
        return cards
    else:
        return "Error fetching news!"

# Gradio UI
users = ["1", "2", "3"]

with gr.Blocks() as ui:
    # User selection
    user_dropdown = gr.Dropdown(users, label="Select User", value="1")
    # Category selection
    category_dropdown = gr.Dropdown(["Tech", "Sports", "Finance", "Health"], label="Select Category")
    # Custom category input
    custom_category_input = gr.Textbox(label="Enter Custom Category (optional)")
    # Button to fetch news
    fetch_button = gr.Button("Fetch News")
    # News display area
    news_display = gr.Textbox(label="News", interactive=False, placeholder="News will appear here")

    # Button click event
    fetch_button.click(
        fn=fetch_news,
        inputs=[user_dropdown, category_dropdown, custom_category_input],
        outputs=news_display
    )

ui.launch()

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://07525ad886d626aabc.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


