# News Aggregator App | Upgrad Capstone Project

Important Installations | Required

In [None]:
!pip install --upgrade --quiet  langchain langchain-community langchain-pinecone langchain-huggingface neo4j langchain-core tiktoken yfiles_jupyter_graphs newsapi-python requests huggingface_hub pinecone-client tqdm pinecone sentence_transformers py2neo gradio fastapi

Importing libraries | Required

In [10]:
# Filter warnings
import warnings
warnings.filterwarnings('ignore')

# Standard library imports
import os
import time
import json
from uuid import uuid4
from datetime import datetime, timedelta
from typing import Dict, List, Optional, Tuple

# Third-party imports
import pinecone
import requests
import torch
from google.colab import output, userdata
from neo4j import GraphDatabase
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
from yfiles_jupyter_graphs import GraphWidget
from pinecone import Pinecone, ServerlessSpec

# LangChain imports
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.graphs import Neo4jGraph
from langchain_community.vectorstores import Neo4jVector
from langchain_core.messages import AIMessage, HumanMessage
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts.prompt import PromptTemplate
from langchain_core.runnables import (
    ConfigurableField,
    RunnableBranch,
    RunnableLambda,
    RunnableParallel,
    RunnablePassthrough,
)
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain import PromptTemplate, LLMChain
from langchain.chains import SimpleSequentialChain

# Frontend - Backend libraries
from fastapi import FastAPI, HTTPException
from py2neo import Graph, Node, Relationship
from pydantic import BaseModel
import uvicorn
import threading
from typing import List, Dict
import gradio as gr
import requests

# Google Colab widget configuration
try:
    output.enable_custom_widget_manager()
except:
    pass

Loading environment variables | API keys

In [3]:
HUGGINGFACE_TOKEN = userdata.get('HUGGINGFACE_TOKEN')
NEWSAPI_KEY = userdata.get('NEWSAPI_KEY')
PINECONE_API_KEY = userdata.get('PINECONE_API_KEY')
NEO4J_PASSWORD = userdata.get('NEO4J_PASSWORD')
NEO4J_URI="neo4j+s://87dc4a97.databases.neo4j.io"
NEO4J_USERNAME="neo4j"

Validate HuggingFace access token   
*(run below command to validate your access token in terminal)*

In [4]:
!huggingface-cli login --token $HUGGINGFACE_TOKEN

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
The token `giru-upgrad-news-agg-read-only` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `giru-upgrad-news-agg-read-only`


Setup OS environment variables

In [5]:
os.environ['HUGGINGFACE_TOKEN'] = HUGGINGFACE_TOKEN
os.environ['NEWSAPI_KEY'] = NEWSAPI_KEY
os.environ['PINECONE_API_KEY'] = PINECONE_API_KEY
os.environ['NEO4J_PASSWORD'] = NEO4J_PASSWORD
os.environ['NEO4J_URI'] = NEO4J_URI
os.environ['NEO4J_USERNAME'] = NEO4J_USERNAME

In [6]:
def count_words(text: str) -> int:
    """Count words in a text string."""
    if not text:
        return 0
    return len(text.split())

In [7]:
def create_safe_filename(topic: str, title: str) -> str:
    """Create a safe filename from topic and title."""
    # Remove or replace invalid filename characters
    invalid_chars = '<>:"/\\|?*'
    safe_title = ''.join(c if c not in invalid_chars else '_' for c in title)
    safe_title = safe_title[:100]  # Limit length
    return f"{topic}_{safe_title}.txt"

In [8]:
def fetch_multiple_topics(api_key: str, topics: List[str], database_folder: str = "database") -> Dict[str, int]:
    """
    Fetch articles for multiple topics and save each article to a separate file.

    Args:
        api_key: NewsAPI key
        topics: List of topics to fetch articles for
        database_folder: Folder to store article files
        days_from: Number of days from today to fetch articles

    Returns:
        Dictionary with topics and their saved article counts
    """
    # Ensure database folder exists
    os.makedirs(database_folder, exist_ok=True)
    article_counts = {topic: 0 for topic in topics}

    for topic in topics:
        try:
            url = (
                f"https://newsapi.org/v2/everything"
                f"?q={topic}"
                f"&sortBy=popularity"
                f"&pageSize=100"
                f"&apiKey={api_key}"
            )

            response = requests.get(url)
            response.raise_for_status()

            articles = response.json().get('articles', [])

            for article in articles:
                title = article.get('title', 'No title')
                content = article.get('content', '')
                description = article.get('description', '')

                # Combine content and description for word count
                full_text = f"{content}\n{description}".strip()
                word_count = count_words(full_text)

                # Skip if content is too short
                if word_count < 10:
                    continue

                # Create filename using topic and title
                filename = create_safe_filename(topic, title)
                filepath = os.path.join(database_folder, filename)

                # Write article to file
                with open(filepath, 'w', encoding='utf-8') as file:
                    # Write metadata header
                    file.write("=" * 50 + "\n")
                    file.write(f"Topic: {topic}\n")
                    file.write(f"Title: {title}\n")
                    file.write(f"Published: {article.get('publishedAt', 'No date')}\n")
                    file.write(f"Source: {article.get('source', {}).get('name', 'Unknown')}\n")
                    file.write(f"URL: {article.get('url', 'No URL')}\n")
                    file.write(f"Word Count: {word_count}\n")
                    file.write("=" * 50 + "\n\n")

                    # Write content
                    file.write(full_text)

                article_counts[topic] += 1

            # Sleep to respect API rate limits
            time.sleep(1)

        except requests.exceptions.RequestException as e:
            print(f"Error fetching articles for {topic}: {str(e)}")
            continue

    return article_counts

In [9]:
class ArticleRAG:
    def __init__(self, database_folder: str = "database", index_name: str = "articles-embeddings"):
        """
        Initialize RAG system using LangChain and HuggingFace embeddings with Pinecone integration.

        Args:
            database_folder: Folder containing article files
            index_name: Name for the Pinecone index
        """
        self.database_folder = database_folder

        # Initialize HuggingFace Embeddings
        print("Initializing HuggingFace Embeddings...")
        self.embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

        # Initialize Pinecone with new pattern
        print("Initializing Pinecone...")
        self.pc = Pinecone(api_key=PINECONE_API_KEY)
        self.index_name = index_name

        # Check if index exists and create if needed
        if self.index_name not in self.pc.list_indexes().names():
            print(f"Creating new Pinecone index: {self.index_name}")
            self.pc.create_index(
                name=self.index_name,
                dimension=self.embeddings.embed_query("").shape[0],
                metric='cosine',
                spec=ServerlessSpec(
                    cloud='aws',
                    region='us-east-1'
                )
            )

        # Initialize vector store with the new index
        self.vector_store = PineconeVectorStore(
            index=self.pc.Index(self.index_name),
            embedding=self.embeddings
        )

        # Text splitter for chunking
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=500,  # Adjust for larger chunks
            chunk_overlap=100,
            length_function=len
        )

    def process_articles(self):
        """
        Read and process articles from the database folder, storing embeddings in Pinecone.
        """
        print("Processing articles from database folder...")
        docs = []
        for filename in os.listdir(self.database_folder):
            if filename.endswith(".txt"):
                filepath = os.path.join(self.database_folder, filename)
                with open(filepath, "r", encoding="utf-8") as file:
                    # Read the file content
                    content = file.read()

                    # Split content into metadata and body
                    parts = content.split("=" * 50)
                    if len(parts) < 3:
                        print(f"Skipping malformed file: {filename}")
                        continue

                    metadata_text = parts[1].strip()
                    article_content = parts[2].strip()

                    # Chunk the article content
                    chunks = self.text_splitter.split_text(article_content)

                    # Create Document objects with metadata
                    for i, chunk in enumerate(chunks):
                        doc = Document(
                            page_content=chunk,
                            metadata={
                                "source_file": filename,
                                "chunk_index": i,
                                "total_chunks": len(chunks),
                                "metadata_text": metadata_text
                            }
                        )
                        docs.append(doc)

        if docs:
            # Generate UUIDs for documents
            uuids = [str(uuid4()) for _ in range(len(docs))]

            # Add documents to Pinecone
            print("Adding documents to Pinecone...")
            self.vector_store.add_documents(documents=docs, ids=uuids)
        else:
            print("No valid documents to process.")

    def query(self, query_text: str, k: int = 5):
        """
        Query the Pinecone vector database.

        Args:
            query_text: Query string
            k: Number of top results to return

        Returns:
            List of relevant results with metadata
        """
        print("Querying Pinecone...")
        results = self.vector_store.similarity_search(query_text, k=k)
        formatted_results = [
            {
                "content": res.page_content,
                "metadata": res.metadata
            }
            for res in results
        ]
        return formatted_results

In [12]:
API_KEY = NEWSAPI_KEY  # Replace with your API key
topics = ["Artificial Intelligence", "Politics", "Business", "Technology", "Sports", "Entertainment", "Health"]

print("Fetching articles...")
results = fetch_multiple_topics(api_key=API_KEY, topics=topics, database_folder="database")

# Print summary
print("\nArticles saved:")
for topic, count in results.items():
    print(f"{topic}: {count} articles")

# Initialize and process articles with ArticleRAG
rag = ArticleRAG(database_folder="database", index_name="articles-embeddings")
rag.process_articles()

Fetching articles...

Articles saved:
Artificial Intelligence: 96 articles
Politics: 98 articles
Business: 90 articles
Technology: 86 articles
Sports: 93 articles
Entertainment: 83 articles
Health: 90 articles
Initializing HuggingFace Embeddings...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Initializing Pinecone...
Processing articles from database folder...
Adding documents to Pinecone...


In [18]:
preference = 'Health'
docs_content = ''

In [19]:
def fetch_documents(query):
    results = rag.query(query, k=5)
    for result in results:
        docs_content += result['metadata']
        docs_content += '\n'
        docs_content += result['content']
        docs_content += '\n'
    return docs_content

In [12]:
# Example query
domanin = ''
query = "What are the latest developments in artificial intelligence?"
results = rag.query(query, k=5)

print("\nQuery Results:")
for i, result in enumerate(results, 1):
    print(f"\nResult {i}:")
    print(f"Content: {result['content'][:200]}...")
    print(f"Metadata: {result['metadata']}")

NameError: name 'rag' is not defined

In [11]:
app = FastAPI()

# Neo4j Connection
# graph = Graph("bolt://localhost:7687", auth=("neo4j", "password"))
graph = Neo4jGraph()

class FetchNewsRequest(BaseModel):
    user_id: str
    category: str

@app.post("/fetch_news/")
def fetch_news(data: FetchNewsRequest):
    preference.append(str(data.category))
    query = f'Top news articles that match these {preference}, including their titles, summaries, and publication dates and what is going on, nowadays, on the topic {preference}. News around {preference}. What is latest news in {preference}?.'
    docs_content = fetch_documents(query)
    news = [
        {"title": f"Latest in {data.category}", "content": f"{content}"}
    ]
    return {"news": news}

# Run FastAPI server in a thread
def run_fastapi():
    uvicorn.run(app, host="0.0.0.0", port=8000)

threading.Thread(target=run_fastapi, daemon=True).start()

INFO:     Started server process [413]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)


In [None]:
# Step 1: Fetch user preferences (simulated)
def fetch_user_preferences(user_id):
    # Simulate fetching user preferences from a database or API
    preferences = {
        "user_id": user_id,
        "preferred_categories": ["Technology", "Health", "Finance"]
    }
    return preferences

# Step 2: Fetch documents (simulated)
def fetch_documents(query):
    # Simulate fetching documents based on a RAG query
    documents = [
        {"title": "Latest Tech Trends", "content": "AI is transforming industries..."},
        {"title": "Health Benefits of Meditation", "content": "Meditation improves mental health..."},
        {"title": "Stock Market Insights", "content": "Investing in stocks requires knowledge..."}
    ]
    return documents

# Step 3: Combine preferences and documents
def combine_preferences_and_documents(preferences, documents):
    combined_data = {
        "user_preferences": preference,
        "documents": result_docs_content
    }
    return combined_data

# Step 4: Generate prompt using LangChain's PromptTemplate
def generate_prompt(combined_data):
    prompt_template = PromptTemplate(
        input_variables=["user_preferences", "documents"],
        template="""
        User Preferences: {user_preferences}

        Documents:
        {documents}

        Based on the above preferences and documents, please provide a summary or insights tailored to the user's interests.
        """
    )

    documents_str = "\n".join([f"- {doc['title']}: {doc['content']}" for doc in combined_data["documents"]])

    prompt = prompt_template.format(
        user_preferences=", ".join(combined_data["user_preferences"]),
        documents=documents_str
    )

    return prompt

# Step 5: Create an LLM chain using LangChain
def feed_to_llm(prompt):
    llm = OpenAI(model_name="gpt-3.5-turbo")  # Specify your model here
    llm_chain = LLMChain(llm=llm, prompt=PromptTemplate.from_template(prompt))

    response = llm_chain.run({"user_preferences": ", ".join(combined_data["user_preferences"]),
                               "documents": "\n".join([f"- {doc['title']}: {doc['content']}" for doc in combined_data["documents"]])})

    return response

# Main execution flow
user_id = 1  # Example user ID
query = "latest news"  # Example query for RAG

# Step 1: Fetch user preferences
preferences = fetch_user_preferences(user_id)

# Step 2: Fetch documents based on RAG query
documents = fetch_documents(query)

# Step 3: Combine preferences and documents
combined_data = combine_preferences_and_documents(preferences, documents)

# Step 4: Generate prompt for LLM
prompt = generate_prompt(combined_data)

# Step 5: Feed prompt to LLM and get response
response = feed_to_llm(prompt)

# Output the final response
print(response)


Frontend

In [1]:
!pip --quiet install gradio

In [4]:
import gradio as gr
import requests

# Backend URL
backend_url = "http://0.0.0.0:8000/"

# Function to send data to backend and fetch news
def fetch_news(user_id, category, custom_category):
    payload = {
        "user_id": user_id,
        "category": category,
        "custom_category": custom_category,
    }
    # Backend endpoint to fetch news (implement this in FastAPI)
    response = requests.post(f"{backend_url}/fetch_news/", json=payload)

    if response.status_code == 200:
        news = response.json().get("news", [])
        cards = "\n\n".join([f"**{n['title']}**\n{n['content']}" for n in news])
        return cards
    else:
        return "Error fetching news!"

# Gradio UI
users = ["1", "2", "3"]

with gr.Blocks() as ui:
    # User selection
    user_dropdown = gr.Dropdown(users, label="Select User", value="1")
    # Category selection
    category_dropdown = gr.Dropdown(["Artificial Intelligence", "Politics", "Business", "Technology", "Sports", "Entertainment", "Health"], label="Select Category")
    # Button to fetch news
    fetch_button = gr.Button("Fetch News")
    # News display area
    news_display = gr.Textbox(label="News", interactive=False, placeholder="News will appear here")

    # Button click event
    fetch_button.click(
        fn=fetch_news,
        inputs=[user_dropdown, category_dropdown],
        outputs=news_display
    )

ui.launch()



Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://7eae8898f5caaadc62.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


