In [1]:
!pip install feedparser

Collecting feedparser
  Using cached feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting sgmllib3k (from feedparser)
  Using cached sgmllib3k-1.0.0-py3-none-any.whl
Using cached feedparser-6.0.11-py3-none-any.whl (81 kB)
Installing collected packages: sgmllib3k, feedparser
Successfully installed feedparser-6.0.11 sgmllib3k-1.0.0



[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [10]:
import feedparser
from datetime import datetime

rss_feeds = [
    'http://rss.cnn.com/rss/cnn_topstories.rss',
    'http://qz.com/feed',
    'http://feeds.foxnews.com/foxnews/politics',
    'http://feeds.reuters.com/reuters/businessNews',
    'http://feeds.feedburner.com/NewshourWorld',
    'https://feeds.bbci.co.uk/news/world/asia/india/rss.xml'
]

In [11]:
import torch
from transformers import DistilBertTokenizer, DistilBertModel

# Check for GPU availability and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the DistilBERT model and tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertModel.from_pretrained("distilbert-base-uncased").to(device)

# Define a padding token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # Set the padding token to the end-of-sequence token

# Example articles
# articles = [
#     {
#         "title": "Protests Erupt in the Capital",
#         "description": "Citizens are rallying against new legislation."
#     },
#     {
#         "title": "Hurricane Causes Widespread Damage",
#         "description": "The hurricane has left thousands homeless."
#     },
#     {
#         "title": "A New Vaccine Offers Hope",
#         "description": "Scientists have developed a vaccine that is 90% effective."
#     },
#     {
#         "title": "Earthquake Rocks the City",
#         "description": "A major earthquake has struck, causing extensive damage."
#     }
# ]

# Define categories
categories = [
    "Terrorism / protest / political unrest / riot",
    "Positive / Uplifting",
    "Natural Disasters"
]

# Function to generate embeddings
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(device)  # Move inputs to GPU
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1)  # Average pooling over the last hidden state
    return embeddings

# Generate embeddings for categories
category_embeddings = [get_embeddings(category) for category in categories]

# Function to classify articles
def classify_article(article):
    full_text = f"{article['title']}: {article['description']}"
    article_embedding = get_embeddings(full_text)

    # Calculate cosine similarity
    similarities = []
    for category_embedding in category_embeddings:
        sim = torch.nn.functional.cosine_similarity(article_embedding, category_embedding)
        similarities.append(sim.item())
    
    # Get the category with the highest similarity
    max_index = similarities.index(max(similarities))
    max_sim = max(similarities)
    if max_sim>0.6:
        return categories[max_index],max_sim
    else :
        return "Others",max_sim

In [12]:
def parse_feed(url):
    feed = feedparser.parse(url)
    articles = []
    for entry in feed.entries:
        # Extract the media content (if available)
        media_content = None
        if 'media_content' in entry:
            media_content = entry.media_content[0]['url'] if entry.media_content else None
        
        # Extract published date
        published_date = None
        if hasattr(entry, 'published_parsed'):
            published_date = datetime(*entry.published_parsed[:6])
        
        article = {
            'title': entry.title,
            'content': entry.summary if 'summary' in entry else '',  # Handle missing summary
            'published': published_date,
            'url': entry.link,
            'media_url': media_content  # Add media URL if present
        }
        articles.append(article)
    return articles

# Parse all feeds
for feed_url in rss_feeds:
    articles = parse_feed(feed_url)
    # Add logic to process/store articles
    for article in articles:
        print(article)

{'title': 'Some on-air claims about Dominion Voting Systems were false, Fox News acknowledges in statement after deal is announced', 'content': '', 'published': datetime.datetime(2023, 4, 19, 12, 44, 51), 'url': 'https://www.cnn.com/business/live-news/fox-news-dominion-trial-04-18-23/index.html', 'media_url': 'https://cdn.cnn.com/cnnnext/dam/assets/230418164538-02-dominion-fox-trial-settlement-0418-super-169.jpg'}
{'title': 'Dominion still has pending lawsuits against election deniers such as Rudy Giuliani and Sidney Powell', 'content': '', 'published': None, 'url': 'https://www.cnn.com/business/live-news/fox-news-dominion-trial-04-18-23/h_8d51e3ae2714edaa0dace837305d03b8', 'media_url': 'https://cdn.cnn.com/cnnnext/dam/assets/230417170417-fox-news-headquarters-0228-super-169.jpg'}
{'title': 'Here are the 20 specific Fox broadcasts and tweets Dominion says were defamatory', 'content': "• Fox-Dominion trial delay 'is not unusual,' judge says\n• Fox News' defamation battle isn't stopping 

In [15]:
articles_formatted = []
for article in articles:
    articles_formatted.append(
        {'title':article['title'],
        'description':article['content']},        
        )
    
articles_formatted

[{'title': "Modi's BJP ahead in Haryana election but trails in Kashmir",
  'description': "PM Modi's party is leading in the northern state of Haryana but may fall short in Indian-administered Kashmir."},
 {'title': "Indian financial aid opens 'new chapter' with Maldives",
  'description': 'The deal comes as the Maldives president visits Delhi after relations nosedived in recent months.'},
 {'title': 'Why are there so many dropped catches at T20 World Cup?',
  'description': "BBC Sport looks at the significant number of dropped catches at the Women's T20 World Cup so far and the possible explanations."},
 {'title': 'Climbers rescued after three days in the Himalayas',
  'description': 'A British climber who went missing in the Himalayas has spoken of her relief after surviving for two days in "brutal" conditions that put her life in danger.'},
 {'title': 'Maldives president visits India amid trouble in tourist paradise',
  'description': 'President Muizzu is visiting India at a time wh

In [17]:
for article in articles_formatted:
    category,max_sim = classify_article(article)
    print(f"Title: {article['title']}\nCategory: {category}\n")

Title: Modi's BJP ahead in Haryana election but trails in Kashmir
Category: Terrorism / protest / political unrest / riot

Title: Indian financial aid opens 'new chapter' with Maldives
Category: Terrorism / protest / political unrest / riot

Title: Why are there so many dropped catches at T20 World Cup?
Category: Others

Title: Climbers rescued after three days in the Himalayas
Category: Others

Title: Maldives president visits India amid trouble in tourist paradise
Category: Terrorism / protest / political unrest / riot

Title: The Polish artist who painted Hindu gods in Indian palaces
Category: Terrorism / protest / political unrest / riot

Title: India government says criminalising marital rape 'excessively harsh'
Category: Terrorism / protest / political unrest / riot

Title: India's foreign minister to visit Pakistan for the first time since 2015
Category: Terrorism / protest / political unrest / riot

Title: How India became a Test cricket powerhouse
Category: Others

Title: Indi

In [19]:
import feedparser
from datetime import datetime

rss_feeds = [
    'http://rss.cnn.com/rss/cnn_topstories.rss',
    'http://qz.com/feed',
    'http://feeds.foxnews.com/foxnews/politics',
    'http://feeds.reuters.com/reuters/businessNews',
    'http://feeds.feedburner.com/NewshourWorld',
    'https://feeds.bbci.co.uk/news/world/asia/india/rss.xml'
]

def parse_feed(url):
    feed = feedparser.parse(url)
    articles = []
    for entry in feed.entries:
        # Extract the media content (if available)
        media_content = None
        if 'media_content' in entry:
            media_content = entry.media_content[0]['url'] if entry.media_content else None
        
        # Extract published date
        published_date = None
        if hasattr(entry, 'published_parsed'):
            published_date = datetime(*entry.published_parsed[:6])
        
        article = {
            'title': entry.title,
            'content': entry.summary if 'summary' in entry else '',  # Handle missing summary
            'published': published_date,
            'url': entry.link,
            'media_url': media_content , 
            'category':""
        }
        articles_required = {
            'title': entry.title,
            'description': entry.summary if 'summary' in entry else '',
        }
        article_category,sim = classify_article(articles_required)
        article['category'] = article_category
        articles.append(article)
    return articles

print(parse_feed('http://rss.cnn.com/rss/cnn_topstories.rss'))

