## Mathrubhumi Scraper

In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime, timedelta
import time
from urllib.parse import urlparse
import re

def get_recent_sitemaps(main_sitemap_url="https://www.mathrubhumi.com/sitemap.xml", days_back=15):
    """Extract recent sitemaps from the main index"""
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(main_sitemap_url, headers=headers, timeout=15)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.content, 'xml')
        cutoff_date = datetime.now() - timedelta(days=days_back)
        recent_sitemaps = []
        
        for sitemap in soup.find_all('sitemap'):
            loc = sitemap.find('loc')
            lastmod = sitemap.find('lastmod')
            
            if not loc or not loc.text:
                continue
                
            # Check if sitemap is recent enough
            sitemap_date = None
            if lastmod and lastmod.text:
                try:
                    sitemap_date = datetime.strptime(lastmod.text[:19], "%Y-%m-%dT%H:%M:%S")
                except ValueError:
                    pass
            
            # Also check for date patterns in the URL itself
            url_date = None
            for segment in loc.text.split('/'):
                if segment.isdigit() and len(segment) == 8:  # YYYYMMDD format
                    try:
                        url_date = datetime.strptime(segment, "%Y%m%d")
                        break
                    except ValueError:
                        continue
            
            # Use the most recent date we found
            current_date = max(d for d in [sitemap_date, url_date] if d is not None)
            
            if current_date and current_date >= cutoff_date:
                recent_sitemaps.append({
                    'url': loc.text,
                    'lastmod': current_date.strftime("%Y-%m-%d")
                })
        
        return recent_sitemaps
        
    except Exception as e:
        print(f"Error fetching sitemap index: {str(e)}")
        return []

def is_article_url(url):
    """Check if URL is an actual article and not a directory listing"""
    # Check for common article URL patterns
    patterns = [
        r'-\d+\.\d+$',  # Ends with -1.12345678
        r'/\d+$',       # Ends with /12345678
        r'\.html$',      # Ends with .html
        r'\.php\?id=\d+' # Ends with .php?id=123456
    ]
    
    for pattern in patterns:
        if re.search(pattern, url):
            return True
    
    # Check if URL has more than 3 path segments (e.g., /news/kerala/article-1.123)
    path_segments = urlparse(url).path.split('/')
    if len([seg for seg in path_segments if seg]) > 3:
        return True
    
    return False

def parse_news_sitemap(sitemap_url, days_back=15):
    """Parse individual news sitemap and extract recent articles"""
    cutoff_date = datetime.now() - timedelta(days=days_back)
    articles = []
    
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(sitemap_url, headers=headers, timeout=15)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.content, 'xml')
        
        for url in soup.find_all('url'):
            try:
                loc = url.find('loc')
                if not loc or not loc.text:
                    continue
                
                url_str = loc.text.lower()
                
                # Check if URL is from /news/ directory and is an actual article
                if '/news/' not in url_str or not is_article_url(url_str):
                    continue
                
                # Extract and validate date
                pub_date = None
                lastmod = url.find('lastmod')
                
                if lastmod and lastmod.text:
                    try:
                        pub_date = datetime.strptime(lastmod.text[:10], "%Y-%m-%d")
                    except ValueError:
                        pass
                
                # Fallback to URL date parsing
                if not pub_date:
                    path = urlparse(loc.text).path
                    for segment in path.split('/'):
                        if segment.isdigit() and len(segment) == 8:  # YYYYMMDD
                            try:
                                pub_date = datetime.strptime(segment, "%Y%m%d")
                                break
                            except ValueError:
                                continue
                
                # Skip if no valid date found or too old
                if not pub_date or pub_date < cutoff_date:
                    continue
                
                # Extract title from URL if not in XML
                title = url.find('news:title') or url.find('image:title')
                title = title.text if title else loc.text.split('/')[-1].replace('-', ' ').title()
                
                # Extract section from URL path (second part of path after /news/)
                path_parts = urlparse(loc.text).path.split('/')
                section = path_parts[2] if len(path_parts) > 2 else 'general'
                
                articles.append({
                    'title': title,
                    'url': loc.text,
                    'date': pub_date.strftime("%Y-%m-%d"),
                    'section': section,
                    'datetime': pub_date  # Adding datetime for sorting
                })
                
            except Exception as e:
                print(f"Error parsing article in {sitemap_url}: {str(e)}")
                continue
                
    except Exception as e:
        print(f"Error parsing sitemap {sitemap_url}: {str(e)}")
    
    return articles

def get_top_news_articles(days_back=15, articles_per_day=5):
    """Main function to get top news articles per day"""
    print("Fetching recent sitemaps...")
    recent_sitemaps = get_recent_sitemaps(days_back=days_back)
    
    if not recent_sitemaps:
        print("No recent sitemaps found")
        return pd.DataFrame()
    
    print(f"Found {len(recent_sitemaps)} recent sitemaps, parsing articles...")
    all_articles = []
    
    for sitemap in recent_sitemaps:
        print(f"  Processing {sitemap['url']}...")
        articles = parse_news_sitemap(sitemap['url'], days_back=days_back)
        all_articles.extend(articles)
        time.sleep(1)  # Be polite
    
    if not all_articles:
        print("No recent articles found")
        return pd.DataFrame()
    
    # Create DataFrame with strict date filtering
    df = pd.DataFrame(all_articles)
    df['date'] = pd.to_datetime(df['date'])
    
    # Final date filter to ensure no old articles slip through
    cutoff = datetime.now() - timedelta(days=days_back)
    df = df[df['date'] >= cutoff]
    
    if df.empty:
        print(f"No articles within last {days_back} days after final filtering")
        return pd.DataFrame()
    
    # Clean and sort
    df = df.drop_duplicates('url')
    
    # Get top articles per day
    df = df.sort_values(['date', 'datetime'], ascending=False)
    top_articles = df.groupby(df['date'].dt.date).head(articles_per_day)
    
    return top_articles.sort_values('date', ascending=False)

if __name__ == "__main__":
    days_back = 15
    articles_per_day = 5
    
    news_df = get_top_news_articles(days_back=days_back, articles_per_day=articles_per_day)
    
    if not news_df.empty:
        filename = f"mathrubhumi_news_top_{articles_per_day}_per_day_last_{days_back}_days_{datetime.now().strftime('%Y%m%d')}.csv"
        news_df.to_csv(filename, index=False, encoding='utf-8-sig')
        print(f"\nSaved {len(news_df)} articles to {filename}")
        
        print("\nSample of most recent articles:")
        print(news_df[['date', 'title', 'section', 'url']].head().to_string(index=False))
    else:
        print("\nNo recent articles found to save")

Fetching recent sitemaps...
Found 3 recent sitemaps, parsing articles...
  Processing https://www.mathrubhumi.com/polopoly_fs/1.7292585.1746433801!/file.xml...
  Processing https://www.mathrubhumi.com/polopoly_fs/1.10471980.1746036006!/file.xml...
  Processing https://www.mathrubhumi.com/polopoly_fs/1.10552540.1746433802!/file.xml...

Saved 75 articles to mathrubhumi_news_top_5_per_day_last_15_days_20250505.csv

Sample of most recent articles:
      date                                                                       title section                                                                                                                       url
2025-05-05                 Two Youth Dies In Accident At Thiruvananthapuram 1.10562960  kerala                       https://www.mathrubhumi.com/news/kerala/two-youth-dies-in-accident-at-thiruvananthapuram-1.10562960
2025-05-05 Listin Stephen Reacts On Controversies Nivin Pauli Sandra Thomas 1.10562952    news https://www.mathrubhumi

In [6]:
!pip install selenium

Collecting selenium
  Downloading selenium-4.32.0-py3-none-any.whl.metadata (7.5 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.30.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting attrs>=23.2.0 (from trio~=0.17->selenium)
  Downloading attrs-25.3.0-py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Collecting h11<1,>=0.9.0 (from wsproto>=0.14->trio-websocket~=0.9->selenium)
  Downloading h11-0.16.0-py3-none-any.whl.metadata (8.3 kB)
Downloading selenium-4.32.0-py3-none-any.whl (9.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownload

## Kerala Kaumudi Scraper

In [12]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime, timedelta, date
import time
import re

def scrape_kerala_kaumudi():
    # Configuration
    base_url = "https://keralakaumudi.com/news/inc/load-more-latest.php"
    headers = {
        "User-Agent": "Mozilla/5.0",
        "Content-Type": "application/x-www-form-urlencoded",
    }
    days_back = 15
    articles_per_day = 5
    max_requests = 100  # Safety limit to prevent infinite loops

    # Calculate date cutoff (as datetime object)
    cutoff_date = datetime.now() - timedelta(days=days_back)
    articles = []

    offset = 0
    requests_made = 0
    oldest_date_reached = False

    while not oldest_date_reached and requests_made < max_requests:
        # Prepare POST request data
        data = {
            "offset": offset,
            "tag": ""
        }

        try:
            # Make the request
            response = requests.post(base_url, headers=headers, data=data)
            response.raise_for_status()
            
            # Parse HTML response
            soup = BeautifulSoup(response.text.strip(), "html.parser")
            news_items = soup.find_all("div", class_="full-width no-padding cat-news")
            
            if not news_items:
                break

            # Process each news item
            for item in news_items:
                try:
                    # Extract URL
                    link = item.find("a", href=True)
                    if not link:
                        continue
                    url = link["href"]

                    # Extract title
                    title_tag = item.find("h5")
                    title = title_tag.get_text(strip=True) if title_tag else "No Title"

                    # Extract date
                    date_tag = item.find("span", class_="dt-info")
                    if date_tag:
                        date_str = date_tag.get_text(strip=True)
                        try:
                            article_date = datetime.strptime(date_str, "%b %d, %Y")
                        except ValueError:
                            try:
                                article_date = datetime.strptime(date_str, "%B %d, %Y")
                            except:
                                article_date = None
                    else:
                        article_date = None

                    # Skip if we couldn't parse the date
                    if not article_date:
                        continue

                    # Stop if we've reached articles older than our cutoff
                    if article_date.date() < cutoff_date.date():
                        oldest_date_reached = True
                        break

                    # Extract summary
                    summary_tag = item.find("div", class_="full-width no-padding cat-text").find("span")
                    summary = summary_tag.get_text(strip=True) if summary_tag else ""

                    # Add to our collection
                    articles.append({
                        "title": title,
                        "url": url,
                        "date": article_date,
                        "summary": summary,
                        "datetime": article_date  # For sorting
                    })

                except Exception as e:
                    print(f"Error processing article: {e}")
                    continue

            # Prepare for next request
            offset += len(news_items)
            requests_made += 1
            time.sleep(1)  # Be polite

        except Exception as e:
            print(f"Request failed at offset {offset}: {e}")
            break

    # Convert to DataFrame
    df = pd.DataFrame(articles)
    
    # Ensure we have dates in datetime format
    if not df.empty:
        # Convert all dates to datetime64[ns] if they aren't already
        df['date'] = pd.to_datetime(df['date'])
        
        # Filter to only keep articles within our date range
        df = df[df['date'] >= pd.to_datetime(cutoff_date)]
        
        # Sort by date (newest first)
        df = df.sort_values('datetime', ascending=False)
        
        # Get top articles per day
        # Convert datetime to date for grouping
        df['date_only'] = df['date'].dt.date
        top_articles = df.groupby('date_only').head(articles_per_day)
        
        # Clean up before returning
        top_articles = top_articles.drop(columns=['date_only'])
        return top_articles.sort_values('date', ascending=False)
    
    return pd.DataFrame()

if __name__ == "__main__":
    print("Scraping Kerala Kaumudi news...")
    news_df = scrape_kerala_kaumudi()
    
    if not news_df.empty:
        filename = f"kerala_kaumudi_top_5_per_day_last_15_days_{datetime.now().strftime('%Y%m%d')}.csv"
        # Convert datetime to string for CSV output
        news_df['date'] = news_df['date'].dt.strftime('%Y-%m-%d')
        news_df.to_csv(filename, index=False, encoding='utf-8-sig')
        print(f"\nSaved {len(news_df)} articles to {filename}")
        
        print("\nSample of most recent articles:")
        print(news_df[['date', 'title', 'url']].head(10).to_string(index=False))
    else:
        print("\nNo recent articles found to save")

Scraping Kerala Kaumudi news...

Saved 75 articles to kerala_kaumudi_top_5_per_day_last_15_days_20250505.csv

Sample of most recent articles:
      date                                                                                                                         title                                                                                                                                url
2025-05-05         'മമ്മൂട്ടിയും മോഹൻലാലും ഒന്നും തന്നില്ലേയെന്നാണ് ചോദിക്കുന്നത്, സുബി സിനിമകൾ മനഃപൂർവം ഒഴിവാക്കി'; വെളിപ്പെടുത്തി അമ്മ                                              https://keralakaumudi.com/news/news.php?id=1528547&u=life-of-late-actress-subi-suresh
2025-05-05                                                     പേവിഷബാധയേറ്റ് മരിച്ച നിയ ഫൈസലിന്റെ മൃതദേഹം ഖബറടക്കി; മാതാവ് ക്വാറന്റീനിൽ             https://keralakaumudi.com/news/news.php?id=1528494&u=body-of-nia-faisal-who-died-of-rabies-buried-mother-in-quarantine
2025-05-05                  ആദ്യം ആര് റൊട്ടി കഴിക്കുമെ

## Manorama News Scraper

In [14]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime, timedelta
import time
import re

def scrape_manorama_online():
    # Configuration
    base_url = "https://www.manoramaonline.com/news/latest-news.html"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    days_back = 15
    articles_per_day = 5
    max_pages = 30  # Safety limit to prevent infinite loops

    # Calculate date cutoff
    cutoff_date = datetime.now() - timedelta(days=days_back)
    articles = []
    seen_urls = set()  # To track unique URLs

    page = 1
    oldest_date_reached = False

    while not oldest_date_reached and page <= max_pages:
        try:
            # Construct URL with pagination
            if page == 1:
                url = base_url
            else:
                url = f"{base_url}?page={page}"

            # Make the request
            response = requests.get(url, headers=headers)
            response.raise_for_status()
            
            # Parse HTML response
            soup = BeautifulSoup(response.text, 'html.parser')
            news_items = soup.find_all('li', class_='cmp-story-list__item')
            
            if not news_items:
                break

            # Process each news item
            for item in news_items:
                try:
                    # Extract URL
                    link = item.find('a', class_='cmp-story-list__title-link', href=True)
                    if not link:
                        continue
                    
                    # Construct full URL and check for duplicates
                    partial_url = link['href']
                    full_url = "https://www.manoramaonline.com" + partial_url
                    
                    if full_url in seen_urls:
                        continue  # Skip duplicate articles
                    seen_urls.add(full_url)

                    # Extract title
                    title_tag = item.find('h2', class_='cmp-story-list__title')
                    title = title_tag.get_text(strip=True) if title_tag else "No Title"

                    # Extract date from URL (format: /news/latest-news/2025/05/05/...)
                    date_match = re.search(r'/(\d{4})/(\d{2})/(\d{2})/', partial_url)
                    if date_match:
                        year, month, day = date_match.groups()
                        article_date = datetime(int(year), int(month), int(day))
                    else:
                        # Fallback to current date if no date in URL
                        article_date = datetime.now()

                    # Stop if we've reached articles older than our cutoff
                    if article_date.date() < cutoff_date.date():
                        oldest_date_reached = True
                        break

                    # Extract summary
                    summary_tag = item.find('p', class_='cmp-story-list__dispn')
                    summary = summary_tag.get_text(strip=True) if summary_tag else ""

                    # Add to our collection
                    articles.append({
                        "title": title,
                        "url": full_url,
                        "date": article_date,
                        "summary": summary,
                        "datetime": article_date  # For sorting
                    })

                except Exception as e:
                    print(f"Error processing article: {e}")
                    continue

            # Prepare for next page
            page += 1
            time.sleep(1)  # Be polite

        except Exception as e:
            print(f"Request failed at page {page}: {e}")
            break

    # Convert to DataFrame
    df = pd.DataFrame(articles)
    
    # Ensure we have dates in datetime format
    if not df.empty:
        df['date'] = pd.to_datetime(df['date'])
        
        # Filter to only keep articles within our date range
        df = df[df['date'] >= pd.to_datetime(cutoff_date)]
        
        # Final duplicate check (in case same URL appears on different pages)
        df = df.drop_duplicates(subset=['url'], keep='first')
        
        # Sort by date (newest first)
        df = df.sort_values('datetime', ascending=False)
        
        # Get top articles per day
        df['date_only'] = df['date'].dt.date
        top_articles = df.groupby('date_only').head(articles_per_day)
        
        # Clean up before returning
        top_articles = top_articles.drop(columns=['date_only'])
        return top_articles.sort_values('date', ascending=False)
    
    return pd.DataFrame()

if __name__ == "__main__":
    print("Scraping Manorama Online news...")
    news_df = scrape_manorama_online()
    
    if not news_df.empty:
        filename = f"manorama_top_5_per_day_last_15_days_{datetime.now().strftime('%Y%m%d')}.csv"
        # Convert datetime to string for CSV output
        news_df['date'] = news_df['date'].dt.strftime('%Y-%m-%d')
        news_df.to_csv(filename, index=False, encoding='utf-8-sig')
        print(f"\nSaved {len(news_df)} unique articles to {filename}")
        
        print("\nSample of most recent articles:")
        print(news_df[['date', 'title', 'url']].head(10).to_string(index=False))
    else:
        print("\nNo recent articles found to save")

Scraping Manorama Online news...

Saved 75 unique articles to manorama_top_5_per_day_last_15_days_20250505.csv

Sample of most recent articles:
      date                                                                                                                        title                                                                                                                                           url
2025-05-05                    തുർക്കി യുദ്ധക്കപ്പൽ കറാച്ചി തുറമുഖത്ത്, ബുധനാഴ്ച വരെ തുടരും; പാക്ക് നാവികസേന ഉദ്യോഗസ്ഥരുമായി കൂടിക്കാഴ്ച                       https://www.manoramaonline.com/news/latest-news/2025/05/05/turkey-pakistan-naval-cooperation-warship-karachi-visit.html
2025-05-05                                        അടിവയറ്റിലെ കൊഴുപ്പു നീക്കാൻ ശസ്ത്രക്രിയ; വനിതാ സോഫ്റ്റ്‌വെയർ എൻജിനീയർ ഗുരുതരാവസ്ഥയിൽ                      https://www.manoramaonline.com/district-news/thiruvananthapuram/2025/05/05/software-engineer-critical-after-surgery.html
2025-05-05            