In [3]:
import os
import json
import time
import random
import zipfile
import requests
import pandas as pd
from bs4 import BeautifulSoup

# Class Explanation: `NewsScraper`

## Overview
The `NewsScraper` class is designed for scraping news articles from three different Urdu news websites: Geo, Jang, and Express. The class has methods that cater to each site's unique structure and requirements. Below, we will go through the class and its methods, detailing what each function does, the input it takes, and the output it returns.

## Class Definition

```python
class NewsScraper:
    def __init__(self, id_=0):
        self.id = id_
```


## Method 1: `get_express_articles`

### Description
Scrapes news articles from the Express website across categories like saqafat (entertainment), business, sports, science-technology, and world. The method navigates through multiple pages for each category to gather a more extensive dataset.

### Input
- **`max_pages`**: The number of pages to scrape for each category (default is 7).

### Process
- Iterates over each category and page.
- Requests each category page and finds article cards within `<ul class='tedit-shortnews listing-page'>`.
- Extracts the article's headline, link, and content by navigating through `<div class='horiz-news3-caption'>` and `<span class='story-text'>`.

### Output
- **Returns**: A tuple of:
  - A Pandas DataFrame containing columns: `id`, `title`, and `link`).
  - A dictionary `express_contents` where the key is the article ID and the value is the article content.

### Data Structure
- Article cards are identified by `<li>` tags.
- Content is structured within `<span class='story-text'>` and `<p>` tags.



In [None]:
class NewsScraper:
    def __init__(self,id_=0):
        self.id = id_


  # write functions to scrape from other websites


    def get_express_articles(self, max_pages=1):
        express_df = {
            "id": [],
            "title": [],
            "link": [],
            "content": [],
            "gold_label": [],
        }
        base_url = 'https://jang.com.pk/'
        categories = ['entertainment', 'business', 'sports', 'health-science', 'world']   # saqafat is entertainment category

        # Iterating over the specified number of pages
        for category in categories:
            for page in range(1, max_pages + 1):
                print(f"Scraping page {page} of category '{category}'...")
                url = f"{base_url}/{category}/latest-news{page}"
                response = requests.get(url)
                response.raise_for_status()
                soup = BeautifulSoup(response.text, "html.parser")

                # Finding article cards
                cards = soup.find('ul', class_='tedit-shortnews listing-page').find_all('li')  # Adjust class as per actual site structure
                print(f"\t--> Found {len(cards)} articles on page {page} of '{category}'.")

                success_count = 0

                for card in cards:
                    try:
                        div = card.find('div',class_='horiz-news3-caption')

                        # Article Title
                        headline = div.find('a').get_text(strip=True).replace('\xa0', ' ')

                        # Article link
                        link = div.find('a')['href']

                        # Requesting the content from each article's link
                        article_response = requests.get(link)
                        article_response.raise_for_status()
                        content_soup = BeautifulSoup(article_response.text, "html.parser")


                        # Content arranged in paras inside <span> tags
                        paras = content_soup.find('span',class_='story-text').find_all('p')

                        combined_text = " ".join(
                        p.get_text(strip=True).replace('\xa0', ' ').replace('\u200b', '')
                        for p in paras if p.get_text(strip=True)
                        )

                        # Storing data
                        express_df['id'].append(self.id)
                        express_df['title'].append(headline)
                        express_df['link'].append(link)
                        express_df['gold_label'].append(category.replace('saqafat','entertainment').replace('science','science-technology'))
                        express_df['content'].append(combined_text)

                        # Increment ID and success count
                        self.id += 1
                        success_count += 1

                    except Exception as e:
                        print(f"\t--> Failed to scrape an article on page {page} of '{category}': {e}")

                print(f"\t--> Successfully scraped {success_count} articles from page {page} of '{category}'.")
            print('')

        return pd.DataFrame(express_df)

In [6]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# List of categories to scrape
cats = ["World News", "Sports News", "Science & Technology News", "Business News", "Entertainment News"]

class NewsScraper:
    def __init__(self, id_=0):
        self.id = id_

    def get_category_links(self, base_url):
        response = requests.get(base_url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        footer_links = {}
        footer_section = soup.find('section', class_='footer')

        if footer_section:
            footer_content = footer_section.find('div', class_='footer_content')
            if footer_content:
                first_row = footer_content.find('div', class_='first_footer')
                if first_row:
                    col = first_row.find('div', class_='col-lg-3 col-md-3 col-sm-6 col-xs-6 h_footer')
                    if col:
                        footer_list = col.find('ul', class_='footer-list')
                        if footer_list:
                            footer_quick1 = footer_list.find('div', class_='footer-quick1')

                            if footer_quick1:
                                for li in footer_quick1.find_all('li'):
                                    a_tag = li.find('a')
                                    if a_tag and 'href' in a_tag.attrs:
                                        category_url = a_tag['href']
                                        category_title = a_tag.get_text(strip=True)

                                        # Check if the category title is in the `cats` list
                                        if category_title in cats:
                                            # Extract last part of the URL to use as the category label
                                            label = re.search(r'/([^/]+)$', category_url).group(1)
                                            footer_links[category_url] = label

        return footer_links

    def get_articles_from_category(self, category_url, label, max_pages=1):
        articles_data = {
            "id": [],
            "title": [],
            "link": [],
            "content": [],
            "gold_label": []
        }

        for page in range(1, max_pages + 1):
            page_url = f"{category_url}?page={page}" if page > 1 else category_url
            response = requests.get(page_url)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')

            try:
                latest_page = soup.find('section', class_='latest_page')
                if latest_page:
                    latest_page_list = soup.find('section', class_='latest_page_list')
                    if latest_page_list:
                        container = latest_page_list.find('div', class_='container')
                        if container:
                            latest_page_right = container.find('div', class_='latest_page_right')
                            if latest_page_right:
                                scroll_pagination = latest_page_right.find('ul', class_='scrollPaginationNew__')
                                if scroll_pagination:
                                    for li in scroll_pagination.find_all('li'):
                                        main_heading = li.find('div', class_='main-heading')
                                        if main_heading:
                                            a_tag = main_heading.find('a')
                                            if a_tag and 'href' in a_tag.attrs:
                                                article_url = a_tag['href']
                                                full_url = f"{article_url}"
                                                article_response = requests.get(full_url)
                                                article_response.raise_for_status()
                                                content_soup = BeautifulSoup(article_response.text, "html.parser")

                                                # Get article title
                                                title_div = content_soup.find('section', class_='detail-page')
                                                if title_div:
                                                    container = title_div.find('div', class_='container')
                                                    if container:
                                                        detail_right = container.find('div', class_='detail-right')
                                                        if detail_right:
                                                            detail_right_top = detail_right.find('div', class_='detail-right-top')
                                                            if detail_right_top:
                                                                title_tag = detail_right_top.find('h1')
                                                                if title_tag:
                                                                    title = title_tag.get_text(strip=True)
                                                                else:
                                                                    title = "No title found"
                                                            
                                                            # Get article content
                                                            detail_content = detail_right.find('div', class_='detail-content')
                                                            if detail_content:
                                                                description_area = detail_content.find('div', class_='description-area')
                                                                if description_area:
                                                                    detail_view_content = description_area.find('div', class_='detail_view_content')
                                                                    if detail_view_content:
                                                                        paragraphs = detail_view_content.find_all('p')
                                                                        article_content = " ".join([para.get_text(strip=True) for para in paragraphs])
                                                                    else:
                                                                        article_content = "No content found"
                                                            else:
                                                                article_content = "No content found"
                                                            
                                                            # Add data to articles_data
                                                            articles_data["id"].append(self.id)
                                                            articles_data["title"].append(title)
                                                            articles_data["link"].append(full_url)
                                                            articles_data["content"].append(article_content)
                                                            articles_data["gold_label"].append(label)  # Set label as gold label

                                                            # Print statement after each article is added
                                                            print(f"Added article: {title}")

                                                            self.id += 1

            except AttributeError as e:
                pass

        return articles_data

    def get_all_articles(self, base_url, max_pages=1):
        category_links = self.get_category_links(base_url)

        all_articles = {
            "id": [],
            "title": [],
            "link": [],
            "content": [],
            "gold_label": []
        }

        for category_url, label in category_links.items():
            category_data = self.get_articles_from_category(category_url, label, max_pages)
            for key in all_articles:
                all_articles[key].extend(category_data[key])

        return pd.DataFrame(all_articles)

# Initialize scraper
scraper = NewsScraper()

# Set the base URL and max pages to scrape
base_url = "https://jang.com.pk/"
articles_df = scraper.get_all_articles(base_url, max_pages=1)

# Save the data to CSV file
articles_df.to_csv("scraped_articles.csv", index=False)

# Confirm that the data is saved
print(f"Data saved to scraped_articles.csv")


Added article: امید ہے ٹرمپ اپنے دوسرے دور صدارت میں جنگیں رکوائیں گے، ترک صدر
Added article: اسرائیلی وزیراعظم کے دفتر پر نیتن یاہو کے قریبی ساتھیوں کی  حساس ویڈیوز جمع کرنے کا الزام
Added article: ‎امریکی مسلمانوں، عرب ووٹرز نے ڈیموکریٹس کو چھوڑ دیا: سی اے آئی آر
Added article: ایمسٹرڈیم: اسرائیلی فٹبال شائقین پر حملہ، 11 زخمی
Added article: ڈونلڈ ٹرمپ نے سوزی وائلز کو وائٹ ہاؤس کی چیف آف اسٹاف مقرر کر دیا
Added article: ڈونلڈ ٹرمپ کا روسی صدر پیوٹن سے بات کی خواہش کا اظہار
Added article: روسی صدر پیوٹن کی ٹرمپ کو انتخاب میں کامیابی پر مبارکباد
Added article: دنیا کی کوئی طاقت آرٹیکل 370 دوبارہ نہیں لاسکتی، نریندر مودی
Added article: ٹرمپ کی کامیابی کے بعد خارجہ پالیسی پر انتہاپسندوں کے غلبے کا خدشہ ہے، امریکی جریدہ
Added article: اسرائیل نے امریکا سے 25 جنگی طیارے خریدنے کا معاہدہ کرلیا
Added article: وزیر خزانہ کی برطرفی سے جرمن حکومت کو خطرہ لاحق، اپوزیشن کا اعتماد کا ووٹ لینے کا مطالبہ
Added article: فتح کے بعد فیملی تصویر، ٹرمپ کی اہلیہ غائب، ایلون مسک اہل خانہ کے ساتھ موجود
Ad

In [5]:
import pandas as pd

# Load the CSV file
articles_df = pd.read_csv("scraped_articles.csv")

# Replace specific labels in the 'gold_label' column
articles_df['gold_label'] = articles_df['gold_label'].replace({
    "health-science": "Science & Technology",
    "world": "International",
    "sports": "Sports",
    "business":"Business",
    "entertainment":"Entertainment"
})

# Save the updated DataFrame back to the CSV file
articles_df.to_csv("scraped_articles.csv", index=False)

# Confirm the changes
print("CSV file updated with revised gold labels.")

CSV file updated with revised gold labels.


# Output
- Save a combined csv of all 3 sites.