<a href="https://colab.research.google.com/github/DarmaCahya/CrawlerNews/blob/main/test_Crawling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Web Scraping and Data Collection from News Websites

This notebook demonstrates how to scrape news articles from various websites using Python libraries such as `requests`, `BeautifulSoup`, and `Selenium`. The collected data is then stored in a CSV file for further analysis.


**Note:** For optimal experience and ease of use, it is recommended to run this notebook using Visual Studio Code (VSC).


## Install Libraries
First, we need to install the required libraries. You can install them by running the following commands:


In [1]:
!pip install requests beautifulsoup4 pandas
!pip install selenium

Collecting selenium
  Downloading selenium-4.22.0-py3-none-any.whl (9.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m42.2 MB/s[0m eta [36m0:00:00[0m
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.26.0-py3-none-any.whl (475 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m475.7/475.7 kB[0m [31m31.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl (17 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl (10 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl (24 kB)
Collecting h11<1,>=0.9.0 (from wsproto>=0.14->trio-websocket~=0.9->selenium)
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25h

# Crawling news data on several websites in a general way

In [83]:
import json
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import pandas as pd
import time

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

# Load JSON file
with open('news_websites.json') as json_file:
    websites = json.load(json_file)['websites']

def get_soup(url):
    response = requests.get(url, headers=headers)
    response.raise_for_status()
    return BeautifulSoup(response.text, 'html.parser')

def parse_article(link, website):
    try:
        soup = get_soup(link)
        content_div = soup.find(class_=website['content_class'])
        content = ' '.join(p.get_text(strip=True) for p in content_div.find_all('p')) if content_div else 'No content found'
        return content
    except Exception as e:
        print(f"Error parsing article {link}: {e}")
        return 'No content found'

def parse_page(url, website):
    soup = get_soup(url)
    news_data = []
    articles = soup.find_all(website['article_tag'], class_=website['article_class'])
    print(f"Found {len(articles)} articles on page: {url}")

    for article in articles:
        title_tag = article.find(class_=website.get('title_class', None))
        if title_tag:
            title = title_tag.get_text(strip=True)
        else:
            title_tag = article.find('a')
            title = title_tag['title'] if 'title' in title_tag.attrs else title_tag.get_text(strip=True)

        link_tag = article.find('a')
        link = link_tag['href'] if link_tag else 'No link found'

        date_tag = article.find(class_=website['date_class'])
        date = date_tag.get_text(strip=True) if date_tag else 'No date found'

        image_tag = article.find('img')
        if(website['name'] == "Antara"):
            image = image_tag['data-src'] if image_tag else 'No image found'
        else:
            image = image_tag['src'] if image_tag else 'No image found'

        content = parse_article(link, website) if link != 'No link found' else 'No content found'

        news_data.append({
            'title': title,
            'link': link,
            "image" : image,
            'date': date,
            'content': content,
            'is_fake': 0,
            'media_bias': website['platform']
        })
        print(f"Appended article: {title}")

    return news_data

def get_all_articles(base_url, website, max_pages=1):
    articles = []
    next_page = base_url
    current_page = 1

    while next_page and current_page <= max_pages:
        print(f"Crawling page {current_page}")
        articles.extend(parse_page(next_page, website))
        soup = get_soup(next_page)
        if website['name'] == 'Antara':
            next_page = f"{base_url}/{current_page + 1}"
        elif website['name'] == 'Suara':
            next_page = f"{base_url}?page={current_page + 1}"
        elif website['name'] == 'Detik':
            next_page = f"{base_url}/{current_page + 1}"
        elif website['name'] == 'Tribunnews':
            break
        else:
            next_button = soup.find(class_=website['next_page'])
            next_page = next_button["href"] if next_button else None
        current_page += 1
        time.sleep(2)
    return articles

def crawlerGeneral():
    all_news = []
    for website in websites:
        try:
            base_url = website['url']
            scraped_news = get_all_articles(base_url, website)
            print(f"Scraped {len(scraped_news)} articles from {website['name']}")
            all_news.extend(scraped_news)
            time.sleep(2) 
        except requests.HTTPError as e:
            print(f"Failed to scrape {website['name']}: {e}")

    with open('scraped_news.json', 'w', encoding='utf-8') as f:
        json.dump(all_news, f, ensure_ascii=False, indent=4)

    print(f"Total articles collected: {len(all_news)}")
    return all_news

if __name__ == "__main__":
    crawlerGeneral()

Crawling page 1
Found 10 articles on page: https://www.cnnindonesia.com/politik/indeks/4
Appended article: Tito Sindir Bupati Petahana Susah Dicari: Sibuk Cari Tiket Pilkada
Appended article: Tito Ungkap 10 Pj Kepala Daerah Mundur untuk Maju Pilkada 2024
Appended article: Rencana Ubah Wantimpres Jadi DPA, DPR Tegaskan Takkan Ubah Fungsi
Appended article: Pemerintah Mulai Susun Draf RUU TNI, Polri, Imigrasi, dan Kementerian
Appended article: Pansus Haji DPR, Momen Ungkap Dugaan Pelanggaran Pemerintah
Appended article: Cak Imin Pastikan Pansus Angket Haji Jalan di Masa Reses DPR
Appended article: IDI Tak Tolak Dokter Asing tapi Desak Pemerintah Prioritaskan WNI
Appended article: Paripurna Sahkan RUU Kepariwisataan Jadi Usul Inisiatif DPR
Appended article: Paripurna DPR Resmi Sahkan RUU Sumber Daya Alam Hayati jadi UU
Appended article: Airlangga Ungkap Rencana Kaesang Akan Bertamu ke Golkar Kamis Esok
Scraped 10 articles from CNN Indonesia
Crawling page 1
Found 20 articles on page: https:

# Crawling news data using user input or by topic

In [1]:
# Import Required Libraries
import json
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import pandas as pd
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Define User-Agent for Requests
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

# Load JSON file
with open('news_websites.json') as json_file:
    websites = json.load(json_file)['websites']

# Function to Get BeautifulSoup Object from URL
def get_soup(url):
    response = requests.get(url, headers=headers)
    response.raise_for_status()
    return BeautifulSoup(response.text, 'html.parser')

# Function to Parse Individual Article
def parse_article(link, website):
    try:
        soup = get_soup(link)
        content_div = soup.find(class_=website['content_class'])
        content = ' '.join(p.get_text(strip=True) for p in content_div.find_all('p')) if content_div else 'No content found'
        return content
    except Exception as e:
        print(f"Error parsing article {link}: {e}")
        return 'No content found'

# Function to Parse Articles on a Page
def parse_page(url, website):
    soup = get_soup(url)
    news_data = []
    articles = soup.find_all(website['article_tag'], class_=website['article_class'])
    print(f"Found {len(articles)} articles on page: {url}")
    for article in articles:
        title_tag = article.find(class_=website.get('title_class', None))
        if title_tag:
            title = title_tag.get_text(strip=True)
        else:
            title_tag = article.find('a')
            if title_tag:
                title = title_tag['title'] if 'title' in title_tag.attrs else title_tag.get_text(strip=True)
            else:
                title = 'No title found'
        link_tag = article.find('a')
        link = link_tag['href'] if link_tag else 'No link found'

        date_tag = article.find(class_=website['date_class'])
        date = date_tag.get_text(strip=True) if date_tag else 'No date found'

        content = parse_article(link, website) if link != 'No link found' else 'No content found'

        news_data.append({
            'title': title,
            'link': link,
            'date': date,
            'content': content,
            'is_fake': 0,
            'media_bias': website['platform']
        })
        print(f"Appended article: {title}")

    return news_data

# Function to Get All Articles from a Website
def get_all_articles(base_url, website, max_pages=2):
    articles = []
    next_page = base_url
    current_page = 1

    while next_page and current_page <= max_pages:
        print(f"Crawling page {current_page}")
        articles.extend(parse_page(next_page, website))
        soup = get_soup(next_page)
        if website['name'] == 'Antara':
            next_page = f"{base_url}&page={current_page + 1}"
        elif website['name'] == 'Suara':
            next_page = f"{base_url}?page={current_page + 1}"
        elif website['name'] == 'Detik':
            next_page = f"{base_url}&page={current_page + 1}"
        else:
            next_button = soup.find(class_=website['next_page'])
            next_page = next_button["href"] if next_button else None
        current_page += 1
        time.sleep(2)
    return articles

# Function to Execute the Crawling Process
def main():
    all_news = []
    topik = input("Masukkan Topik: ")
    for website in websites:
        try:
            base_url = website['url'] + topik
            scraped_news = get_all_articles(base_url, website)
            print(f"Scraped {len(scraped_news)} articles from {website['name']}")
            all_news.extend(scraped_news)
            time.sleep(2)  # Respectful delay to avoid overwhelming the server
        except requests.HTTPError as e:
            print(f"Failed to scrape {website['name']}: {e}")

    df = pd.DataFrame(all_news)
    print(f"Total articles collected: {len(all_news)}")

    # Save to CSV
    df.to_csv('scraped_news.csv', index=False)

if __name__ == "__main__":
    main()

ModuleNotFoundError: No module named 'selenium'

**TESTING SUCCESSFUL FOR CRAWLING WITH USER INPUT**

* For crawling news data based on user input use the coding below

In [None]:
import json
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import pandas as pd
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

# Load JSON file
with open('news_websites.json') as json_file:
    websites = json.load(json_file)['websites']

def get_soup(url):
    response = requests.get(url, headers=headers)
    response.raise_for_status()
    return BeautifulSoup(response.text, 'html.parser')

def parse_article(link, website):
    try:
        soup = get_soup(link)
        content_div = soup.find(class_=website['content_class'])
        content = ' '.join(p.get_text(strip=True) for p in content_div.find_all('p')) if content_div else 'No content found'
        return content
    except Exception as e:
        print(f"Error parsing article {link}: {e}")
        return 'No content found'

def parse_page(url, website):
    news_data = []

    if website['name'] == 'CNN Indonesia':
        driver = webdriver.Chrome()
        driver.get(url)
        
        try:
            WebDriverWait(driver, 20).until(
                EC.invisibility_of_element_located((By.CLASS_NAME, 'animate-pulse'))
            )
            
            content = driver.page_source
            soup = BeautifulSoup(content, 'html.parser')
            
            articles = soup.find_all('article')
            
            for article in articles:
                title = article.find('h2').get_text(strip=True) if article.find('h2') else 'No Title'
                date_tag = article.find('span', class_='text-xs text-cnn_black_light3')
                date = date_tag.get_text(strip=True) if date_tag else 'No date found'

                link_tag = article.find('a')
                link = link_tag['href'] if link_tag else 'No link found'

                image_tag = article.find('img')
                image = image_tag['src'] if image_tag else 'No image found'
                content = parse_article(link, website) if link != 'No link found' else 'No content found'

                news_data.append({
                    'title': title,
                    'content': content,
                    'date': date,
                    'link': link,
                    'image': image,
                    'is_fake': 0,
                    'media_bias': 'CNN Indonesia'
                })
        
        finally:
            driver.quit()

    else:
        soup = get_soup(url)
        articles = soup.find_all(website['article_tag'], class_=website['article_class'])
        print(f"Found {len(articles)} articles on page: {url}")
        
        for article in articles:
            title_tag = article.find(class_=website.get('title_class', None))
            if title_tag:
                title = title_tag.get_text(strip=True)
            else:
                title_tag = article.find('a')
                if title_tag:
                    title = title_tag['title'] if 'title' in title_tag.attrs else title_tag.get_text(strip=True)
                else:
                    title = 'No title found'
            link_tag = article.find('a')
            link = link_tag['href'] if link_tag else 'No link found'

            date_tag = article.find(class_=website['date_class'])
            date = date_tag.get_text(strip=True) if date_tag else 'No date found'

            image_tag = article.find('img')
            if(website['name'] == 'Antara'):
                image = image_tag['data-src'] if image_tag else 'No image found'
            elif(website['name'] == 'Detik'):
                image = image_tag['src'] if image_tag else 'No image found'

            content = parse_article(link, website) if link != 'No link found' else 'No content found'

            news_data.append({
                'title': title,
                'link': link,
                'date': date,
                'content': content,
                'image': image,
                'is_fake': 0,
                'media_bias': website['platform']
            })
            print(f"Appended article: {title}")

    return news_data

def get_all_articles(base_url, website, max_pages=2):
    articles = []
    next_page = base_url
    current_page = 1

    while next_page and current_page <= max_pages:
        print(f"Crawling page {current_page}")
        articles.extend(parse_page(next_page, website))
        if website['name'] in ['Antara', 'Suara', 'Detik']:
            next_page = f"{base_url}&page={current_page + 1}"
        else:
            soup = get_soup(next_page)
            next_button = soup.find(class_=website['next_page'])
            next_page = next_button["href"] if next_button else None
        current_page += 1
        time.sleep(2)
    return articles

def crawlerWithTopik(topik):
    all_news = []
    for website in websites:
        try:
            base_url = website['url'] + topik
            scraped_news = get_all_articles(base_url, website)
            print(f"Scraped {len(scraped_news)} articles from {website['name']}")
            all_news.extend(scraped_news)
            time.sleep(2)
        except requests.HTTPError as e:
            print(f"Failed to scrape {website['name']}: {e}")

    with open('scraped_news.json', 'w', encoding='utf-8') as f:
        json.dump(all_news, f, ensure_ascii=False, indent=4)

    print(f"Total articles collected: {len(all_news)}")
    return all_news

if __name__ == "__main__":
    topik = input("Masukkan Topik: ")
    crawlerWithTopik(topik)