In [24]:
import requests
from bs4 import BeautifulSoup
import re
import threading
from datetime import datetime
import pandas as pd
import time
from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import concurrent.futures
from selenium.common.exceptions import StaleElementReferenceException


In [34]:
data = {
    "keywords": "jokowi",
    "since_time": "2023-10-16",
    "until_time": "2023-10-19"
}

In [35]:
def scrape_pagination(keywords, since_time, until_time):
    driver = webdriver.Chrome()
    current_date = datetime.strptime(since_time, "%Y-%m-%d")
    until_date = datetime.strptime(until_time, "%Y-%m-%d")
    result_data = []  # Membuat list kosong untuk menyimpan data
    
    while current_date <= until_date:
        date_str = current_date.strftime('%Y/%m/%d')
        url = f"https://www.cnnindonesia.com/search?query={keywords}&date={date_str}"
        driver.get(url)
        
        try:
            total_search_element = WebDriverWait(driver, 300).until(
                EC.presence_of_element_located((By.ID, "total-search"))
            )

            total_search_text = total_search_element.text
            match = re.search(r'\d+', total_search_text)
            if match:
                numeric_value = int(match.group())
                hasil_divided = numeric_value / 10
                page_index = round(hasil_divided)
                result_data.append({
                        'keywords':keywords,
                        'tanggal_berita': date_str,
                        'jumlah_index': page_index
                 })
            else:
                print(f"No numeric value found in total_search_text for date {date_str}")
        except Exception as e:
            print(f"An error occurred for date {date_str}: {str(e)}")
            
        current_date += timedelta(days=1)

    driver.quit()
    return result_data 

In [36]:
data_tanggal = scrape_pagination(data["keywords"], data["since_time"], data["until_time"])
print(data_tanggal)

[{'keywords': 'jokowi', 'tanggal_berita': '2023/10/16', 'jumlah_index': 4}, {'keywords': 'jokowi', 'tanggal_berita': '2023/10/17', 'jumlah_index': 5}, {'keywords': 'jokowi', 'tanggal_berita': '2023/10/18', 'jumlah_index': 4}, {'keywords': 'jokowi', 'tanggal_berita': '2023/10/19', 'jumlah_index': 4}]


In [37]:
def scrape_links(page_number, keywords, date, link_list):
    driver = webdriver.Chrome()
    try:
        url = f"https://www.cnnindonesia.com/search?query={keywords}&date={date}&page={page_number}"
        driver.get(url)

        try:
            div_badan = WebDriverWait(driver, 300).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, ".flex.flex-col.gap-5"))
            )

            if div_badan:
                page_links = []
                articles = div_badan.find_elements(By.CSS_SELECTOR, ".flex-grow")
                for article in articles:
                    # Wrap this element locating code in a try block
                    try:
                        link = WebDriverWait(article, 300).until(
                            EC.presence_of_element_located((By.TAG_NAME, 'a'))
                        )
                        href = link.get_attribute('href')
                        page_links.append(href)
                    except StaleElementReferenceException:
                        # Handle StaleElementReferenceException by re-locating the element
                        link = WebDriverWait(article, 300).until(
                            EC.presence_of_element_located((By.TAG_NAME, 'a'))
                        )
                        href = link.get_attribute('href')
                        page_links.append(href)
        except StaleElementReferenceException:
            # Handle StaleElementReferenceException by re-locating the div_badan element
            div_badan = WebDriverWait(driver, 300).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, ".flex.flex-col.gap-5"))
            )
            page_links = []
            articles = div_badan.find_elements(By.CSS_SELECTOR, ".flex-grow")
            for article in articles:
                link = WebDriverWait(article, 60).until(
                    EC.presence_of_element_located((By.TAG_NAME, 'a'))
                )
                href = link.get_attribute('href')
                page_links.append(href)

        print(f"Scraped {len(link_list)} links from page {page_number} date {date}")
        link_list.extend(page_links)
    except Exception as e:
        print(f"Error while scraping page {page_number}: {str(e)}")


In [38]:
def scrape_all_links(data_tanggal):
    link_list = []  # Create an empty list to store all the links
    with ThreadPoolExecutor(max_workers=4) as executor:  # You can adjust the number of threads as needed
        for item in data_tanggal:
            keywords = item['keywords']
            date = item['tanggal_berita']
            max_pages = item['jumlah_index']
            for page_number in range(1, max_pages+1):  # Specify the range of pages you want to scrape
                executor.submit(scrape_links, page_number, keywords, date, link_list)
    
    return link_list

In [39]:
all_links = scrape_all_links(data_tanggal)
print(len(all_links))

Scraped 0 links from page 4 date 2023/10/16
Scraped 8 links from page 1 date 2023/10/16
Scraped 18 links from page 3 date 2023/10/16
Scraped 28 links from page 1 date 2023/10/17
Scraped 38 links from page 2 date 2023/10/16
Scraped 48 links from page 2 date 2023/10/17
Scraped 58 links from page 3 date 2023/10/17
Scraped 68 links from page 4 date 2023/10/17
Scraped 78 links from page 5 date 2023/10/17
Scraped 86 links from page 1 date 2023/10/18
Scraped 96 links from page 4 date 2023/10/18
Scraped 106 links from page 2 date 2023/10/18
Scraped 116 links from page 3 date 2023/10/18
Scraped 126 links from page 4 date 2023/10/19
Scraped 131 links from page 3 date 2023/10/19
Scraped 141 links from page 1 date 2023/10/19
Scraped 151 links from page 2 date 2023/10/19
161
