In [1]:
import requests
from bs4 import BeautifulSoup
import re
import threading
from datetime import datetime
import pandas as pd
import time
from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor
import concurrent.futures

In [5]:
def scrape_links(date,page_number):
    format=datetime.strptime(date, "%Y-%m-%d")
    formatted_date_string = format.strftime("%Y-%m-%d")
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
    }
    url = f"https://surabaya.inews.id/indeks/all/{formatted_date_string}/{page_number}"
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    articles = soup.find_all('div', {"class": "box-list-news"})
    links = []
    for article in articles:
        link = article.find('a')['href']
        links.append(link)
    print(f"Scraped {len(links)} links from page {page_number} url {url}")
    
    return links

In [6]:
def scrape_link_per_day(date, max_threads=5):
    page_number = 0
    page_links = []

    with concurrent.futures.ThreadPoolExecutor(max_threads) as executor:
        futures = []

        while True:
            future = executor.submit(scrape_links, date, page_number)
            futures.append(future)
            page_number += 12

            # Break the loop if no more articles are found
            if not future.result():
                break

        for future in concurrent.futures.as_completed(futures):
            page_links.extend(future.result())

    return page_links

In [7]:
link=scrape_link_per_day('2023-11-01')

Scraped 12 links from page 0 url https://surabaya.inews.id/indeks/all/2023-11-01/0
Scraped 10 links from page 12 url https://surabaya.inews.id/indeks/all/2023-11-01/12
Scraped 0 links from page 24 url https://surabaya.inews.id/indeks/all/2023-11-01/24


In [8]:
def scrape_url(url,max_retries=2):
    retries = 0
    while retries < max_retries:
            try:
                headers = {
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
                }
                response = requests.get(url, headers=headers)
                if response.status_code == 200:
                    soup = BeautifulSoup(response.text, 'html.parser')
                    
                    
                    # Judul Berita
                    title_elem = soup.find('div', {"class": "title"})
                    if title_elem:
                        title_text=title_elem.find('h1')
                        title_text = title_text.text.strip()
                    else:
                        title_text = "Title not found"  
                    # tanggal berita
                    date_elem = soup.find('div', {"class": "date"})
                    # print(date_elem)
                    if date_elem:
                        date_text= date_elem.text.strip()
                        date_text= date_text.replace('\n', '').replace('\r', '').replace('\t', '')
                        date_text=' '.join(date_text.split())
                        match = re.search(r'\b(\d{2} \w+ \d{4})', date_text)
                        if match:
                            extracted_date_str = match.group(1)
                        date_object = datetime.strptime(extracted_date_str, '%d %B %Y')
                        formatted_date = date_object.strftime('%Y-%m-%d')
                        # date_text = date_text
                    else:
                        date_text = "Date not found"
                    #     # Content Berita
                    body_elem = soup.find('div', {"class": "caption"})
                        
                    if body_elem:
                        content_elem = body_elem.find_all('p')
                        content_text = ""
                        for p in content_elem:
                                content_text += p.text.strip() + "\n"
                            
                        if content_text.strip():
                            content_text=content_text
                            content_text = content_text.replace('\n', '').replace('\r', '').replace('\t', '')
                            content_text = ' '.join(content_text.split())
                        else:
                            content_text="Content not found"
                    else:
                            content_text="Content not found"

                    return{
                        'title': title_text,
                        'date': formatted_date,
                        'content':content_text,
                        'link' : url}
                elif response.status_code == 429:
                    print(f"Received a 429 error for {url}. Retrying in 5 seconds...")
                    time.sleep(5)
                else:
                    print(f"Failed to retrieve data from {url}: Status Code {response.status_code}")
            except requests.exceptions.RequestException as e:
                print(f"Error fetching URL '{url}': {e}")
            except Exception as e:
                print(f"Error processing URL '{url}': {e}")
            retries += 1
            if retries < max_retries:
                print(f"Retrying {url} (Attempt {retries}/{max_retries})")
                time.sleep(5)  # You can adj|ust the delay as needed
    return None         

In [10]:
url='https://surabaya.inews.id/read/365197/bpjamsostek-surabaya-karimunjawa-serahkan-jkm-kepada-ahli-waris-pegawai-non-asn-ppls'
data_inews = scrape_url(url)

In [11]:
print(data_inews)

{'title': 'BPJamsostek Surabaya Karimunjawa Serahkan JKM Kepada Ahli Waris Pegawai Non ASN PPLS', 'date': '2023-11-01', 'content': 'SURABAYA, iNews.id – BPJS Ketenagakerjaan (BPJamsostek) Surabaya Karimunjawa menyerahkan santunan klaim Jaminan Kematian (JKM) kepada ahli waris pegawai Non ASN Pusat Pengendalian Lumpur Sidoarjo (PPLS). Simbolis penyerahan dilakukan bersamaan dengan kegiatan Sosialisasi Manfaat Program BPJS Ketenagakerjaan, di Kantor BA Porong PPLS, Rabu (01/11/2023).Selain penyerahan klaim JKM, pada momen ini BPJamsostek Surabaya Karimunjawa juga menyerahkan simbolis kartu peserta BPJS Ketenagakerjaan kepada pegawai Swakelola Pusat Pengendalian Lumpur Sidoarjo.Hadir dalam kegiatan ini Kepala Cabang BPJS Ketenagakerjaan Surabaya Karimunjawa, Adventus Edison Souhuwat, Kepala Bagian Tata Usaha PPLS, Hikmad Batara Reza dan seluruh pekerja Swakelola PPLS.Sonny panggilan akrab Adventus Edison Souhuwat menyampaikan duka cita atas meninggalnya salah satu pegawai Non ASN PPLS“Sem