In [1]:
import requests
from bs4 import BeautifulSoup
import re
import threading
from datetime import datetime
import pandas as pd
import time
from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor
import concurrent.futures

In [2]:
def scrape_links(date,page_number):
    format=datetime.strptime(date, "%Y-%m-%d")
    formatted_date_string = format.strftime("%d-%m-%Y")
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
    }
    url = f"https://jateng.inews.id/indeks/{formatted_date_string}/{page_number}"
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    articles = soup.find_all('li', {"class": "padding-10px-all"})
    links = []
    for article in articles:
        link = article.find('a')['href']
        links.append(link)
    print(f"Scraped {len(links)} links from page {page_number} url {url}")
    
    return links

In [3]:
def scrape_link_per_day(date, max_threads=5):
    page_number = 1
    page_links = []

    with concurrent.futures.ThreadPoolExecutor(max_threads) as executor:
        futures = []

        while True:
            future = executor.submit(scrape_links, date, page_number)
            futures.append(future)
            page_number += 1

            # Break the loop if no more articles are found
            if not future.result():
                break

        for future in concurrent.futures.as_completed(futures):
            page_links.extend(future.result())

    return page_links

In [4]:
link=scrape_link_per_day('2023-11-01')

Scraped 15 links from page 1 url https://jateng.inews.id/indeks/01-11-2023/1
Scraped 12 links from page 2 url https://jateng.inews.id/indeks/01-11-2023/2
Scraped 0 links from page 3 url https://jateng.inews.id/indeks/01-11-2023/3


In [5]:
def scrape_url(url,max_retries=2):
    retries = 0
    while retries < max_retries:
            try:
                headers = {
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
                }
                response = requests.get(url, headers=headers)
                if response.status_code == 200:
                    soup = BeautifulSoup(response.text, 'html.parser')
                    
                    
                    # Judul Berita
                    title_elem = soup.find('h1', {"style": "padding:0 0 10px; margin:0; font-weight:700; font-size:34px; color:#000;"})
                    if title_elem:
                        title_text = title_elem.text.strip()
                    else:
                        title_text = "Title not found"  
                    # tanggal berita
                    date_elem = soup.find('a', {"class": "author-profile"})
                    # print(date_elem)
                    if date_elem:
                        date_text= date_elem.text.strip()
                        date_text= date_text.replace('\n', '').replace('\r', '').replace('\t', '')
                        date_text=' '.join(date_text.split())
                        match = re.search(r'\b(\d{2} \w+ \d{4})', date_text)
                        if match:
                            extracted_date_str = match.group(1)
                        date_object = datetime.strptime(extracted_date_str, '%d %B %Y')
                        formatted_date = date_object.strftime('%Y-%m-%d')
                        # date_text = date_text
                    else:
                        date_text = "Date not found"
                    #     # Content Berita
                    body_elem = soup.find('div', {"itemprop": "articleBody"})
                        
                    if body_elem:
                        content_elem = body_elem.find_all('p')
                        content_text = ""
                        for p in content_elem:
                                content_text += p.text.strip() + "\n"
                            
                        if content_text.strip():
                            content_text=content_text
                            content_text = content_text.replace('\n', '').replace('\r', '').replace('\t', '')
                            content_text = ' '.join(content_text.split())
                        else:
                            content_text="Content not found"
                    else:
                            content_text="Content not found"

                    return{
                        'title': title_text,
                        'date': formatted_date,
                        'content':content_text,
                        'link' : url}
                elif response.status_code == 429:
                    print(f"Received a 429 error for {url}. Retrying in 5 seconds...")
                    time.sleep(5)
                else:
                    print(f"Failed to retrieve data from {url}: Status Code {response.status_code}")
            except requests.exceptions.RequestException as e:
                print(f"Error fetching URL '{url}': {e}")
            except Exception as e:
                print(f"Error processing URL '{url}': {e}")
            retries += 1
            if retries < max_retries:
                print(f"Retrying {url} (Attempt {retries}/{max_retries})")
                time.sleep(5)  # You can adjust the delay as needed
    return None         

In [6]:
url='https://jateng.inews.id/berita/4-prestasi-cemerlang-ganjar-pranowo-dalam-konservasi-lingkungan-di-jawa-tengah'
data_inews = scrape_url(url)

In [7]:
print(data_inews)

{'title': '4 Prestasi Cemerlang Ganjar Pranowo dalam Konservasi Lingkungan di Jawa Tengah', 'date': '2023-11-01', 'content': 'JAKARTA, iNews.id - Calon presiden, Ganjar Pranowo, menonjol sebagai sosok yang sangat peduli terhadap tantangan lingkungan di Indonesia. Di tengah eskalasi permasalahan lingkungan, kepemimpinan yang memperhatikan alam semakin berharga. Tidak hanya berbicara, Capres 2024 ini telah mencapai prestasi nyata di bidang konservasi lingkungan.Ketika masih menjabat sebagai Gubernur Jateng pada 2021, Ganjar Pranowo diakui oleh Kementerian Lingkungan Hidup dan Kehutanan dengan penghargaan Green Leadership Nirwasita Tantra, sebagai bentuk penghargaan atas dedikasinya dalam mengatasi isu-isu lingkungan.Prestasi ini bukan hasil kebetulan, melainkan akumulasi program konservasi lingkungan yang sukses yang telah diterapkan oleh Ganjar Pranowo selama masa jabatannya.Ini mencerminkan komitmennya yang mendalam terhadap pelestarian lingkungan serta usahanya yang nyata dalam memast