Scrapping tribunnews Jakarta

In [21]:
import requests
from bs4 import BeautifulSoup
import re
import threading
from datetime import datetime
import pandas as pd
import time
from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor
import concurrent.futures
from urllib.parse import urlparse, parse_qs
from datetime import datetime
from dateutil import parser

In [22]:
def scrape_links(date, page_number):
    current_date = datetime.strptime(date, "%Y-%m-%d").strftime("%Y-%-m-%-d")
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
    }

    url = f"https://jakarta.tribunnews.com/index-news?date={current_date}&page={page_number}"
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    body = soup.find('ul', {"class": "lsi"})
    articles = body.find_all('li', {"class": "ptb15"})
    
    links = []
    for article in articles:
        header = article.find('h3', {"class": "f16 fbo"})
        if header:
            link = header.find('a')
            if link and 'href' in link.attrs:
                link_href = link['href']
                links.append(link_href)
                
    print(f"Scraped {len(links)} links from page {page_number} url {url}")

    return links


In [23]:
def scrape_link_per_day(date, max_threads=5):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
    }
    current_date = datetime.strptime(date, "%Y-%m-%d").strftime("%Y-%-m-%-d")
    url = f"https://jakarta.tribunnews.com/index-news?date={current_date}&page="
    response = requests.get(url + "1", headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    cek = soup.find('div', {"class": "paging"})
    links = cek.find_all('a')
    all_link = []
    for link in links:
        if link and 'href' in link.attrs:
            link_href = link['href']
            all_link.append(link_href)
    last_link = all_link[-1]

    # Extract the "page" parameter value from the last link
    parsed_url = urlparse(last_link)
    page_value = parse_qs(parsed_url.query).get('page', [])[0] if 'page' in parse_qs(parsed_url.query) else None
    print(page_value)
    # Check if page_value is an integer
    try:
        page_number = int(page_value)
    except (TypeError, ValueError):
        page_number = 1

    page_links = []

    with concurrent.futures.ThreadPoolExecutor(max_threads) as executor:
        # Use list comprehension to submit tasks to the thread pool
        futures = [executor.submit(scrape_links, current_date, index) for index in range(1, page_number + 1)]

        # Collect results from completed futures
        for future in concurrent.futures.as_completed(futures):
            page_links.extend(future.result())

    return page_links

In [24]:
link=scrape_link_per_day("2023-11-01")

5
Scraped 20 links from page 2 url https://jakarta.tribunnews.com/index-news?date=2023-11-1&page=2
Scraped 20 links from page 3 url https://jakarta.tribunnews.com/index-news?date=2023-11-1&page=3
Scraped 20 links from page 1 url https://jakarta.tribunnews.com/index-news?date=2023-11-1&page=1
Scraped 20 links from page 4 url https://jakarta.tribunnews.com/index-news?date=2023-11-1&page=4
Scraped 3 links from page 5 url https://jakarta.tribunnews.com/index-news?date=2023-11-1&page=5


In [25]:
def scrape_url(url,max_retries=2):
    retries = 0
    while retries < max_retries:
            try:
                headers = {
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
                }
                response = requests.get(url, headers=headers)
                if response.status_code == 200:
                    soup = BeautifulSoup(response.text, 'html.parser')
                    
                    
                    # Judul Berita
                    title_elem = soup.find('h1', {"id": "arttitle"})
                    if title_elem:
                        title_text = title_elem.text.strip()
                    else:
                        title_text = "Title not found"   
                    # tanggal berita
                    date_elem = soup.find('div', {"class": "grey bdr3 pb10 pt10"})
                    if date_elem:
                        # date_text = date_elem.text.strip()
                        date_text=date_elem.find('time')
                        date_text = date_elem.text.strip()
                        date_part = ' '.join(date_text.split(',')[1:]).strip()
                        date_object = parser.parse(date_part)
                        formatted_date = date_object.strftime('%Y-%m-%d')
                    else:
                        date_text = "Date not found"
                    #     # Content Berita
                    body_elem = soup.find('div', {"class": "side-article txt-article multi-fontsize editcontent"})
                    
                    if body_elem:
                        content_elem = body_elem.find_all('p')
                        content_text = ""
                        for p in content_elem:
                            content_text += p.text.strip() + "\n"
                        
                        if content_text.strip():
                            content_text=content_text
                            content_text = content_text.replace('\n', '').replace('\r', '').replace('\t', '')
                            content_text = ' '.join(content_text.split())
                        else:
                            content_text ="Content not found"
                    else:
                        content_text ="Content not found"

                    return{
                        'title': title_text,
                        'date': formatted_date,
                        'content':content_text,
                        'link' : url}
                elif response.status_code == 429:
                    print(f"Received a 429 error for {url}. Retrying in 5 seconds...")
                    time.sleep(5)
                else:
                    print(f"Failed to retrieve data from {url}: Status Code {response.status_code}")
            except requests.exceptions.RequestException as e:
                print(f"Error fetching URL '{url}': {e}")
            except Exception as e:
                print(f"Error processing URL '{url}': {e}")
            retries += 1
            if retries < max_retries:
                print(f"Retrying {url} (Attempt {retries}/{max_retries})")
                time.sleep(5)  # You can adjust the delay as needed
    return None         

In [26]:
cek=scrape_url('https://jakarta.tribunnews.com/2023/11/01/kronologi-mertua-bunuh-menantu-hamil-7-bulan-suami-korban-syok-pulang-interview-buka-pintu-kamar')

In [27]:
print(cek)

{'title': 'Kronologi Mertua Bunuh Menantu Hamil 7 Bulan, Suami Korban Syok Pulang Interview Buka Pintu Kamar', 'date': '2023-11-01', 'content': 'TRIBUNJAKARTA.COM - Seorang suami bernama Sueb syok saat melihat istrinya Fitria Almuniroh Hafidloh Diana (23) tewas bersimbah darah di dalam kamarnya.Fitria yang tengah hamil 7 bulan ternyata dibunuh oleh mertuanya sendiri Khoiri alias Satir (53).Wakapolres Pasuruan Kompol Hari Aziz membeberkan kronologi pembunuhan sadis tersebut.Hari awalnya menjelaskan Sueb, Khoiri, dan Fitria, tinggal serumah di Dusun Blimbing, Desa Parerejo, Kecamatan Purwodadi, Kabupaten Pasuruan, Jawa Timur.Khoiri sendiri merupakan duda, istrinya meninggal dunia beberapa waktu yang lalu.Saat peristiwa pembunuhan terjadi Khoiri, dan Fitria hanya berdua di rumah, sementara Sueb sedang interview kerja."Mereka itu tinggal satu rumah, lalu saat putranya mencari pekerjaan atau interview," ucap Hari dikutip TribunJakarta dari YouTube Tv One, pada Rabu (1/11/2023)."Korban di ru