In [1]:
import requests
from bs4 import BeautifulSoup
import re
import threading
from datetime import datetime
import pandas as pd
import time
from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor
import concurrent.futures

In [2]:
data = {
    "keywords": "anies",
    "since_time": "2023-09-01",
    "until_time": "2023-11-10"
}

In [3]:
def scrape_links(date, keywords,page_number):
    input_date = datetime.strptime(date, "%Y-%m-%d")
    formatted_date_string = input_date.strftime("%Y/%m/%d")
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
    }
    url = f"https://www.cnbcindonesia.com/search?query={keywords}+&p={page_number}&kanal=&tipe=&date={formatted_date_string}"
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    articles = soup.find_all('article')
        
    links = []
    for article in articles:
            link = article.find('a')['href']
            links.append(link)
        
    print(f"Scraped {len(links)} links from page {page_number}")
    return links

In [4]:
def scrape_link_per_day(date, keywords,max_threads=5):
    page_number = 1
    page_links = []

    with concurrent.futures.ThreadPoolExecutor(max_threads) as executor:
        futures = []

        while True:
            future = executor.submit(scrape_links, date,keywords, page_number)
            futures.append(future)
            page_number += 1

            # Break the loop if no more articles are found
            if not future.result():
                break

        for future in concurrent.futures.as_completed(futures):
            page_links.extend(future.result())

    return page_links

In [5]:
link=scrape_link_per_day(data["since_time"],data["keywords"])

Scraped 12 links from page 1
Scraped 3 links from page 2
Scraped 0 links from page 3


In [6]:
print(len(link))

15


In [11]:
def scrape_url(url,max_retries=2):
    retries = 0
    while retries < max_retries:
            try:
                headers = {
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
                }
                response = requests.get(url, headers=headers)
                if response.status_code == 200:
                    soup = BeautifulSoup(response.text, 'html.parser')
                    
                    
                    # Judul Berita
                    title_elem = soup.find('h1')
                    if title_elem:
                        title_text = title_elem.text.strip()
                    else:
                        title_text = "Title not found" 
                    # tanggal berita
                    date_elem = soup.find('div', {"class": "date"})
                    if date_elem:
                        date_text = date_elem.text.strip()
                        datetime_object = datetime.strptime(date_text, '%d %B %Y %H:%M')
                        formatted_date = datetime_object.strftime('%Y/%m/%d')
                    else:
                        date_text = "Date not found"
                    #     # Content Berita
                    body_elem = soup.find('div', {"class": "read__content"})
                        
                    body_elem = soup.find('div', {"class": "detail_text"})
                    
                    if body_elem:
                        content_elem = body_elem.find_all('p')
                        content_text = ""
                        for p in content_elem:
                            content_text += p.text.strip() + "\n"
                        
                        if content_text.strip():
                            content_text=content_text
                            content_text = content_text.replace('\n', '').replace('\r', '').replace('\t', '')
                            content_text = ' '.join(content_text.split())
                            content_text= content_text.replace("ADVERTISEMENTSCROLL TO RESUME CONTENT","")
                        else:
                            content_text ="Content not found"
                    else:
                            content_text ="Content not found"

                    return{
                        'title': title_text,
                        'date': formatted_date,
                        'content':content_text,
                        'link' : url}
                elif response.status_code == 429:
                    print(f"Received a 429 error for {url}. Retrying in 5 seconds...")
                    time.sleep(5)
                else:
                    print(f"Failed to retrieve data from {url}: Status Code {response.status_code}")
            except requests.exceptions.RequestException as e:
                print(f"Error fetching URL '{url}': {e}")
            except Exception as e:
                print(f"Error processing URL '{url}': {e}")
            retries += 1
            if retries < max_retries:
                print(f"Retrying {url} (Attempt {retries}/{max_retries})")
                time.sleep(5)  # You can adjust the delay as needed
    return None         

In [12]:
url='https://www.cnbcindonesia.com/news/20231120085647-4-490296/jokowi-ri-akan-terus-mendukung-perjuangan-bangsa-palestina'
cek=scrape_url(url)

In [13]:
print(cek)

{'title': 'Jokowi: RI akan Terus Mendukung Perjuangan Bangsa Palestina!', 'date': '2023/11/20', 'content': 'Jakarta, CNBC Indonesia - Presiden Republik Indonesia Joko Widodo menegaskan sikap Indonesia yang akan terus bersama mendukung perjuangan bangsa Palestina. Penegasan itu disampaikan Jokowi saat melepas bantuan kemanusiaan untuk Palestina tahap kedua di Pangkalan Tentara Nasional Indonesia Angkatan Udara Halim Perdanakusuma, Jakarta Timur, Senin (20/11/2023).Dalam sambutannya, mantan gubernur DKI Jakarta itu mengatakan, selain bantuan kemanusiaan, Indonesia juga akan terus memberikan dukungan politik bagi Palestina.Sebagai salah satu utusan khusus OKI, Menteri Luar Negeri RI Retno Marsudi juga sedang berada di beberapa negara untuk menggalang dukungan agar kekejaman di Gaza segera dihentikan, dilakukan sesegera mungkin gencatan senjata dan bantuan kemanusiaan bisa masuk dengan baik untuk membantu saudara-saudara kita di Gaza."Sekali lagi saya tegaskan, Indonesia akan terus bersama

In [10]:
# for url in link:
#     data_cnbc = scrape_url(url)
#     print(data["keywords"])
#     keywords=data["keywords"]
#     if keywords.lower() in data_cnbc['title'].lower() or keywords.lower() in data_cnbc['content'].lower():
#         print("Data contains keywords:", data_cnbc)
#     else:
#         print("News does not contain the specified keywords and will not be inserted into the database. URL:", data_cnbc['link'])