In [1]:
import requests
from bs4 import BeautifulSoup
import re
import threading
from datetime import datetime
import pandas as pd
import time
from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor
import concurrent.futures

In [2]:
data = {
    "keywords": "anies",
    "since_time": "2023-09-01",
    "until_time": "2023-11-10"
}

In [3]:
def scrape_links(date, keywords,page_number):
    input_date = datetime.strptime(date, "%Y-%m-%d")
    formatted_date_string = input_date.strftime("%Y/%m/%d")
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
    }
    url = f"https://www.cnbcindonesia.com/search?query={keywords}+&p={page_number}&kanal=&tipe=&date={formatted_date_string}"
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    articles = soup.find_all('article')
        
    links = []
    for article in articles:
            link = article.find('a')['href']
            links.append(link)
        
    print(f"Scraped {len(links)} links from page {page_number}")
    return links

In [4]:
def scrape_link_per_day(date, keywords,max_threads=5):
    page_number = 1
    page_links = []

    with concurrent.futures.ThreadPoolExecutor(max_threads) as executor:
        futures = []

        while True:
            future = executor.submit(scrape_links, date,keywords, page_number)
            futures.append(future)
            page_number += 1

            # Break the loop if no more articles are found
            if not future.result():
                break

        for future in concurrent.futures.as_completed(futures):
            page_links.extend(future.result())

    return page_links

In [5]:
link=scrape_link_per_day(data["since_time"],data["keywords"])

Scraped 12 links from page 1
Scraped 3 links from page 2
Scraped 0 links from page 3


In [6]:
print(link)

['https://www.cnbcindonesia.com/market/20230901172550-17-468310/santer-jadi-wakil-anies-ini-sederet-aset-mewah-cak-imin', 'https://www.cnbcindonesia.com/news/20230901174320-4-468316/sby-kata-netizen-demokrat-kena-prank-musang-berbulu-domba', 'https://www.cnbcindonesia.com/news/20230901172304-4-468308/sby-marah-dan-kecewa-ke-anies-sekarang-saja-tak-jujur', 'https://www.cnbcindonesia.com/news/20230901164950-4-468289/sby-soal-heboh-duet-anies-cak-imin-ini-bukan-kiamat', 'https://www.cnbcindonesia.com/news/20230901160631-4-468274/demokrat-marah-besar-ke-anies-ini-bentuk-pengkhianatan', 'https://www.cnbcindonesia.com/news/20230901132405-8-468203/video-anies-dipasangkan-dengan-cak-imin-demokrat-kecewa', 'https://www.cnbcindonesia.com/news/20230901131023-4-468195/beredar-diduga-surat-tangan-anies-meminang-ahy-jadi-cawapres', 'https://www.cnbcindonesia.com/entrepreneur/20230901083818-25-468079/terungkap-harta-surya-paloh-bisnisnya-gak-cuma-media', 'https://www.cnbcindonesia.com/news/2023090111

In [7]:
def scrape_url(url,max_retries=2):
    retries = 0
    while retries < max_retries:
            try:
                headers = {
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
                }
                response = requests.get(url, headers=headers)
                if response.status_code == 200:
                    soup = BeautifulSoup(response.text, 'html.parser')
                    
                    
                    # Judul Berita
                    title_elem = soup.find('h1')
                    if title_elem:
                        title_text = title_elem.text.strip()
                    else:
                        title_text = "Title not found" 
                    # tanggal berita
                    date_elem = soup.find('div', {"class": "date"})
                    if date_elem:
                        date_text = date_elem.text.strip()
                    else:
                        date_text = "Date not found"
                    #     # Content Berita
                    body_elem = soup.find('div', {"class": "read__content"})
                        
                    body_elem = soup.find('div', {"class": "detail_text"})
                    
                    if body_elem:
                        content_elem = body_elem.find_all('p')
                        content_text = ""
                        for p in content_elem:
                            content_text += p.text.strip() + "\n"
                        
                        if content_text.strip():
                            content_text=content_text
                            content_text = content_text.replace('\n', '').replace('\r', '').replace('\t', '')
                            content_text = ' '.join(content_text.split())
                        else:
                            content_text ="Content not found"
                    else:
                            content_text ="Content not found"

                    return{
                        'title': title_text,
                        'date': date_text,
                        'content':content_text,
                        'link' : url}
                elif response.status_code == 429:
                    print(f"Received a 429 error for {url}. Retrying in 5 seconds...")
                    time.sleep(5)
                else:
                    print(f"Failed to retrieve data from {url}: Status Code {response.status_code}")
            except requests.exceptions.RequestException as e:
                print(f"Error fetching URL '{url}': {e}")
            except Exception as e:
                print(f"Error processing URL '{url}': {e}")
            retries += 1
            if retries < max_retries:
                print(f"Retrying {url} (Attempt {retries}/{max_retries})")
                time.sleep(5)  # You can adjust the delay as needed
    return None         

In [8]:
def is_keyword_in_text(keyword, text):
    # Case-insensitive search for whole word match
    pattern = re.compile(r'\b{}\b'.format(re.escape(keyword)), flags=re.IGNORECASE)

    # Check for a match with or without parentheses and hyphens
    match_results = [pattern.search(match) for match in [text, text.replace('(', '').replace(')', ''), text.replace('-', ' ')]]
    
    # print("Keyword:", keyword)
    # print("Match Results:", match_results)

    return any(match_results)

for url in link:
    data_cnbc = scrape_url(url)
    print(data["keywords"])
    
    # Assuming data_cnbc is a dictionary with 'title', 'content', and 'link' keys
    title_contains_keyword = any(is_keyword_in_text(keyword, data_cnbc['title']) for keyword in data["keywords"])
    content_contains_keyword = any(is_keyword_in_text(keyword, data_cnbc['content']) for keyword in data["keywords"])

    if title_contains_keyword or content_contains_keyword:
        matched_keywords = [keyword for keyword in data["keywords"] if is_keyword_in_text(keyword, data_cnbc['title']) or is_keyword_in_text(keyword, data_cnbc['content'])]
        print("Data contains keywords:", data_cnbc)
        print("Matched Keywords:", matched_keywords)
    else:
        print("News does not contain the specified keywords and will not be inserted into the database. URL:", data_cnbc)


anies
Data contains keywords: {'title': 'Santer Jadi Wakil Anies, Ini Sederet Aset Mewah Cak Imin', 'date': '01 September 2023 19:50', 'content': 'Jakarta, CNBC Indonesia - Abdul Muhaimin Iskandar atau Cak Imin dikabarkan dipilih bakal calon presiden (capres) Anies Baswedan sebagai calon wakil presiden (cawapres). Kabar ini terbilang mengejutkan mengingat Anies selama ini lebih dikaitkan dengan nama Ketua umum Demokrat Agus Harimurti Yudhoyono (AHY).Mengutip situs laporan e-lhkpn periodik 2022, Wakil Ketua DPR itu memiliki total harta sebesar Rp 27,28 miliar. Besaran harta itu terdiri dari tanah dan bangunan senilai Rp 24,70 miliar.Antara lain mencakup tanah seluas 386 m2 di Jakarta Selatan, tanah dan bangunan seluas 723 m2 di Jakarta Selatan, tanah dan bangunan seluas 1.070 m2, tanah dan bangunan seluas 300 m2 di Jakarta Selatan, dan tanah seluas 595 m2 di Jakarta Selatan. Semuanya tercatat sebagai hasil sendiri.ADVERTISEMENTSCROLL TO RESUME CONTENTSelanjutnya, Cak Imin tercatat memil