In [1]:
import requests
from bs4 import BeautifulSoup
import re
import threading
from datetime import datetime
import pandas as pd
import time
from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import concurrent.futures
from selenium.common.exceptions import StaleElementReferenceException


In [2]:
data = {
    "keywords": "jokowi",
    "since_time": "2023-10-23",
    "until_time": "2023-10-25"
}

In [3]:
def scrape_pagination(keywords, since_time, until_time):
    current_date = datetime.strptime(since_time, "%Y-%m-%d")
    until_date = datetime.strptime(until_time, "%Y-%m-%d")
    result_data = []  # Membuat list kosong untuk menyimpan data
    
    while current_date <= until_date:
        formatted_date = current_date.strftime("%d-%m-%Y")
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
        }
        url = f"https://www.antaranews.com/search?q={keywords}&startDate={formatted_date}&endDate={formatted_date}&page=1"
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.text, "html.parser")
        cek = soup.find('h2')
        
        if cek:
            hasil_text = cek.get_text()
            result = re.search(r'\d+', hasil_text)
            hasil_number = int(result.group(0))
            hasil_divided = hasil_number / 15
            page_index = round(hasil_divided)
            result_data.append({
                'keywords':keywords,
                'tanggal_berita': formatted_date,
                'jumlah_index': page_index
            })
        else:
            print(f"No data found for {formatted_date}")
        current_date += timedelta(days=1)
    
    return result_data 

In [4]:
data_tanggal = scrape_pagination(data["keywords"], data["since_time"], data["until_time"])
print(data_tanggal)

[{'keywords': 'jokowi', 'tanggal_berita': '23-10-2023', 'jumlah_index': 2}, {'keywords': 'jokowi', 'tanggal_berita': '24-10-2023', 'jumlah_index': 3}, {'keywords': 'jokowi', 'tanggal_berita': '25-10-2023', 'jumlah_index': 4}]


In [5]:
def scrape_links(page_number, keywords, date, link_list):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
    }
    url = f"https://www.antaranews.com/search?q={keywords}&startDate={date}&endDate={date}&page={page_number}"
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")
    articles = soup.find_all('article',{"class": "simple-post simple-big clearfix"})

    page_links = []
    for article in articles:
        link = article.find('a')['href']
        page_links.append(link)

    print(f"Scraped {len(page_links)} links from page {page_number} cek tanggal {date}")
    
    # Extend the link_list with the links scraped on this page
    link_list.extend(page_links)

In [6]:
def scrape_all_links(data_tanggal):
    link_list = []  # Create an empty list to store all the links
    with ThreadPoolExecutor(max_workers=4) as executor:  # You can adjust the number of threads as needed
        for item in data_tanggal:
            keywords = item['keywords']
            date = item['tanggal_berita']
            max_pages = item['jumlah_index']
            for page_number in range(1, max_pages+1):  # Specify the range of pages you want to scrape
                executor.submit(scrape_links, page_number, keywords, date, link_list)
    
    return link_list

In [7]:
all_links = scrape_all_links(data_tanggal)
print(len(all_links))

Scraped 15 links from page 1 cek tanggal 23-10-2023
Scraped 15 links from page 1 cek tanggal 24-10-2023
Scraped 15 links from page 2 cek tanggal 23-10-2023
Scraped 15 links from page 2 cek tanggal 24-10-2023
Scraped 15 links from page 1 cek tanggal 25-10-2023
Scraped 15 links from page 3 cek tanggal 24-10-2023
Scraped 15 links from page 3 cek tanggal 25-10-2023
Scraped 15 links from page 2 cek tanggal 25-10-2023
Scraped 15 links from page 4 cek tanggal 25-10-2023
135


In [8]:
print(all_links)

['https://www.antaranews.com/berita/3788454/pm-australia-akan-berkunjung-ke-china-pekan-depan', 'https://www.antaranews.com/berita/3788430/kementan-gandeng-asosiasi-kejar-target-produksi-35-juta-ton-beras', 'https://www.antaranews.com/berita/3788361/kapolresta-mamuju-ingatkan-pentingnya-netralitas-polisi-saat-pemilu', 'https://www.antaranews.com/berita/3788346/pertemuan-dengan-pers-disiapkan-untuk-finalisasi-publisher-rights', 'https://www.antaranews.com/berita/3788202/kejagung-kinerja-aparatur-kejaksaan-harus-dirasakan-masyarakat', 'https://www.antaranews.com/berita/3788058/sri-mulyani-waspadai-dampak-kebijakan-as-ke-arus-modal-asing-di-ri', 'https://www.antaranews.com/berita/3787968/hoaks-anies-baswedan-tidak-lolos-tes-kesehatan-pada-23-oktober', 'https://www.antaranews.com/berita/3787938/pnm-gelar-studi-banding-nasabah-tingkatkan-kapasitas-umkm', 'https://www.antaranews.com/berita/3787911/prabowo-saya-dinasti-merah-putih-cinta-tanah-air', 'https://www.antaranews.com/berita/3787896/k

In [9]:
def scrape_url(url, keywords,max_retries=3):
    retries = 0
    while retries < max_retries:
            try:
                headers = {
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
                }
                response = requests.get(url, headers=headers)
                if response.status_code == 200:
                    soup = BeautifulSoup(response.text, 'html.parser')

                    # Extract data from the web page
                    title_elem = soup.find('h1',{"class": "post-title"})
                    title_text = title_elem.text.strip() if title_elem else "Title not found"

                    date_elem = soup.find('span', {"class": "article-date"})
                    date_text = date_elem.text.strip() if date_elem else "Date not found"

                    body_elem = soup.find('div', {"class": "post-content clearfix"})
                    if body_elem:
                        content_text = body_elem.text
                        content_text = content_text.replace('\n', '').replace('\r', '').replace('\t', '')
                        content_text = ' '.join(content_text.split())
                    else:
                        content_text ="Content not found"

                    nama_berita_match = re.search(r'https://www\.(\w+)\.com/', url)
                    nama_berita = nama_berita_match.group(1) if nama_berita_match else "Nama_berita not found"

                    return {
                        'title': title_text,
                        'keywords': keywords,
                        'date': date_text,
                        'content': content_text,
                        'nama_berita': nama_berita,
                        'link': url
                    }
                else:
                    print(f"Failed to retrieve data from {url}: Status Code {response.status_code}")
            except requests.exceptions.RequestException as e:
                print(f"Error fetching URL '{url}': {e}")
            except Exception as e:
                print(f"Error processing URL '{url}': {e}")
            retries += 1
            if retries < max_retries:
                print(f"Retrying {url} (Attempt {retries}/{max_retries})")
                time.sleep(5)  # You can adjust the delay as needed
    return None         

In [10]:
# Create a list to store the results
results = []

# Define a function to scrape URLs and store results
def scrape_and_store_result(url):
    result = scrape_url(url, data['keywords'])
    if result:
        results.append(result)

# Create a ThreadPoolExecutor to run the scraping function concurrently
max_threads = 5  # You can adjust the number of threads as needed
with concurrent.futures.ThreadPoolExecutor(max_threads) as executor:
    executor.map(scrape_and_store_result, all_links)

# Print or process the results as needed
for result in results:
    print(result)

{'title': 'PM Australia akan berkunjung ke China pekan depan', 'keywords': 'jokowi', 'date': 'Senin, 23 Oktober 2023 21:28 WIB', 'content': 'Istanbul (ANTARA) - Perdana Menteri Australia Anthony Albanese akan mengunjungi China pada awal November 2023 untuk bertemu dengan Presiden China Xi Jinping dan timpalannya Li Qiang.Selama kunjungan Albanese ke China pada 4-7 November, para pemimpin kedua negara tersebut akan membahas kerja sama pada bidang ekonomi, perubahan iklim, dan hubungan antar masyarakat, kata Kantor Perdana Menteri Australia dalam sebuah pernyataan.Sementara itu, Kementerian Luar Negeri China mengonfirmasi kunjungan Albanese pada Senin.“Atas undangan Perdana Menteri China Li Qiang, Albanese akan menghadiri Pameran Impor Internasional China ke-6 pekan depan,” kata juru bicara Kementerian Luar Negeri Mao Ning.“Sebuah hubungan bilateral yang sehat dan stabil dibutuhkan untuk kepentingan mendasar kedua negara dan masyarakatnya, yang juga dapat membantu menjaga perdamaian dan 

In [11]:
results

[{'title': 'PM Australia akan berkunjung ke China pekan depan',
  'keywords': 'jokowi',
  'date': 'Senin, 23 Oktober 2023 21:28 WIB',
  'content': 'Istanbul (ANTARA) - Perdana Menteri Australia Anthony Albanese akan mengunjungi China pada awal November 2023 untuk bertemu dengan Presiden China Xi Jinping dan timpalannya Li Qiang.Selama kunjungan Albanese ke China pada 4-7 November, para pemimpin kedua negara tersebut akan membahas kerja sama pada bidang ekonomi, perubahan iklim, dan hubungan antar masyarakat, kata Kantor Perdana Menteri Australia dalam sebuah pernyataan.Sementara itu, Kementerian Luar Negeri China mengonfirmasi kunjungan Albanese pada Senin.“Atas undangan Perdana Menteri China Li Qiang, Albanese akan menghadiri Pameran Impor Internasional China ke-6 pekan depan,” kata juru bicara Kementerian Luar Negeri Mao Ning.“Sebuah hubungan bilateral yang sehat dan stabil dibutuhkan untuk kepentingan mendasar kedua negara dan masyarakatnya, yang juga dapat membantu menjaga perdamai

In [12]:
df = pd.DataFrame(results)
print('hasil scrapping',len(results))
df

hasil scrapping 135


Unnamed: 0,title,keywords,date,content,nama_berita,link
0,PM Australia akan berkunjung ke China pekan depan,jokowi,"Senin, 23 Oktober 2023 21:28 WIB",Istanbul (ANTARA) - Perdana Menteri Australia ...,antaranews,https://www.antaranews.com/berita/3788454/pm-a...
1,Kejagung: Kinerja aparatur Kejaksaan harus dir...,jokowi,"Senin, 23 Oktober 2023 19:43 WIB",Yang kami lakukan bagaimana bekerja untuk masy...,antaranews,https://www.antaranews.com/berita/3788202/keja...
2,Pertemuan dengan pers disiapkan untuk finalisa...,jokowi,"Senin, 23 Oktober 2023 20:34 WIB",Jadi dalam pertemuan itu kami mau menyampaikan...,antaranews,https://www.antaranews.com/berita/3788346/pert...
3,Kapolresta Mamuju ingatkan pentingnya netralit...,jokowi,"Senin, 23 Oktober 2023 20:39 WIB","Mamuju (ANTARA) - Kepala Polresta Mamuju, Komi...",antaranews,https://www.antaranews.com/berita/3788361/kapo...
4,Kementan gandeng asosiasi kejar target produks...,jokowi,"Senin, 23 Oktober 2023 21:16 WIB",Jakarta (ANTARA) - Kementerian Pertanian mengg...,antaranews,https://www.antaranews.com/berita/3788430/keme...
...,...,...,...,...,...,...
130,Presiden Jokowi lantik Jenderal Agus Subiyanto...,jokowi,"Rabu, 25 Oktober 2023 09:53 WIB",Jakarta (ANTARA) - Presiden Joko Widodo (Jokow...,antaranews,https://www.antaranews.com/berita/3790893/pres...
131,Presiden tunjuk Letjen Agus Subiyanto sebagai ...,jokowi,"Rabu, 25 Oktober 2023 09:37 WIB","Ya, jadi hari ini pelantikan Letjen TNI Agus S...",antaranews,https://www.antaranews.com/berita/3790878/pres...
132,Pendukung Prabowo-Gibran mulai padati jalan de...,jokowi,"Rabu, 25 Oktober 2023 08:48 WIB",Jakarta (ANTARA) - Ratusan pendukung bakal pas...,antaranews,https://www.antaranews.com/berita/3790770/pend...
133,Presiden Jokowi lantik Menteri Pertanian Amran...,jokowi,"Rabu, 25 Oktober 2023 09:30 WIB",Presiden Joko Widodo (Jokowi) melantik Menteri...,antaranews,https://www.antaranews.com/berita/3790866/pres...
