In [1]:
import requests
from bs4 import BeautifulSoup
import re
import threading
from datetime import datetime
import pandas as pd
import time

In [2]:
data = {
    "keywords": "jokowi",
    "since_time": "2023-10-01",
    "until_time": "2023-10-18"
}

In [3]:
def cek_totalPagination(keywords, since_time, until_time):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
    }
    since_time = datetime.strptime(since_time, "%Y-%m-%d").strftime("%d/%m/%Y")
    until_time = datetime.strptime(until_time, "%Y-%m-%d").strftime("%d/%m/%Y")
    url = f"https://www.detik.com/search/searchall?query={keywords}&siteid=2&sortby=time&fromdatex={since_time}&todatex={until_time}&page=1"
    
    response = requests.get(url,headers=headers) 
    soup = BeautifulSoup(response.text, "html.parser")
    cek = soup.find('span', {"class": "fl text"})
    hasil_text = cek.get_text()

    # Use regular expression to extract the number
    result = re.search(r'\d+', hasil_text)
    if result:
        hasil_number = int(result.group(0))
        hasil_divided = hasil_number / 9  # Divide the number by 9
        rounded_result = round(hasil_divided)  # Round to the nearest whole number
        return rounded_result
    else:
        return 0  # Return 0 if no number was found


In [4]:
jumlah_index = cek_totalPagination(data["keywords"], data["since_time"], data["until_time"])
print(f"Total results: {jumlah_index}")

Total results: 160


In [5]:
def scrape_links(page_number,keywords, since_time, until_time):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
    }
    since_time = datetime.strptime(since_time, "%Y-%m-%d").strftime("%d/%m/%Y")
    until_time = datetime.strptime(until_time, "%Y-%m-%d").strftime("%d/%m/%Y")
    url = f"https://www.detik.com/search/searchall?query={keywords}&siteid=2&sortby=time&fromdatex={since_time}&todatex={until_time}&page={page_number}"
    response = requests.get(url,headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")
    articles = soup.find_all('article')
    
    page_links = []
    for article in articles:
        link = article.find('a')['href']
        page_links.append(link)
    
    print(f"Scraped {len(page_links)} links from page {page_number}")
    return page_links

In [6]:
# import concurrent.futures

# def scrape_links_with_threads(max_threads, jumlah_index, data):
#     links = []
#     with concurrent.futures.ThreadPoolExecutor(max_threads) as executor:
#         for page_number in range(1, jumlah_index + 1):
#             thread = executor.submit(scrape_links, page_number, data["keywords"], data["since_time"], data["until_time"])
#             links.extend(thread.result())

#     return links

# # Define the maximum number of concurrent threads you want to use
# max_threads = 8  # Adjust this number based on your system's capabilities

# # Call the function to scrape links
# scraped_links = scrape_links_with_threads(max_threads, jumlah_index, data)

# # Now you can use 'scraped_links' for further processing
# print("Total Links:", len(scraped_links))


In [7]:
def scrape_links_in_threads(data, jumlah_index):
    links = []
    threads_link = []
    
    def scrape_links_for_page(page_number):
        links.extend(scrape_links(page_number, data["keywords"], data["since_time"], data["until_time"]))

    for page_number in range(1, jumlah_index + 1):
        thread = threading.Thread(target=scrape_links_for_page, args=(page_number,))
        thread.start()
        threads_link.append(thread)

    for thread in threads_link:
        thread.join()
    
    print("Total Links:", len(links))
    return links

# Call the function to scrape links in multiple threads
scraped_links = scrape_links_in_threads(data, jumlah_index)

Scraped 9 links from page 1
Scraped 9 links from page 5
Scraped 9 links from page 4
Scraped 9 links from page 18
Scraped 9 links from page 17
Scraped 9 links from page 15
Scraped 9 links from page 12
Scraped 9 links from page 7
Scraped 9 links from page 8
Scraped 9 links from page 6
Scraped 9 links from page 2
Scraped 9 links from page 11
Scraped 9 links from page 19
Scraped 9 links from page 10
Scraped 9 links from page 3
Scraped 9 links from page 24
Scraped 9 links from page 27
Scraped 9 links from page 16
Scraped 9 links from page 38
Scraped 9 links from page 9
Scraped 9 links from page 35
Scraped 9 links from page 37
Scraped 9 links from page 57
Scraped 9 links from page 31
Scraped 9 links from page 25
Scraped 9 links from page 40
Scraped 9 links from page 13
Scraped 9 links from page 34
Scraped 9 links from page 14
Scraped 9 links from page 80
Scraped 9 links from page 22
Scraped 9 links from page 54
Scraped 9 links from page 45
Scraped 9 links from page 65
Scraped 9 links from pa

In [8]:
print(scraped_links)

['https://news.detik.com/pemilu/d-6988425/mahfud-saya-tak-pernah-kampanye-baru-kali-ini-nyatakan-bersedia-jadi-cawapres', 'https://oto.detik.com/berita/d-6988419/ehang-yang-dijajal-jokowi-dapat-sertifikat-taksi-terbang-otonom-pertama-di-dunia', 'https://finance.detik.com/berita-ekonomi-bisnis/d-6988402/jokowi-buka-trade-expo-indonesia-kemendag-bidik-transaksi-rp-172-t', 'https://www.detik.com/sumut/berita/d-6988398/ternyata-gibran-tak-diundang-ke-acara-pengumuman-cawapres-ganjar', 'https://news.detik.com/detiktv/d-6988354/deklarasi-capres-cawapres-pdip-tanpa-kehadiran-jokowi', 'https://www.detik.com/jateng/berita/d-6988349/masih-ngantor-di-solo-gibran-tak-diundang-deklarasi-cawapres-ganjar', 'https://news.detik.com/berita/d-6988266/puan-tegaskan-pdip-tak-pecah-kongsi-dengan-jokowi-semua-baik-baik-saja', 'https://finance.detik.com/berita-ekonomi-bisnis/d-6988228/ri-dan-china-teken-10-nota-kesepahaman-berantas-kemiskinan-hingga-investasi', 'https://finance.detik.com/berita-ekonomi-bisnis

In [9]:
# import concurrent.futures

# # Define the maximum number of concurrent threads you want to use
# max_threads = 8  # Adjust this number based on your system's capabilities

# links = []
# with concurrent.futures.ThreadPoolExecutor(max_threads) as executor:
#     for page_number in range(1, jumlah_index + 1):
#         thread = executor.submit(scrape_links, page_number, data["keywords"], data["since_time"], data["until_time"])
#         links.extend(thread.result())

# # No need for manual thread joining in this case

# print("Total Links:", len(links))


In [10]:
# import concurrent.futures

# # Define the maximum number of concurrent threads you want to use
# max_threads = 8  # Adjust this number based on your system's capabilities

# links = []
# with concurrent.futures.ThreadPoolExecutor(max_threads) as executor:
#     for page_number in range(1, jumlah_index + 1):
#         thread = executor.submit(scrape_links, page_number, data["keywords"], data["since_time"], data["until_time"])
#         links.extend(thread.result())

# # No need for manual thread joining in this case

# print("Total Links:", len(links))


In [11]:
# links = []
# threads_link = []
# for page_number in range(1, jumlah_index + 1):
#     thread = threading.Thread(target=lambda p=page_number: links.extend(scrape_links(p,data["keywords"], data["since_time"], data["until_time"])))
#     thread.start()
#     threads_link.append(thread)

# for thread in threads_link:
#     thread.join()
# print("Total Links:", len(links))


In [12]:
# results = []

In [13]:
# def scrape_url(url,keywords):
#     try:
#         headers = {
#         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
#         }
#         response = requests.get(url,headers=headers)
#         if response.status_code == 200:
#             soup = BeautifulSoup(response.text, 'html.parser')
            
            
#             # Judul Berita
#             title_elem = soup.find('h1', {"class": "detail__title"})
#             if title_elem:
#                 title_text = title_elem.text.strip()
#             else:
#                 title_text = "Title not found"
#             # Author berita
#             author_elem = soup.find('div', {"class": "detail__author"})
#             if author_elem:
#                 author_text = author_elem.get_text()
#                 author_text = author_text.split('-')[0].strip()
#             else:
#                 author_text = "Author not found"     
#             # tanggal berita
#             date_elem = soup.find('div', {"class": "detail__date"})
#             if date_elem:
#                 date_text = date_elem.text.strip()
#             else:
#                 date_text = "Date not found"
#             #     # Category berita
#             category_elements = soup.find('div',{"class": "page__breadcrumb"})
#             if category_elements:
#                 category_text = category_elements.find('a',{"dtr-sec": "breadcrumbkanal"})
#                 category_text= category_text.text.strip()
#             else:
#                 category_text = "Category not found"
#             #     # Content Berita
#             body_elem = soup.find('div', {"class": "detail__body"})
            
#             if body_elem:
#                 content_elem = body_elem.find_all('p')
#                 content_text = ""
#                 for p in content_elem:
#                     content_text += p.text.strip() + "\n"
                
#                 if content_text.strip():
#                     content_text=content_text
#                 else:
#                     content_text ="Content not found"
#             else:
#               content_text ="Content not found"

#             nama_berita_match = re.search(r'https://(?:www\.)?([a-zA-Z0-9.-]+)\.com', url)
#             if nama_berita_match:
#                 nama_berita = nama_berita_match.group(1)
#             else:
#                 nama_berita = "Nama_berita not found"
#             results.append({'title': title_text,
#                             'keywords':keywords,
#                             'author' : author_text,
#                             'category':category_text,
#                             'date': date_text,
#                             'content' : content_text,
#                             'nama_berita' : nama_berita,
#                             'link' : url})
#             print(f"Done Get data  {url}")
#         else:
#             print(f"Failed to retrieve data from {url}")

#     except requests.exceptions.RequestException as e:
#         print(f"Error fetching URL '{url}': {e}")
#     except Exception as e:
#         print(f"Error processing URL '{url}': {e}")

In [14]:
def scrape_url(url, keywords,max_retries=3):
    retries = 0
    while retries < max_retries:
            try:
                headers = {
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
                }
                response = requests.get(url, headers=headers)
                if response.status_code == 200:
                    soup = BeautifulSoup(response.text, 'html.parser')

                    # Extract data from the web page
                    title_elem = soup.find('h1', {"class": "detail__title"})
                    title_text = title_elem.text.strip() if title_elem else "Title not found"

                    author_elem = soup.find('div', {"class": "detail__author"})
                    author_text = author_elem.get_text().split('-')[0].strip() if author_elem else "Author not found"

                    date_elem = soup.find('div', {"class": "detail__date"})
                    date_text = date_elem.text.strip() if date_elem else "Date not found"

                    category_elements = soup.find('div', {"class": "page__breadcrumb"})
                    category_text = category_elements.find('a', {"dtr-sec": "breadcrumbkanal"}).text.strip() if category_elements else "Category not found"

                    body_elem = soup.find('div', {"class": "detail__body"})
                    if body_elem:
                        content_elem = body_elem.find_all('p')
                        content_text = "\n".join(p.text.strip() for p in content_elem)
                    else:
                        content_text = "Content not found"

                    nama_berita_match = re.search(r'https://(?:www\.)?([a-zA-Z0-9.-]+)\.com', url)
                    nama_berita = nama_berita_match.group(1) if nama_berita_match else "Nama_berita not found"

                    return {
                        'title': title_text,
                        'keywords': keywords,
                        'author': author_text,
                        'category': category_text,
                        'date': date_text,
                        'content': content_text,
                        'nama_berita': nama_berita,
                        'link': url
                    }
                else:
                    print(f"Failed to retrieve data from {url}: Status Code {response.status_code}")
            except requests.exceptions.RequestException as e:
                print(f"Error fetching URL '{url}': {e}")
            except Exception as e:
                print(f"Error processing URL '{url}': {e}")
            retries += 1
            if retries < max_retries:
                print(f"Retrying {url} (Attempt {retries}/{max_retries})")
                time.sleep(5)  # You can adjust the delay as needed
    return None         

In [15]:
# def scrape_url(url, keywords):
#     try:
#         headers = {
#             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
#         }
#         response = requests.get(url, headers=headers)
#         if response.status_code == 200:
#             soup = BeautifulSoup(response.text, 'html.parser')

#             # Extract data from the web page
#             title_elem = soup.find('h1', {"class": "detail__title"})
#             title_text = title_elem.text.strip() if title_elem else "Title not found"

#             author_elem = soup.find('div', {"class": "detail__author"})
#             author_text = author_elem.get_text().split('-')[0].strip() if author_elem else "Author not found"

#             date_elem = soup.find('div', {"class": "detail__date"})
#             date_text = date_elem.text.strip() if date_elem else "Date not found"

#             category_elements = soup.find('div', {"class": "page__breadcrumb"})
#             category_text = category_elements.find('a', {"dtr-sec": "breadcrumbkanal"}).text.strip() if category_elements else "Category not found"

#             body_elem = soup.find('div', {"class": "detail__body"})
#             if body_elem:
#                 content_elem = body_elem.find_all('p')
#                 content_text = "\n".join(p.text.strip() for p in content_elem)
#             else:
#                 content_text = "Content not found"

#             nama_berita_match = re.search(r'https://(?:www\.)?([a-zA-Z0-9.-]+)\.com', url)
#             nama_berita = nama_berita_match.group(1) if nama_berita_match else "Nama_berita not found"

#             return {
#                 'title': title_text,
#                 'keywords': keywords,
#                 'author': author_text,
#                 'category': category_text,
#                 'date': date_text,
#                 'content': content_text,
#                 'nama_berita': nama_berita,
#                 'link': url
#             }
#         else:
#             print(f"Failed to retrieve data from {url}: Status Code {response.status_code}")
#     except requests.exceptions.RequestException as e:
#         print(f"Error fetching URL '{url}': {e}")
#     except Exception as e:
#         print(f"Error processing URL '{url}': {e}")

#     # Return None in case of an error or if data extraction fails
#     return None


In [16]:
import os

num_cores = os.cpu_count()

if num_cores is not None:
    print(f"Number of CPU cores available: {num_cores}")
else:
    print("Number of CPU cores could not be determined.")

Number of CPU cores available: 8


In [17]:
def scrape_urls_in_threads(scraped_links, data):
    results = []
    threads = []

    def scrape_url_and_store_result(url):
        result = scrape_url(url, data["keywords"])
        if result:
            results.append(result)

    for url in scraped_links:
        thread = threading.Thread(target=scrape_url_and_store_result, args=(url,))
        thread.start()
        threads.append(thread)

    for thread in threads:
        thread.join()

    return results

# Call the function to scrape URLs in multiple threads and store the results
results = scrape_urls_in_threads(scraped_links, data)

In [18]:
# import concurrent.futures

# def scrape_and_store_content_in_threads(scraped_links, max_threads, data):
#     results = []
#     with concurrent.futures.ThreadPoolExecutor(max_threads) as executor:
#         future_to_url = {executor.submit(scrape_url, url, data["keywords"]): url for url in scraped_links}
#         for future in concurrent.futures.as_completed(future_to_url):
#             url = future_to_url[future]
#             try:
#                 result = future.result()
#                 if result:
#                     results.append(result)
#                 else:
#                     print(f"Failed to scrape data from {url}")
#             except Exception as e:
#                 print(f"Error processing URL '{url}': {e}")

#     return results

# # Define the maximum number of concurrent threads you want to use
# max_threads = 8  # Adjust this number based on your system's capabilities

# # Call the function to scrape and store content in threads
# results = scrape_and_store_content_in_threads(scraped_links, max_threads, data)

# # Continue with any further processing after threads have completed


In [19]:
df = pd.DataFrame(results)
print('hasil scrapping',len(results))
df.head(10)

hasil scrapping 1440


Unnamed: 0,title,keywords,author,category,date,content,nama_berita,link
0,"Mahfud: Saya Tak Pernah Kampanye, Baru Kali In...",jokowi,Matius Hutajulu,Pemilu,"Rabu, 18 Okt 2023 11:25 WIB",Mahfud Md menjelaskan sebelum ditetapkan sebag...,news.detik,https://news.detik.com/pemilu/d-6988425/mahfud...
1,Momen Jokowi Ngobrol dan Foto Bareng Putin di ...,jokowi,Marlinda Erwanti,Berita,"Selasa, 17 Okt 2023 19:04 WIB",Presiden Joko Widodo (Jokowi) bertemu dengan P...,news.detik,https://news.detik.com/berita/d-6987543/momen-...
2,Kelompok Petani di Jateng Dukung Prabowo Capre...,jokowi,Tim detikcom,Pemilu,"Selasa, 17 Okt 2023 22:06 WIB",Kelompok petani di Jawa Tengah (Jateng) mendek...,news.detik,https://news.detik.com/pemilu/d-6987812/kelomp...
3,"2 Mantan Ketua Presidium PP PMKRI Gabung PSI, ...",jokowi,Muhammad Fardan Kaftaro,Pemilu,"Selasa, 17 Okt 2023 19:20 WIB",Dua mantan Ketua Presidium Pengurus Pusat Perh...,news.detik,https://news.detik.com/pemilu/d-6987588/2-mant...
4,Jokowi-Xi Jinping Bahas Pengiriman Beras China...,jokowi,Samuel Gading,Berita Ekonomi Bisnis,"Selasa, 17 Okt 2023 18:30 WIB",Presiden Joko Widodo diketahui melawat ke Repu...,finance.detik,https://finance.detik.com/berita-ekonomi-bisni...
5,"Masih Ngantor di Solo, Gibran Tak Diundang Dek...",jokowi,Tara Wahyu NV,Berita,"Rabu, 18 Okt 2023 11:00 WIB",Wali Kota Solo Gibran Rakabuming Raka batal be...,detik,https://www.detik.com/jateng/berita/d-6988349/...
6,Puan Tegaskan PDIP Tak Pecah Kongsi dengan Jok...,jokowi,Matius Hutajulu,Berita,"Rabu, 18 Okt 2023 10:34 WIB",Ketua DPP PDIP Puan Maharani menghadiri pengum...,news.detik,https://news.detik.com/berita/d-6988266/puan-t...
7,Projo Ungkap Hampir Pasti Gibran Cawapres Prabowo,jokowi,Helmy Akbar,Nusra,"Senin, 16 Okt 2023 19:11 WIB",Relawan Pro-Jokowi (Projo) mengungkapkan hampi...,detik,https://www.detik.com/bali/nusra/d-6985605/pro...
8,"Jokowi Buka Trade Expo Indonesia, Kemendag Bid...",jokowi,Anisa Indraini,Berita Ekonomi Bisnis,"Rabu, 18 Okt 2023 11:17 WIB",Presiden Joko Widodo (Jokowi) membuka pameran ...,finance.detik,https://finance.detik.com/berita-ekonomi-bisni...
9,Aturan Ekspor-Impor Barang Kiriman Berlaku Mul...,jokowi,Anisa Indraini,Berita Ekonomi Bisnis,"Selasa, 17 Okt 2023 18:45 WIB",Menteri Keuangan Sri Mulyani Indrawati menerap...,finance.detik,https://finance.detik.com/berita-ekonomi-bisni...


In [20]:
# import concurrent.futures

# # Define the maximum number of concurrent threads you want to use
# max_threads = 8  # Adjust this number based on your system's capabilities

# threads = []
# with concurrent.futures.ThreadPoolExecutor(max_threads) as executor:
#     for url in scraped_links:
#         thread = executor.submit(scrape_url, url, data["keywords"])
#         threads.append(thread)

# # Wait for all threads to complete
# for thread in threads:
#     thread.result()


In [21]:
# print(results)

In [22]:
# threads = []
# for url in links:
#     thread = threading.Thread(target=scrape_url, args=(url,data["keywords"]))
#     thread.start()
#     threads.append(thread)
    
# for thread in threads:
#     thread.join()