## Scraping Detik

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from datetime import datetime
import threading
import re

In [2]:
# Mengecek Link status Ok
url = "https://news.detik.com/index"
response = requests.get(url)
response

<Response [200]>

In [3]:
jumlah_index = 1
threads_link = []
links = []
results = []
keywords = 'anies'

In [4]:
def scrape_links(page_number,keywords):
    url = f"https://www.detik.com/search/searchall?query={keywords}&sortby=time&page={page_number}"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    articles = soup.find_all('article')
    
    page_links = []
    for article in articles:
        link = article.find('a')['href']
        page_links.append(link)
    
    print(f"Scraped {len(page_links)} links from page {page_number}")
    return page_links

In [5]:
for page_number in range(1, jumlah_index + 1):
    thread = threading.Thread(target=lambda p=page_number: links.extend(scrape_links(p,keywords)))
    thread.start()
    threads_link.append(thread)

for thread in threads_link:
    thread.join()
print("Total Links:", len(links))


Scraped 9 links from page 1
Total Links: 9


In [6]:
def scrape_url(url,keywords):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            
            
            # Judul Berita
            title_elem = soup.find('h1', {"class": "detail__title"})
            if title_elem:
                title_text = title_elem.text.strip()
            else:
                title_text = "Title not found"
            # Author berita
            author_elem = soup.find('div', {"class": "detail__author"})
            if author_elem:
                author_text = author_elem.get_text()
                author_text = author_text.split('-')[0].strip()
            else:
                author_text = "Author not found"     
            # tanggal berita
            date_elem = soup.find('div', {"class": "detail__date"})
            if date_elem:
                date_text = date_elem.text.strip()
            else:
                date_text = "Date not found"
            #     # Category berita
            category_elements = soup.find('div',{"class": "page__breadcrumb"})
            if category_elements:
                category_text = category_elements.find('a',{"dtr-sec": "breadcrumbkanal"})
                category_text= category_text.text.strip()
            else:
                category_text = "Category not found"
            #     # Content Berita
            body_elem = soup.find('div', {"class": "detail__body"})
            
            if body_elem:
                content_elem = body_elem.find_all('p')
                content_text = ""
                for p in content_elem:
                    content_text += p.text.strip() + "\n"
                
                if content_text.strip():
                    content_text=content_text
                else:
                    content_text ="Content not found"
            else:
              content_text ="Content not found"

            nama_berita_match = re.search(r'https://(.*?)/', url)
            if nama_berita_match:
                nama_berita = nama_berita_match.group(1)
            else:
                nama_berita = "Nama_berita not found"
            results.append({'title': title_text,
                            'keywords':keywords,
                            'author' : author_text,
                            'category':category_text,
                            'date': date_text,
                            'content' : content_text,
                            'nama_berita' : nama_berita,
                            'link' : url})
        else:
            print(f"Failed to retrieve data from {url}")

    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL '{url}': {e}")
    except Exception as e:
        print(f"Error processing URL '{url}': {e}")

In [7]:
threads = []
for url in links:
    thread = threading.Thread(target=scrape_url, args=(url,keywords))
    thread.start()
    threads.append(thread)
    
for thread in threads:
    thread.join()

In [8]:
df = pd.DataFrame(results)
print('hasil scrapping',len(results))
df.head(10)

hasil scrapping 9


Unnamed: 0,title,keywords,author,category,date,content,nama_berita,link
0,Cak Imin Ungkap Ada Ilmuwan dari Perwakilan Di...,anies,Kadek Melda Luxiana,Pemilu,"Rabu, 27 Sep 2023 16:13 WIB",Bacawapres Koalisi Perubahan Muhaimin Iskandar...,news.detik.com,https://news.detik.com/pemilu/d-6953433/cak-im...
1,Cak Imin Klaim Hasil Survei Internal di Jatim ...,anies,Kadek Melda Luxiana,Pemilu,"Rabu, 27 Sep 2023 13:21 WIB",Bakal calon wakil presiden (Bacawapres) dari K...,news.detik.com,https://news.detik.com/pemilu/d-6953031/cak-im...
2,Tafsir Satu Meja Prabowo dan Mega,anies,Tim detikcom,Pemilu,"Rabu, 27 Sep 2023 08:24 WIB",Dalam acara Hari Nasional Kerajaan Arab Saudi ...,news.detik.com,https://news.detik.com/pemilu/d-6952502/tafsir...
3,Menerka Capres yang Didukung PSI Usai Kaesang ...,anies,Isal Mawardi,Pemilu,"Rabu, 27 Sep 2023 07:16 WIB","Putra bungsu Presiden Joko Widodo (Jokowi), Ka...",news.detik.com,https://news.detik.com/pemilu/d-6952451/menerk...
4,Hasrat Politik dan Eufemisme Kampanye Terselubung,anies,Muhammad Aufal Fresky,Kolom,"Rabu, 27 Sep 2023 14:30 WIB",Kontestasi politik lima tahunan sebentar lagi ...,news.detik.com,https://news.detik.com/kolom/d-6952955/hasrat-...
5,Eco City vs Ego City,anies,Ari Sukmayadi,Kolom,"Rabu, 27 Sep 2023 11:52 WIB",Kemajuan suatu peradaban paling mudah dilihat ...,news.detik.com,https://news.detik.com/kolom/d-6952849/eco-cit...
6,"Cak Imin Ngaku Rutin Survei Internal, Klaim Ja...",anies,Author not found,Category not found,"648 Views | Rabu, 27 Sep 2023 14:35 WIB",Bacawapres Koalisi Perubahan sekaligus Ketum P...,news.detik.com,https://news.detik.com/detiktv/d-6953242/cak-i...
7,Anggota DPRD Medan Mundur dari Golkar-Dukung A...,anies,Nizar Aldi,Berita,"Rabu, 27 Sep 2023 14:50 WIB",Anggota DPRD Medan Muhammad Afri Rizki Lubis m...,www.detik.com,https://www.detik.com/sumut/berita/d-6953222/a...
8,"Pilih Dukung Anies, Anggota DPRD Medan Mundur ...",anies,Nizar Aldi,Berita,"Rabu, 27 Sep 2023 09:41 WIB",Anggota DPRD Medan Fraksi Partai Golkar Muhamm...,www.detik.com,https://www.detik.com/sumut/berita/d-6952569/p...


In [9]:
filtered_df = df[df['content'].str.contains(keywords, case=False)|
                 df['title'].str.contains(keywords, case=False)]
print('hasil filter',len(filtered_df))
filtered_df.head(10)

hasil filter 8


Unnamed: 0,title,keywords,author,category,date,content,nama_berita,link
0,Cak Imin Ungkap Ada Ilmuwan dari Perwakilan Di...,anies,Kadek Melda Luxiana,Pemilu,"Rabu, 27 Sep 2023 16:13 WIB",Bacawapres Koalisi Perubahan Muhaimin Iskandar...,news.detik.com,https://news.detik.com/pemilu/d-6953433/cak-im...
1,Cak Imin Klaim Hasil Survei Internal di Jatim ...,anies,Kadek Melda Luxiana,Pemilu,"Rabu, 27 Sep 2023 13:21 WIB",Bakal calon wakil presiden (Bacawapres) dari K...,news.detik.com,https://news.detik.com/pemilu/d-6953031/cak-im...
3,Menerka Capres yang Didukung PSI Usai Kaesang ...,anies,Isal Mawardi,Pemilu,"Rabu, 27 Sep 2023 07:16 WIB","Putra bungsu Presiden Joko Widodo (Jokowi), Ka...",news.detik.com,https://news.detik.com/pemilu/d-6952451/menerk...
4,Hasrat Politik dan Eufemisme Kampanye Terselubung,anies,Muhammad Aufal Fresky,Kolom,"Rabu, 27 Sep 2023 14:30 WIB",Kontestasi politik lima tahunan sebentar lagi ...,news.detik.com,https://news.detik.com/kolom/d-6952955/hasrat-...
5,Eco City vs Ego City,anies,Ari Sukmayadi,Kolom,"Rabu, 27 Sep 2023 11:52 WIB",Kemajuan suatu peradaban paling mudah dilihat ...,news.detik.com,https://news.detik.com/kolom/d-6952849/eco-cit...
6,"Cak Imin Ngaku Rutin Survei Internal, Klaim Ja...",anies,Author not found,Category not found,"648 Views | Rabu, 27 Sep 2023 14:35 WIB",Bacawapres Koalisi Perubahan sekaligus Ketum P...,news.detik.com,https://news.detik.com/detiktv/d-6953242/cak-i...
7,Anggota DPRD Medan Mundur dari Golkar-Dukung A...,anies,Nizar Aldi,Berita,"Rabu, 27 Sep 2023 14:50 WIB",Anggota DPRD Medan Muhammad Afri Rizki Lubis m...,www.detik.com,https://www.detik.com/sumut/berita/d-6953222/a...
8,"Pilih Dukung Anies, Anggota DPRD Medan Mundur ...",anies,Nizar Aldi,Berita,"Rabu, 27 Sep 2023 09:41 WIB",Anggota DPRD Medan Fraksi Partai Golkar Muhamm...,www.detik.com,https://www.detik.com/sumut/berita/d-6952569/p...


Save Data

In [10]:
# current_datetime = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
# excel_file_name = f'./excel/detik_{keywords}_{current_datetime}.xlsx'
# filtered_df.to_excel(excel_file_name, index=False)

# print(f'Data has been saved to {excel_file_name}')