## Scraping Detik

In [21]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from datetime import datetime
import threading
import re

In [22]:
# Mengecek Link status Ok
url = "https://detik.com/index"
response = requests.get(url)
response

<Response [200]>

In [23]:
jumlah_index = 1
threads_link = []
links = []
results = []
keywords = 'anies'

In [24]:
def scrape_links(page_number,keywords):
    url = f"https://www.detik.com/search/searchall?query={keywords}&sortby=time&page={page_number}"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    articles = soup.find_all('article')
    
    page_links = []
    for article in articles:
        link = article.find('a')['href']
        page_links.append(link)
    
    print(f"Scraped {len(page_links)} links from page {page_number}")
    return page_links

In [25]:
for page_number in range(1, jumlah_index + 1):
    thread = threading.Thread(target=lambda p=page_number: links.extend(scrape_links(p,keywords)))
    thread.start()
    threads_link.append(thread)

for thread in threads_link:
    thread.join()
print("Total Links:", len(links))


Scraped 9 links from page 1
Total Links: 9


In [26]:
def scrape_url(url,keywords):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            
            
            # Judul Berita
            title_elem = soup.find('h1', {"class": "detail__title"})
            if title_elem:
                title_text = title_elem.text.strip()
            else:
                title_text = "Title not found"
            # Author berita
            author_elem = soup.find('div', {"class": "detail__author"})
            if author_elem:
                author_text = author_elem.get_text()
                author_text = author_text.split('-')[0].strip()
            else:
                author_text = "Author not found"     
            # tanggal berita
            date_elem = soup.find('div', {"class": "detail__date"})
            if date_elem:
                date_text = date_elem.text.strip()
            else:
                date_text = "Date not found"
            #     # Category berita
            category_elements = soup.find('div',{"class": "page__breadcrumb"})
            if category_elements:
                category_text = category_elements.find('a',{"dtr-sec": "breadcrumbkanal"})
                category_text= category_text.text.strip()
            else:
                category_text = "Category not found"
            #     # Content Berita
            body_elem = soup.find('div', {"class": "detail__body"})
            
            if body_elem:
                content_elem = body_elem.find_all('p')
                content_text = ""
                for p in content_elem:
                    content_text += p.text.strip() + "\n"
                
                if content_text.strip():
                    content_text=content_text
                else:
                    content_text ="Content not found"
            else:
              content_text ="Content not found"

            nama_berita_match = re.search(r'https://(?:www\.)?([a-zA-Z0-9.-]+)\.com', url)
            if nama_berita_match:
                nama_berita = nama_berita_match.group(1)
            else:
                nama_berita = "Nama_berita not found"
            results.append({'title': title_text,
                            'keywords':keywords,
                            'author' : author_text,
                            'category':category_text,
                            'date': date_text,
                            'content' : content_text,
                            'nama_berita' : nama_berita,
                            'link' : url})
        else:
            print(f"Failed to retrieve data from {url}")

    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL '{url}': {e}")
    except Exception as e:
        print(f"Error processing URL '{url}': {e}")

In [27]:
threads = []
for url in links:
    thread = threading.Thread(target=scrape_url, args=(url,keywords))
    thread.start()
    threads.append(thread)
    
for thread in threads:
    thread.join()

In [28]:
df = pd.DataFrame(results)
print('hasil scrapping',len(results))
df.head(10)

hasil scrapping 9


Unnamed: 0,title,keywords,author,category,date,content,nama_berita,link
0,PKB Harap Anies-Cak Imin Bertemu Habib Rizieq ...,anies,Rolando Fransiscus Sihombing,Pemilu,"Jumat, 29 Sep 2023 07:58 WIB","Bakal capres dan cawapres Koalisi Perubahan, A...",news.detik,https://news.detik.com/pemilu/d-6955676/pkb-ha...
1,Pihak Habib Rizieq Bicara Permintaan Restu Ter...,anies,Rolando Fransiscus Sihombing,Pemilu,"Jumat, 29 Sep 2023 06:44 WIB",Habib Rizieq Shihab mengundang bakal capres da...,news.detik,https://news.detik.com/pemilu/d-6955622/pihak-...
2,Gus Mujib Janjikan Anies-Cak Imin Menang Mutla...,anies,Muhajir Arifin,Berita,"Jumat, 29 Sep 2023 01:00 WIB","Dari Banyuwangi dan Jember, Anies Baswedan dan...",detik,https://www.detik.com/jatim/berita/d-6955568/g...
3,Kiai Ponpes Nilai Bawaslu Akan Kebingungan den...,anies,Tim detikJatim,Berita,"Kamis, 28 Sep 2023 23:59 WIB",Pasangan bakal calon presiden dan wakil presid...,detik,https://www.detik.com/sumut/berita/d-6955560/k...
4,Safari Politik Anies-Cak Imin: Doa Kiai Hisyam...,anies,Suki Nurhalim,Berita,"Jumat, 29 Sep 2023 08:02 WIB","Kamis (28/9/2023), pasangan bakal capres dan c...",detik,https://www.detik.com/jatim/berita/d-6955675/s...
5,2 Bacapres 'Berebut' Dukungan Kiai dan Suara N...,anies,Tim detikJatim,Berita,"Jumat, 29 Sep 2023 02:01 WIB",Dua bakal calon presiden hari ini melakukan sa...,detik,https://www.detik.com/sumut/berita/d-6955564/2...
6,Anies Pimpin Tahlil Ziarah di Makam Maulana Ma...,anies,Jemmi Purwodianto,Berita,"Jumat, 29 Sep 2023 03:00 WIB",Bacapres Anies Baswedan ziarah ke makam Sunan ...,detik,https://www.detik.com/jatim/berita/d-6955586/a...
7,"Di Pasuruan, Cak Imin Minta Semuanya Tak Raguk...",anies,Muhajir Arifin,Berita,"Jumat, 29 Sep 2023 02:00 WIB",Pasangan bacapres Anies Baswedan dan bacawapre...,detik,https://www.detik.com/jatim/berita/d-6955580/d...
8,"Anies-Cak Imin Bertemu Habib Rizieq, PKS: Suda...",anies,Rolando Fransiscus Sihombing,Pemilu,"Jumat, 29 Sep 2023 07:16 WIB","Bakal capres dan cawapres Koalisi Perubahan, A...",news.detik,https://news.detik.com/pemilu/d-6955657/anies-...


In [29]:
filtered_df = df[df['content'].str.contains(keywords, case=False)|
                 df['title'].str.contains(keywords, case=False)]
print('hasil filter',len(filtered_df))
filtered_df.head(10)

hasil filter 9


Unnamed: 0,title,keywords,author,category,date,content,nama_berita,link
0,PKB Harap Anies-Cak Imin Bertemu Habib Rizieq ...,anies,Rolando Fransiscus Sihombing,Pemilu,"Jumat, 29 Sep 2023 07:58 WIB","Bakal capres dan cawapres Koalisi Perubahan, A...",news.detik,https://news.detik.com/pemilu/d-6955676/pkb-ha...
1,Pihak Habib Rizieq Bicara Permintaan Restu Ter...,anies,Rolando Fransiscus Sihombing,Pemilu,"Jumat, 29 Sep 2023 06:44 WIB",Habib Rizieq Shihab mengundang bakal capres da...,news.detik,https://news.detik.com/pemilu/d-6955622/pihak-...
2,Gus Mujib Janjikan Anies-Cak Imin Menang Mutla...,anies,Muhajir Arifin,Berita,"Jumat, 29 Sep 2023 01:00 WIB","Dari Banyuwangi dan Jember, Anies Baswedan dan...",detik,https://www.detik.com/jatim/berita/d-6955568/g...
3,Kiai Ponpes Nilai Bawaslu Akan Kebingungan den...,anies,Tim detikJatim,Berita,"Kamis, 28 Sep 2023 23:59 WIB",Pasangan bakal calon presiden dan wakil presid...,detik,https://www.detik.com/sumut/berita/d-6955560/k...
4,Safari Politik Anies-Cak Imin: Doa Kiai Hisyam...,anies,Suki Nurhalim,Berita,"Jumat, 29 Sep 2023 08:02 WIB","Kamis (28/9/2023), pasangan bakal capres dan c...",detik,https://www.detik.com/jatim/berita/d-6955675/s...
5,2 Bacapres 'Berebut' Dukungan Kiai dan Suara N...,anies,Tim detikJatim,Berita,"Jumat, 29 Sep 2023 02:01 WIB",Dua bakal calon presiden hari ini melakukan sa...,detik,https://www.detik.com/sumut/berita/d-6955564/2...
6,Anies Pimpin Tahlil Ziarah di Makam Maulana Ma...,anies,Jemmi Purwodianto,Berita,"Jumat, 29 Sep 2023 03:00 WIB",Bacapres Anies Baswedan ziarah ke makam Sunan ...,detik,https://www.detik.com/jatim/berita/d-6955586/a...
7,"Di Pasuruan, Cak Imin Minta Semuanya Tak Raguk...",anies,Muhajir Arifin,Berita,"Jumat, 29 Sep 2023 02:00 WIB",Pasangan bacapres Anies Baswedan dan bacawapre...,detik,https://www.detik.com/jatim/berita/d-6955580/d...
8,"Anies-Cak Imin Bertemu Habib Rizieq, PKS: Suda...",anies,Rolando Fransiscus Sihombing,Pemilu,"Jumat, 29 Sep 2023 07:16 WIB","Bakal capres dan cawapres Koalisi Perubahan, A...",news.detik,https://news.detik.com/pemilu/d-6955657/anies-...


Save Data

In [30]:
# current_datetime = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
# excel_file_name = f'../tempat_simpan_nasional/detik_{keywords}_{current_datetime}.xlsx'
# filtered_df.to_excel(excel_file_name, index=False)

# print(f'Data has been saved to {excel_file_name}')