In [12]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time

#Load URL
urls_df = pd.read_csv('url_detik_tabrakan.csv')
urls = urls_df['url'].dropna().unique()

# storage
articles_data = {
    'url': [],
    'article_text': []
}


service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)

# Function to fetch article content
def fetch_article_text(url):
    try:
        driver.get(url)
        
        # Scroll down
        scroll_pause_time = 1
        screen_height = driver.execute_script("return window.screen.height;")
        i = 1
        while True:
            driver.execute_script(f"window.scrollTo(0, {screen_height * i});")
            i += 1
            time.sleep(scroll_pause_time)
            scroll_height = driver.execute_script("return document.body.scrollHeight;")
            if screen_height * i > scroll_height:
                break

        
        html = driver.page_source
        soup = BeautifulSoup(html, "html.parser")

        
        article_body = soup.find('div', class_='detail__body-text itp_bodycontent')
        if article_body:
            paragraphs = article_body.find_all('p') 
            article_text = []
            
            for paragraph in paragraphs:
                text = paragraph.get_text(strip=True)
                
                
                if not any(keyword in text.lower() for keyword in ["baca juga", "simak juga", "download apps detikcom"]):
                    article_text.append(text)
            
            return ' '.join(article_text).strip() 

    except Exception as e:
        print(f"Failed to retrieve article from {url}: {e}")
    
    return None

# Loop through all URLs and fetch articles
for url in urls:
    print(f"Fetching article from {url}")
    article_text = fetch_article_text(url)
    if article_text:
        articles_data['url'].append(url)
        articles_data['article_text'].append(article_text)
    
    time.sleep(1)

# Save the articles data to a DataFrame and export to CSV
articles_df = pd.DataFrame(articles_data)
articles_df.to_csv('detik_article_tabrakan.csv', index=False)


driver.quit()

print("Article Collection Complete, Data Saved to 'detik_article_tabrakan.csv'.")


Fetching article from https://news.detik.com/berita/d-6709913/anak-korban-berang-prada-mw-kabur-usai-tabrakan-maut-mana-jiwa-kesatrianya
Fetching article from https://news.detik.com/berita/d-7370729/pengendara-mobil-tabrak-dan-lindas-nenek-di-palembang-diamankan-polisi
Fetching article from https://news.detik.com/berita/d-7125507/8-perjalanan-ka-dialihkan-imbas-tabrakan-kereta-turangga-ka-lokal-bandung
Fetching article from https://news.detik.com/berita/d-6770135/ada-4-motor-yang-ditabrak-dan-terseret-truk-hingga-berapi-di-jambi
Fetching article from https://news.detik.com/berita/d-7156516/dinding-warung-toko-velg-mobil-rusak-imbas-tabrakan-beruntun-di-puncak-bogor
Fetching article from https://news.detik.com/berita/d-7126110/tabrakan-2-kereta-terjadi-di-single-track-ini-penjelasan-kai-soal-jadwal
Fetching article from https://news.detik.com/berita/d-7477264/polisi-amankan-sopir-angkot-penabrak-pejalan-kaki-kios-di-depok
Fetching article from https://news.detik.com/berita/d-6925383/kod

In [None]:
articles_df.value_counts()