**Scraping Data**

In [1]:
import csv
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
import time

In [2]:
# Setup selenium
options = Options()
options.headless = True  # Run browser in headless mode
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

In [3]:
base_url = 'https://www.weforum.org/agenda/?page='
num_pages = 10  # Number of pages you want to scrape

In [4]:
results = []
for page in range(1, num_pages + 1):
    url = base_url + str(page)
    print(f"Scraping page {page}: {url}")
    driver.get(url)

    # Tunggu sebentar agar halaman sepenuhnya dimuat
    time.sleep(3)

    # Mengambil elemen artikel
    articles = driver.find_elements(By.CSS_SELECTOR, 'article.wef-1rl6rhs')

    for article in articles:
        try:
            category = article.find_element(By.CSS_SELECTOR, 'a.chakra-badge').text.strip()
        except:
            category = 'N/A'
        
        try:
            title = article.find_element(By.CSS_SELECTOR, 'a.chakra-link.wef-1c7l3mo').text.strip()
        except:
            title = 'N/A'
        
        try:
            description = article.find_element(By.CSS_SELECTOR, 'div.wef-1c4gmja').text.strip()
        except:
            description = 'N/A'
        
        try:
            author_info = article.find_element(By.CSS_SELECTOR, 'div.wef-1bf8j10')
            authors = author_info.find_element(By.CSS_SELECTOR, 'p.chakra-text.wef-kefhaq').text.strip()
            date = author_info.find_element(By.CSS_SELECTOR, 'p.chakra-text.wef-1iho44l').text.strip()
        except:
            authors = 'N/A'
            date = 'N/A'
        
        results.append({
            'kategori': category,
            'judul': title,
            'deskripsi': description,
            'penulis': authors,
            'tanggal': date
        })

Scraping page 1: https://www.weforum.org/agenda/?page=1
Scraping page 2: https://www.weforum.org/agenda/?page=2
Scraping page 3: https://www.weforum.org/agenda/?page=3
Scraping page 4: https://www.weforum.org/agenda/?page=4
Scraping page 5: https://www.weforum.org/agenda/?page=5
Scraping page 6: https://www.weforum.org/agenda/?page=6
Scraping page 7: https://www.weforum.org/agenda/?page=7
Scraping page 8: https://www.weforum.org/agenda/?page=8
Scraping page 9: https://www.weforum.org/agenda/?page=9
Scraping page 10: https://www.weforum.org/agenda/?page=10


In [5]:
# Menyimpan hasil ke file CSV
csv_file = 'weforum_articles_all_pages.csv'
with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=['kategori', 'judul', 'deskripsi', 'penulis', 'tanggal'])
    writer.writeheader()
    for result in results:
        writer.writerow(result)

print(f"Data berhasil disimpan ke {csv_file}")

driver.quit()

Data berhasil disimpan ke weforum_articles_all_pages.csv


**Klasifikasi Kategori**

In [21]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

# Load data
data = pd.read_csv(r"C:\Users\ASUS\Documents\Portofolio Data Science\Berita\weforum_articles_all_pages.csv")
data.head()

Unnamed: 0,kategori,judul,deskripsi,penulis,tanggal
0,BUSINESS,"5 ways to build innovation cultures at work, a...",Key structures support the most innovative tea...,Andrea Willige,"August 10, 2024"
1,"RESILIENCE, PEACE AND SECURITY",Sudan: Learning lessons from the past in searc...,For civil war ceasefire talks in Geneva to suc...,Mohamed Elshabik,"August 9, 2024"
2,HEALTH AND HEALTHCARE SYSTEMS,Innovative mouth-based touchpad enables people...,MouthPad enables paralyzed individuals to cont...,Zach Winn,"August 9, 2024"
3,FINANCIAL AND MONETARY SYSTEMS,"US jobless claims fall, easing market fears an...",Top economy stories: US recession fears abate ...,Rebecca Geldard and Joe Myers,"August 9, 2024"
4,FINANCIAL AND MONETARY SYSTEMS,"From Lagos to Abidjan: 1,000km of entrepreneur...",An improved digital payments framework along t...,Moustapha Mamy Diaby and Cynthia Gnassingbe,"August 9, 2024"


In [9]:
# Fitur dan target
X = data['judul']  # Kolom teks
y = data['kategori']  # Kolom target kategori

# Split data menjadi training dan testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [16]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
X_train

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 2148 stored elements and shape (119, 1443)>