In [None]:
pip install pandas



In [None]:
pip install numpy



In [None]:
pip install seaborn



In [None]:
pip install requests beautifulsoup4



In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime

In [None]:
class Scraper:
    def __init__(self, keywords, pages):  # Corrected __init__ method
        self.keywords = keywords
        self.pages = pages
        self.articles = []

    def fetch(self, base_url):
        self.base_url = base_url
        self.params = {
            'query': self.keywords,
            'sortby': 'time',
            'page': 1  # Start from page 1
        }

        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.35 Safari/537.36'
        }

        self.response = requests.get(self.base_url, params=self.params, headers=self.headers)
        if self.response.status_code != 200:
            print(f"[!] Failed to fetch data: {self.response.status_code}")
        return self.response

    def get_articles(self):
        article_lists = []

        for page_num in range(1, int(self.pages) + 1):
            self.params['page'] = page_num  # Update the page parameter
            page = requests.get(self.base_url, params=self.params, headers=self.headers)
            soup = BeautifulSoup(page.text, "html.parser")

            # Find all articles (Adjust the selector based on the actual HTML structure)
            articles = soup.find_all("article")  # Adjust this as necessary

            if not articles:
                print(f"[!] No articles found on page {page_num}")
                continue

            for article in articles:
                # Extracting article information
                title_element = article.find("h2")
                title = title_element.get_text(strip=True) if title_element else "No Title"

                category_element = article.find("span", {"class": "category"})
                category = category_element.get_text(strip=True) if category_element else "No Category"

                published_time_element = article.find("span", {"class": "date"})
                published_time = published_time_element.get_text(strip=True) if published_time_element else "No Date"

                href_element = article.find("a", href=True)
                href = href_element['href'] if href_element else "No URL"

                description_element = article.find("p")
                descript = description_element.get_text(strip=True) if description_element else "No Description"

                article_lists.append({
                    "title": title,
                    "category": category,
                    "published_time": published_time,
                    "href": href,
                    "description": descript
                })

        self.articles = article_lists

        print("[~] Scraping finished!")
        print(f"[~] Total Articles: {len(self.articles)}")
        return self.articles

    def save_to(self, file_format="csv"):
        time_scrape = datetime.now().strftime("%m%d%Y_%H%M%S")
        df = pd.DataFrame(self.articles)

        file_name = f"result_{self.keywords}_{time_scrape}"
        if file_format == "csv":
            file_name += ".csv"
            df.to_csv(file_name, index=False)
            print(f"[~] Result saved to '{file_name}'")
        elif file_format == "excel":
            file_name += ".xlsx"
            df.to_excel(file_name, index=False)
            print(f"[~] Result saved to '{file_name}'")

    def show_results(self, row=5):
        df = pd.DataFrame(self.articles)
        df.index += 1
        if row:
            print(df.head())
        else:
            print(df)

if __name__ == '__main__':
    keywords = input("[~] Keywords     : ")
    pages = input("[~] Total Pages  : ")
    base_url = "https://www.detik.com/search/searchall"

    scrape = Scraper(keywords, pages)
    response = scrape.fetch(base_url)
    if response.status_code == 200:
        articles = scrape.get_articles()

        try:
            ask = input("[~] Do you want to save the results? [y/n]: ").lower()
            if ask == 'y':
                file_format = input("[~] Save to file format? [csv/excel]: ").lower()
                scrape.save_to(file_format=file_format)
            elif ask == 'n':
                scrape.show_results()
        except Exception as e:
            print(e)
    else:
        print("[~] Program Finished")

[~] Keywords     : gunadarma
[~] Total Pages  : 3
[~] Scraping finished!
[~] Total Articles: 36
[~] Do you want to save the results? [y/n]: y
[~] Save to file format? [csv/excel]: csv
[~] Result saved to 'result_gunadarma_09272024_074751.csv'


In [None]:
data = pd.read_csv('/content/result_gunadarma_09272024_074751.csv')
data.head()

Unnamed: 0,title,category,published_time,href,description
0,detikFinance,No Category,No Date,https://finance.detik.com/infrastruktur/d-7374...,No Description
1,detikEdu,No Category,No Date,https://www.detik.com/edu/edutainment/d-710206...,No Description
2,detikEdu,No Category,No Date,https://www.detik.com/edu/advertorial-news-blo...,No Description
3,20Detik,No Category,No Date,https://20.detik.com/detikupdate/20240604-2406...,No Description
4,detikEdu,No Category,No Date,https://www.detik.com/edu/advertorial-news-blo...,No Description


In [None]:
data.shape

(36, 5)

In [None]:
for a in data["href"]:
    print(a)

https://finance.detik.com/infrastruktur/d-7374020/proyek-diresmikan-jokowi-ini-kampus-pertama-yang-mau-dibangun-di-ikn
https://www.detik.com/edu/edutainment/d-7102061/jokowi-letakkan-batu-pertama-gedung-ii-kampus-gunadarma-di-penyangga-ikn
https://www.detik.com/edu/advertorial-news-block/d-6738493/gunadarma-kembangkan-agrotechno-eco-edutourism-lewat-technopark
https://20.detik.com/detikupdate/20240604-240604158/jokowi-groundbreaking-kampus-pertama-di-ikn
https://www.detik.com/edu/advertorial-news-block/d-6738510/gunadarma-kembangkan-agrotechno-eco-edutourism-lewat-technopark
https://news.detik.com/detiktv/d-6470755/babak-baru-pelecehan-di-gunadarma-kini-pelaku-lapor-soal-dipersekusi
https://news.detik.com/berita/d-6466269/dede-yusuf-minta-persekusi-pelaku-pelecehan-di-gunadarma-tetap-diusut
https://finance.detik.com/foto-bisnis/d-5878955/gunadarma-bangun-kampus-di-podomoro-golf-view
https://www.detik.com/jateng/hukum-dan-kriminal/d-6461153/mahasiswa-gunadarma-pelaku-pelecehan-ditelanja