# Data Collecting

## Setup Session

In [2]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import time
import random
from requests.adapters import HTTPAdapter
from slugify import slugify
import uuid
from urllib3.util.retry import Retry

session = requests.Session()
retry = Retry(total=5, backoff_factor=0.1, status_forcelist=[500, 502, 503, 504])
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)

session.headers.update({'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'})     

base_url = 'https://www.tvonenews.com/indeks'
news_categories = ['ekonomi', 'sport']
all_links_file = '../data/all_article_links.txt'
current_date = datetime.now()
end_date = datetime.now() - timedelta(days=365)
category_articles_limit = 2100
all_article_links = set()

## Crawling

In [None]:
def get_articles_from_page(category, date):
    page_url_to_crawl = f'{base_url}/{category}/all/{date}'
    try:
        response = session.get(page_url_to_crawl, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
                
        article_links = {a['href'] for a in soup.find_all('a', class_='ali-title')}
        all_article_links.update(article_links)
        
        if article_links:
            print(f"Berhasil mendapatkan {len(all_article_links)} artikel pada date {date}")
                    
        time.sleep(random.uniform(1, 3))            
    except requests.RequestException as e:
        print(f"Error fetching on date {date}: {e}")
        return

count = 1
for category in news_categories:
    current_date_for_category = current_date
    limit = category_articles_limit * count
    print(f"\nCATEGORY: {category.upper()}\n")
    while current_date_for_category > end_date and len(all_article_links) < limit:
        formatted_date = current_date_for_category.strftime('%Y/%m/%d')
        get_articles_from_page(category, formatted_date)        
        current_date_for_category -= timedelta(days=1)
    count += 1
    
with open(all_links_file, 'w', encoding='utf-8') as f:
    for i, article in enumerate(all_article_links):
        f.write(f"{article}{"" if i + 1 == len(all_article_links) else "\n"}")   

## Scrapping

In [3]:
def get_article_details(article_url):
    try:
        response = session.get(article_url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        title = soup.find('h1', class_='detail-title').get_text() if soup.find('h1') else 'No title'
        date = soup.find('div', class_='detail-date').get_text() if soup.find('div', class_='detail-date') else 'No date'
        url = article_url
        category = 'Ekonomi' if '/ekonomi/' in article_url else 'Sport'
        summary = soup.find('h2', class_='detail-summary').get_text() if soup.find('h2', class_='detail-summary') else 'No summary'
        author = soup.find('div', class_='detail-author-data').find('a').get_text() if soup.find('div', class_='detail-author-data') else 'No author'
        content = soup.find('div', class_='detail-content').get_text() if soup.find('div', class_='detail-content') else 'No content'        

        return {'title': title, 'date': date, 'url': url, 'category': category, 'summary': summary, 'author': author, 'content': content}
    except requests.RequestException as e:
        print(f"Error fetching article {article_url}: {e}")
        return None
    
def scrape_news():
    delay_on = 30
    all_articles = []
    
    with open(all_links_file, 'r', encoding='utf-8') as file:
        raw_text = file.read() 
                   
    article_links = raw_text.split("\n")
    for i, link in enumerate(article_links):
        article_data = get_article_details(link)
        if article_data:
            print(f"{i + 1}. Berhasil menyimpan article {article_data['title']}")
            all_articles.append(article_data)
        if i + 1 == delay_on:
            time.sleep(random.uniform(2, 5))
            delay_on += 30
    return all_articles        

def save_to_array(news_data):
    news_array = []    
    for news in news_data:
        news_info = {
            "Judul": news['title'],
            "Tanggal": news['date'],
            "Url": news['url'],
            "Kategori": news['category'],
            "Ringkasan": news['summary'],
            "Pengarang": news['author'],
            "Isi Berita": news['content']
        }
        news_array.append(news_info)
    
    return news_array

news_data = scrape_news()
array_data = save_to_array(news_data)
print("Scrapping selesai!")

1. Berhasil menyimpan article 
		    4 Bintang Voli Indonesia yang Berkarier di Luar Negeri, Ada Megawati Hangestri hingga Terbaru Rendy Tamamilang
		  
2. Berhasil menyimpan article 
		    Bahlil Minta Warga Lokal Dilibatkan dalam Proyek Swasembada Gula di Merauke yang Buka 2 Juta Hektare Lahan: Tapi Pengusaha Juga Harus Siap
		  
3. Berhasil menyimpan article 
		    SEA V League: Tekad Timnas Voli Indonesia Taklukan Thailand di Laga Perdana
		  
4. Berhasil menyimpan article 
		    Padahal Sudah Pulang ke Indonesia, Megawati Hangestri Trending Lagi di Korea, Media Lokal Negeri Ginseng Bilang Megatron...
		  
5. Berhasil menyimpan article 
		    Megawati Hangestri Cetak Sejarah Baru Usai Jegal GS Caltex Tanpa Balas, Bahkan Para Pemain Red Sparks Sampai Ikut Lakukan Hal Ini...
		  
6. Berhasil menyimpan article 
		    Di Belakang Megawati Hangestri, Vanja Bukilic Ungkap Sosok Mega Dimatanya, Sampai Bilang Megatron itu...
		  
7. Berhasil menyimpan article 
		    Sudah Cicil Rp11 Triliu

## Save Data to CSV

In [None]:
import csv
import re

def clean_content(content):
    content = re.sub(r'ADVERTISEMENT|SCROLL TO CONTINUE WITH CONTENT|(Jakarta,)?\s*tvOnenews.com -', '', content, flags=re.IGNORECASE)
    content = '\n'.join(line.strip() for line in content.splitlines() if line.strip())
    content = content.replace('\n', ' ')
    return content.strip()

with open('data/result_data.csv', mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["No", "Judul", "Tanggal", "Ringkasan", "Pengarang", "Isi Berita"])
    
    for i, article in enumerate(array_data):
        judul = re.sub(r'\s+', ' ', article['Judul']).strip() if article['Judul'] else "Judul tidak ditemukan"
        tanggal = re.sub(r'\s+', ' ', article['Tanggal']).strip() if article['Tanggal'] else "Tanggal tidak ditemukan"
        tanggal = re.sub(r'\s+', ' ', article['Ringkasan']).strip() if article['Tanggal'] else "Ringkasan tidak ditemukan"
        pengarang = re.sub(r'\s+', ' ', article['Pengarang']).strip() if article['Pengarang'] else "Pengarang tidak ditemukan"
        isi_berita = clean_content(article['Isi Berita'])
        writer.writerow([i + 1, judul, tanggal, pengarang, isi_berita])

print("CSV berhasil dibuat!")


CSV berhasil dibuat!


## Save Data to MongoDB

In [4]:
import re
from pymongo import MongoClient

client = MongoClient('mongodb://localhost:27017/')
db = client['local']
news_data_collection = db['news_data']
uuid_for_slug = str(uuid.uuid4())[:8]


def clean_content(content):
    content = re.sub(r'ADVERTISEMENT|SCROLL TO CONTINUE WITH CONTENT|(Jakarta,)?\s*tvOnenews.com -', '', content, flags=re.IGNORECASE)
    content = '\n'.join(line.strip() for line in content.splitlines() if line.strip())
    content = content.replace('\n', ' ')
    return content.strip()

documents = []
for i, article in enumerate(array_data):
    judul = re.sub(r'\s+', ' ', article['Judul']).strip() if article['Judul'] else "Judul tidak ditemukan"
    tanggal = re.sub(r'\s+', ' ', article['Tanggal']).strip() if article['Tanggal'] else "Tanggal tidak ditemukan"
    url = re.sub(r'\s+', ' ', article['Url']).strip() if article['Url'] else "Url tidak ditemukan"
    kategori = re.sub(r'\s+', ' ', article['Kategori']).strip() if article['Kategori'] else "Kategori tidak ditemukan"
    ringkasan = re.sub(r'\s+', ' ', article['Ringkasan']).strip() if article['Ringkasan'] else "Ringkasan tidak ditemukan"
    pengarang = re.sub(r'\s+', ' ', article['Pengarang']).strip() if article['Pengarang'] else "Pengarang tidak ditemukan"
    isi_berita = clean_content(article['Isi Berita'])

    documents.append({
        "Judul": judul,
        "Tanggal": tanggal,
        "Kategori": kategori,
        "Url": url,
        "Slug": slugify(judul) + '-' + uuid_for_slug,
        "Ringkasan": ringkasan,
        "Pengarang": pengarang,
        "Isi Berita": isi_berita
    })

if documents:
    news_data_collection.insert_many(documents)

print("Data berhasil disimpan ke MongoDB!")


Data berhasil disimpan ke MongoDB!
