# Scrapping Tribunnews 

In [1]:
import requests
from bs4 import BeautifulSoup
import re
import threading
from datetime import datetime
import pandas as pd
import time
from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor
import concurrent.futures

In [2]:
data = {
    "keywords": "prabowo",
    "since_time": "2023-01-01",
    "until_time": "2023-11-10"
}

In [3]:

def scrape_link_per_day_worker(date, page_number):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
    }
    url = f"https://www.tribunnews.com/index-news?date={date}&page={page_number}"
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    body = soup.find('ul', {"class": "lsi"})
    articles = body.find_all('li', {"class": "ptb15"})

    page_links = []

    for article in articles:
        header = article.find('h3', {"class": "f16 fbo"})
        if header:
            link = header.find('a')
            if link and 'href' in link.attrs:
                link_href = link['href']
                page_links.append(link_href)
                
    print(f"Scraped {len(page_links)} links from page {page_number}")

    return page_links

In [4]:
def scrape_links(date, page_number):
    current_date = datetime.strptime(date, "%Y-%m-%d").strftime("%Y-%-m-%-d")
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
    }

    url = f"https://www.tribunnews.com/index-news?date={current_date}&page={page_number}"
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    body = soup.find('ul', {"class": "lsi"})
    articles = body.find_all('li', {"class": "ptb15"})
    
    links = []
    for article in articles:
        header = article.find('h3', {"class": "f16 fbo"})
        if header:
            link = header.find('a')
            if link and 'href' in link.attrs:
                link_href = link['href']
                links.append(link_href)
                
    print(f"Scraped {len(links)} links from page {page_number}")

    return links


In [5]:
def scrape_link_per_day(date, max_threads=5):
    page_number = 1
    page_links = []

    with concurrent.futures.ThreadPoolExecutor(max_threads) as executor:
        futures = []

        while True:
            future = executor.submit(scrape_links, date, page_number)
            futures.append(future)
            page_number += 1

            # Break the loop if no more articles are found
            if not future.result():
                break

        for future in concurrent.futures.as_completed(futures):
            page_links.extend(future.result())

    return page_links

In [6]:
link=scrape_link_per_day(data["since_time"])

Scraped 20 links from page 1
Scraped 20 links from page 2
Scraped 20 links from page 3
Scraped 20 links from page 4
Scraped 20 links from page 5
Scraped 20 links from page 6
Scraped 20 links from page 7
Scraped 20 links from page 8
Scraped 20 links from page 9
Scraped 20 links from page 10
Scraped 20 links from page 11
Scraped 20 links from page 12
Scraped 20 links from page 13
Scraped 20 links from page 14
Scraped 20 links from page 15
Scraped 20 links from page 16
Scraped 13 links from page 17
Scraped 0 links from page 18


In [7]:
print(link)

['https://www.tribunnews.com/mata-lokal-memilih/2023/01/01/ganjar-erick-dinilai-mirip-seperti-jokowi-di-2014-yang-muncul-dengan-segudang-prestasi', 'https://www.tribunnews.com/seleb/2023/01/01/agensi-mengonfirmasi-shindong-super-junior-pacaran-dengan-non-selebriti', 'https://www.tribunnews.com/pendidikan/2023/01/01/kunci-jawaban-bahasa-indonesia-kelas-10-halaman-237-penggunaan-kata-ganti-pada-teks-biografi', 'https://www.tribunnews.com/sport/2023/01/01/jadwal-motogp-2023-mulai-bulan-maret-di-portugal-motogp-mandalika-bulan-oktober', 'https://www.tribunnews.com/superskor/2023/01/01/rekor-pertemuan-timnas-indonesia-vs-filipina-garuda-pernah-bantai-12-0-the-azkals', 'https://www.tribunnews.com/metropolitan/2023/01/01/pelaku-penculik-bocah-di-gunung-sahari-merupakan-residivis-pencabulan-dan-pernah-dipenjara-7-tahun', 'https://www.tribunnews.com/regional/2023/01/01/pamit-beli-petasan-2-bocah-laki-laki-ditemukan-tewas-di-kubangan-galian-proyek-tol-cijago-depok', 'https://www.tribunnews.com/s

In [8]:
results=[]

In [9]:
def scrape_url(url,max_retries=2):
    retries = 0
    while retries < max_retries:
            try:
                headers = {
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
                }
                response = requests.get(url, headers=headers)
                if response.status_code == 200:
                    soup = BeautifulSoup(response.text, 'html.parser')
                    
                    
                    # Judul Berita
                    title_elem = soup.find('h1', {"id": "arttitle"})
                    if title_elem:
                        title_text = title_elem.text.strip()
                    else:
                        title_text = "Title not found"   
                    # tanggal berita
                    date_elem = soup.find('time', {"class": "grey"})
                    if date_elem:
                        date_text = date_elem.text.strip()
                    else:
                        date_text = "Date not found"
                    #     # Content Berita
                    body_elem = soup.find('div', {"class": "side-article txt-article multi-fontsize"})
                    
                    if body_elem:
                        content_elem = body_elem.find_all('p')
                        content_text = ""
                        for p in content_elem:
                            content_text += p.text.strip() + "\n"
                        
                        if content_text.strip():
                            content_text=content_text
                        else:
                            content_text ="Content not found"
                    else:
                        content_text ="Content not found"

                    return{
                        'title': title_text,
                        'date': date_text,
                        'content':content_text,
                        'link' : url}
                elif response.status_code == 429:
                    print(f"Received a 429 error for {url}. Retrying in 5 seconds...")
                    time.sleep(5)
                else:
                    print(f"Failed to retrieve data from {url}: Status Code {response.status_code}")
            except requests.exceptions.RequestException as e:
                print(f"Error fetching URL '{url}': {e}")
            except Exception as e:
                print(f"Error processing URL '{url}': {e}")
            retries += 1
            if retries < max_retries:
                print(f"Retrying {url} (Attempt {retries}/{max_retries})")
                time.sleep(5)  # You can adjust the delay as needed
    return None         

In [10]:
def is_keyword_in_text(keyword, text):
    # Case-sensitive search for whole word match
    return re.search(r'\b{}\b'.format(re.escape(keyword)), text)

for url in link:
    data_tribunnews = scrape_url(url)
    print(data["keywords"])
    
    # Assuming data_tribunnews is a dictionary with 'title', 'content', and 'link' keys
    title_contains_keyword = any(is_keyword_in_text(keyword, data_tribunnews['title']) for keyword in data["keywords"])
    content_contains_keyword = any(is_keyword_in_text(keyword, data_tribunnews['content']) for keyword in data["keywords"])

    if title_contains_keyword or content_contains_keyword:
        print("Data contains keywords:", data_tribunnews['link'])
    else:
        print("News does not contain the specified keywords and will not be inserted into the database. URL:", data_tribunnews['link'])

prabowo
News does not contain the specified keywords and will not be inserted into the database. URL: https://www.tribunnews.com/mata-lokal-memilih/2023/01/01/ganjar-erick-dinilai-mirip-seperti-jokowi-di-2014-yang-muncul-dengan-segudang-prestasi
prabowo
News does not contain the specified keywords and will not be inserted into the database. URL: https://www.tribunnews.com/seleb/2023/01/01/agensi-mengonfirmasi-shindong-super-junior-pacaran-dengan-non-selebriti
prabowo
News does not contain the specified keywords and will not be inserted into the database. URL: https://www.tribunnews.com/pendidikan/2023/01/01/kunci-jawaban-bahasa-indonesia-kelas-10-halaman-237-penggunaan-kata-ganti-pada-teks-biografi
prabowo
News does not contain the specified keywords and will not be inserted into the database. URL: https://www.tribunnews.com/sport/2023/01/01/jadwal-motogp-2023-mulai-bulan-maret-di-portugal-motogp-mandalika-bulan-oktober
prabowo
News does not contain the specified keywords and will not 

KeyboardInterrupt: 