## Scraping CNN

In [4]:
import requests
from bs4 import BeautifulSoup
import re
import threading
from datetime import datetime
import pandas as pd
import time
import concurrent.futures

In [11]:
jumlah_index = 20
threads_link = []
links = []
results = []
lock = threading.Lock()



In [12]:
def scrape_links(page_number):
    try:
        headers = {
            'User-Agent': 'Chrome/118.0.0.0'
        }
        url = f"https://www.cnnindonesia.com/indeks/2/{page_number}"
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.text, "html.parser")
        div_badan = soup.find('div', {"class": "flex flex-col gap-5"})
        if div_badan:
            articles = div_badan.findAll('article', {"class": "flex-grow"})
            page_links = []
            for article in articles:
                link = article.find('a')['href']
                page_links.append(link)

            with lock:  # Lock to protect the shared 'links' list
                links.extend(page_links)

        else:
            print(f"Div 'flex flex-col gap-5' not found on page {page_number}")

        print(f"Scraped {len(page_links)} links from page {page_number}")
    except Exception as e:
        print(f"Error while scraping page {page_number}: {str(e)}")

In [13]:
if __name__ == "__main__":
    # Create a ThreadPoolExecutor with a number of threads matching CPU cores
    with concurrent.futures.ThreadPoolExecutor() as executor:
        # Submit scraping tasks
        for page_number in range(1, jumlah_index + 1):
            executor.submit(scrape_links, page_number)

    # Print the total number of links
    print("Total Links:", len(links))

Scraped 10 links from page 2
Scraped 10 links from page 5
Scraped 10 links from page 4
Scraped 10 links from page 1
Scraped 10 links from page 3
Scraped 10 links from page 7
Scraped 10 links from page 10
Scraped 10 links from page 11
Scraped 10 links from page 8
Scraped 10 links from page 12
Scraped 10 links from page 9
Scraped 10 links from page 6
Scraped 10 links from page 13
Scraped 10 links from page 14
Scraped 10 links from page 15
Scraped 10 links from page 16
Scraped 10 links from page 17
Scraped 10 links from page 18
Scraped 10 links from page 20
Scraped 10 links from page 19
Total Links: 200


In [14]:
def parse_indonesian_date(indonesian_date):
    # Split the date string into components
    components = indonesian_date.split(', ')[1].split(' ')
    day = int(components[0])
    month_mapping = {
        'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'Mei': 5, 'Jun': 6,
        'Jul': 7, 'Agu': 8, 'Sep': 9, 'Okt': 10, 'Nov': 11, 'Des': 12
    }
    month = month_mapping[components[1]]
    year = int(components[2])
    time_components = components[3].split(':')
    hour = int(time_components[0])
    minute = int(time_components[1])
    
    # Return a datetime object
    return datetime(year, month, day, hour, minute)

In [15]:
def scrape_url(url,max_retries=2):
    retries = 0
    while retries < max_retries:
            try:
                headers = {
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
                }
                response = requests.get(url, headers=headers)
                if response.status_code == 200:
                    soup = BeautifulSoup(response.text, 'html.parser')
                    
                    
                    # Judul Berita
                    title_elem = soup.find('h1', {"class": "mb-2 text-[28px] leading-9 text-cnn_black"})
                    if title_elem:
                        title_text = title_elem.text.strip()
                    else:
                        title_text = "Title not found"
                    # Author berita
                    author_elem = soup.find('span', {"class": "text-cnn_red"})
                    if author_elem:
                        author_text = author_elem.get_text()
                        author_text = author_text.split('-')[0].strip()
                    else:
                        author_text = "Author not found"     
                    # tanggal berita
                    date_elem = soup.find('div', {"class": "text-cnn_grey text-sm mb-4"})
                    if date_elem:
                        date_text = date_elem.text.strip()
                        date_object = parse_indonesian_date(date_text)
                        formatted_date = date_object.strftime('%Y-%m-%d')
                    else:
                        date_text = "Date not found"
                    #     # Category berita
                    category_elements = soup.find('a', {"dtr-sec": "subkanal"})
                    if category_elements:
                        category_text= category_elements.text.strip()
                    else:
                        category_text = "Category not found"
                    #     # Content Berita
                    body_elem = soup.find('div', {"class": "detail-text text-cnn_black text-sm grow min-w-0"})
                    
                    if body_elem:
                        content_elem = body_elem.find_all('p')
                        content_text = ""
                        for p in content_elem:
                            content_text += p.text.strip() + "\n"
                        
                        if content_text.strip():
                            content_text=content_text
                        else:
                            content_text ="Content not found"
                    else:
                        content_text ="Content not found"

                    nama_berita_match = re.search(r'https://www\.(\w+)\.com/', url)
                    if nama_berita_match:
                        nama_berita = nama_berita_match.group(1)
                    else:
                        nama_berita = "Nama_berita not found"
                    results.append({
                        'title': title_text,
                        'tanggal': formatted_date,
                        'penulis':author_text,
                        'category':category_text,
                        'link' : url})
                elif response.status_code == 429:
                    print(f"Received a 429 error for {url}. Retrying in 5 seconds...")
                    time.sleep(5)
                else:
                    print(f"Failed to retrieve data from {url}: Status Code {response.status_code}")
            except requests.exceptions.RequestException as e:
                print(f"Error fetching URL '{url}': {e}")
            except Exception as e:
                print(f"Error processing URL '{url}': {e}")
            retries += 1
            if retries < max_retries:
                print(f"Retrying {url} (Attempt {retries}/{max_retries})")
                time.sleep(5)  # You can adjust the delay as needed
    return None         

In [6]:
# def scrape_url(url):
#     try:
#         response = requests.get(url)
#         if response.status_code == 200:
#             soup = BeautifulSoup(response.text, 'html.parser')
            
            
#             # Judul Berita
#             title_elem = soup.find('h1', {"class": "mb-2 text-[28px] leading-9 text-cnn_black"})
#             if title_elem:
#                 title_text = title_elem.text.strip()
#             else:
#                 title_text = "Title not found"
#             # Author berita
#             author_elem = soup.find('span', {"class": "text-cnn_red"})
#             if author_elem:
#                 author_text = author_elem.get_text()
#                 author_text = author_text.split('-')[0].strip()
#             else:
#                 author_text = "Author not found"     
#             # tanggal berita
#             date_elem = soup.find('div', {"class": "text-cnn_grey text-sm mb-4"})
#             if date_elem:
#                 date_text = date_elem.text.strip()
#             else:
#                 date_text = "Date not found"
#             #     # Category berita
#             category_elements = soup.find('a', {"dtr-sec": "subkanal"})
#             if category_elements:
#                 category_text= category_elements.text.strip()
#             else:
#                 category_text = "Category not found"
#             #     # Content Berita
#             body_elem = soup.find('div', {"class": "detail-text text-cnn_black text-sm grow min-w-0"})
            
#             if body_elem:
#                 content_elem = body_elem.find_all('p')
#                 content_text = ""
#                 for p in content_elem:
#                     content_text += p.text.strip() + "\n"
                
#                 if content_text.strip():
#                     content_text=content_text
#                 else:
#                     content_text ="Content not found"
#             else:
#               content_text ="Content not found"

#             nama_berita_match = re.search(r'https://www\.(\w+)\.com/', url)
#             if nama_berita_match:
#                 nama_berita = nama_berita_match.group(1)
#             else:
#                 nama_berita = "Nama_berita not found"
#             results.append({
#                     'title': title_text,
#                     'category':category_text,
#                             'link' : url}
#                            )
#         else:
#             print(f"Failed to retrieve data from {url}")

#     except requests.exceptions.RequestException as e:
#         print(f"Error fetching URL '{url}': {e}")
#     except Exception as e:
#         print(f"Error processing URL '{url}': {e}")

In [16]:
with concurrent.futures.ThreadPoolExecutor() as executor:
    # Submit the URL scraping tasks to the executor
    executor.map(scrape_url, links)

# Now, the results list contains the scraped data from all URLs
for result in results:
    print(result)

Retrying https://www.cnnindonesia.com/nasional/20231211183818-20-1035851/daftar-rekayasa-lalu-lintas-di-sekitar-kpu-saat-debat-capres-besok (Attempt 1/2)
Retrying https://www.cnnindonesia.com/internasional/20231211172411-124-1035811/video-serangan-bom-asap-bikin-panik-pengungsi-di-jabalia (Attempt 1/2)
Retrying https://www.cnnindonesia.com/ekonomi/20231211181542-92-1035838/ganjar-nilai-ekonomi-ri-bisa-tumbuh-7-persen-asal-hukum-tegak (Attempt 1/2)
Retrying https://www.cnnindonesia.com/gaya-hidup/20231211164555-269-1035797/mau-kunjungi-taiwan-cek-dulu-syarat-wisatawan-pakai-e-visa (Attempt 1/2)
Retrying https://www.cnnindonesia.com/nasional/20231211175328-36-1035823/video-jokowi-respons-kritik-bem-ugm-kita-punya-etika-ketimuran (Attempt 1/2)
Retrying https://www.cnnindonesia.com/hiburan/20231211142507-220-1035693/sinopsis-superman-returns-bioskop-trans-tv-11-desember-2023 (Attempt 1/2)
Retrying https://www.cnnindonesia.com/nasional/20231211182910-20-1035844/tilang-manual-tidak-berlaku-s

In [17]:
df = pd.DataFrame(results)
df = df.drop_duplicates()
total_data = len(df)
print('Total data:', total_data)
df.head(20)


Total data: 161


Unnamed: 0,title,tanggal,penulis,category,link
0,Daftar Rekayasa Lalu Lintas di Sekitar KPU Saa...,2023-12-11,CNN Indonesia,Peristiwa,https://www.cnnindonesia.com/nasional/20231211...
1,VIDEO: Serangan Bom Asap Bikin Panik Pengungsi...,2023-12-11,CNN Indonesia,Timur Tengah,https://www.cnnindonesia.com/internasional/202...
2,Ganjar Nilai Ekonomi RI Bisa Tumbuh 7 Persen A...,2023-12-11,CNN Indonesia,Bisnis,https://www.cnnindonesia.com/ekonomi/202312111...
3,Mau Kunjungi Taiwan? Cek Dulu Syarat Wisatawan...,2023-12-11,CNN Indonesia,Travel,https://www.cnnindonesia.com/gaya-hidup/202312...
4,VIDEO: Jokowi Respons Kritik BEM UGM: Kita Pun...,2023-12-11,CNN Indonesia,Politik,https://www.cnnindonesia.com/nasional/20231211...
5,"Sinopsis Superman Returns, Bioskop Trans TV 11...",2023-12-11,CNN Indonesia,Film,https://www.cnnindonesia.com/hiburan/202312111...
6,Tilang Manual Tidak Berlaku Saat Libur Nataru,2023-12-11,CNN Indonesia,Peristiwa,https://www.cnnindonesia.com/nasional/20231211...
7,Ivar Jenner dan Marselino Pulih Cedera Jelang ...,2023-12-11,CNN Indonesia,Sepakbola,https://www.cnnindonesia.com/olahraga/20231211...
8,"Gempa Bumi M 5,8 Guncang Riau, Getaran Terasa ...",2023-12-11,CNN Indonesia,Peristiwa,https://www.cnnindonesia.com/nasional/20231211...
9,Asal Usul Pengungsi Rohingya yang Mengundang P...,2023-12-11,CNN Indonesia,Timur Tengah,https://www.cnnindonesia.com/internasional/202...


Cetak dan simpan

In [18]:
current_datetime = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
excel_file_name = f'../tempat_simpan_nasional/cnn_data_{current_datetime}.xlsx'
df.to_excel(excel_file_name, index=False)

print(f'Data has been saved to {excel_file_name}')

Data has been saved to ../tempat_simpan_nasional/cnn_data_2023-12-11_20-06-46.xlsx
