## Scraping CNN

In [1]:
import requests
from bs4 import BeautifulSoup
import re
import threading
from datetime import datetime
import pandas as pd
import time
import concurrent.futures

In [2]:
jumlah_index = 200
threads_link = []
links = []
results = []
lock = threading.Lock()


In [3]:
def scrape_links(page_number):
    try:
        headers = {
            'User-Agent': 'Chrome/118.0.0.0'
        }
        url = f"https://www.cnnindonesia.com/olahraga/indeks/7/{page_number}"
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.text, "html.parser")
        div_badan = soup.find('div', {"class": "flex flex-col gap-5"})
        if div_badan:
            articles = div_badan.findAll('article', {"class": "flex-grow"})
            page_links = []
            for article in articles:
                link = article.find('a')['href']
                page_links.append(link)

            with lock:  # Lock to protect the shared 'links' list
                links.extend(page_links)

        else:
            print(f"Div 'flex flex-col gap-5' not found on page {page_number}")

        print(f"Scraped {len(page_links)} links from page {page_number}")
    except Exception as e:
        print(f"Error while scraping page {page_number}: {str(e)}")

In [4]:
if __name__ == "__main__":
    # Create a ThreadPoolExecutor with a number of threads matching CPU cores
    with concurrent.futures.ThreadPoolExecutor() as executor:
        # Submit scraping tasks
        for page_number in range(1, jumlah_index + 1):
            executor.submit(scrape_links, page_number)

    # Print the total number of links
    print("Total Links:", len(links))

Scraped 10 links from page 3
Scraped 10 links from page 9
Scraped 10 links from page 12
Scraped 10 links from page 4
Scraped 10 links from page 1
Scraped 10 links from page 13
Scraped 10 links from page 7
Scraped 10 links from page 6
Scraped 10 links from page 2
Scraped 10 links from page 8
Scraped 10 links from page 11
Scraped 10 links from page 10
Scraped 10 links from page 5
Scraped 10 links from page 17
Scraped 10 links from page 25
Scraped 10 links from page 18
Scraped 10 links from page 14
Scraped 10 links from page 15
Scraped 10 links from page 21
Scraped 10 links from page 22
Scraped 10 links from page 23
Scraped 10 links from page 20
Scraped 10 links from page 16
Scraped 10 links from page 35
Scraped 10 links from page 19
Scraped 10 links from page 24
Scraped 10 links from page 37
Scraped 10 links from page 28
Scraped 10 links from page 26
Scraped 10 links from page 27
Scraped 10 links from page 40
Scraped 10 links from page 41
Scraped 10 links from page 29
Scraped 10 links fr

In [5]:
def scrape_url(url,max_retries=2):
    retries = 0
    while retries < max_retries:
            try:
                headers = {
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
                }
                response = requests.get(url, headers=headers)
                if response.status_code == 200:
                    soup = BeautifulSoup(response.text, 'html.parser')
                    
                    
                    # Judul Berita
                    title_elem = soup.find('h1', {"class": "mb-2 text-[28px] leading-9 text-cnn_black"})
                    if title_elem:
                        title_text = title_elem.text.strip()
                    else:
                        title_text = "Title not found"
                    # Author berita
                    author_elem = soup.find('span', {"class": "text-cnn_red"})
                    if author_elem:
                        author_text = author_elem.get_text()
                        author_text = author_text.split('-')[0].strip()
                    else:
                        author_text = "Author not found"     
                    # tanggal berita
                    date_elem = soup.find('div', {"class": "text-cnn_grey text-sm mb-4"})
                    if date_elem:
                        date_text = date_elem.text.strip()
                    else:
                        date_text = "Date not found"
                    #     # Category berita
                    category_elements = soup.find('a', {"dtr-sec": "subkanal"})
                    if category_elements:
                        category_text= category_elements.text.strip()
                    else:
                        category_text = "Category not found"
                    #     # Content Berita
                    body_elem = soup.find('div', {"class": "detail-text text-cnn_black text-sm grow min-w-0"})
                    
                    if body_elem:
                        content_elem = body_elem.find_all('p')
                        content_text = ""
                        for p in content_elem:
                            content_text += p.text.strip() + "\n"
                        
                        if content_text.strip():
                            content_text=content_text
                        else:
                            content_text ="Content not found"
                    else:
                        content_text ="Content not found"

                    nama_berita_match = re.search(r'https://www\.(\w+)\.com/', url)
                    if nama_berita_match:
                        nama_berita = nama_berita_match.group(1)
                    else:
                        nama_berita = "Nama_berita not found"
                    results.append({
                        'title': title_text,
                                    'category':category_text,
                                    'link' : url})
                elif response.status_code == 429:
                    print(f"Received a 429 error for {url}. Retrying in 5 seconds...")
                    time.sleep(5)
                else:
                    print(f"Failed to retrieve data from {url}: Status Code {response.status_code}")
            except requests.exceptions.RequestException as e:
                print(f"Error fetching URL '{url}': {e}")
            except Exception as e:
                print(f"Error processing URL '{url}': {e}")
            retries += 1
            if retries < max_retries:
                print(f"Retrying {url} (Attempt {retries}/{max_retries})")
                time.sleep(5)  # You can adjust the delay as needed
    return None         

In [6]:
# def scrape_url(url):
#     try:
#         response = requests.get(url)
#         if response.status_code == 200:
#             soup = BeautifulSoup(response.text, 'html.parser')
            
            
#             # Judul Berita
#             title_elem = soup.find('h1', {"class": "mb-2 text-[28px] leading-9 text-cnn_black"})
#             if title_elem:
#                 title_text = title_elem.text.strip()
#             else:
#                 title_text = "Title not found"
#             # Author berita
#             author_elem = soup.find('span', {"class": "text-cnn_red"})
#             if author_elem:
#                 author_text = author_elem.get_text()
#                 author_text = author_text.split('-')[0].strip()
#             else:
#                 author_text = "Author not found"     
#             # tanggal berita
#             date_elem = soup.find('div', {"class": "text-cnn_grey text-sm mb-4"})
#             if date_elem:
#                 date_text = date_elem.text.strip()
#             else:
#                 date_text = "Date not found"
#             #     # Category berita
#             category_elements = soup.find('a', {"dtr-sec": "subkanal"})
#             if category_elements:
#                 category_text= category_elements.text.strip()
#             else:
#                 category_text = "Category not found"
#             #     # Content Berita
#             body_elem = soup.find('div', {"class": "detail-text text-cnn_black text-sm grow min-w-0"})
            
#             if body_elem:
#                 content_elem = body_elem.find_all('p')
#                 content_text = ""
#                 for p in content_elem:
#                     content_text += p.text.strip() + "\n"
                
#                 if content_text.strip():
#                     content_text=content_text
#                 else:
#                     content_text ="Content not found"
#             else:
#               content_text ="Content not found"

#             nama_berita_match = re.search(r'https://www\.(\w+)\.com/', url)
#             if nama_berita_match:
#                 nama_berita = nama_berita_match.group(1)
#             else:
#                 nama_berita = "Nama_berita not found"
#             results.append({
#                     'title': title_text,
#                     'category':category_text,
#                             'link' : url}
#                            )
#         else:
#             print(f"Failed to retrieve data from {url}")

#     except requests.exceptions.RequestException as e:
#         print(f"Error fetching URL '{url}': {e}")
#     except Exception as e:
#         print(f"Error processing URL '{url}': {e}")

In [7]:
with concurrent.futures.ThreadPoolExecutor() as executor:
    # Submit the URL scraping tasks to the executor
    executor.map(scrape_url, links)

# Now, the results list contains the scraped data from all URLs
for result in results:
    print(result)

Retrying https://www.cnnindonesia.com/olahraga/20231024042535-142-1015001/klasemen-liga-inggris-tottenham-gusur-man-city-dari-puncak (Attempt 1/2)
Retrying https://www.cnnindonesia.com/olahraga/20231023200259-178-1014962/khamzat-disebut-tak-pantas-dapat-duel-gelar-juara-ufc (Attempt 1/2)
Retrying https://www.cnnindonesia.com/olahraga/20231024035337-142-1015000/hasil-liga-inggris-son-cetak-gol-tottenham-hajar-fulham-2-0 (Attempt 1/2)
Retrying https://www.cnnindonesia.com/olahraga/20231024011106-156-1014990/binder-merasa-kasihan-martin-gagal-menang-di-motogp-australia (Attempt 1/2)
Retrying https://www.cnnindonesia.com/olahraga/20231024043610-144-1015003/foto-tottenham-jaga-rekor-belum-terkalahkan-usai-hajar-fulham (Attempt 1/2)
Retrying https://www.cnnindonesia.com/olahraga/20231022124429-145-1014441/infografis-jadwal-timnas-indonesia-di-kualifikasi-piala-dunia-2026 (Attempt 1/2)
Retrying https://www.cnnindonesia.com/olahraga/20231024022839-142-1014995/messi-banggakan-capaian-bersama-in

In [11]:
df = pd.DataFrame(results)
df = df.drop_duplicates()
total_data = len(df)
print('Total data:', total_data)
df.head(20)


Total data: 2000


Unnamed: 0,title,category,link
0,Klasemen Liga Inggris: Tottenham Gusur Man Cit...,Sepakbola,https://www.cnnindonesia.com/olahraga/20231024...
1,Khamzat Disebut Tak Pantas Dapat Duel Gelar Ju...,Olahraga Lainnya,https://www.cnnindonesia.com/olahraga/20231023...
2,"Hasil Liga Inggris: Son Cetak Gol, Tottenham H...",Sepakbola,https://www.cnnindonesia.com/olahraga/20231024...
3,Binder Merasa Kasihan Martin Gagal Menang di M...,Moto GP,https://www.cnnindonesia.com/olahraga/20231024...
4,FOTO: Tottenham Jaga Rekor Belum Terkalahkan U...,Sepakbola,https://www.cnnindonesia.com/olahraga/20231024...
5,INFOGRAFIS: Jadwal Timnas Indonesia di Kualifi...,Infografis,https://www.cnnindonesia.com/olahraga/20231022...
6,Messi Banggakan Capaian Bersama Inter Miami,Sepakbola,https://www.cnnindonesia.com/olahraga/20231024...
7,Erick Thohir Soal Persiapan Piala Dunia U-17: ...,Sepakbola,https://www.cnnindonesia.com/olahraga/20231024...
8,Bagnaia Mulai Pongah Jelang MotoGP Thailand da...,Moto GP,https://www.cnnindonesia.com/olahraga/20231023...
9,Sidang Komdis PSSI: Hugo Samir Pukul Pemain Pe...,Sepakbola,https://www.cnnindonesia.com/olahraga/20231023...


Cetak dan simpan

In [12]:
# current_datetime = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
# excel_file_name = f'../tempat_simpan_nasional/cnn_sport_{current_datetime}.xlsx'
# df.to_excel(excel_file_name, index=False)

# print(f'Data has been saved to {excel_file_name}')

Data has been saved to ../tempat_simpan_nasional/cnn_sport_2023-10-24_14-34-10.xlsx
