In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin  # Import urljoin

def scrape_cbr_anime():
    print("--------------------------------")
    print("CBR/Anime")
    print("--------------------------------")

    # Replace 'your_url_here' with the actual URL of the website you want to scrape
    url = 'https://www.cbr.com/category/anime/'  # Replace with the actual URL

    # Send an HTTP GET request to the website
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find and print the titles, links, and dates of the articles
        article_blocks = soup.find_all('div', class_='w-display-card-content')  # Replace with the actual HTML element and class name
        for block in article_blocks:
            title_element = block.find('h5', class_='display-card-title').find('a')
            title = title_element.text.strip()
            relative_link = title_element['href']  # Get the relative link
            full_link = urljoin(url, relative_link)  # Prepend base URL to relative links
            date_element = block.find('time', class_='display-card-date')['datetime']
            date = date_element.split('T')[0]  # Extract date part from the datetime attribute
            print(f"Title: {title}")
            print(f"Link: {full_link}")  # Use the full link with the base URL
            print(f"Date: {date}")
            print()  # Add an empty line for better readability
    else:
        print('Failed to retrieve the webpage. Status code:', response.status_code)

def scrape_hashnode_data_science():
    print("--------------------------------")
    print("Hashnode/Data Science")
    print("--------------------------------")

    # Replace 'your_url_here' with the actual URL of the website you want to scrape
    url = 'https://hashnode.com/n/data-science'  # Replace with the actual URL

    # Send an HTTP GET request to the website
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find and print the post titles, URLs, and dates
        post_sections = soup.find_all('section', class_='flex flex-col gap-2 sm:gap-4')  # Replace with the actual HTML element and class name
        for section in post_sections:
            title_element = section.find('h1', class_='font-heading text-base sm:text-xl font-semibold sm:font-bold text-slate-700 dark:text-slate-200 hn-break-words cursor-pointer')
            title = title_element.text.strip()
            link_element = title_element.find_parent('a', href=True)
            link = link_element['href']
            date_element = section.find('p', class_='text-sm text-slate-500 dark:text-slate-400 font-normal')
            date = date_element.text.strip()
            print(f"Title: {title}")
            print(f"Link: {link}")
            print(f"Date: {date}")
            print()  # Add an empty line for better readability
    else:
        print('Failed to retrieve the webpage. Status code:', response.status_code)

# Define similar functions for the other sites (TechCrunch, Interesting Engineering, Wired/Science)

def scrape_techcrunch_startups():
    print("--------------------------------")
    print("TechCrunch/Startups")
    print("--------------------------------")

    # Replace 'your_url_here' with the actual URL of the website you want to scrape
    url = 'https://techcrunch.com/category/startups/'

    # Send an HTTP GET request to the website
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find and print the titles, links, and dates of the blog posts
        post_blocks = soup.find_all('div', class_='post-block')  # Replace with the actual HTML element and class name
        for block in post_blocks:
            title = block.find('h2', class_='post-block__title').text.strip()
            link = block.find('a', class_='post-block__title__link')['href']
            date_element = block.find('time')
            date = date_element.text  # Extract date part from the datetime attribute
            print(f"Title: {title}")
            print(f"Link: {link}")
            print(f"Date: {date}")
            print()  # Add an empty line for better readability
    else:
        print('Failed to retrieve the webpage. Status code:', response.status_code)

def scrape_interesting_engineering():
    print("--------------------------------")
    print("Interesting Engineering")
    print("--------------------------------")

    # Base URL to prepend to relative links
    base_url = 'https://interestingengineering.com/'

    # Replace 'your_url_here' with the actual URL of the website you want to scrape
    url = 'https://interestingengineering.com/news/page/1'  # Replace with the actual URL

    # Send an HTTP GET request to the website
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find and print the article titles, URLs, and date information
        article_blocks = soup.find_all('div', class_='Category_result__description__iz_rw')  # Replace with the actual HTML element and class name
        for block in article_blocks:
            title_link_element = block.find('a', href=True)
            title = title_link_element.find('h2', class_='Category_result__header__HQgVv').text.strip()
            link = urljoin(base_url, title_link_element['href'])  # Prepend base URL to relative links
            author_element = block.find('a', class_='Category_result__author__name__In7jd')
            author = author_element.text.strip()
            date_element = block.find('span', class_='Category_result__author__publishTime__nwLBU')
            date = date_element.text.strip()
            print(f"Title: {title}")
            print(f"Link: {link}")
            print(f"Author: {author}")
            print(f"Date: {date}")
            print()  # Add an empty line for better readability
    else:
        print('Failed to retrieve the webpage. Status code:', response.status_code)

def scrape_wired_science():
    print("--------------------------------")
    print("Wired/Science")
    print("--------------------------------")

    # Replace 'your_url_here' with the actual URL of the website you want to scrape
    url = 'https://www.wired.com/category/science/'  # Replace with the actual URL

    # Send an HTTP GET request to the website
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find and print the article titles, URLs, and categories
        article_blocks = soup.find_all('div', class_='SummaryItemContent-eiDYMl')  # Replace with the actual HTML element and class name
        for block in article_blocks:
            title_element = block.find('h3', class_='SummaryItemHedBase-hiFYpQ')
            title = title_element.text.strip()
            relative_link = block.find('a', class_='SummaryItemHedLink-civMjp')['href']  # Get the relative link
            full_link = urljoin(url, relative_link)  # Prepend base URL to relative links
            category_element = block.find('span', class_='RubricName-fVtemz')
            category = category_element.text.strip() if category_element else "N/A"  # Handle missing category
            print(f"Title: {title}")
            print(f"Link: {full_link}")  # Use the full link with the base URL
            print(f"Category: {category}")
            print()  # Add an empty line for better readability
    else:
        print('Failed to retrieve the webpage. Status code:', response.status_code)

# Call the scraping functions for each site
scrape_cbr_anime()
scrape_hashnode_data_science()
scrape_interesting_engineering()
scrape_wired_science()
scrape_techcrunch_startups()


--------------------------------
CBR/Anime
--------------------------------
Title: Dragon Ball: Every Eternal Dragon, Ranked by Coolness
Link: https://www.cbr.com/dragon-ball-eternal-dragons-ranked/
Date: 2023-09-23

Title: Boruto: Two Blue Vortex Confirms [Spoiler] Is Darker — And It May Be Sasuke's Fault
Link: https://www.cbr.com/boruto-two-blue-vortex-sasuke-dark-mentorship-kill-code/
Date: 2023-09-23

Title: Every Bleach Filler Arc (In Chronological Order)
Link: https://www.cbr.com/bleach-anime-filler-arc-chronological-order/
Date: 2023-09-23

Title: 10 Most Powerful Anime Dragons of All Time, Ranked
Link: https://www.cbr.com/strongest-anime-dragons-ranked/
Date: 2023-09-23

Title: Saint Seiya: Where the Spin-Offs Fit in the Timeline Chronologically
Link: https://www.cbr.com/saint-seiya-where-spin-offs-fit-chronologically/
Date: 2023-09-23

Title: Lupin III Universe Expands Further With Kabuki Stage Play
Link: https://www.cbr.com/lupin-iii-kabuki-stage-play-announced/
Date: 2023-09