In [60]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# Base URL and range of pages to scrape
base_url = "https://arstechnica.com/page/"
start_page = 2
end_page = 2

# Set to store filtered links (to automatically remove duplicates)
filtered_links = set()

# Iterate through each page
for page_num in range(start_page, end_page + 1):
    url = f"{base_url}{page_num}/"
    
    
    # Fetch the page content
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to retrieve page {url}")
        continue
    
    # Parse HTML with BeautifulSoup
    soup = BeautifulSoup(response.content, "html.parser")
    
    # Find all divs with the specified class
    divs = soup.find_all('div', class_='listing listing-latest')
    
    # Pattern to match 'https://arstechnica.com/xxxx/2024/' where xxxx can be any string
    pattern = re.compile(r'https://arstechnica\.com/.+/2024/')
    
    # Extract links from each div
    for div in divs:
        links = div.find_all('a', href=True)
        for link in links:
            href = link['href']
            if pattern.match(href):
                filtered_links.add(href)

# Convert set to list and store in links_to_iterate
links_to_iterate = list(filtered_links)

# Print all collected unique links
#for link in links_to_iterate:
    #print(link)





In [61]:
links_to_iterate

['https://arstechnica.com/information-technology/2024/07/in-bid-to-loosen-nvidias-grip-on-ai-amd-to-buy-finnish-startup-for-665-million/?comments=1',
 'https://arstechnica.com/space/2024/07/feds-who-forced-ukrainian-investor-to-sell-rocket-company-backtrack-years-later/?comments=1',
 'https://arstechnica.com/science/2024/07/antikythera-mechanisms-calendar-ring-likely-tracked-the-lunar-calendar/',
 'https://arstechnica.com/gadgets/2024/07/three-betas-in-ios-18-testers-still-cant-try-out-apple-intelligence-features/',
 'https://arstechnica.com/information-technology/2024/07/openai-board-shakeup-microsoft-out-apple-backs-away-amid-ai-partnership-scrutiny/',
 'https://arstechnica.com/information-technology/2024/07/in-bid-to-loosen-nvidias-grip-on-ai-amd-to-buy-finnish-startup-for-665-million/',
 'https://arstechnica.com/space/2024/07/europes-first-ariane-6-flight-achieved-most-of-its-goals-but-ended-prematurely/',
 'https://arstechnica.com/culture/2024/07/its-another-bloody-power-struggle-

In [64]:

# Function to scrape content, Headings, Author, Date, Category, and Links from a given URL
def scrape_techcrunch_content(url):
    try:
        # Send an HTTP request to the URL
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for bad status codes
        
        # Parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find the specific <div> class containing the content
        content_div = soup.find('div', class_='article-content post-page')
        
        # Find the <h1> heading with specific class
        heading_h1 = soup.find('h1', class_='headline')
        
        # Find the author
        author_div = soup.find('p', class_='byline')
        
        # Find the date
        date_div = soup.find('p', class_='date')
        
        # Find the category in the <a> tag
        category_a = soup.find('a', class_=' ')
        
        # Find the category in the <p> tag
        category_p = soup.find('p', class_='')
        
        # Find the div class containing the links
        links_div = soup.find('div', class_='article-guts')
        
        # Extract links from the links div
        links = [a['href'] for a in links_div.find_all('a', href=True)] if links_div else []
        
        # Extract text content
        if content_div and heading_h1 and author_div and date_div:
            # Extract text from heading
            heading_text = heading_h1.get_text(strip=True)
            
            # Extract text content from the content div
            content_text = content_div.get_text(separator='\n')
            
            # Extract author and date
            author = author_div.get_text(strip=True)
            date = date_div.get_text(strip=True)
            
            # Extract category from the <a> tag and <p> tag
            category = []
            if category_a:
                category.append(category_a.get_text(strip=True))
            else:
                category.append('Featured Articles')
            
            category_text = ', '.join(category) if category else "No category found"
            
            return heading_text, content_text, author, date, category_text, links
        else:
            return "No heading found", "No content found", "No author found", "No date found", "No category found", []
    
    except requests.RequestException as e:
        print(f"Error scraping {url}: {e}")
        return None, None, None, None, None, []

# List of URLs to scrape
urls=links_to_iterate  




# Initialize a list to store the data
data = []

# Scrape each URL and store the data
for url in urls:
    
    heading, content, author, date, category, links = scrape_techcrunch_content(url)
    if heading and content and author and date and category:
        data.append({
            'URL': url,
            'Heading': heading,
            'Content': content,
            'Author': author,
            'Date': date,
            'Category': category,
            'Links': ', '.join(links)  # Join links with a comma separator
        })

# Create a DataFrame from the data
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
output_file = 'scraped_arstechnica_data.csv'
df.to_csv(output_file, index=False)
#print(f"Data saved to {output_file}")

In [65]:
df


Unnamed: 0,URL,Heading,Content,Author,Date,Category,Links
0,https://arstechnica.com/information-technology...,"In bid to loosen Nvidia’s grip on AI, AMD to b...",\n\n\n\n\n\n\n\nEnlarge\nAkos Stiller/Bloomber...,No author found,No date found,Featured Articles,https://cdn.arstechnica.net/wp-content/uploads...
1,https://arstechnica.com/space/2024/07/feds-who...,Feds who forced Ukrainian investor to sell roc...,\n\n\n\n\n\n\n\nEnlarge\n \n/\n Firefly Aerosp...,No author found,No date found,Featured Articles,https://cdn.arstechnica.net/wp-content/uploads...
2,https://arstechnica.com/science/2024/07/antiky...,New Antikythera mechanism analysis challenges ...,\n\n\n\n\n\n\n\nEnlarge\n \n/\n Fragment of th...,No author found,No date found,Featured Articles,https://cdn.arstechnica.net/wp-content/uploads...
3,https://arstechnica.com/gadgets/2024/07/three-...,"Three betas in, iOS 18 testers still can’t try...",\n\n\n\n\n\n\n\nEnlarge\nApple\n \n\n\n\n\n\n\...,No author found,No date found,Featured Articles,https://cdn.arstechnica.net/wp-content/uploads...
4,https://arstechnica.com/information-technology...,"OpenAI board shake-up: Microsoft out, Apple ba...",\n\n\n\n\n\n\n\nEnlarge\nBenj Edwards / OpenAI...,No author found,No date found,Featured Articles,https://cdn.arstechnica.net/wp-content/uploads...
...,...,...,...,...,...,...,...
67,https://arstechnica.com/gaming/2024/07/microso...,Microsoft asks many Game Pass subscribers to p...,\n\n\n\n\n\n\n\nEnlarge\n \n/\n Artist's conce...,No author found,No date found,Featured Articles,https://cdn.arstechnica.net/wp-content/uploads...
68,https://arstechnica.com/gaming/2024/07/fallout...,Fallout: Londondevs will “downgrade”Fallout 4t...,\n\n\n\n\n\n\n\nreader comments\n\n\n120\n\n\n...,No author found,No date found,Featured Articles,https://arstechnica.com/gaming/2024/07/fallout...
69,https://arstechnica.com/tech-policy/2024/07/re...,Report: Z-Library admins on the lam ahead of U...,\n\n\n\n\n\n\n\nEnlarge\nfcscafeine | iStock /...,No author found,No date found,Featured Articles,https://cdn.arstechnica.net/wp-content/uploads...
70,https://arstechnica.com/gadgets/2024/07/report...,"After two rejections, Apple approves Epic Game...",\n\n\n\n\n\n\n\nEnlarge\n \n/\n Epic Games fou...,No author found,No date found,Featured Articles,https://cdn.arstechnica.net/wp-content/uploads...
