In [2]:
# Importing necessary libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm.notebook import tqdm
import os
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Define the path to the CSV file in Google Drive
csv_file_path = '/content/drive/My Drive/aws_data.csv'

# Function to scrape a single page
def scrape_aws_blog_page(page_url):
    response = requests.get(page_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    articles = soup.find_all('article', class_='blog-post')

    data = []
    for article in articles:
        title = article.find('span', property='name headline').text.strip()
        detail_section = article.find('section', class_='blog-post-excerpt')
        detail = detail_section.p.text.strip() if detail_section else ''
        url = article.find('a', property='url')['href']
        date = article.find('time', property='datePublished')['datetime']
        data.append({'Title': title, 'Detail': detail, 'URL': url, 'Date': date})

    return data

# Function to scrape multiple pages and save to CSV
def scrape_aws_blog(start_page, end_page):
    all_data = []

    # Check if the CSV file already exists
    if os.path.exists(csv_file_path):
        existing_data = pd.read_csv(csv_file_path)
        last_scraped_page = existing_data['Page'].max()
        start_page = last_scraped_page + 1
        all_data.extend(existing_data.to_dict('records'))
    else:
        last_scraped_page = start_page - 1

    # Scrape each page and update the CSV file
    for page in tqdm(range(start_page, end_page + 1), desc='Scraping pages'):
        page_url = f'https://aws.amazon.com/blogs/architecture/page/{page}/'
        page_data = scrape_aws_blog_page(page_url)
        for item in page_data:
            item['Page'] = page
        all_data.extend(page_data)

        # Save data to CSV after each page
        df = pd.DataFrame(all_data)
        df.to_csv(csv_file_path, index=False)

    print("Data saved to", csv_file_path)

# Call the function to scrape blogs from page 1 to 69
scrape_aws_blog(start_page=1, end_page=69)

Mounted at /content/drive


Scraping pages:   0%|          | 0/69 [00:00<?, ?it/s]

Data saved to /content/drive/My Drive/aws_data.csv


In [1]:
import pandas as pd
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Define the path to the CSV file in Google Drive
csv_file_path = '/content/drive/My Drive/aws_data.csv'

# Read the existing CSV file
df = pd.read_csv(csv_file_path)

# Rename the "Detail" column to "Excerpt"
df.rename(columns={'Detail': 'Excerpt'}, inplace=True)

# Save the updated DataFrame back to the CSV file
df.to_csv(csv_file_path, index=False)

print("Column 'Detail' has been renamed to 'Excerpt'")

Mounted at /content/drive
Column 'Detail' has been renamed to 'Excerpt'


In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm.notebook import tqdm
import os
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Define the path to the CSV file in Google Drive
csv_file_path = '/content/drive/My Drive/aws_data.csv'

# Function to scrape the full article content
def scrape_article_content(article_url):
    response = requests.get(article_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    article_body = soup.find('section', class_='blog-post-content', property='articleBody')
    if article_body:
        return ' '.join([p.text.strip() for p in article_body.find_all('p')])
    return ''

# Read the existing CSV file
df = pd.read_csv(csv_file_path)

# Check if the "Detail" column already exists (indicating the process has run before)
if 'Detail' in df.columns:
    last_scraped_index = df[df['Detail'].isna()].index.min()
else:
    df['Detail'] = None
    last_scraped_index = 0

# Scrape the full article content for each URL and update the CSV file
for index in tqdm(range(last_scraped_index, len(df)), desc='Scraping article details'):
    url = df.at[index, 'URL']
    df.at[index, 'Detail'] = scrape_article_content(url)

    # Save progress after each article
    df.to_csv(csv_file_path, index=False)

print("Details updated and data saved to", csv_file_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Scraping article details:   0%|          | 0/677 [00:00<?, ?it/s]

Details updated and data saved to /content/drive/My Drive/aws_data.csv
