In [6]:
import requests
from bs4 import BeautifulSoup
import json

def scrape_content(url):
    try:
        print(f"Scraping content from: {url}")
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extracting only paragraphs within the specified class
        content_div = soup.find('div', class_='body__inner-container')
        if content_div:
            paragraphs = content_div.find_all('p')
            text = ' '.join([p.get_text() for p in paragraphs])
            print(f"Scraping successful for: {url}")
            return text
        else:
            print(f"No content found in specified class for {url}")
            return None
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return None

def complete_dataset(dataset):
    completed_dataset = []
    for item in dataset:
        print(f"Processing story: {item['story_name']}")
        if item['content'].startswith('http'):
            scraped_text = scrape_content(item['content'])
            if scraped_text:
                item['content'] = scraped_text
        completed_dataset.append(item)
        print(f"Finished processing: {item['story_name']}")
    return completed_dataset

In [7]:
print("Loading dataset from file...")
with open('input/ttcw_short_stories.json', 'r') as f:
    dataset = json.load(f)

print("Starting the scraping process...")
completed_dataset = complete_dataset(dataset)
with open('completed_dataset.json', 'w') as f:
    json.dump(completed_dataset, f, indent=4, ensure_ascii=False)
print("Dataset completed and saved to 'completed_dataset.json'")

Loading dataset from file...
Starting the scraping process...
Processing story: Maintenance, Hvidovre
Scraping content from: https://www.newyorker.com/magazine/2023/05/15/maintenance-hvidovre-fiction-olga-ravn
Scraping successful for: https://www.newyorker.com/magazine/2023/05/15/maintenance-hvidovre-fiction-olga-ravn
Finished processing: Maintenance, Hvidovre
Processing story: Maintenance, Hvidovre
Finished processing: Maintenance, Hvidovre
Processing story: Maintenance, Hvidovre
Finished processing: Maintenance, Hvidovre
Processing story: Maintenance, Hvidovre
Finished processing: Maintenance, Hvidovre
Processing story: Listening For the Click
Scraping content from: https://www.newyorker.com/books/flash-fiction/listening-for-the-click
Scraping successful for: https://www.newyorker.com/books/flash-fiction/listening-for-the-click
Finished processing: Listening For the Click
Processing story: Listening For the Click
Finished processing: Listening For the Click
Processing story: Listenin