In [63]:
import requests
from bs4 import BeautifulSoup
import json
import os
import array as arr
import glob

def extract_bbc_article_data(url):
    """Extract 'title', 'text', and images from a BBC article."""
    response = requests.get(url)
    response.raise_for_status()

    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Extract the title
    title_tag = soup.find('h1')
    title = title_tag.get_text(strip=True) if title_tag else 'No Title Found'

    # Extract the text
    container = soup.find("article")
    text_sections = container.find_all('p')
    text = ' '.join(section.get_text(strip=True) for section in text_sections)

    # Extract images
    images = []
    for img_tag in soup.find_all('img'):
        src = img_tag.get('src', '')
        alt = img_tag.get('alt', '')
        
        # Filter images that start with the specified URL prefix
        if src.startswith('https://ichef.bbci.co.uk/'):
            # Attempt to find a caption, if it is within a figure or another tag
            figure = img_tag.find_parent('figure')
            caption = ''
            if figure:
                caption_tag = figure.find('figcaption')
                caption = caption_tag.get_text(strip=True) if caption_tag else ''
            
            # Make sure we only append images meeting the criteria
            images.append({
                'url': src,
                'caption': caption,
                'altText': alt
            })
    
    return {
        'title': title,
        'text': text,
        'images': images
    }

def save_json(article_data, filename='article.json'):
    """Saves the article data to a JSON file."""
    with open(filename, 'w', encoding='utf-8') as file:
        json.dump(article_data, file, ensure_ascii=False, indent=4)

def extract_bbc_article_urls(main_url):
    """Extracts article URLs from the main page of the BBC website."""
    response = requests.get(main_url)
    response.raise_for_status()

    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Initialize an empty list to hold article URLs
    article_urls = []

    # Find all anchor tags
    for a_tag in soup.find_all('a', href=True):
        href = a_tag['href']
        # BBC article URLs generally contain '/news/' or '/sport/', consider adding more
        if '/news/articles/' in href:
            # Construct full URL if necessary
            if not href.startswith('http'):
                href = 'https://www.bbc.com' + href
            article_urls.append(href)
    
    # Removing duplicates by converting to a set and back to a list
    unique_article_urls = list(set(article_urls))
    #print(str(unique_article_urls))
    return unique_article_urls
    

def main():
    # Example BBC article URL (this would need user input or a list of URLs for a complete pipeline)
    article_url = 'https://www.bbc.com/news/articles/c93l8j1j8yvo'  # Replace with a valid BBC article URL

    # Scrape the article and save to JSON
    article_data = extract_bbc_article_data(article_url)
    #print(f"{article_data}")

    output_dir = 'bbc_articles'
    os.makedirs(output_dir, exist_ok=True)
    files = glob.glob('bbc_articles')
    for f in files:
        try:
            os.remove(f)
        except:
            print("Failed to delete "+str(f))

    all_articles = extract_bbc_article_urls('https://www.bbc.com')
    for articles in all_articles:
        article_data=extract_bbc_article_data(articles)
        safe_title = ''.join(e if e.isalnum() else '_' for e in article_data['title'])
        json_filename = os.path.join(output_dir, f"{safe_title}.json")
        isExist = os.path.exists(output_dir)
        save_json(article_data, json_filename)

if __name__ == '__main__':
    main()

Failed to delete <built-in method title of str object at 0x10829d530>
