In [2]:
import requests
from bs4 import BeautifulSoup
import json
import os
import array as arr
import glob

def extract_bbc_article_data(url):
    """Extract 'title', 'text', and images from a BBC article."""
    response = requests.get(url)
    response.raise_for_status()

    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Extract the title
    title_tag = soup.find('h1')
    title = title_tag.get_text(strip=True) if title_tag else 'No Title Found'

    # Extract the text
    container = soup.find("article")
    text_sections = container.find_all('p')
    text = ' '.join(section.get_text(strip=True) for section in text_sections)

    # Extract images
    images = []
    for img_tag in soup.find_all('img'):
        src = img_tag.get('src', '')
        alt = img_tag.get('alt', '')
        
        # Filter images that start with the specified URL prefix
        if src.startswith('https://ichef.bbci.co.uk/'):
            # Attempt to find a caption, if it is within a figure or another tag
            figure = img_tag.find_parent('figure')
            caption = ''
            if figure:
                caption_tag = figure.find('figcaption')
                caption = caption_tag.get_text(strip=True) if caption_tag else ''
            
            # Make sure we only append images meeting the criteria
            images.append({
                'url': src,
                'caption': caption,
                'altText': alt
            })
    
    return {
        'title': title,
        'text': text,
        'images': images
    }

def save_json(article_data, filename='article.json'):
    """Saves the article data to a JSON file."""
    with open(filename, 'w', encoding='utf-8') as file:
        json.dump(article_data, file, ensure_ascii=False, indent=4)

def extract_bbc_article_urls(main_url):
    """Extracts article URLs from the main page of the BBC website."""
    response = requests.get(main_url)
    response.raise_for_status()

    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Initialize an empty list to hold article URLs
    article_urls = []

    # Find all anchor tags
    for a_tag in soup.find_all('a', href=True):
        href = a_tag['href']
        # BBC article URLs generally contain '/news/' or '/sport/', consider adding more
        if '/news/articles/' in href:
            # Construct full URL if necessary
            if not href.startswith('http'):
                href = 'https://www.bbc.com' + href
            article_urls.append(href)
    
    # Removing duplicates by converting to a set and back to a list
    unique_article_urls = list(set(article_urls))
    #print(str(unique_article_urls))
    return unique_article_urls
    

def main():
    # Example BBC article URL (this would need user input or a list of URLs for a complete pipeline)
    article_url = 'https://www.bbc.com/news/articles/c93l8j1j8yvo'  # Replace with a valid BBC article URL

    # Scrape the article and save to JSON
    article_data = extract_bbc_article_data(article_url)
    #print(f"{article_data}")

    output_dir = '/Users/ram/bbcbot/bbc_articles'
    os.makedirs(output_dir, exist_ok=True)
    files = glob.glob('/Users/ram/bbcbot/bbc_articles')
    for f in files:
        try:
            os.remove(f)
        except:
            print("Failed to delete "+str(f))

    all_articles = extract_bbc_article_urls('https://www.bbc.com')
    for articles in all_articles:
        article_data=extract_bbc_article_data(articles)
        safe_title = ''.join(e if e.isalnum() else '_' for e in article_data['title'])
        json_filename = os.path.join(output_dir, f"{safe_title}.json")
        isExist = os.path.exists(output_dir)
        save_json(article_data, json_filename)

if __name__ == '__main__':
    main()

Failed to delete /Users/ram/bbcbot/bbc_articles


In [28]:
ENCODER = None

def encode_string_by_tiktoken(content: str, model_name: str = "gpt-4o"):
    global ENCODER
    if ENCODER is None:
        ENCODER = tiktoken.encoding_for_model(model_name)
    tokens = ENCODER.encode(content)
    return tokens

In [30]:
def chunking_by_token_size_with_images(
    title: str,
    text: str,
    images: list,
    overlap_token_size=128,
    max_token_size=1024,
    tiktoken_model="gpt-4o",
    **kwargs,
):
    # Function to encode and decode tokens
    def encode_string(content):
        return encode_string_by_tiktoken(content, model_name=tiktoken_model)

    def decode_tokens(tokens):
        return decode_tokens_by_tiktoken(tokens, model_name=tiktoken_model)

    # Encode the entire text
    tokens = encode_string(text)
    results = []

    # Chunk the text
    text_chunks = []
    for index, start in enumerate(range(0, len(tokens), max_token_size - overlap_token_size)):
        chunk_content = decode_tokens(tokens[start:start + max_token_size])
        text_chunks.append(chunk_content.strip())

    # Add text chunks with title
    for index, chunk in enumerate(text_chunks):
        results.append({
            "tokens": len(encode_string(chunk)),
            "content": f"{title}: {chunk}",
            "chunk_order_index": index
        })

    # Add image chunks with title
    for image_url in images:
        results.append({
            "tokens": len(encode_string(image_url)),  # Assuming you want to count tokens for the URL
            "content": f"{title}: {image_url}",
            "chunk_order_index": len(text_chunks) + len(results)  # Sequential index
        })

    return results

In [32]:
title = "Sample Title"
text = "This is a long text that needs to be chunked into smaller parts based on token size."
images = ["image1_url", "image2_url"]

chunks = chunking_by_token_size_with_images(title, text, images)
for chunk in chunks:
    print(chunk)

NameError: name 'tiktoken' is not defined

In [34]:
def read_json_file(file_path):
    try:
        with open(file_path, 'r') as file:
            data = json.load(file)
            return data
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

In [36]:
# Ask the user for the file path
# file_path = input("Enter the path to the JSON file: ")
file_path = "/Users/ram/bbcbot/bbc_articles/Three_reasons_Trump_tariffs_aren_t_China_s_only_problem.json"

# Check if the file exists
if not os.path.isfile(file_path):
    print("The specified file does not exist.")
else:
    # Read and parse the JSON file
    json_data = read_json_file(file_path)
    
    # If successful, print the data
    if json_data is not None:
        print(json.dumps(json_data, indent=4))


{
    "title": "Three reasons Trump tariffs aren't China's only problem",
    "images": [
        {
            "url": "https://ichef.bbci.co.uk/news/480/cpsprodpb/1e3a/live/777c5e90-d3b7-11ef-9da7-c1c79bfc6e92.jpg.webp",
            "caption": "Trump says he been talking to China's Xi through aides since his election",
            "altText": "Getty Images US President Donald Trump, right, and Xi Jinping, China's president, greet attendees waving American and Chinese national flags during a welcome ceremony outside the Great Hall of the People in Beijing, China, on Thursday, 9 November, 2017"
        },
        {
            "url": "https://ichef.bbci.co.uk/news/480/cpsprodpb/1b73/live/268c82a0-d3ec-11ef-b7cd-a9b221bd937e.jpg.webp",
            "caption": "Experts say deep issues in China's economy need to be addressed to fuel spending",
            "altText": "Getty Images Pedestrians walk past a shopping mall decorated with red lanterns and a sign reading 2025 Happy New Year to celeb

In [42]:
def decode_tokens_by_tiktoken(tokens: list[int], model_name: str = "gpt-4o"):
    global ENCODER
    if ENCODER is None:
        ENCODER = tiktoken.encoding_for_model(model_name)
    content = ENCODER.decode(tokens)
    return content

In [50]:
import tiktoken
title = json_data["title"]
text = json_data["text"]
images = json_data["images"]

print(images)

#chunks = chunking_by_token_size_with_images(title, text, images)
#for chunk in chunks:
#    print(chunk)

[{'url': 'https://ichef.bbci.co.uk/news/480/cpsprodpb/1e3a/live/777c5e90-d3b7-11ef-9da7-c1c79bfc6e92.jpg.webp', 'caption': "Trump says he been talking to China's Xi through aides since his election", 'altText': "Getty Images US President Donald Trump, right, and Xi Jinping, China's president, greet attendees waving American and Chinese national flags during a welcome ceremony outside the Great Hall of the People in Beijing, China, on Thursday, 9 November, 2017"}, {'url': 'https://ichef.bbci.co.uk/news/480/cpsprodpb/1b73/live/268c82a0-d3ec-11ef-b7cd-a9b221bd937e.jpg.webp', 'caption': "Experts say deep issues in China's economy need to be addressed to fuel spending", 'altText': 'Getty Images Pedestrians walk past a shopping mall decorated with red lanterns and a sign reading 2025 Happy New Year to celebrate the upcoming Chinese New Year on January 14, 2025 in Chongqing, China.'}, {'url': 'https://ichef.bbci.co.uk/news/480/cpsprodpb/7073/live/297f6760-d3ed-11ef-b7cd-a9b221bd937e.jpg.webp'