#Create Enviornment

In [1]:
# Access to Google Drive

from google.colab import drive
drive.mount('/content/drive')

!pip install snowflake
import os
import time
import requests
import pandas as pd
import concurrent.futures
import snowflake.connector
import concurrent.futures
import json

from bs4 import BeautifulSoup
from datetime import datetime

BASE_PATH = '/content/drive/Othercomputers/My Mac/CSCI_104/Week_Project/Datasets/'

Mounted at /content/drive
Collecting snowflake
  Downloading snowflake-1.4.0-py3-none-any.whl.metadata (2.0 kB)
Collecting snowflake-core==1.4.0 (from snowflake)
  Downloading snowflake_core-1.4.0-py3-none-any.whl.metadata (2.0 kB)
Collecting snowflake-legacy (from snowflake)
  Downloading snowflake_legacy-1.0.0-py3-none-any.whl.metadata (2.5 kB)
Collecting snowflake-connector-python (from snowflake-core==1.4.0->snowflake)
  Downloading snowflake_connector_python-3.15.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (70 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.8/70.8 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Collecting asn1crypto<2.0.0,>0.24.0 (from snowflake-connector-python->snowflake-core==1.4.0->snowflake)
  Downloading asn1crypto-1.5.1-py2.py3-none-any.whl.metadata (13 kB)
Collecting boto3>=1.24 (from snowflake-connector-python->snowflake-core==1.4.0->snowflake)
  Downloading boto3-1.38.15-py3-none-any.whl.metadata (6.6 kB)
Co

#Fetch Data


In [2]:
# Fetch data from forum
def read_csv_safely(file_path):
    """
    Read a CSV file with error handling for malformed data.
    """
    try:
        # First attempt with normal settings
        return pd.read_csv(file_path)
    except pd.errors.ParserError:
        print(f"CSV parsing error detected in {file_path}. Attempting to read with error handling...")
        try:
            # For newer pandas versions, use integer for quoting
            return pd.read_csv(file_path,
                              on_bad_lines='skip',
                              quoting=3,  # 3 is QUOTE_NONE in pandas
                              escapechar='\\')
        except (TypeError, AttributeError):
            try:
                # Fallback for older pandas versions
                return pd.read_csv(file_path,
                                  error_bad_lines=False,
                                  quoting=3,  # 3 is QUOTE_NONE in pandas
                                  escapechar='\\')
            except Exception as e:
                # Last resort if all else fails
                print(f"Advanced CSV handling failed: {e}. Trying simple approach...")
                return pd.read_csv(file_path, dtype=str, engine='python')

# --- Data Fetching Functions ---

def create_urls(base_path: str, filename: str = 'e9_forum_thread_ids.csv', threads: int = 1):
    """
    Generates and appends new thread IDs to a CSV file for later fetching.

    Args:
        base_path: Directory to store data files
        filename: File to store thread IDs
        threads: Number of new thread IDs to create

    Returns:
        DataFrame with newly created thread IDs
    """
    file_path = os.path.join(base_path, filename)

    # Check if thread ID file exists and is non-empty
    if os.path.exists(file_path) and os.path.getsize(file_path) > 0:
        existing_ids = pd.read_csv(file_path)
        last_thread_id = int(existing_ids['thread_id'].iloc[-1])
        print(f"Existing thread_ids found. Last thread_id: {last_thread_id}")
    else:
        last_thread_id = 0
        print(f"No existing thread_ids. Starting from {last_thread_id}")

    # Generate the next sequence of thread IDs
    new_ids = [{'thread_id': tid} for tid in range(last_thread_id + 1, last_thread_id + threads + 1)]
    new_thread_ids = pd.DataFrame(new_ids)

    # Append new IDs to the CSV file
    new_thread_ids.to_csv(file_path, mode='a', header=not os.path.exists(file_path), index=False)
    print(f"Added {threads} new thread_ids. Ending at {new_ids[-1]['thread_id']}")
    return new_thread_ids


def fetch_full_thread_data(df, base_path: str, posts_filename: str = 'e9_forum_posts.csv', decorated_filename: str = 'e9_forum_threads_decorated.csv'):
    """
    Downloads forum thread HTML data, parses content, and extracts post metadata.

    Args:
        df: DataFrame with thread_id column
        base_path: Directory to store output files
        posts_filename: File to store individual post content
        decorated_filename: File to store thread metadata (title + first post)
    """
    os.makedirs(base_path, exist_ok=True)  # Ensure directory exists

    posts_file = os.path.join(base_path, posts_filename)
    decorated_file = os.path.join(base_path, decorated_filename)

    # Load existing datasets if they exist
    existing_posts = pd.read_csv(posts_file) if os.path.exists(posts_file) else pd.DataFrame(columns=['thread_id', 'post_timestamp', 'post_raw'])
    existing_decorated = pd.read_csv(decorated_file) if os.path.exists(decorated_file) else pd.DataFrame(columns=['thread_id', 'thread_title', 'thread_first_post'])

    # Identify which threads are new and need to be fetched
    existing_thread_ids = set(existing_posts['thread_id'].tolist()) | set(existing_decorated['thread_id'].tolist())
    new_threads = df[~df['thread_id'].isin(existing_thread_ids)]

    if new_threads.empty:
        print("No new threads to fetch.")
        return

    print(f"Fetching data for {len(new_threads)} new threads...")

    post_data = []         # List to store individual posts
    decorated_data = []    # List to store thread metadata

    # Iterate over each thread ID to fetch
    for thread_id in new_threads['thread_id']:
        thread_url = f"https://e9coupe.com/forum/threads/{thread_id}/?page=1"
        try:
            print(f"Fetching thread {thread_id}...")
            response = requests.get(thread_url)

            if response.status_code != 200:
                print(f"Error {response.status_code} fetching {thread_url}")
                continue

            soup = BeautifulSoup(response.text, 'html.parser')
            articles = soup.find_all('article', class_='message--post')

            if not articles:
                print(f"No posts found for thread {thread_id}. Skipping.")
                continue

            post_count = len(articles)
            print(f"Found {post_count} posts in thread {thread_id}")

            # Extract thread title
            title_element = soup.find('title')
            thread_title = title_element.get_text().split('|')[0].strip() if title_element else "No Title"

            # Extract first post (from message body class)
            first_post_element = soup.find('article', class_='message-body')
            first_post = first_post_element.get_text(strip=True) if first_post_element else "No content"

            # Save thread-level metadata
            decorated_data.append({
                'thread_id': thread_id,
                'thread_title': thread_title,
                'thread_first_post': first_post
            })

            # Loop through all articles/posts in the thread
            for article in articles:
                timestamp_element = article.find('time')
                content_element = article.find('div', class_='bbWrapper')

                post_data.append({
                    'thread_id': thread_id,
                    'post_timestamp': timestamp_element['datetime'] if timestamp_element else "N/A",
                    'post_raw': content_element.get_text(strip=True) if content_element else "No content"
                })

            time.sleep(1)  # Throttle requests to avoid hammering the site

        except Exception as e:
            print(f"Error fetching thread {thread_id}: {e}")

    # Save post-level data to file
    if post_data:
        new_posts_df = pd.DataFrame(post_data)
        combined_posts = pd.concat([existing_posts, new_posts_df], ignore_index=True)
        combined_posts.to_csv(posts_file, index=False)
        print(f"Saved {len(new_posts_df)} new posts. Total posts: {len(combined_posts)}")

    # Save thread-level metadata to file
    if decorated_data:
        new_decorated_df = pd.DataFrame(decorated_data)
        combined_decorated = pd.concat([existing_decorated, new_decorated_df], ignore_index=True)
        combined_decorated.to_csv(decorated_file, index=False)
        print(f"Saved {len(new_decorated_df)} new decorated threads. Total threads: {len(combined_decorated)}")


def create_forum_corpus(base_path: str, posts_filename: str = 'e9_forum_posts.csv',
                       decorated_filename: str = 'e9_forum_threads_decorated.csv',
                       corpus_filename: str = 'e9_forum_corpus.csv',
                       append_to_main_corpus: bool = True):
    """
    Merges post data with thread metadata to build a text corpus.

    Args:
        base_path: Directory containing data files
        posts_filename: File containing individual posts
        decorated_filename: File containing thread metadata
        corpus_filename: Output file for the batch corpus
        append_to_main_corpus: Whether to append to the full persistent corpus

    Returns:
        DataFrame containing the complete corpus
    """
    posts_file = os.path.join(base_path, posts_filename)
    decorated_file = os.path.join(base_path, decorated_filename)
    corpus_file = os.path.join(base_path, corpus_filename)
    main_corpus_file = os.path.join(base_path, 'e9_forum_corpus.csv')

    # Check for required inputs
    if not os.path.exists(posts_file) or not os.path.exists(decorated_file):
        print(f"ERROR: Required input files not found. Cannot create corpus.")
        if not os.path.exists(posts_file):
            print(f"Missing: {posts_file}")
        if not os.path.exists(decorated_file):
            print(f"Missing: {decorated_file}")
        return pd.DataFrame()

    # Load input files safely
    print(f"Reading posts from {posts_file}")
    posts_df = read_csv_safely(posts_file)
    print(f"Reading thread metadata from {decorated_file}")
    decorated_df = read_csv_safely(decorated_file)

    print(f"Found {len(posts_df)} posts across {posts_df['thread_id'].nunique()} threads")
    print(f"Found {len(decorated_df)} threads with metadata")

    # Aggregate all post text per thread
    print("Aggregating posts by thread ID...")
    aggregated = posts_df.groupby('thread_id')['post_raw'].agg(
        lambda x: ' '.join(str(i) for i in x if pd.notna(i))).reset_index()
    aggregated.rename(columns={'post_raw': 'thread_all_posts'}, inplace=True)

    # Ensure correct dtype for merging
    decorated_df['thread_id'] = decorated_df['thread_id'].astype('int64')
    aggregated['thread_id'] = aggregated['thread_id'].astype('int64')

    # Keep only threads that have both metadata and posts
    common_thread_ids = set(decorated_df['thread_id']) & set(aggregated['thread_id'])
    print(f"Found {len(common_thread_ids)} threads with both metadata and posts")

    filtered_decorated = decorated_df[decorated_df['thread_id'].isin(common_thread_ids)]
    filtered_aggregated = aggregated[aggregated['thread_id'].isin(common_thread_ids)]

    # Merge into a single corpus
    batch_corpus = pd.merge(filtered_decorated, filtered_aggregated, on='thread_id', how='inner')
    print(f"Created corpus with {len(batch_corpus)} threads")
    batch_corpus.to_csv(corpus_file, index=False)
    print(f"Saved batch corpus to {corpus_file}")

    # Optionally update the main corpus
    if append_to_main_corpus:
        if os.path.exists(main_corpus_file):
            try:
                # Use safe reading for main corpus
                main_corpus = read_csv_safely(main_corpus_file)
                print(f"Loaded existing main corpus with {len(main_corpus)} threads")

                # Only add new threads
                existing_main_thread_ids = set(main_corpus['thread_id'].tolist())
                new_threads = batch_corpus[~batch_corpus['thread_id'].isin(existing_main_thread_ids)]

                if new_threads.empty:
                    print("No new threads to add to main corpus")
                else:
                    combined_corpus = pd.concat([main_corpus, new_threads], ignore_index=True)
                    # Try saving the combined corpus
                    try:
                        combined_corpus.to_csv(main_corpus_file, index=False)
                        print(f"Added {len(new_threads)} new threads to main corpus. Total: {len(combined_corpus)}")
                        return combined_corpus
                    except Exception as e:
                        print(f"Error saving combined corpus: {e}")
                        print("Continuing with batch file only.")
                        return batch_corpus
            except Exception as e:
                print(f"Error updating main corpus: {e}")
                print("Continuing with batch file only.")
                return batch_corpus
        else:
            try:
                batch_corpus.to_csv(main_corpus_file, index=False)
                print(f"Created new main corpus with {len(batch_corpus)} threads")
            except Exception as e:
                print(f"Error creating main corpus: {e}")
                print("Continuing with batch file only.")

    return batch_corpus

def update_local_corpus(base_path: str, threads_to_add: int = 5, corpus_filename: str = 'e9_forum_corpus_batch.csv'):
    """
    Orchestrates fetching new thread data and updating the local corpus.

    Args:
        base_path: Directory to store all data files
        threads_to_add: Number of new threads to fetch
        corpus_filename: Filename for the current batch

    Returns:
        DataFrame of updated corpus
    """
    print("\n=== Starting Local Forum Corpus Update ===\n")
    os.makedirs(base_path, exist_ok=True)

    new_thread_ids = create_urls(base_path, threads=threads_to_add)
    fetch_full_thread_data(new_thread_ids, base_path)
    forum_corpus_df = create_forum_corpus(base_path, corpus_filename=corpus_filename, append_to_main_corpus=True)

    print("\n=== Local Forum Corpus Update Complete ===\n")
    return forum_corpus_df


def create_corpus_backup(base_path: str, corpus_filename: str = 'e9_forum_corpus.csv'):
    """
    Creates a timestamped backup of the current corpus CSV.

    Args:
        base_path: Directory of corpus file
        corpus_filename: Filename to back up
    """
    corpus_path = os.path.join(base_path, corpus_filename)
    if not os.path.exists(corpus_path):
        print(f"Corpus file not found: {corpus_path}")
        return

    backup_dir = os.path.join(base_path, 'backups')
    os.makedirs(backup_dir, exist_ok=True)

    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    backup_filename = f"{os.path.splitext(corpus_filename)[0]}_{timestamp}.csv"
    backup_path = os.path.join(backup_dir, backup_filename)

    import shutil
    shutil.copy2(corpus_path, backup_path)
    print(f"Created backup: {backup_path}")


def save_corpus_to_json(base_path: str, corpus_filename: str = 'e9_forum_corpus.csv'):
    """
    Converts the CSV corpus into a JSON file for use in downstream applications.

    Args:
        base_path: Directory containing the CSV
        corpus_filename: Input CSV to convert
    """
    corpus_path = os.path.join(base_path, corpus_filename)
    if not os.path.exists(corpus_path):
        print(f"Corpus file not found: {corpus_path}")
        return

    corpus_df = pd.read_csv(corpus_path)
    if corpus_df.empty:
        print("No data to save.")
        return

    json_filename = f"{os.path.splitext(corpus_filename)[0]}.json"
    json_path = os.path.join(base_path, json_filename)

    records = corpus_df.to_dict(orient='records')
    with open(json_path, 'w', encoding='utf-8') as f:
        json.dump(records, f, indent=2, ensure_ascii=False)

    print(f"Saved corpus to JSON: {json_path} ({len(records)} threads)")


def fetch_forum_data_in_batches(base_path: str, num_batches: int = 2, threads_per_batch: int = 10):
    """
    Automates multiple batch fetches of new thread data and compiles a complete corpus.

    Args:
        base_path: Directory to store all data
        num_batches: Number of batches to run
        threads_per_batch: Threads to fetch in each batch

    Returns:
        Final DataFrame of the complete corpus
    """
    os.makedirs(base_path, exist_ok=True)

    main_corpus_path = os.path.join(base_path, 'e9_forum_corpus.csv')
    if os.path.exists(main_corpus_path):
        create_corpus_backup(base_path)

    print(f"\n=== Starting Forum Data Fetching: {num_batches} batches, {threads_per_batch} threads per batch ===\n")

    for batch_num in range(num_batches):
        print(f"\n=== Processing Batch {batch_num + 1}/{num_batches} ===\n")
        batch_filename = f"e9_forum_corpus_batch_{batch_num + 1}.csv"
        update_local_corpus(base_path, threads_to_add=threads_per_batch, corpus_filename=batch_filename)

    save_corpus_to_json(base_path)

    if os.path.exists(main_corpus_path):
        final_corpus = pd.read_csv(main_corpus_path)
        print(f"\n=== Forum Data Fetching Complete: {len(final_corpus)} total threads in corpus ===\n")
        return final_corpus
    else:
        print("\n=== Forum Data Fetching Complete, but no corpus was created ===\n")
        return pd.DataFrame()


#Orchestration

In [3]:
# Orchestration Fetch and Storage


NUM_BATCHES = 1
THREADS_PER_BATCH = 10
MAX_WORKERS = 3

# Create executor for concurrent processing
executor = concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS)
futures = []

# Process each batch
for batch_num in range(NUM_BATCHES):
    print(f"\n=== Starting batch {batch_num + 1} ===\n")

    # Generate batch filename
    batch_filename = f"e9_forum_corpus_batch_{batch_num + 1}.csv"

    # Define a function to fetch and save data
    def process_batch(base_path, threads_to_add, filename):
        try:
            # Fetch and save data locally
            forum_corpus_df = update_local_corpus(base_path, threads_to_add=threads_to_add, corpus_filename=filename)
            print(f"Batch {filename} completed successfully")
            return forum_corpus_df
        except Exception as e:
            print(f"ERROR processing batch {filename}: {e}")
            raise

    # Submit the batch processing task to the executor
    future = executor.submit(process_batch, BASE_PATH, THREADS_PER_BATCH, batch_filename)

    # Add callback for result handling
    def create_callback(filename):
        def handle_batch_result(fut):
            try:
                fut.result()  # This will raise any exception that occurred during execution
                print(f"Processing completed for {filename}")
            except Exception as e:
                print(f"PROCESSING FAILED for {filename}: {e}")
        return handle_batch_result

    future.add_done_callback(create_callback(batch_filename))
    futures.append(future)

# Wait for all processing to complete
executor.shutdown(wait=True)
print("\n=== All scraping and local saving complete ===\n")


=== Starting batch 1 ===


=== Starting Local Forum Corpus Update ===

Existing thread_ids found. Last thread_id: 15400
Added 10 new thread_ids. Ending at 15410
Fetching data for 10 new threads...
Fetching thread 15401...
Found 9 posts in thread 15401
Fetching thread 15402...
Found 6 posts in thread 15402
Fetching thread 15403...
Found 5 posts in thread 15403
Fetching thread 15404...
Found 12 posts in thread 15404
Fetching thread 15405...
Found 4 posts in thread 15405
Fetching thread 15406...
Found 1 posts in thread 15406
Fetching thread 15407...
Found 15 posts in thread 15407
Fetching thread 15408...
Found 1 posts in thread 15408
Fetching thread 15409...
Found 1 posts in thread 15409
Fetching thread 15410...
Found 14 posts in thread 15410
Saved 68 new posts. Total posts: 950
Saved 10 new decorated threads. Total threads: 143
Reading posts from /content/drive/Othercomputers/My Mac/CSCI_104/Week_Project/Datasets/e9_forum_posts.csv
Reading thread metadata from /content/drive/Othercomput

# Check files are correctly synched

In [4]:
# File synchcornization

def initialize_local_corpus(base_path):
    """
    Initialize or reset the local corpus file structure.
    Creates empty files if they don't exist.
    """
    print("\n=== Initializing Local Corpus ===\n")

    # Paths to files
    corpus_file = os.path.join(base_path, 'e9_forum_corpus.csv')
    thread_id_file = os.path.join(base_path, 'e9_forum_thread_ids.csv')

    # Create backup directory
    backup_dir = os.path.join(base_path, 'backups')
    os.makedirs(backup_dir, exist_ok=True)

    # Create or backup existing corpus file
    if os.path.exists(corpus_file):
        # Create backup
        create_corpus_backup(base_path)
        print(f"Backed up existing corpus file")
    else:
        # Create empty corpus file with headers
        empty_corpus = pd.DataFrame(columns=[
            'thread_id', 'title', 'first_post', 'all_posts',
            'scrape_date', 'post_date', 'update_date'
        ])
        empty_corpus.to_csv(corpus_file, index=False)
        print(f"Created new empty corpus file at {corpus_file}")

    # Initialize thread ID file if it doesn't exist
    if not os.path.exists(thread_id_file):
        empty_thread_ids = pd.DataFrame({'thread_id': []})
        empty_thread_ids.to_csv(thread_id_file, index=False)
        print(f"Created new empty thread ID file at {thread_id_file}")

    return True

def sync_thread_ids_with_corpus(base_path: str):
    """
    Synchronize the thread_id tracking file with the main corpus file
    """
    print("\n=== Syncing thread ID tracking file with main corpus ===\n")

    # Paths to files
    thread_id_file = os.path.join(base_path, 'e9_forum_thread_ids.csv')
    main_corpus_file = os.path.join(base_path, 'e9_forum_corpus.csv')

    if not os.path.exists(main_corpus_file):
        print("Main corpus file not found. Cannot sync thread IDs.")
        return False

    try:
        # Read the main corpus
        corpus_df = read_csv_safely(main_corpus_file)

        if len(corpus_df) == 0:
            print("Corpus file is empty. Creating empty thread ID file.")
            empty_thread_ids = pd.DataFrame({'thread_id': []})
            empty_thread_ids.to_csv(thread_id_file, index=False)
            return True

        # Get the highest thread ID
        thread_id_col = 'thread_id' if 'thread_id' in corpus_df.columns else 'THREAD_ID'
        highest_id = corpus_df[thread_id_col].max()

        # Create a new thread ID tracking file with all IDs from 1 to highest
        all_ids = [{'thread_id': tid} for tid in range(1, highest_id + 1)]
        all_thread_ids = pd.DataFrame(all_ids)

        # Backup the existing file if it exists
        if os.path.exists(thread_id_file):
            backup_dir = os.path.join(base_path, 'backups')
            os.makedirs(backup_dir, exist_ok=True)
            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
            backup_file = os.path.join(backup_dir, f"e9_forum_thread_ids_{timestamp}.csv")
            import shutil
            shutil.copy2(thread_id_file, backup_file)
            print(f"Backed up thread ID file to {backup_file}")

        # Save the new thread ID file
        all_thread_ids.to_csv(thread_id_file, index=False)
        print(f"Updated thread ID file with {len(all_ids)} IDs (1 to {highest_id})")

        return True

    except Exception as e:
        print(f"Error syncing thread IDs: {e}")
        return False

# Initialize the local corpus structure if needed
initialize_local_corpus(BASE_PATH)

# Then sync the thread ID file with the local corpus
sync_thread_ids_with_corpus(BASE_PATH)


=== Initializing Local Corpus ===

Created backup: /content/drive/Othercomputers/My Mac/CSCI_104/Week_Project/Datasets/backups/e9_forum_corpus_20250514_013520.csv
Backed up existing corpus file

=== Syncing thread ID tracking file with main corpus ===

Backed up thread ID file to /content/drive/Othercomputers/My Mac/CSCI_104/Week_Project/Datasets/backups/e9_forum_thread_ids_20250514_013522.csv
Updated thread ID file with 15410 IDs (1 to 15410)


True

# Export to HTML

In [5]:
# Export notebook as HTML

from nbconvert import HTMLExporter
import nbformat
import codecs
import os
import copy

notebook_path = '/content/drive/Othercomputers/My Mac/CSCI_104/Week_Project/Notebooks/LLM_RAG_ELGASDAVID_Corpus.ipynb'
html_path = '/content/drive/Othercomputers/My Mac/CSCI_104/Week_Project/Notebooks/LLM_RAG_ELGASDAVID_Corpus.html'

# Verify the file exists
if not os.path.exists(notebook_path):
    print(f"Error: File not found at {notebook_path}")
else:
    # Create the HTML exporter with embedded resources
    html_exporter = HTMLExporter()

    # Configure to embed images, data, and other resources
    html_exporter.embed_images = True

    # Optional: Use the full template which includes more styling
    html_exporter.template_name = 'classic'

    # Set config to embed all resources
    html_exporter.exclude_input_prompt = False
    html_exporter.exclude_output_prompt = False

    try:
        # Read the notebook
        with open(notebook_path, 'r', encoding='utf-8') as notebook_file:
            notebook_content = nbformat.read(notebook_file, as_version=4)

        # Make a deep copy to avoid modifying the original
        notebook_copy = copy.deepcopy(notebook_content)

        # Remove widget metadata if present
        if 'widgets' in notebook_copy.get('metadata', {}):
            del notebook_copy['metadata']['widgets']

        # Sanitize all cell metadata
        for cell in notebook_copy.cells:
            if 'metadata' in cell and 'widgets' in cell['metadata']:
                del cell['metadata']['widgets']

            # Also clean outputs
            if cell.get('cell_type') == 'code' and 'outputs' in cell:
                for output in cell['outputs']:
                    if 'metadata' in output and 'widgets' in output['metadata']:
                        del output['metadata']['widgets']

        # Convert to HTML with embedded resources
        html_data, resources = html_exporter.from_notebook_node(notebook_copy)

        # Check if there are resources to embed
        if resources and 'outputs' in resources:
            print(f"Found {len(resources['outputs'])} resources to embed")

        # Write the HTML file
        with codecs.open(html_path, 'w', encoding='utf-8') as f:
            f.write(html_data)

        print(f"HTML file with embedded resources saved to {html_path}")
    except Exception as e:
        print(f"Error during conversion: {e}")

        # Fallback to basic template
        try:
            print("Attempting fallback method with basic template...")
            html_exporter = HTMLExporter(template_name='basic')
            html_exporter.embed_images = True  # Still try to embed images in fallback

            # Need to reload the notebook for the fallback attempt
            with open(notebook_path, 'r', encoding='utf-8') as notebook_file:
                notebook_content = nbformat.read(notebook_file, as_version=4)

            notebook_copy = copy.deepcopy(notebook_content)

            # Apply the same widget cleanup
            if 'widgets' in notebook_copy.get('metadata', {}):
                del notebook_copy['metadata']['widgets']

            for cell in notebook_copy.cells:
                if 'metadata' in cell and 'widgets' in cell['metadata']:
                    del cell['metadata']['widgets']

                if cell.get('cell_type') == 'code' and 'outputs' in cell:
                    for output in cell['outputs']:
                        if 'metadata' in output and 'widgets' in output['metadata']:
                            del output['metadata']['widgets']

            html_data, resources = html_exporter.from_notebook_node(notebook_copy)

            with codecs.open(html_path, 'w', encoding='utf-8') as f:
                f.write(html_data)

            print(f"Fallback method: HTML file saved to {html_path}")
        except Exception as e2:
            print(f"Fallback method also failed: {e2}")

HTML file with embedded resources saved to /content/drive/Othercomputers/My Mac/CSCI_104/Week_Project/Notebooks/LLM_RAG_ELGASDAVID_Corpus.html
