In [None]:
from google.colab import drive
drive.mount('/content/drive')

!pip install -q sentence-transformers faiss-cpu pandas openai --upgrade
!pip install snowflake
!pip install hf_xet

import snowflake.connector
import os
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss
from openai import OpenAI
import pickle

api_key_path = "/content/drive/Othercomputers/My Mac/CSCI_104/credentials/openaikey.txt"
sf_creds_path = '/content/drive/Othercomputers/My Mac/Git/credentials/snowflake_credentials.txt'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# === Read OpenAI API key from file ===
with open(api_key_path, 'r') as file:
    openai_api_key = file.read().strip()

# Create OpenAI client
client = OpenAI(api_key=openai_api_key)

# === Load Snowflake credentials ===
sf_env = {}
with open(sf_creds_path, 'r') as f:
    for line in f:
        if '=' in line:
            key, value = line.strip().split('=', 1)
            sf_env[key.strip()] = value.strip()

# === Connect to Snowflake ===
conn = snowflake.connector.connect(
    user=sf_env['USER'],
    password=sf_env['PASSWORD'],
    account=sf_env['ACCOUNT'],
    database='E9_CORPUS',
    schema='E9_CORPUS_SCHEMA',
    warehouse='COMPUTE_WH'
)

# === Fetch Forum Data ===
cur = conn.cursor()
cur.execute("""
    SELECT THREAD_ID, THREAD_TITLE, THREAD_FIRST_POST, THREAD_ALL_POSTS
    FROM E9_CORPUS.E9_CORPUS_SCHEMA.E9_FORUM_CORPUS
""")
rows = cur.fetchall()

df = pd.DataFrame(rows, columns=['thread_id', 'thread_title', 'thread_first_post', 'thread_all_posts'])
print(f"Loaded {len(df)} threads from Snowflake.")

# === Prepare Combined Text ===
df["full_text"] = (
    df["thread_title"].fillna("") + "\n\n" +
    df["thread_first_post"].fillna("") + "\n\n" +
    df["thread_all_posts"].fillna("")
)

# === Embed Corpus ===
model = SentenceTransformer("all-MiniLM-L6-v2")
corpus_embeddings = model.encode(df["full_text"].tolist(), show_progress_bar=True)

# === Create FAISS Index ===
dimension = corpus_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(corpus_embeddings)

# === Ask a Sample Question ===
question = "How do I remove the steering wheel in an E9?"
question_embedding = model.encode([question])

top_k = 3
distances, indices = index.search(question_embedding, top_k)

print("\nRetrieved top threads:\n")
retrieved_texts = []
for i, (idx, dist) in enumerate(zip(indices[0], distances[0])):
    title = df.iloc[idx]["thread_title"]
    print(f"[{i+1}] {title} (Distance: {dist:.4f})")
    retrieved_texts.append(df.iloc[idx]["full_text"])

# === Build Prompt for RAG ===
context = "\n\n".join([f"Thread {i+1}:\n{text}" for i, text in enumerate(retrieved_texts)])
rag_prompt = f"""You are an expert on BMW E9 maintenance. Use the following forum threads to answer the question.

{context}

Question: {question}
Answer:"""

baseline_prompt = f"""You are an expert on BMW E9 maintenance.

Question: {question}
Answer:"""

# === Get LLM Responses ===
response_with_context = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[{"role": "user", "content": rag_prompt}],
    temperature=0.2
)

response_without_context = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[{"role": "user", "content": baseline_prompt}],
    temperature=0.2
)

# === Export FAISS Index and Data ===
print("\n[Step] Exporting FAISS index and thread data to disk...")

output_dir = '/content/drive/Othercomputers/My Mac/CSCI_104/Week_Project/App/'
os.makedirs(output_dir, exist_ok=True)

faiss_path = os.path.join(output_dir, "bmw_e9_index.faiss")
df_path = os.path.join(output_dir, "bmw_e9_threads.pkl")
sample_csv_path = os.path.join(output_dir, "bmw_e9_sample.csv")

# Save index and data
faiss.write_index(index, faiss_path)
with open(df_path, "wb") as f:
    pickle.dump(df, f)
df[['thread_id', 'thread_title']].head(100).to_csv(sample_csv_path, index=False)

# === Report Export Status ===
if os.path.exists(faiss_path) and os.path.exists(df_path):
    faiss_size = os.path.getsize(faiss_path) / 1024 / 1024
    pkl_size = os.path.getsize(df_path) / 1024 / 1024

    print(f"[Success] Files exported to:\n  {output_dir}")
    print(f"  - FAISS index:  {os.path.basename(faiss_path)} ({faiss_size:.2f} MB)")
    print(f"  - Thread data:  {os.path.basename(df_path)} ({pkl_size:.2f} MB)")
else:
    print("[Error] Export failed. One or more files were not created.")

# === Show LLM Answers ===
print("\n=== ANSWER WITH RAG CONTEXT ===\n")
print(response_with_context.choices[0].message.content)

print("\n=== BASELINE ANSWER (No RAG) ===\n")
print(response_without_context.choices[0].message.content)


Loaded 15047 threads from Snowflake.


Batches:   0%|          | 0/471 [00:00<?, ?it/s]

## Export

In [None]:
from nbconvert import HTMLExporter
import nbformat
import codecs
import os
import copy

notebook_path = '/content/drive/Othercomputers/My Mac/CSCI_104/Project/Notebooks/LLM_RAG_ELGASDAVID_RAG.ipynb'
html_path = '/content/drive/Othercomputers/My Mac/CSCI_104/Project/Notebooks/LLM_RAG_ELGASDAVID_RAG.html'

# Verify the file exists
if not os.path.exists(notebook_path):
    print(f"Error: File not found at {notebook_path}")
else:
    # Create the HTML exporter with embedded resources
    html_exporter = HTMLExporter()

    # Configure to embed images, data, and other resources
    html_exporter.embed_images = True

    # Optional: Use the full template which includes more styling
    html_exporter.template_name = 'classic'

    # Set config to embed all resources
    html_exporter.exclude_input_prompt = False
    html_exporter.exclude_output_prompt = False

    try:
        # Read the notebook
        with open(notebook_path, 'r', encoding='utf-8') as notebook_file:
            notebook_content = nbformat.read(notebook_file, as_version=4)

        # Make a deep copy to avoid modifying the original
        notebook_copy = copy.deepcopy(notebook_content)

        # Remove widget metadata if present
        if 'widgets' in notebook_copy.get('metadata', {}):
            del notebook_copy['metadata']['widgets']

        # Sanitize all cell metadata
        for cell in notebook_copy.cells:
            if 'metadata' in cell and 'widgets' in cell['metadata']:
                del cell['metadata']['widgets']

            # Also clean outputs
            if cell.get('cell_type') == 'code' and 'outputs' in cell:
                for output in cell['outputs']:
                    if 'metadata' in output and 'widgets' in output['metadata']:
                        del output['metadata']['widgets']

        # Convert to HTML with embedded resources
        html_data, resources = html_exporter.from_notebook_node(notebook_copy)

        # Check if there are resources to embed
        if resources and 'outputs' in resources:
            print(f"Found {len(resources['outputs'])} resources to embed")

        # Write the HTML file
        with codecs.open(html_path, 'w', encoding='utf-8') as f:
            f.write(html_data)

        print(f"HTML file with embedded resources saved to {html_path}")
    except Exception as e:
        print(f"Error during conversion: {e}")

        # Fallback to basic template
        try:
            print("Attempting fallback method with basic template...")
            html_exporter = HTMLExporter(template_name='basic')
            html_exporter.embed_images = True  # Still try to embed images in fallback

            # Need to reload the notebook for the fallback attempt
            with open(notebook_path, 'r', encoding='utf-8') as notebook_file:
                notebook_content = nbformat.read(notebook_file, as_version=4)

            notebook_copy = copy.deepcopy(notebook_content)

            # Apply the same widget cleanup
            if 'widgets' in notebook_copy.get('metadata', {}):
                del notebook_copy['metadata']['widgets']

            for cell in notebook_copy.cells:
                if 'metadata' in cell and 'widgets' in cell['metadata']:
                    del cell['metadata']['widgets']

                if cell.get('cell_type') == 'code' and 'outputs' in cell:
                    for output in cell['outputs']:
                        if 'metadata' in output and 'widgets' in output['metadata']:
                            del output['metadata']['widgets']

            html_data, resources = html_exporter.from_notebook_node(notebook_copy)

            with codecs.open(html_path, 'w', encoding='utf-8') as f:
                f.write(html_data)

            print(f"Fallback method: HTML file saved to {html_path}")
        except Exception as e2:
            print(f"Fallback method also failed: {e2}")