<a href="https://colab.research.google.com/github/davidelgas/DataSciencePortfolio/blob/main/Language_Models/LLM_RAG/LLM_RAG_Semantic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# === 1. Mount Google Drive ===
from google.colab import drive
drive.mount('/content/drive')


# === 2. Install Required Packages ===
!pip install -q sentence-transformers faiss-cpu pandas openai --upgrade
!pip install snowflake
!pip install hf_xet  # Add this line to silence warnings and improve performance


# === 3. RAG Pipeline ===
import snowflake.connector
import os
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss
from openai import OpenAI
import pickle


# === 4. Configuration ===
api_key_path = "/content/drive/Othercomputers/My Mac/CSCI_104/credentials/openaikey.txt"

# Read OpenAI API key
with open(api_key_path, 'r') as file:
    openai_api_key = file.read().strip()

# Create OpenAI client
client = OpenAI(api_key=openai_api_key)

# === 5. Load Corpus from Snowflake ===
import snowflake.connector

# Load credentials from a text file (format: KEY=VALUE per line)
sf_creds_path = '/content/drive/Othercomputers/My Mac/Git/credentials/snowflake_credentials.txt'

sf_env = {}
with open(sf_creds_path, 'r') as f:
    for line in f:
        if '=' in line:
            key, value = line.strip().split('=', 1)
            sf_env[key.strip()] = value.strip()

# Connect to Snowflake
conn = snowflake.connector.connect(
    user=sf_env['USER'],
    password=sf_env['PASSWORD'],
    account=sf_env['ACCOUNT'],
    database='E9_CORPUS',
    schema='E9_CORPUS_SCHEMA',
    warehouse='COMPUTE_WH'  # or use your default warehouse
)

cur = conn.cursor()
cur.execute("SELECT THREAD_ID, THREAD_TITLE, THREAD_FIRST_POST, THREAD_ALL_POSTS FROM E9_CORPUS.E9_CORPUS_SCHEMA.E9_FORUM_CORPUS")
rows = cur.fetchall()

# Convert to DataFrame
df = pd.DataFrame(rows, columns=['thread_id', 'thread_title', 'thread_first_post', 'thread_all_posts'])
print(f"Loaded {len(df)} threads from Snowflake.")

df["full_text"] = (
    df["thread_title"].fillna("") + "\n\n" +
    df["thread_first_post"].fillna("") + "\n\n" +
    df["thread_all_posts"].fillna("")
)


# === 6. Embed Corpus ===
model = SentenceTransformer("all-MiniLM-L6-v2")
corpus_embeddings = model.encode(df["full_text"].tolist(), show_progress_bar=True)

# === 7. Create FAISS Index ===
dimension = corpus_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(corpus_embeddings)

# === 8. Ask a Question ===
question = "How do I remove the steering wheel in an E9?"
question_embedding = model.encode([question])

# === 9. Retrieve Top Matches + Distances ===
top_k = 3
distances, indices = index.search(question_embedding, top_k)

retrieved_texts = []
print("\n=== Retrieved Threads with Distances ===\n")
for i, (idx, dist) in enumerate(zip(indices[0], distances[0])):
    title = df.iloc[idx]["thread_title"]
    print(f"[{i+1}] Distance: {dist:.4f} | Title: {title}")
    retrieved_texts.append(df.iloc[idx]["full_text"])

# === 10. Format Prompt with Context ===
context = "\n\n".join([f"Thread {i+1}:\n{text}" for i, text in enumerate(retrieved_texts)])
rag_prompt = f"""You are an expert on BMW E9 maintenance. Use the following forum threads to answer the question.

{context}

Question: {question}
Answer:"""

# === 11a. Generate Answer WITH context ===
response_with_context = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[{"role": "user", "content": rag_prompt}],
    temperature=0.2
)

# === 11b. Generate Answer WITHOUT context (baseline) ===
baseline_prompt = f"""You are an expert on BMW E9 maintenance.

Question: {question}
Answer:"""

response_without_context = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[{"role": "user", "content": baseline_prompt}],
    temperature=0.2
)

# === 12. Export FAISS Index and Thread Data ===
print("\n=== Exporting FAISS Index and Thread Data ===\n")

# Create output directory if it doesn't exist
output_dir = "/content/drive/Othercomputers/My Mac/Git/Language_Models/streamlit_rag/"
os.makedirs(output_dir, exist_ok=True)

# Export FAISS index with the exact filenames Streamlit app expects
faiss_path = os.path.join(output_dir, "bmw_e9_index.faiss")
faiss.write_index(index, faiss_path)
print(f"FAISS index exported to: {faiss_path}")

# Export DataFrame with thread data with the exact filename Streamlit app expects
df_path = os.path.join(output_dir, "bmw_e9_threads.pkl")
with open(df_path, "wb") as f:
    pickle.dump(df, f)
print(f"Thread data exported to: {df_path}")

# Also export a sample of the data as CSV for easy inspection
sample_csv_path = os.path.join(output_dir, "bmw_e9_sample.csv")
# Select subset of columns and only 100 rows for the sample
df[['thread_id', 'thread_title']].head(100).to_csv(sample_csv_path, index=False)
print(f"Sample data exported to: {sample_csv_path}")

print("\nExport complete! Files are now saved to the Git repository directory.")
print(f"FAISS index: {faiss_path}")
print(f"Thread data: {df_path}")
print(f"Sample CSV: {sample_csv_path}")

# Verify files were created
import os
if os.path.exists(faiss_path) and os.path.exists(df_path):
    print(f"\n SUCCESS: Files successfully created in {output_dir}")
    print(f"FAISS index size: {os.path.getsize(faiss_path) / 1024 / 1024:.2f} MB")
    print(f"Thread data size: {os.path.getsize(df_path) / 1024 / 1024:.2f} MB")
else:
    print("\n ERROR: Files were not created successfully")

# === 13. Output both answers ===
print("\n=== ANSWER WITH RAG CONTEXT ===\n")
print(response_with_context.choices[0].message.content)

print("\n=== BASELINE ANSWER (No RAG) ===\n")
print(response_without_context.choices[0].message.content)