#Create Enviornment

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# E9 Forum Corpus t-SNE Visualization - Using corrected columns

import pandas as pd
import os
import tensorflow as tf
from tensorboard.plugins import projector
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import shutil

# Download NLTK resources
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
print("NLTK resources downloaded/verified")

# Load ONLY the main CSV file with a more forgiving parser
print("Loading CSV file...")
data_path = '/content/drive/Othercomputers/My Mac/Git/Language_Models/datasets/e9/e9_forum_corpus.csv'
df = pd.read_csv(data_path, engine='python', on_bad_lines='skip')
print(f"Successfully loaded {len(df)} rows from {data_path}")

# Display the actual column names for verification
print("\nActual column names in the CSV file:")
print(df.columns.tolist())

# Verify that we have the expected columns
expected_columns = ['thread_id', 'thread_title', 'thread_first_post', 'thread_all_posts']
for col in expected_columns:
    if col in df.columns:
        print(f"Found expected column: {col}")
    else:
        print(f"Warning: Expected column '{col}' not found in data")

# Basic preprocessing
print("\nPreprocessing text...")
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
stop_words.update(['e9', 'bmw', 'car', 'cars', 'coupe', 'csi', 'cs', 'csl', 'http', 'https',
                   'www', 'com', 'ebay', 'post', 'thread', 'forum', 'html'])

def preprocess_text(text):
    """Clean and tokenize text"""
    if not isinstance(text, str):
        return []

    # Clean text
    text = re.sub(r'https?://\S+|www\.\S+', '', text) # Remove URLs
    text = re.sub(r'<.*?>', '', text) # Remove HTML tags
    text = re.sub(r'[^\w\s]', ' ', text.lower()) # Remove special chars
    text = re.sub(r'\d+', '', text) # Remove numbers
    text = re.sub(r'\s+', ' ', text).strip() # Remove extra whitespace

    # Tokenize and lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in text.split()
              if word not in stop_words and len(word) > 2]
    return tokens

# Create combined text field for better embeddings
print("Creating combined text field...")
# Use the known column names
df['combined_text'] = df['thread_title'].fillna('') + ' ' + df['thread_first_post'].fillna('')
df['processed'] = df['combined_text'].apply(preprocess_text)

# Remove empty documents
empty_docs = df['processed'].apply(len) == 0
if empty_docs.sum() > 0:
    print(f"Removing {empty_docs.sum()} documents with empty processed text")
    df = df[~empty_docs]

print(f"Final dataset size: {len(df)} documents")

# Create TF-IDF embeddings
print("\nCreating TF-IDF document embeddings...")
df['processed_text'] = df['processed'].apply(lambda x: ' '.join(x))
max_features = min(3000, len(df))
vectorizer = TfidfVectorizer(max_features=max_features)
tfidf_matrix = vectorizer.fit_transform(df['processed_text']).toarray()
print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")

# Cluster for visualization
print("\nClustering documents...")
kmeans = KMeans(n_clusters=10, random_state=42, n_init=10)
clusters = kmeans.fit_predict(tfidf_matrix)
df['cluster'] = clusters

# Set up TensorBoard directory
log_dir = './tensorboard_logs_e9/'
if os.path.exists(log_dir):
    shutil.rmtree(log_dir)
os.makedirs(log_dir)

# Save embeddings as a TensorFlow variable
print("\nSaving document embeddings for TensorBoard...")
embeddings_var = tf.Variable(tfidf_matrix, name='document_embeddings')
checkpoint = tf.train.Checkpoint(embedding=embeddings_var)
checkpoint.save(os.path.join(log_dir, "embedding.ckpt"))

# Create metadata file
metadata_path = os.path.join(log_dir, 'metadata.tsv')
with open(metadata_path, 'w', encoding='utf-8') as f:
    f.write('Title\tCluster\n')
    for i in range(len(df)):
        # Use the known column name
        safe_title = str(df.iloc[i]['thread_title'])[:150].replace('\t', ' ').replace('\n', ' ')
        f.write(f"{safe_title}\t{clusters[i]}\n")

# Configure TensorBoard Projector
config = projector.ProjectorConfig()
embedding = config.embeddings.add()
embedding.tensor_name = "embedding/.ATTRIBUTES/VARIABLE_VALUE"
embedding.metadata_path = 'metadata.tsv'
projector.visualize_embeddings(log_dir, config)

print(f"\nDocument embeddings configuration saved to {log_dir}")

# Load TensorBoard
try:
    get_ipython().run_line_magic('load_ext', 'tensorboard')
    print("\nTensorBoard extension loaded")
except:
    print("\nCould not load TensorBoard extension automatically")

# Print instructions
print("\n" + "="*80)
print("VISUALIZATION INSTRUCTIONS:")
print("="*80)
print("\nTo visualize in TensorBoard, run this command in your notebook:")
print("%load_ext tensorboard")
print(f"%tensorboard --logdir {log_dir}")
print("\nIn TensorBoard Projector, select t-SNE visualization from the left panel")
print("Adjust t-SNE parameters as needed:")
print("- Perplexity: 5-50 (default 30)")
print("- Learning rate: 10-1000 (default 100)")
print("- Iterations: 1000+ for better results")

# Visualization

In [None]:
%load_ext tensorboard
%tensorboard --logdir ./tensorboard_logs_e9/

#Export to HTML

In [None]:
from nbconvert import HTMLExporter
import nbformat
import codecs
import os
import copy

notebook_path = '/content/drive/Othercomputers/My Mac/CSCI_104/Week_Project/Notebooks/LLM_RAG_ELGASDAVID_TSNE.ipynb'
html_path = '/content/drive/Othercomputers/My Mac/CSCI_104/Week_Project/Notebooks/LLM_RAG_ELGASDAVID_TSNE.html'

# Verify the file exists
if not os.path.exists(notebook_path):
    print(f"Error: File not found at {notebook_path}")
else:
    # Create the HTML exporter with embedded resources
    html_exporter = HTMLExporter()

    # Configure to embed images, data, and other resources
    html_exporter.embed_images = True

    # Optional: Use the full template which includes more styling
    html_exporter.template_name = 'classic'

    # Set config to embed all resources
    html_exporter.exclude_input_prompt = False
    html_exporter.exclude_output_prompt = False

    try:
        # Read the notebook
        with open(notebook_path, 'r', encoding='utf-8') as notebook_file:
            notebook_content = nbformat.read(notebook_file, as_version=4)

        # Make a deep copy to avoid modifying the original
        notebook_copy = copy.deepcopy(notebook_content)

        # Remove widget metadata if present
        if 'widgets' in notebook_copy.get('metadata', {}):
            del notebook_copy['metadata']['widgets']

        # Sanitize all cell metadata
        for cell in notebook_copy.cells:
            if 'metadata' in cell and 'widgets' in cell['metadata']:
                del cell['metadata']['widgets']

            # Also clean outputs
            if cell.get('cell_type') == 'code' and 'outputs' in cell:
                for output in cell['outputs']:
                    if 'metadata' in output and 'widgets' in output['metadata']:
                        del output['metadata']['widgets']

        # Convert to HTML with embedded resources
        html_data, resources = html_exporter.from_notebook_node(notebook_copy)

        # Check if there are resources to embed
        if resources and 'outputs' in resources:
            print(f"Found {len(resources['outputs'])} resources to embed")

        # Write the HTML file
        with codecs.open(html_path, 'w', encoding='utf-8') as f:
            f.write(html_data)

        print(f"HTML file with embedded resources saved to {html_path}")
    except Exception as e:
        print(f"Error during conversion: {e}")

        # Fallback to basic template
        try:
            print("Attempting fallback method with basic template...")
            html_exporter = HTMLExporter(template_name='basic')
            html_exporter.embed_images = True  # Still try to embed images in fallback

            # Need to reload the notebook for the fallback attempt
            with open(notebook_path, 'r', encoding='utf-8') as notebook_file:
                notebook_content = nbformat.read(notebook_file, as_version=4)

            notebook_copy = copy.deepcopy(notebook_content)

            # Apply the same widget cleanup
            if 'widgets' in notebook_copy.get('metadata', {}):
                del notebook_copy['metadata']['widgets']

            for cell in notebook_copy.cells:
                if 'metadata' in cell and 'widgets' in cell['metadata']:
                    del cell['metadata']['widgets']

                if cell.get('cell_type') == 'code' and 'outputs' in cell:
                    for output in cell['outputs']:
                        if 'metadata' in output and 'widgets' in output['metadata']:
                            del output['metadata']['widgets']

            html_data, resources = html_exporter.from_notebook_node(notebook_copy)

            with codecs.open(html_path, 'w', encoding='utf-8') as f:
                f.write(html_data)

            print(f"Fallback method: HTML file saved to {html_path}")
        except Exception as e2:
            print(f"Fallback method also failed: {e2}")