In [5]:
from datasets import load_dataset
from dotenv import load_dotenv

load_dotenv()

dataset = load_dataset("NicolaiSivesind/human-vs-machine", 'research_abstracts_labeled')


In [8]:
import psycopg2
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Retrieve PostgreSQL credentials from .env
DB_NAME = os.getenv("DB_NAME")
DB_USER = os.getenv("DB_USER")
DB_PASSWORD = os.getenv("DB_PASSWORD")
DB_HOST = os.getenv("DB_HOST")
DB_PORT = os.getenv("DB_PORT")

# Connect to PostgreSQL
conn = psycopg2.connect(
    dbname=DB_NAME,
    user=DB_USER,
    password=DB_PASSWORD,
    host=DB_HOST,
    port=DB_PORT
)
cursor = conn.cursor()

print("✅ Successfully connected to PostgreSQL!")
cursor = conn.cursor()

# Create the table if it does not exist
cursor.execute('''
    CREATE TABLE IF NOT EXISTS text_embeddings (
        id SERIAL PRIMARY KEY,
        text TEXT UNIQUE,
        label INTEGER,
        embedding TEXT
    );
''')

conn.commit()
print("✅ Table 'text_embeddings' created successfully.")


✅ Successfully connected to PostgreSQL!
✅ Table 'text_embeddings' created successfully.


In [14]:
import torch
import ollama

def get_existing_embedding(text):
    """
    Checks if an embedding exists for a given text in PostgreSQL.
    Returns the stored embedding as a tensor if found, otherwise None.
    """
    cursor.execute("SELECT embedding FROM text_embeddings WHERE text = %s", (text,))
    result = cursor.fetchone()
    
    if result:
        # Convert string back to tensor
        embedding_list = list(map(float, result[0].split(",")))
        return torch.tensor(embedding_list, dtype=torch.float32)
    
    return None

def get_embedding(text, label):
    """
    Generates an embedding using Ollama if it doesn't already exist in PostgreSQL.
    Saves it to the database and returns the tensor.
    """
    existing_embedding = get_existing_embedding(text)
    if existing_embedding is not None:
        return existing_embedding  # Reuse stored embedding

    try:
        embedding_data = ollama.embeddings(model="mxbai-embed-large", prompt=text)
        embedding_tensor = torch.tensor(embedding_data["embedding"], dtype=torch.float32)

        # Convert tensor to string format for PostgreSQL storage
        embedding_str = ",".join(map(str, embedding_tensor.tolist()))

        # Insert the new embedding into the database
        cursor.execute('''
            INSERT INTO text_embeddings (text, label, embedding) 
            VALUES (%s, %s, %s)
        ''', (text, label, embedding_str))
        conn.commit()

        return embedding_tensor
    except Exception as e:
        print(f"❌ Error generating embedding: {e}")
        return None


In [10]:
import pandas as pd

df = pd.DataFrame(dataset['train'])
texts = df['text'].tolist()
labels = df['label'].tolist()

total_samples = len(texts)

for idx, (text, label) in enumerate(zip(texts, labels), start=1):
    embedding = get_embedding(text, label)

    # Print progress every 100 samples
    if idx % 100 == 0 or idx == total_samples:
        percent_done = (idx / total_samples) * 100
        print(f"✅ Processed {idx}/{total_samples} ({percent_done:.2f}%)")

print("✅ All embeddings saved successfully!")


✅ Processed 100/14000 (0.71%)
✅ Processed 200/14000 (1.43%)
✅ Processed 300/14000 (2.14%)
✅ Processed 400/14000 (2.86%)
✅ Processed 500/14000 (3.57%)
✅ Processed 600/14000 (4.29%)
✅ Processed 700/14000 (5.00%)
✅ Processed 800/14000 (5.71%)
✅ Processed 900/14000 (6.43%)
✅ Processed 1000/14000 (7.14%)
✅ Processed 1100/14000 (7.86%)
✅ Processed 1200/14000 (8.57%)
✅ Processed 1300/14000 (9.29%)
✅ Processed 1400/14000 (10.00%)
✅ Processed 1500/14000 (10.71%)
✅ Processed 1600/14000 (11.43%)
✅ Processed 1700/14000 (12.14%)
✅ Processed 1800/14000 (12.86%)
✅ Processed 1900/14000 (13.57%)
✅ Processed 2000/14000 (14.29%)
✅ Processed 2100/14000 (15.00%)
✅ Processed 2200/14000 (15.71%)
✅ Processed 2300/14000 (16.43%)
✅ Processed 2400/14000 (17.14%)
✅ Processed 2500/14000 (17.86%)
✅ Processed 2600/14000 (18.57%)
✅ Processed 2700/14000 (19.29%)
✅ Processed 2800/14000 (20.00%)
✅ Processed 2900/14000 (20.71%)
✅ Processed 3000/14000 (21.43%)
✅ Processed 3100/14000 (22.14%)
✅ Processed 3200/14000 (22.86%

In [21]:
import torch.nn as nn
import torch.optim as optim

class SparseAutoencoder(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(SparseAutoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.LeakyReLU(negative_slope=0.01),  # Helps sparsity
        )
        self.decoder = nn.Sequential(
            nn.Linear(hidden_dim, input_dim),
            nn.Sigmoid()  # Normalize output
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded, encoded  # Return both


In [22]:
conn = psycopg2.connect(
    dbname=DB_NAME,
    user=DB_USER,
    password=DB_PASSWORD,
    host=DB_HOST,
    port=DB_PORT
)
cursor = conn.cursor()

# Retrieve embeddings from PostgreSQL
cursor.execute("SELECT embedding, label FROM text_embeddings")
rows = cursor.fetchall()
print(f"✅ Retrieved {len(rows)} embeddings from PostgreSQL.")

# Convert embeddings back into tensors
embeddings, labels = [], []

for embedding_str, label in rows:
    embedding_list = list(map(float, embedding_str.split(",")))  # Convert string to list
    embeddings.append(torch.tensor(embedding_list, dtype=torch.float32))
    labels.append(label)

# Stack embeddings into a single tensor
residual_embeddings = torch.stack(embeddings)
labels_tensor = torch.tensor(labels, dtype=torch.float32)

print(f"✅ Loaded {len(residual_embeddings)} embeddings from PostgreSQL.")
print(f"✅ Residual embeddings shape: {residual_embeddings.shape}")


✅ Retrieved 14000 embeddings from PostgreSQL.
✅ Loaded 14000 embeddings from PostgreSQL.
✅ Residual embeddings shape: torch.Size([14000, 1024])


In [23]:
# Initialize model
input_dim = residual_embeddings.shape[1]  # Embedding dimension
hidden_dim = 64  # Compressed representation
model = SparseAutoencoder(input_dim, hidden_dim).to("cuda")

# Define optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.MSELoss()

# Training loop
epochs = 100
for epoch in range(epochs):
    optimizer.zero_grad()
    residual_embeddings = residual_embeddings.to("cuda")
    reconstructed, encoded = model(residual_embeddings)

    loss = loss_fn(reconstructed, residual_embeddings)
    loss.backward()
    optimizer.step()

    print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.6f}")


Epoch [1/100], Loss: 0.543013
Epoch [2/100], Loss: 0.535490
Epoch [3/100], Loss: 0.526721
Epoch [4/100], Loss: 0.515521
Epoch [5/100], Loss: 0.502123
Epoch [6/100], Loss: 0.486873
Epoch [7/100], Loss: 0.470118
Epoch [8/100], Loss: 0.452234
Epoch [9/100], Loss: 0.433662
Epoch [10/100], Loss: 0.414844
Epoch [11/100], Loss: 0.396184
Epoch [12/100], Loss: 0.378023
Epoch [13/100], Loss: 0.360641
Epoch [14/100], Loss: 0.344251
Epoch [15/100], Loss: 0.329009
Epoch [16/100], Loss: 0.315015
Epoch [17/100], Loss: 0.302327
Epoch [18/100], Loss: 0.290963
Epoch [19/100], Loss: 0.280894
Epoch [20/100], Loss: 0.272066
Epoch [21/100], Loss: 0.264403
Epoch [22/100], Loss: 0.257806
Epoch [23/100], Loss: 0.252159
Epoch [24/100], Loss: 0.247341
Epoch [25/100], Loss: 0.243246
Epoch [26/100], Loss: 0.239788
Epoch [27/100], Loss: 0.236889
Epoch [28/100], Loss: 0.234474
Epoch [29/100], Loss: 0.232468
Epoch [30/100], Loss: 0.230807
Epoch [31/100], Loss: 0.229432
Epoch [32/100], Loss: 0.228299
Epoch [33/100], L

In [25]:
from sklearn.model_selection import train_test_split

# Split residual embeddings into 80% train, 20% validation
train_embeddings, val_embeddings = train_test_split(residual_embeddings, test_size=0.2, random_state=42)

# Move tensors to GPU
train_embeddings = train_embeddings.to("cuda")
val_embeddings = val_embeddings.to("cuda")

print(f"✅ Training samples: {len(train_embeddings)}, Validation samples: {len(val_embeddings)}")


✅ Training samples: 11200, Validation samples: 2800


In [None]:
# Initialize model
input_dim = residual_embeddings.shape[1]  # Embedding dimension
hidden_dim = 64  # Compressed representation
model = SparseAutoencoder(input_dim, hidden_dim).to("cuda")

# Define optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)  # Add weight decay for regularization
loss_fn = nn.MSELoss()

# Training loop with validation loss
epochs = 10000
best_val_loss = float("inf")  # Track best validation loss

for epoch in range(epochs):
    model.train()  # Set to training mode
    optimizer.zero_grad()
    
    reconstructed, _ = model(train_embeddings)
    train_loss = loss_fn(reconstructed, train_embeddings)
    
    train_loss.backward()
    optimizer.step()

    # Compute validation loss
    model.eval()  # Set to evaluation mode
    with torch.no_grad():
        val_reconstructed, _ = model(val_embeddings)
        val_loss = loss_fn(val_reconstructed, val_embeddings)

    # Save best model based on validation loss
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        print(f"Saving best model with validation loss: {best_val_loss.item():.6f} .. {epoch}")
        torch.save(model.state_dict(), "best_sparse_autoencoder.pth")

    print(f"Epoch [{epoch+1}/{epochs}] - Train Loss: {train_loss.item():.6f} | Validation Loss: {val_loss.item():.6f}")

print("✅ Training completed. Best model saved!")


Saving best model with validation loss: 0.536983
Epoch [1/10000] - Train Loss: 0.543122 | Validation Loss: 0.536983
Saving best model with validation loss: 0.529981
Epoch [2/10000] - Train Loss: 0.536917 | Validation Loss: 0.529981
Saving best model with validation loss: 0.520905
Epoch [3/10000] - Train Loss: 0.529883 | Validation Loss: 0.520905
Saving best model with validation loss: 0.509896
Epoch [4/10000] - Train Loss: 0.520775 | Validation Loss: 0.509896
Saving best model with validation loss: 0.497228
Epoch [5/10000] - Train Loss: 0.509736 | Validation Loss: 0.497228
Saving best model with validation loss: 0.483191
Epoch [6/10000] - Train Loss: 0.497039 | Validation Loss: 0.483191
Saving best model with validation loss: 0.468089
Epoch [7/10000] - Train Loss: 0.482972 | Validation Loss: 0.468089
Saving best model with validation loss: 0.452243
Epoch [8/10000] - Train Loss: 0.467840 | Validation Loss: 0.452243
Saving best model with validation loss: 0.435952
Epoch [9/10000] - Train

In [27]:
import torch
# Define the filename
MODEL_PATH = "sparse_autoencoder.pth"
# Save the model state_dict (weights only)
torch.save(model.state_dict(), MODEL_PATH)
print(f"✅ Model saved successfully at {MODEL_PATH}!")


✅ Model saved successfully at sparse_autoencoder.pth!
