In [1]:
import torch
import psycopg2
from dotenv import load_dotenv
import os
load_dotenv()

conn = psycopg2.connect(
    dbname=os.getenv("DB_NAME"),
    user=os.getenv("DB_USER"),
    password=os.getenv("DB_PASSWORD"),
    host=os.getenv("DB_HOST"),
    port=os.getenv("DB_PORT")
)
cursor = conn.cursor()
cursor = conn.cursor()

# Retrieve embeddings from PostgreSQL
cursor.execute("SELECT embedding, label FROM text_embeddings")
rows = cursor.fetchall()
print(f"âœ… Retrieved {len(rows)} embeddings from PostgreSQL.")

# Convert embeddings back into tensors
embeddings, labels = [], []

for embedding_str, label in rows:
    embedding_list = list(map(float, embedding_str.split(",")))  # Convert string to list
    embeddings.append(torch.tensor(embedding_list, dtype=torch.float32))
    labels.append(label)

# Stack embeddings into a single tensor
residual_embeddings = torch.stack(embeddings)
labels_tensor = torch.tensor(labels, dtype=torch.float32)

print(f"âœ… Loaded {len(residual_embeddings)} embeddings from PostgreSQL.")
print(f"âœ… Residual embeddings shape: {residual_embeddings.shape}")


âœ… Retrieved 14000 embeddings from PostgreSQL.
âœ… Loaded 14000 embeddings from PostgreSQL.
âœ… Residual embeddings shape: torch.Size([14000, 1024])


In [2]:
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim

# Split into 80% train and 20% validation
train_embeddings, val_embeddings, train_labels, val_labels = train_test_split(
    residual_embeddings, labels_tensor, test_size=0.2, random_state=42
)

# Move to GPU
train_embeddings, val_embeddings = train_embeddings.to("cuda"), val_embeddings.to("cuda")

# Define SAE model
class SparseAutoencoder(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(SparseAutoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(hidden_dim, input_dim),
            nn.Sigmoid()
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded, encoded

# Initialize SAE
input_dim = residual_embeddings.shape[1]  # Embedding size
hidden_dim = 128  # Feature extraction size
sae = SparseAutoencoder(input_dim, hidden_dim).to("cuda")

# Define optimizer and loss function
optimizer = optim.Adam(sae.parameters(), lr=1e-3, weight_decay=1e-5)
loss_fn = nn.MSELoss()

# Define L1 regularization (sparsity constraint)
l1_lambda = 0.0001  # Adjust this value based on tuning

# Train SAE
epochs = 500
for epoch in range(epochs):
    sae.train()
    optimizer.zero_grad()
    
    reconstructed, encoded = sae(train_embeddings)
    train_loss = loss_fn(reconstructed, train_embeddings)
    
    # Add L1 penalty
    l1_penalty = l1_lambda * torch.norm(encoded, 1)  
    total_loss = train_loss + l1_penalty  


    train_loss.backward()
    optimizer.step()

    # Compute validation loss
    sae.eval()
    with torch.no_grad():
        val_reconstructed, val_encoded = sae(val_embeddings)
        val_loss = loss_fn(val_reconstructed, val_embeddings)

    print(f"Epoch [{epoch+1}/{epochs}] - Train Loss: {train_loss.item():.6f} | Val Loss: {val_loss.item():.6f}")

print("âœ… SAE training completed!")


Epoch [1/500] - Train Loss: 0.542243 | Val Loss: 0.530349
Epoch [2/500] - Train Loss: 0.530320 | Val Loss: 0.514546
Epoch [3/500] - Train Loss: 0.514485 | Val Loss: 0.492822
Epoch [4/500] - Train Loss: 0.492728 | Val Loss: 0.466045
Epoch [5/500] - Train Loss: 0.465920 | Val Loss: 0.435853
Epoch [6/500] - Train Loss: 0.435699 | Val Loss: 0.404141
Epoch [7/500] - Train Loss: 0.403957 | Val Loss: 0.372778
Epoch [8/500] - Train Loss: 0.372566 | Val Loss: 0.343405
Epoch [9/500] - Train Loss: 0.343175 | Val Loss: 0.317249
Epoch [10/500] - Train Loss: 0.317004 | Val Loss: 0.295005
Epoch [11/500] - Train Loss: 0.294749 | Val Loss: 0.276895
Epoch [12/500] - Train Loss: 0.276625 | Val Loss: 0.262737
Epoch [13/500] - Train Loss: 0.262454 | Val Loss: 0.252053
Epoch [14/500] - Train Loss: 0.251760 | Val Loss: 0.244254
Epoch [15/500] - Train Loss: 0.243952 | Val Loss: 0.238718
Epoch [16/500] - Train Loss: 0.238407 | Val Loss: 0.234817
Epoch [17/500] - Train Loss: 0.234497 | Val Loss: 0.232048
Epoch 

In [3]:
# Extract features from train and validation sets
sae.eval()
with torch.no_grad():
    train_features = sae.encoder(train_embeddings).cpu().numpy()
    val_features = sae.encoder(val_embeddings).cpu().numpy()

# Convert labels to NumPy arrays
train_labels_np = train_labels.cpu().numpy()
val_labels_np = val_labels.cpu().numpy()

print(f"âœ… Extracted features shape: {train_features.shape}")
print(f"âœ… Train labels shape: {train_labels_np.shape}")

âœ… Extracted features shape: (11200, 128)
âœ… Train labels shape: (11200,)


In [4]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Train XGBoost classifier
xgb_model = xgb.XGBClassifier(
objective="binary:logistic", 
    eval_metric="logloss", 
    use_label_encoder=False,
    max_depth=7,  # Increase depth
    learning_rate=0.01,  # Lower learning rate for better optimization
    n_estimators=500  # More trees for improved accuracy
)

xgb_model.fit(train_features, train_labels_np)

# Predict on validation set
val_preds = xgb_model.predict(val_features)

# Compute evaluation metrics
accuracy = accuracy_score(val_labels_np, val_preds)
precision = precision_score(val_labels_np, val_preds)
recall = recall_score(val_labels_np, val_preds)
f1 = f1_score(val_labels_np, val_preds)

print("\nðŸ“Š XGBoost Classification Performance:")
print(f"âœ… Accuracy: {accuracy:.4f}")
print(f"âœ… Precision: {precision:.4f}")
print(f"âœ… Recall: {recall:.4f}")
print(f"âœ… F1 Score: {f1:.4f}")
print("\nâœ… XGBoost model training completed!")

Parameters: { "use_label_encoder" } are not used.




ðŸ“Š XGBoost Classification Performance:
âœ… Accuracy: 0.7704
âœ… Precision: 0.7694
âœ… Recall: 0.7786
âœ… F1 Score: 0.7740

âœ… XGBoost model training completed!


In [5]:
import joblib

# Save SAE encoder
torch.save(sae.state_dict(), "sparse_autoencoder.pth")

# Save XGBoost model
joblib.dump(xgb_model, "xgboost_classifier.pkl")

print("âœ… Models saved successfully!")


âœ… Models saved successfully!
