In [1]:
import sys
from pathlib import Path

# Add project root to Python path
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

import numpy as np
from src.models import DeepLogModel, DeepLogTrainer, create_data_loaders, evaluate_model, print_metrics

from src.utils.data_loader import create_train_val_test_split, filter_normal_samples

In [2]:
# Define paths
DATA_DIR = '../data/hdfs/preprocessed'

In [3]:
# Load HDFS data
hdfs_data = np.load(f'{DATA_DIR}/HDFS.npz', allow_pickle=True)
X = hdfs_data['x_data']  # Sequences [575061, variable_length]
y = hdfs_data['y_data']  # Labels [575061]

unique_events = set()
for seq in X:
    unique_events.update(seq)

# Create event vocabulary from training data
event_to_id = {event: idx+1 for idx, event in enumerate(sorted(unique_events))}
event_to_id['<PAD>'] = 0  # Padding token
vocab_size = len(event_to_id)
print(f"Vocab size: {vocab_size}")
print(f"Events: {sorted(unique_events)[:10]}")  # See first 10

Vocab size: 30
Events: ['E1', 'E10', 'E11', 'E12', 'E13', 'E14', 'E15', 'E16', 'E17', 'E18']


In [4]:
#  Split data (70/15/15)
splits = create_train_val_test_split(X, y, train_ratio=0.7, val_ratio=0.15, random_state=42)
(X_train, y_train), (X_val, y_val), (X_test, y_test) = splits['train'], splits['val'], splits['test']
X_train, y_train = filter_normal_samples(X_train, y_train, verbose=True)

INFO:src.utils.data_loader:Splitting data: train=0.7, val=0.15, test=0.15
INFO:src.utils.data_loader:Split complete:
INFO:src.utils.data_loader:  - Train: 402542 samples (11786 anomalies)
INFO:src.utils.data_loader:  - Val:   86259 samples (2526 anomalies)
INFO:src.utils.data_loader:  - Test:  86260 samples (2526 anomalies)
INFO:src.utils.data_loader:FILTERING TRAINING DATA FOR SEMI-SUPERVISED LEARNING
INFO:src.utils.data_loader:Original training size: 402542 samples
INFO:src.utils.data_loader:  Normal samples: 390,756 (97.07%)
INFO:src.utils.data_loader:Split complete:
INFO:src.utils.data_loader:  - Train: 402542 samples (11786 anomalies)
INFO:src.utils.data_loader:  - Val:   86259 samples (2526 anomalies)
INFO:src.utils.data_loader:  - Test:  86260 samples (2526 anomalies)
INFO:src.utils.data_loader:FILTERING TRAINING DATA FOR SEMI-SUPERVISED LEARNING
INFO:src.utils.data_loader:Original training size: 402542 samples
INFO:src.utils.data_loader:  Normal samples: 390,756 (97.07%)
INFO:s

In [5]:
#  Convert strings to integers
def convert_to_ids(sequences, event_to_id):
    return [[event_to_id[event] for event in seq] for seq in sequences]

X_train_ids = convert_to_ids(X_train, event_to_id)
X_val_ids = convert_to_ids(X_val, event_to_id)
X_test_ids = convert_to_ids(X_test, event_to_id)


In [6]:
# pad sequences (integers, not strings!)
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_len = int(np.percentile([len(s) for s in X_train_ids], 95))

X_train_padded = pad_sequences(X_train_ids, maxlen=max_len, padding='post', value=0)
X_val_padded = pad_sequences(X_val_ids, maxlen=max_len, padding='post', value=0)
X_test_padded = pad_sequences(X_test_ids, maxlen=max_len, padding='post', value=0)

print(f"Max length: {max_len}")
print(f"Train shape: {X_train_padded.shape}")

Max length: 28
Train shape: (390756, 28)


In [7]:
# Create data loaders
train_loader, val_loader, test_loader = create_data_loaders(
    X_train_padded, y_train,
    X_val_padded, y_val,
    X_test_padded, y_test,
    batch_size=64
)

In [8]:
#  Create model
model = DeepLogModel(
    vocab_size=vocab_size,
    embedding_dim=64,
    hidden_dim=128,
    num_layers=2,
    dropout=0.3
)

In [9]:
# 6. Train
import torch
device = 'mps' if torch.backends.mps.is_available() else 'cpu'
trainer = DeepLogTrainer(model, device=device, learning_rate=0.001)

history = trainer.fit(
    train_loader, val_loader,
    num_epochs=50,
    early_stopping_patience=5,
    print_every=5
)

Training: 100%|██████████| 6106/6106 [00:29<00:00, 205.64it/s]




Epoch 1/50 - 37.31s
  Train Loss: 0.2393
  Val Loss:   0.2396
  ✓ New best model (val_loss: 0.2396)


Training: 100%|██████████| 6106/6106 [00:28<00:00, 213.93it/s]

Training: 100%|██████████| 6106/6106 [00:28<00:00, 211.50it/s]
Training: 100%|██████████| 6106/6106 [00:28<00:00, 211.50it/s]
Training: 100%|██████████| 6106/6106 [00:28<00:00, 210.81it/s]

Training: 100%|██████████| 6106/6106 [00:28<00:00, 212.02it/s]




Epoch 5/50 - 37.56s
  Train Loss: 0.2114
  Val Loss:   0.2349
  ✓ New best model (val_loss: 0.2349)


Training: 100%|██████████| 6106/6106 [00:29<00:00, 209.89it/s]

Training: 100%|██████████| 6106/6106 [00:28<00:00, 211.40it/s]

Training: 100%|██████████| 6106/6106 [00:28<00:00, 210.79it/s]

Training: 100%|██████████| 6106/6106 [00:29<00:00, 210.46it/s]

Training: 100%|██████████| 6106/6106 [00:29<00:00, 210.42it/s]




Epoch 10/50 - 37.80s
  Train Loss: 0.2106
  Val Loss:   0.2336
  ✓ New best model (val_loss: 0.2336)


Training: 100%|██████████| 6106/6106 [00:29<00:00, 209.56it/s]
Training: 100%|██████████| 6106/6106 [00:29<00:00, 209.56it/s]
Training: 100%|██████████| 6106/6106 [00:28<00:00, 211.55it/s]

Training: 100%|██████████| 6106/6106 [00:28<00:00, 211.06it/s]

Training: 100%|██████████| 6106/6106 [00:28<00:00, 212.44it/s]

Training: 100%|██████████| 6106/6106 [00:29<00:00, 207.82it/s]




Epoch 15/50 - 38.07s
  Train Loss: 0.2102
  Val Loss:   0.2349
  No improvement (3/5)


Training: 100%|██████████| 6106/6106 [00:29<00:00, 209.94it/s]

Training: 100%|██████████| 6106/6106 [00:28<00:00, 210.76it/s]




Early stopping triggered after 17 epochs

✓ Loaded best model (val_loss: 0.2334)
Total training time: 640.28s


In [15]:
#  Detect anomalies
predictions = trainer.detect_anomalies(test_loader, top_k=4)

100%|██████████| 1348/1348 [00:08<00:00, 153.41it/s]
100%|██████████| 1348/1348 [00:08<00:00, 153.41it/s]


In [13]:
# Evaluate (get labels from test data)
metrics = evaluate_model(predictions, y_test)
print_metrics(metrics)


EVALUATION METRICS
Accuracy:  0.9829 (98.29%)
Precision: 0.8183
Recall:    0.5348
F1-Score:  0.6469

Confusion Matrix:
              Predicted
              Normal  Anomaly
Actual Normal    83434      300
       Anomaly    1175     1351


In [18]:
# Save results to file
results = {
    'dataset': 'HDFS',
    'model': 'DeepLog',
    'architecture': {
        'vocab_size': vocab_size,
        'embedding_dim': model.embedding_dim,
        'hidden_dim': model.hidden_dim,
        'num_layers': model.num_layers,
        'dropout': model.dropout
    },
    'training': {
        'num_epochs': len(history['train_losses']),
        'batch_size': 64,
        'learning_rate': 0.001,
        'early_stopping_patience': 5,
        'best_val_loss': float(history['best_val_loss']),
        'training_time_seconds': float(history['total_time'])
    },
    'data': {
        'train_size': len(y_train),
        'val_size': len(y_val),
        'test_size': len(y_test),
        'max_sequence_length': max_len,
        'train_normal_only': True  # Critical: trained only on normal samples
    },
    'detection': {
        'top_k': 4,
        'method': 'next_event_prediction'
    },
    'metrics': metrics
}

import json
with open('../results/hdfs_deeplog_results.json', 'w') as f:
    json.dump(results, f, indent=2)

print("✓ Results saved to results/hdfs_deeplog_results.json")

# Display summary table
print("\n" + "=" * 70)
print("EXPERIMENT SUMMARY")
print("=" * 70)
print(f"Dataset:              HDFS")
print(f"Model:                DeepLog (LSTM-based)")
print(f"\nArchitecture:")
print(f"  Vocabulary size:    {vocab_size}")
print(f"  Embedding dim:      {model.embedding_dim}")
print(f"  Hidden dim:         {model.hidden_dim}")
print(f"  LSTM layers:        {model.num_layers}")
print(f"  Dropout:            {model.dropout}")
print(f"  Total parameters:   {sum(p.numel() for p in model.parameters()):,}")
print(f"\nTraining:")
print(f"  Train samples:      {len(y_train):,} (NORMAL ONLY)")
print(f"  Val samples:        {len(y_val):,}")
print(f"  Test samples:       {len(y_test):,}")
print(f"  Epochs trained:     {len(history['train_losses'])}")
print(f"  Best val loss:      {history['best_val_loss']:.4f}")
print(f"  Training time:      {history['total_time']:.1f}s")
print(f"\nDetection:")
print(f"  Method:             Top-k next event prediction")
print(f"  Top-k:              9")
print(f"\nTest Performance:")
print(f"  Accuracy:           {metrics['accuracy']:.4f}")
print(f"  Precision:          {metrics['precision']:.4f}")
print(f"  Recall:             {metrics['recall']:.4f}")
print(f"  F1 Score:           {metrics['f1']:.4f}")
if 'auc' in metrics:
    print(f"  AUC:                {metrics['auc']:.4f}")
print("=" * 70)


✓ Results saved to results/hdfs_deeplog_results.json

EXPERIMENT SUMMARY
Dataset:              HDFS
Model:                DeepLog (LSTM-based)

Architecture:
  Vocabulary size:    30
  Embedding dim:      64
  Hidden dim:         128
  LSTM layers:        2
  Dropout:            0.3
  Total parameters:   237,214

Training:
  Train samples:      390,756 (NORMAL ONLY)
  Val samples:        86,259
  Test samples:       86,260
  Epochs trained:     17
  Best val loss:      0.2334
  Training time:      640.3s

Detection:
  Method:             Top-k next event prediction
  Top-k:              9

Test Performance:
  Accuracy:           0.9838
  Precision:          0.9358
  Recall:             0.4790
  F1 Score:           0.6337


In [22]:
save_dir = Path('../mdls')
save_dir.mkdir(parents=True, exist_ok=True)
save_path = save_dir / 'deeplog_checkpoint.pt'
trainer.save_model(str(save_path))

✓ Model saved to ../mdls/deeplog_checkpoint.pt
