# Log Feature Engineering + LSTM Autoencoder (Starter)
This starter notebook builds synthetic features and a tiny LSTM autoencoder to score anomalies. Replace with your real logs via OpenSearch queries.

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset

np.random.seed(42)
hosts = ['host%03d' % i for i in range(50)]
rows = []
for h in hosts:
    for t in range(1000):
        rows.append({'host': h, 'hour': t,
                     'failed_logins': np.random.poisson(0.2),
                     'unique_dst_ips': np.random.poisson(1.2),
                     'bytes': np.random.exponential(300),
                     'process_spawn': np.random.poisson(0.6)})
df = pd.DataFrame(rows)
df.head()

In [None]:
features = ['failed_logins','unique_dst_ips','bytes','process_spawn']
scaler = StandardScaler()
df[features] = scaler.fit_transform(df[features])

h = 'host000'
sub = df[df.host==h].sort_values('hour')
seq_len = 20
sequences = []
for i in range(len(sub)-seq_len):
    sequences.append(sub[features].iloc[i:i+seq_len].values)
import numpy as np
seqs = np.stack(sequences)
print('seqs shape', seqs.shape)

import torch
X = torch.tensor(seqs, dtype=torch.float32)
loader = DataLoader(TensorDataset(X), batch_size=64, shuffle=True)

In [None]:
class LSTMAutoencoder(nn.Module):
    def __init__(self, n_features, latent_dim=64):
        super().__init__()
        self.encoder = nn.LSTM(input_size=n_features, hidden_size=128, num_layers=2, batch_first=True)
        self.fc = nn.Linear(128, latent_dim)
        self.decoder_fc = nn.Linear(latent_dim, 128)
        self.decoder = nn.LSTM(input_size=128, hidden_size=n_features, num_layers=2, batch_first=True)

    def forward(self, x):
        out,_ = self.encoder(x)
        h = out[:, -1, :]
        z = self.fc(h)
        dec_in = self.decoder_fc(z).unsqueeze(1).repeat(1, x.size(1), 1)
        out,_ = self.decoder(dec_in)
        return out

model = LSTMAutoencoder(n_features=len(features))
opt = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.MSELoss()

for epoch in range(5):
    total=0
    for (batch,) in loader:
        recon = model(batch)
        loss = loss_fn(recon, batch)
        opt.zero_grad(); loss.backward(); opt.step()
        total += loss.item()
    print('epoch',epoch,'loss', total/len(loader))

In [None]:
with torch.no_grad():
    recon = model(X)
    mse = torch.mean((recon - X)**2, dim=(1,2)).numpy()

idx = np.argsort(mse)[-10:]
mse[idx], idx[:10]