<a href="https://colab.research.google.com/github/efrat-dev/insider-threat-detector/blob/main/First_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import roc_auc_score, classification_report
import warnings
warnings.filterwarnings('ignore')

# Set random seeds
torch.manual_seed(42)
np.random.seed(42)

def load_and_preprocess_data(filepath):
    """Load and preprocess the real data"""
    print("Loading data...")

    # Try to load the CSV file
    try:
        df = pd.read_csv('/content/drive/MyDrive/processed_data.csv')
        # df = pd.read_csv(filepath, on_bad_lines='skip')
        print(f"Loaded data with shape: {df.shape}")
        print(f"Columns: {df.columns.tolist()}")
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

    # Basic preprocessing
    print("Preprocessing data...")

    # Find feature columns (exclude employee_id, date, is_malicious)
    exclude_cols = ['employee_id', 'is_malicious']
    if 'date' in df.columns:
        exclude_cols.append('date')

    feature_cols = [col for col in df.columns if col not in exclude_cols]
    print(f"Found {len(feature_cols)} feature columns")

    # Handle categorical columns
    processed_df = df.copy()
    for col in feature_cols:
        if processed_df[col].dtype == 'object':
            le = LabelEncoder()
            processed_df[col] = le.fit_transform(processed_df[col].astype(str))
        else:
            processed_df[col] = pd.to_numeric(processed_df[col], errors='coerce')
            processed_df[col].fillna(processed_df[col].mean(), inplace=True)

    return processed_df, feature_cols

class SimpleDataset(Dataset):
    def __init__(self, df, feature_cols, seq_len=180, scaler=None):
        self.seq_len = seq_len
        self.feature_cols = feature_cols
        self.samples = []
        self.labels = []

        print(f"Processing {len(df['employee_id'].unique())} employees...")

        for emp_id in df['employee_id'].unique():
            emp_df = df[df['employee_id'] == emp_id]

            # Sort by date if available
            if 'date' in emp_df.columns:
                emp_df = emp_df.sort_values('date')

            features = emp_df[feature_cols].values
            label = emp_df['is_malicious'].iloc[0]

            # Skip if no data
            if len(features) == 0:
                continue

            # Pad or truncate to seq_len
            if len(features) >= seq_len:
                features = features[-seq_len:]  # Take last seq_len days
            else:
                # Pad with last known values
                padding = np.tile(features[-1], (seq_len - len(features), 1))
                features = np.vstack([features, padding])

            if scaler:
                features = scaler.transform(features)

            self.samples.append(torch.tensor(features, dtype=torch.float32))
            self.labels.append(torch.tensor(label, dtype=torch.float32))

        print(f"Created {len(self.samples)} samples")
        pos_samples = sum(self.labels)
        print(f"Positive samples: {pos_samples}, Negative samples: {len(self.labels) - pos_samples}")

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx], self.labels[idx]

class SimpleLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim=128):
        super(SimpleLSTM, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True, dropout=0.2)
        self.fc = nn.Linear(hidden_dim, 1)
        self.dropout = nn.Dropout(0.3)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        out = self.dropout(lstm_out[:, -1, :])  # Take last time step
        return self.fc(out).squeeze()

def main():
    print("=== Simple Pipeline Test with Real Data ===")

    # Load your real data
    df, feature_cols = load_and_preprocess_data('processed_data.csv')

    if df is None:
        print("Failed to load data!")
        return None, None

    # Use all features
    print(f"Using all {len(feature_cols)} features")

    # Split employees
    employee_ids = df['employee_id'].unique()
    employee_labels = df.groupby('employee_id')['is_malicious'].first()

    train_ids, test_ids = train_test_split(
        employee_ids, test_size=0.3,
        stratify=employee_labels, random_state=42
    )

    train_df = df[df['employee_id'].isin(train_ids)]
    test_df = df[df['employee_id'].isin(test_ids)]

    print(f"Train employees: {len(train_ids)}, Test employees: {len(test_ids)}")

    # Fit scaler on training data
    scaler = StandardScaler()
    scaler.fit(train_df[feature_cols])

    # Create datasets (full 180 days sequence)
    train_dataset = SimpleDataset(train_df, feature_cols, seq_len=180, scaler=scaler)
    test_dataset = SimpleDataset(test_df, feature_cols, seq_len=180, scaler=scaler)

    if len(train_dataset) == 0:
        print("No training samples found!")
        return None, None

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

    print(f"Train samples: {len(train_dataset)}, Test samples: {len(test_dataset)}")

    # Initialize model
    model = SimpleLSTM(input_dim=len(feature_cols))
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # Quick training (20 epochs for better results)
    print("\n=== Training ===")
    model.train()
    for epoch in range(20):
        total_loss = 0
        for batch_x, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_x)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"Epoch {epoch+1}: Loss = {total_loss/len(train_loader):.4f}")

    # Evaluate
    print("\n=== Evaluation ===")
    model.eval()
    all_preds = []
    all_probs = []
    all_labels = []

    with torch.no_grad():
        for batch_x, batch_y in test_loader:
            outputs = model(batch_x)
            probs = torch.sigmoid(outputs)
            preds = (probs > 0.5).float()

            all_preds.extend(preds.numpy())
            all_probs.extend(probs.numpy())
            all_labels.extend(batch_y.numpy())

    # Calculate metrics
    if len(set(all_labels)) > 1:  # Check if we have both classes
        auc_score = roc_auc_score(all_labels, all_probs)
        print(f"AUC Score: {auc_score:.4f}")
    else:
        print("Only one class found in test set")
        auc_score = 0.0

    print("\nClassification Report:")
    print(classification_report(all_labels, all_preds))

    print("\n=== Pipeline Test Complete ===")
    print("✓ Real data loading works")
    print("✓ Data preprocessing works")
    print("✓ Model training works")
    print("✓ Evaluation works")
    print(f"✓ Final AUC: {auc_score:.4f}")

    return model, auc_score

if __name__ == "__main__":
    model, auc_score = main()

=== Simple Pipeline Test with Real Data ===
Loading data...
Loaded data with shape: (180000, 161)
Columns: ['employee_id', 'employee_seniority_years', 'is_malicious', 'num_print_commands', 'num_bw_prints', 'ratio_color_prints', 'day', 'day_of_year', 'entry_minute', 'exit_hour', 'exit_minute', 'print_intensity', 'print_efficiency', 'avg_presence_per_entry', 'behavioral_risk_advanced', 'time_consistency_score', 'media_risk_score', 'employee_department_label', 'employee_department_target', 'employee_department_freq', 'employee_campus_cat_Campus A', 'employee_campus_cat_Campus B', 'employee_campus_cat_Campus C', 'employee_campus_label', 'employee_position_label', 'employee_position_target', 'employee_position_freq', 'employee_classification_cat_2', 'employee_classification_cat_3', 'employee_classification_label', 'employee_origin_country_label', 'employee_origin_country_target', 'employee_origin_country_freq', 'print_campuses_cat_0', 'num_exits_cat_1', 'num_exits_label', 'weekday_label', '