NameError: name 'data_path' is not defined

In [10]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim

# Load the datasets
data_path = "../data/"

# Load only the first 1000 rows of each dataset

dos_data = pd.read_csv(os.path.join(data_path, 'DoS_dataset.csv'), nrows=50000)
fuzzy_data = pd.read_csv(os.path.join(data_path, 'Fuzzy_dataset.csv'), nrows=50000)
gear_data = pd.read_csv(os.path.join(data_path, 'gear_dataset.csv'), nrows=50000)
rpm_data = pd.read_csv(os.path.join(data_path, 'RPM_dataset.csv'), nrows=50000)


# Concatenate datasets
data = pd.concat([dos_data, fuzzy_data, gear_data, rpm_data], axis=0)

# Separate numeric and categorical columns
numeric_features = data.select_dtypes(include=['float64', 'int64']).columns
categorical_features = data.select_dtypes(include=['object']).columns

# Define preprocessing for numeric features (impute missing values with mean, scale them)
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Fill missing values with the mean
    ('scaler', StandardScaler())])  # Normalize the numeric features

# Define preprocessing for categorical features (impute missing values with mode, one-hot encode them)
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Fill missing values with the most frequent value (mode)
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))])  # One-hot encode the categorical features

# Combine both transformers into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Apply transformations to the data
data_preprocessed = preprocessor.fit_transform(data)

# Convert the processed data into a PyTorch tensor
data_tensor = torch.tensor(data_preprocessed, dtype=torch.float32)

# Create DataLoader
batch_size = 64
dataset = TensorDataset(data_tensor)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

print(f"Preprocessed Data Shape: {data_preprocessed.shape}")

# Define the Autoencoder model
class Autoencoder(nn.Module):
    def __init__(self, input_size):
        super(Autoencoder, self).__init__()
        # Encoder: Compressing input data
        self.encoder = nn.Sequential(
            nn.Linear(input_size, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 16)
        )
        # Decoder: Reconstructing the original data
        self.decoder = nn.Sequential(
            nn.Linear(16, 32),
            nn.ReLU(),
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.Linear(64, input_size),
            nn.Sigmoid()  # Using sigmoid to bring values between 0 and 1
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

# Initialize the Autoencoder
input_size = data_preprocessed.shape[1]  # Number of features (columns) after preprocessing
model = Autoencoder(input_size)

# Loss function and optimizer
criterion = nn.MSELoss()  # Mean Squared Error Loss for reconstruction error
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training the Autoencoder
num_epochs = 50
for epoch in range(num_epochs):
    for batch in dataloader:
        inputs = batch[0]  # DataLoader returns a tuple
        # Zero the parameter gradients
        optimizer.zero_grad()
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, inputs)
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
    
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")

# Anomaly Detection: Detect anomalies based on reconstruction error
def detect_anomalies(data_loader, model, threshold):
    model.eval()  # Set the model to evaluation mode
    anomalies = []
    for batch in data_loader:
        inputs = batch[0]
        outputs = model(inputs)
        loss = criterion(outputs, inputs)
        # If the reconstruction error exceeds the threshold, flag as anomaly
        if loss.item() > threshold:
            anomalies.append(inputs)
    return anomalies

# Set a threshold based on training loss (you can adjust this threshold)
threshold = 0.02  # Example threshold, fine-tune based on your data
anomalies = detect_anomalies(dataloader, model, threshold)

# Display the number of anomalies detected
print(f"Number of anomalies detected: {len(anomalies)}")


Preprocessed Data Shape: (200000, 5951)
Epoch 1/50, Loss: 0.0023746995721012354
Epoch 2/50, Loss: 0.0020012937020510435
Epoch 3/50, Loss: 0.0014616844709962606
Epoch 4/50, Loss: 0.0013969638384878635
Epoch 5/50, Loss: 0.0015908514615148306
Epoch 6/50, Loss: 0.001697818166576326
Epoch 7/50, Loss: 0.0015258279163390398
Epoch 8/50, Loss: 0.0015907399356365204
Epoch 9/50, Loss: 0.0017025051638484001
Epoch 10/50, Loss: 0.001883962075226009
Epoch 11/50, Loss: 0.0018172279233112931
Epoch 12/50, Loss: 0.0015567620284855366
Epoch 13/50, Loss: 0.0016923604998737574
Epoch 14/50, Loss: 0.0022242595441639423
Epoch 15/50, Loss: 0.0016540784854441881
Epoch 16/50, Loss: 0.0018038831185549498
Epoch 17/50, Loss: 0.001801273669116199
Epoch 18/50, Loss: 0.0016077545005828142
Epoch 19/50, Loss: 0.0018933367682620883
Epoch 20/50, Loss: 0.0014081960543990135
Epoch 21/50, Loss: 0.0018347245641052723
Epoch 22/50, Loss: 0.0019409796223044395
Epoch 23/50, Loss: 0.0019131222506985068
Epoch 24/50, Loss: 0.00151197