In [114]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [126]:
# Directory paths
DATASET_DIR = "/Users/darth/Dev/stuProj/datasets/hai-21.03"  # Update with your dataset path
DISTANCES_DIR = "/Users/darth/Dev/stuProj/distances/hai-21.03"  # Update with your desired output path

In [118]:
# Step 1: Filter out rows where any attack columns have a value of 1
def filter_attack_data(df):
    attack_columns = [col for col in df.columns if 'attack' in col]
    filtered_df = df[(df[attack_columns] == 0).all(axis=1)].copy()
    return filtered_df

In [120]:
# Step 2: Min-Max Scaling with Sklearn
def min_max_normalize_sklearn(df):
    physical_columns = [col for col in df.columns if 'attack' not in col and col != 'time']
    scaler = MinMaxScaler()
    df[physical_columns] = scaler.fit_transform(df[physical_columns])
    return df

In [122]:
# Step 3: Calculate Euclidean Distances Between Consecutive Rows
def calculate_euclidean_distances(df):
    physical_columns = [col for col in df.columns if 'attack' not in col and col != 'time']
    data = df[physical_columns].values
    differences = data[1:] - data[:-1]
    distances = np.sqrt(np.sum(differences ** 2, axis=1))
    return distances

In [104]:
# Apply all steps for test and train data
filtered_test_data = filter_attack_data(test_data)
filtered_train_data = filter_attack_data(train_data)

In [106]:
# Normalize the data
normalized_test_data = min_max_normalize_sklearn(filtered_test_data)
normalized_train_data = min_max_normalize_sklearn(filtered_train_data)

In [108]:
# Calculate Euclidean distances
test_distances = calculate_euclidean_distances(normalized_test_data)
train_distances = calculate_euclidean_distances(normalized_train_data)

In [110]:
# Display first few calculated distances
print("Test distances sample:", test_distances[:5])
print("Train distances sample:", train_distances[:5])

Test distances sample: [0.5760887448355914, 0.46091448708332977, 0.30962094756359804, 0.3396309496557332, 0.37353499415228125]
Train distances sample: [0.2333060293184356, 0.28988407724969073, 0.3318059489611643, 0.451542016021968, 0.3878137728864617]


In [112]:
normalized_test_data.describe()

Unnamed: 0,P1_B2004,P1_B2016,P1_B3004,P1_B3005,P1_B4002,P1_B4005,P1_B400B,P1_B4022,P1_FCV01D,P1_FCV01Z,...,P4_ST_GOV,P4_ST_LD,P4_ST_PO,P4_ST_PS,P4_ST_PT01,P4_ST_TT01,attack,attack_P1,attack_P2,attack_P3
count,42572.0,42572.0,42572.0,42572.0,42572.0,42572.0,42572.0,42572.0,42572.0,42572.0,...,42572.0,42572.0,42572.0,42572.0,42572.0,42572.0,42572.0,42572.0,42572.0,42572.0
mean,0.566855,0.379597,0.380418,0.691455,0.431554,0.626656,0.650963,0.482464,0.559034,0.55618,...,0.380108,0.367503,0.379139,0.419971,0.310929,0.795182,0.0,0.0,0.0,0.0
std,0.373375,0.138846,0.410179,0.356456,0.397155,0.402515,0.407916,0.199274,0.422634,0.423504,...,0.138665,0.144954,0.13913,0.49356,0.047718,0.275137,0.0,0.0,0.0,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.465116,0.29877,0.098685,0.553125,0.0,0.213502,0.232829,0.3204,0.148044,0.140693,...,0.298955,0.268052,0.297872,0.0,0.313492,0.647727,0.0,0.0,0.0,0.0
50%,0.465116,0.379122,0.098685,0.776771,0.536136,0.812618,0.962887,0.494547,0.521847,0.521536,...,0.379848,0.369788,0.379149,0.0,0.321429,0.977273,0.0,0.0,0.0,0.0
75%,0.992248,0.459452,0.772198,1.0,0.679155,1.0,0.991586,0.63932,1.0,0.998011,...,0.459712,0.459538,0.45911,1.0,0.321429,1.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0


In [128]:
# Process each file
for root, dirs, files in os.walk(DATASET_DIR):
    for file in files:
        if "test" in file.lower() or "train" in file.lower():
            # Full path to the file
            file_path = os.path.join(root, file)
            # Load data
            df = pd.read_csv(file_path)
            
            # Filter and normalize data
            filtered_data = filter_attack_data(df)
            normalized_data = min_max_normalize_sklearn(filtered_data)
            
            # Calculate distances
            distances = calculate_euclidean_distances(normalized_data)
            
            # Save distances to CSV in the "distances" directory
            dataset_version = os.path.basename(root)  # e.g., hai-21.03
            output_dir = os.path.join(DISTANCES_DIR, dataset_version)
            os.makedirs(output_dir, exist_ok=True)
            
            output_file_path = os.path.join(output_dir, f"{file}_distances.csv")
            pd.DataFrame(distances, columns=["Euclidean_Distance"]).to_csv(output_file_path, index=False)
            
            print(f"Processed {file} and saved distances to {output_file_path}")

Processed train1.csv and saved distances to /Users/darth/Dev/stuProj/distances/hai-21.03/hai-21.03/train1.csv_distances.csv
Processed train3.csv and saved distances to /Users/darth/Dev/stuProj/distances/hai-21.03/hai-21.03/train3.csv_distances.csv
Processed train2.csv and saved distances to /Users/darth/Dev/stuProj/distances/hai-21.03/hai-21.03/train2.csv_distances.csv
Processed test4.csv and saved distances to /Users/darth/Dev/stuProj/distances/hai-21.03/hai-21.03/test4.csv_distances.csv
Processed test5.csv and saved distances to /Users/darth/Dev/stuProj/distances/hai-21.03/hai-21.03/test5.csv_distances.csv
Processed test2.csv and saved distances to /Users/darth/Dev/stuProj/distances/hai-21.03/hai-21.03/test2.csv_distances.csv
Processed test3.csv and saved distances to /Users/darth/Dev/stuProj/distances/hai-21.03/hai-21.03/test3.csv_distances.csv
Processed test1.csv and saved distances to /Users/darth/Dev/stuProj/distances/hai-21.03/hai-21.03/test1.csv_distances.csv
