In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import auc, precision_recall_curve, accuracy_score, precision_score, recall_score
import os
import glob
from helpers import trainer_factory 


def eval_model(model, X_test, y_test):
    score = {}
    predictions = model.predict(X_test)

    # Convert -1 (anomaly) to 0 for consistency with your labels
    predictions[predictions == -1] = 0

    precision, recall, _ = precision_recall_curve(y_test, predictions)
    score["auc"] = auc(recall, precision)
    score["precision"] = precision_score(y_test, predictions)
    score["recall"] = recall_score(y_test, predictions)
    score["accuracy"] = accuracy_score(y_test, predictions)

    return score

In [2]:
folder_path = '/Users/nima/Downloads/archive-2' 


csv_files = glob.glob(os.path.join(folder_path, '*.csv'))


dataframes = []


for file in csv_files:
    df = pd.read_csv(file)  
    dataframes.append(df) 



In [3]:
for i, df in enumerate(dataframes):
    print(f"File {i+1}:")
    # Count the total occurrences of each unique label value
    label_counts = df[' Label'].value_counts()
    # Get the total number of rows (total labels)
    total_labels = df[' Label'].count()

    benign_count = label_counts.get('BENIGN', 0)  
    print(f"  Total Labels: {total_labels}")
    print(f"  BENIGN Labels: {benign_count}")
    print(f"  Anomalous Labels: {total_labels - benign_count}")
    print("-" * 30)

File 1:
  Total Labels: 288602
  BENIGN Labels: 288566
  Anomalous Labels: 36
------------------------------
File 2:
  Total Labels: 529918
  BENIGN Labels: 529918
  Anomalous Labels: 0
------------------------------
File 3:
  Total Labels: 191033
  BENIGN Labels: 189067
  Anomalous Labels: 1966
------------------------------
File 4:
  Total Labels: 286467
  BENIGN Labels: 127537
  Anomalous Labels: 158930
------------------------------
File 5:
  Total Labels: 225745
  BENIGN Labels: 97718
  Anomalous Labels: 128027
------------------------------
File 6:
  Total Labels: 445909
  BENIGN Labels: 432074
  Anomalous Labels: 13835
------------------------------
File 7:
  Total Labels: 692703
  BENIGN Labels: 440031
  Anomalous Labels: 252672
------------------------------
File 8:
  Total Labels: 170366
  BENIGN Labels: 168186
  Anomalous Labels: 2180
------------------------------


In [4]:
from sklearn.preprocessing import LabelEncoder


combined_df = pd.concat(dataframes, ignore_index=True)


combined_df.columns = combined_df.columns.str.strip()


X_combined = combined_df.drop(columns=['Label'])  # Drop the 'Label' column
y_combined = combined_df['Label'].apply(lambda x: 'BENIGN' if x == 'BENIGN' else 'MALICIOUS')  # Set the label to 0 for 'BENIGN' and 1 for 'MALICIOUS'
 


label_encoder = LabelEncoder() # Initialize the label encoder
y_encoded = label_encoder.fit_transform(y_combined) # Encode the labels to 0 and 1


In [5]:
# Ensure X_combined and y_encoded are DataFrames
X_combined = pd.DataFrame(X_combined)
y_encoded = pd.DataFrame(y_encoded)

# Replace infinity values and drop NaNs
X_combined.replace([float('inf'), float('-inf')], float('nan'), inplace=True)
X_combined.dropna(inplace=True)

# Align y_encoded with the index of X_combined
y_encoded = y_encoded.loc[X_combined.index]

# Reset index for DataFrames
X_combined.reset_index(drop=True, inplace=True)
y_encoded.reset_index(drop=True, inplace=True)

# Train/test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_combined, y_encoded, test_size=0.2, random_state=42)

# Standardization
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)


In [6]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.ensemble import IsolationForest


param_grid = {
    'n_estimators': [50, 100, 150],
    'max_samples': [128, 256, 'auto'],
    'contamination': ['auto', 0.01, 0.05],
    'max_features': [0.6, 0.8, 1.0],
}


grid_search = GridSearchCV(
    IsolationForest(random_state=0),
    param_grid,
    scoring='roc_auc',  
    cv=3,                
    n_jobs=-1            
)


grid_search.fit(X_train, y_train)


best_params = grid_search.best_params_
print("Best Parameters Found:", best_params)


isolation_trainer = trainer_factory(IsolationForest, X_train, y_train)
isoforest, _ = isolation_trainer(**best_params)


score = eval_model(isoforest, X_test, y_test)
print("Evaluation Metrics:", score)


Best Parameters Found: {'contamination': 'auto', 'max_features': 1.0, 'max_samples': 128, 'n_estimators': 100}
Evaluation Metrics: {'auc': 0.44604828643517214, 'precision': 0.1551950319079267, 'recall': 0.6721003471352396, 'accuracy': 0.21217130854208807}
