In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report

# Load dataset
df = pd.read_csv("dataset.csv")

# Separate features and target
X = df.drop(columns=["Unnamed: 0", "label"])
y = df["label"]

# Normalize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Use majority class (0) as 'normal'
X_majority = X_scaled[y == 0]
X_rest = X_scaled
y_true = y

# Train Isolation Forest on majority class only
iso_forest = IsolationForest(contamination=0.01, random_state=42)
iso_forest.fit(X_majority)

# Predict outliers for all data
y_pred_outlier = iso_forest.predict(X_rest)  # -1 for outliers, 1 for inliers

# Map to label-like prediction: assume -1 (outlier) = minority, 1 (inlier) = majority
y_pred_label = [0 if p == 1 else 1 for p in y_pred_outlier]

# Evaluate how well it detects non-majority as "outliers"
y_binary_true = [0 if label == 0 else 1 for label in y_true]
print(classification_report(y_binary_true, y_pred_label))


              precision    recall  f1-score   support

           0       0.78      0.99      0.88    335847
           1       0.07      0.00      0.01     91778

    accuracy                           0.78    427625
   macro avg       0.43      0.50      0.44    427625
weighted avg       0.63      0.78      0.69    427625

