In [18]:
import pandas as pd
import numpy as np

from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import seaborn as sns

In [9]:
df = pd.read_csv('../../data/processed/df_scaled.csv')

X = df.drop(columns=['Class'])
y = df['Class']

In [12]:
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

X_train = X_train_full[y_train_full == 0].copy()
y_train = y_train_full[y_train_full == 0].copy()

print(f"Train size (Class=0 only): {X_train.shape}")
print(f"Test size (mixed): {X_test.shape}")

Train size (Class=0 only): (227451, 16)
Test size (mixed): (56962, 16)


In [16]:
iso_forest = IsolationForest(
    n_estimators = 100,
    contamination = 'auto',
    random_state = 42,
    verbose = 0
)
iso_forest.fit(X_train)

y_pred_test = iso_forest.predict(X_test)
y_pred_labels = (y_pred_test == -1).astype(int)

print(confusion_matrix(y_test, y_pred_labels))
print(classification_report(y_test, y_pred_labels, digits=4))

[[53288  3576]
 [    9    89]]
              precision    recall  f1-score   support

           0     0.9998    0.9371    0.9675     56864
           1     0.0243    0.9082    0.0473        98

    accuracy                         0.9371     56962
   macro avg     0.5121    0.9226    0.5074     56962
weighted avg     0.9982    0.9371    0.9659     56962



In [21]:
from sklearn.metrics import precision_score, recall_score, f1_score

scores_test = iso_forest.decision_function(X_test)
thresholds = np.linspace(scores_test.min(), scores_test.max(), 200)

results = []

for t in thresholds:
    y_pred = (scores_test < t).astype(int)
    
    recall = recall_score(y_test, y_pred)
    if recall >= 0.75:  # фильтр по минимальному recall
        precision = precision_score(y_test, y_pred, zero_division=0)
        f1 = f1_score(y_test, y_pred)
        results.append((t, precision, recall, f1))

# Проверим, что есть допустимые пороги
if results:
    # Сортируем по f1 (можно заменить на precision, если нужно)
    best_t, best_p, best_r, best_f1 = max(results, key=lambda x: x[3])

    print(f"✅ Best threshold with recall ≥ 0.75:")
    print(f"Threshold: {best_t:.5f}")
    print(f"Precision: {best_p:.4f}, Recall: {best_r:.4f}, F1 Score: {best_f1:.4f}")
else:
    print("❌ Не найдено порогов с recall ≥ 0.75.")


✅ Best threshold with recall ≥ 0.75:
Threshold: -0.12173
Precision: 0.1434, Recall: 0.7551, F1 Score: 0.2410
