In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.svm import OneClassSVM

import sys
import os

module_path = os.path.abspath(os.path.join('..'))  # Подняться на уровень выше (в src/)
if module_path not in sys.path:
    sys.path.append(module_path)
    
from utils import find_best_threshold
from utils import add_anomaly_score

In [3]:
input_path = '../../data/processed/df_scaled.csv'
output_path = '../../data/processed/df_for_classifier.csv'
model_path = '../../models/oneclass_svm_model.joblib'

In [4]:
df = pd.read_csv(input_path)

X = df.drop(columns=['Class'])
y = df['Class']

In [5]:
# 2. Разделим на train / val / test
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, 
    test_size=0.3, 
    stratify=y, 
    random_state=42
)

# Второе разделение: 30% temp -> 15% val / 15% test (50/50 от temp)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, 
    test_size=0.5,  # 0.5 * 0.3 = 0.15 от исходных данных
    stratify=y_temp, 
    random_state=42
)

X_train_0 = X_train[y_train == 0].copy()
y_train_0 = y_train[y_train == 0].copy()

print(f"Train size (Class=0 only): {X_train_0.shape}")
print(f"Validation size (mixed): {X_val.shape}")
print(f"Test size (mixed): {X_test.shape}")

Train size (Class=0 only): (199020, 16)
Validation size (mixed): (42721, 16)
Test size (mixed): (42722, 16)


In [6]:
ocsvm = OneClassSVM(kernel='rbf', gamma='auto', nu=0.00173)
ocsvm.fit(X_train_0)

ocsvm_scores = ocsvm.decision_function(X_val)  

In [7]:
y_pred_val = ocsvm.predict(X_val)
y_pred_labels = (y_pred_val == -1).astype(int)

print(confusion_matrix(y_val, y_pred_labels))
print(classification_report(y_val, y_pred_labels, digits=4))

[[42369   278]
 [   17    57]]
              precision    recall  f1-score   support

           0     0.9996    0.9935    0.9965     42647
           1     0.1701    0.7703    0.2787        74

    accuracy                         0.9931     42721
   macro avg     0.5849    0.8819    0.6376     42721
weighted avg     0.9982    0.9931    0.9953     42721



In [8]:
ocsvm_result = find_best_threshold(y_val, ocsvm_scores, target_recall=0.74)
print(f"Threshold: {ocsvm_result['threshold']:.5f}")
print(f"Precision: {ocsvm_result['precision']:.4f}, Recall: {ocsvm_result['recall']:.4f}, F1 Score: {ocsvm_result['f1']:.4f}")

Threshold: -0.21708
Precision: 0.4074, Recall: 0.7432, F1 Score: 0.5263


In [9]:
ocsvm_preds = (ocsvm_scores < ocsvm_result['threshold']).astype(int)

print(confusion_matrix(y_val, ocsvm_preds))
print(classification_report(y_val, ocsvm_preds, digits=4))

[[42567    80]
 [   19    55]]
              precision    recall  f1-score   support

           0     0.9996    0.9981    0.9988     42647
           1     0.4074    0.7432    0.5263        74

    accuracy                         0.9977     42721
   macro avg     0.7035    0.8707    0.7626     42721
weighted avg     0.9985    0.9977    0.9980     42721



In [10]:
ANOMALY_THRESHOLD = ocsvm_result['threshold']
print(ANOMALY_THRESHOLD)

-0.2170772639326034


In [11]:
df_train = add_anomaly_score(X_train, ocsvm)
df_val = add_anomaly_score(X_val, ocsvm)
df_test = add_anomaly_score(X_test, ocsvm)

df_train['Class'] = y_train.values
df_val['Class'] = y_val.values
df_test['Class'] = y_test.values

In [12]:
df_train.to_csv('../../data/processed/df_train_with_anomaly.csv', index=False)
df_val.to_csv('../../data/processed/df_val_with_anomaly.csv', index=False)
df_test.to_csv('../../data/processed/df_test_with_anomaly.csv', index=False)

joblib.dump(ocsvm, 'oneclasssvm_anomaly_detector.joblib')

['oneclasssvm_anomaly_detector.joblib']