In [1]:
import sys
from pathlib import Path

project_root = Path.cwd().parent
sys.path.append(str(project_root))

In [37]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

from src.utils import find_best_threshold

In [3]:
train_path = "../data/processed/df_train_with_anomaly.csv"
val_path = "../data/processed/df_val_with_anomaly.csv"
test_path = "../data/processed/df_test_with_anomaly.csv"

In [4]:
train = pd.read_csv(train_path)
val = pd.read_csv(val_path)
test = pd.read_csv(test_path)

In [5]:
X_train = train.drop(columns=['Class'])
y_train = train['Class']

X_val = val.drop(columns=['Class'])
y_val = val['Class']

X_test= test.drop(columns=['Class'])
y_test = test['Class']

In [6]:
y_train.value_counts()

Class
0    199020
1       344
Name: count, dtype: int64

In [7]:
logreg = LogisticRegression(
    random_state=42,
    class_weight='balanced',
    max_iter=1000,
    solver='liblinear'
)

logreg.fit(X_train, y_train)

In [8]:
val_preds = logreg.predict(X_val)
val_probs = logreg.predict_proba(X_val)[:, 1]  # вероятности класса 1

print("Validation Results:")
print(classification_report(y_val, val_preds, digits=4))
print("AUC:", roc_auc_score(y_val, val_probs))

print("Confusion matrix:")
print(confusion_matrix(y_val, val_preds))

Validation Results:
              precision    recall  f1-score   support

           0     0.9997    0.9760    0.9877     42647
           1     0.0581    0.8514    0.1087        74

    accuracy                         0.9758     42721
   macro avg     0.5289    0.9137    0.5482     42721
weighted avg     0.9981    0.9758    0.9862     42721

AUC: 0.9743174482663779
Confusion matrix:
[[41625  1022]
 [   11    63]]


In [9]:
coeff_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficient': logreg.coef_[0]
}).sort_values(by='Coefficient', key=abs, ascending=False)

print(coeff_df)

          Feature  Coefficient
8              V4     1.331481
1             V14    -0.697042
2             V12    -0.652928
3             V10    -0.651235
4             V16    -0.441401
7             V11     0.333611
12             V5     0.231304
14           Hour     0.192423
5              V3    -0.178657
6              V7    -0.128430
10             V1     0.107953
9             V18     0.099892
16  anomaly_score    -0.097969
13             V2    -0.050311
15      LogAmount    -0.045113
0             V17     0.015786
11             V9    -0.008364


In [10]:
best_thresh_result = find_best_threshold(y_val, val_probs, target_recall=0.817, direction='higher')
print(f"Best threshold: {best_thresh_result['threshold']:.4f}")
print(f"Precision: {best_thresh_result['precision']:.4f}, Recall: {best_thresh_result['recall']:.4f}, F1: {best_thresh_result['f1']:.4f}")

Best threshold: 0.9620
Precision: 0.4296, Recall: 0.8243, F1: 0.5648


In [11]:
final_preds = (val_probs >= best_thresh_result['threshold']).astype(int)

print("Validation Results:")
print(classification_report(y_val, final_preds, digits=4))
print("AUC:", roc_auc_score(y_val, final_preds))

print("Confusion matrix:")
print(confusion_matrix(y_val, final_preds))

Validation Results:
              precision    recall  f1-score   support

           0     0.9997    0.9981    0.9989     42647
           1     0.4296    0.8243    0.5648        74

    accuracy                         0.9978     42721
   macro avg     0.7146    0.9112    0.7819     42721
weighted avg     0.9987    0.9978    0.9981     42721

AUC: 0.9112125056798774
Confusion matrix:
[[42566    81]
 [   13    61]]


In [12]:
rf = RandomForestClassifier(
    n_estimators=100,
    class_weight='balanced',
    max_depth=10,
    random_state=42
)
rf.fit(X_train, y_train)

In [52]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

In [53]:
voting_clf = VotingClassifier(
    estimators=[
        ('rf', rf),
        ('knn', knn),
        ('logreg', logreg)
    ],
    voting='soft'
)

voting_clf.fit(X_train, y_train)

In [54]:
probs = voting_clf.predict_proba(X_val)[:, 1]
preds = voting_clf.predict(X_val)

print("Validation Results:")
print(classification_report(y_val, preds, digits=4))
print("AUC:", roc_auc_score(y_val, probs))

print("Confusion matrix:")
print(confusion_matrix(y_val, preds))

Validation Results:
              precision    recall  f1-score   support

           0     0.9997    0.9992    0.9994     42647
           1     0.6316    0.8108    0.7101        74

    accuracy                         0.9989     42721
   macro avg     0.8156    0.9050    0.8547     42721
weighted avg     0.9990    0.9989    0.9989     42721

AUC: 0.9766214029819912
Confusion matrix:
[[42612    35]
 [   14    60]]


In [55]:
ensemble_threshold = find_best_threshold(y_val, probs, target_recall=0.82, direction='higher')

print(f"Best threshold: {ensemble_threshold['threshold']:.4f}")
print(f"Precision: {ensemble_threshold['precision']:.4f}, Recall: {ensemble_threshold['recall']:.4f}, F1: {ensemble_threshold['f1']:.4f}")

Best threshold: 0.4303
Precision: 0.5495, Recall: 0.8243, F1: 0.6595


In [56]:
preds = (probs > ensemble_threshold['threshold']).astype(int)

print("Validation Results:")
print(classification_report(y_val, preds, digits=4))
print("AUC:", roc_auc_score(y_val, probs))

print("Confusion matrix:")
print(confusion_matrix(y_val, preds))

Validation Results:
              precision    recall  f1-score   support

           0     0.9997    0.9988    0.9993     42647
           1     0.5495    0.8243    0.6595        74

    accuracy                         0.9985     42721
   macro avg     0.7746    0.9116    0.8294     42721
weighted avg     0.9989    0.9985    0.9987     42721

AUC: 0.9766214029819912
Confusion matrix:
[[42597    50]
 [   13    61]]
