In [1]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import yaml

import pdcast as pdc

while not Path("data") in Path(".").iterdir():
    os.chdir("..")

import pycaret.classification as clf
import sklearn.preprocessing as pre

from sklearn.metrics import average_precision_score

In [2]:
turbo_mode = False

In [3]:
# Read config.
conf_dict = yaml.safe_load(Path("config/conf.yaml").read_text())

persons_df = pd.read_parquet(conf_dict["persons_nodes"])
companies_df = pd.read_parquet(conf_dict["companies_nodes"])
edges_df = pd.read_parquet(conf_dict["edges"])

features_path = conf_dict["preprocessed_features_path"]

In [4]:
# Load features for data split.
def load_features(path_root, split):
    features_dir = Path(path_root) / split
    companies_df = pd.read_parquet(features_dir / "companies.parquet").dropna()
    persons_df = pd.read_parquet(features_dir / "persons.parquet").dropna()
    return companies_df, persons_df

In [5]:
companies_train_df, persons_train_df = load_features(features_path, "train")

In [6]:
select_cols = set(companies_train_df.columns) & set(persons_train_df.columns)
processed_feature_cols = [x for x in select_cols if x.endswith("__processed")]
raw_feature_cols = [x.split("__processed")[0] for x in processed_feature_cols]
target = "is_anomalous"

entities_df = pd.concat([companies_train_df, persons_train_df], axis=0)[select_cols]
# Sample entities_df so that half of the entities are anomalous.
def balanced_sample(entities_df) -> pd.DataFrame:
    n_entities = len(entities_df)
    n_anomalous = len(entities_df[entities_df[target] == True])
    n_normal = n_entities - n_anomalous
    n_sample = min(n_anomalous, n_normal)
    anomalous_df = entities_df.query("is_anomalous == False").sample(n_sample)
    normal_df = entities_df.query("is_anomalous == True").sample(n_sample)
    return pd.concat([anomalous_df, normal_df], axis=0)


drop_cols = ["id", "name", "component"]

entities_features_df = entities_df.drop(drop_cols, axis=1)
balanced_sample_df = balanced_sample(entities_features_df)

In [7]:
# Perform model selection with PyCaret.

categorical_features = ["isCompany"]

s = clf.setup(
    entities_features_df,
    target="is_anomalous",
    categorical_features=categorical_features,
    silent=True,
    feature_selection=True,
    feature_selection_method="boruta",
)

clf.add_metric("apc", "APC", average_precision_score, target="pred_proba")

Unnamed: 0,Description,Value
0,session_id,8630
1,Target,is_anomalous
2,Target Type,Binary
3,Label Encoded,"False: 0, True: 1"
4,Original Data,"(19562, 25)"
5,Missing Values,False
6,Numeric Features,21
7,Categorical Features,3
8,Ordinal Features,False
9,High Cardinality Features,False


Name                                                               APC
Display Name                                                       APC
Score Function       <function average_precision_score at 0x7f0f4f4...
Scorer               make_scorer(average_precision_score, needs_pro...
Target                                                      pred_proba
Args                                                                {}
Greater is Better                                                 True
Multiclass                                                        True
Custom                                                            True
Name: apc, dtype: object

In [8]:
best_model_imbalanced = clf.compare_models(sort="APC", turbo=turbo_mode)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,APC,TT (Sec)
catboost,CatBoost Classifier,0.9313,0.838,0.3477,0.8563,0.4931,0.4634,0.5187,0.5947,4.552
lightgbm,Light Gradient Boosting Machine,0.932,0.8322,0.3636,0.8468,0.5071,0.4769,0.5273,0.5871,0.039
mlp,MLP Classifier,0.9305,0.8154,0.3401,0.8577,0.4836,0.454,0.5119,0.5736,0.825
gbc,Gradient Boosting Classifier,0.9296,0.8173,0.2948,0.9284,0.4462,0.4194,0.4999,0.5693,0.198
rf,Random Forest Classifier,0.9269,0.8208,0.3734,0.7454,0.496,0.4613,0.4943,0.5674,0.192
ada,Ada Boost Classifier,0.9272,0.8042,0.2622,0.9435,0.4092,0.3835,0.4752,0.5297,0.074
et,Extra Trees Classifier,0.9253,0.8172,0.3635,0.7333,0.4839,0.4486,0.482,0.5286,0.148
gpc,Gaussian Process Classifier,0.9279,0.7745,0.359,0.7773,0.4883,0.4554,0.4963,0.5182,456.188
rbfsvm,SVM - Radial Kernel,0.9278,0.7424,0.2744,0.9312,0.4221,0.3958,0.4821,0.5179,2.385
knn,K Neighbors Classifier,0.9255,0.7597,0.3612,0.7359,0.4823,0.4473,0.4815,0.4615,0.024
