In [1]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import yaml

import pdcast as pdc

while not Path("data") in Path(".").iterdir():
    os.chdir("..")

import pycaret.classification as clf
import sklearn.preprocessing as pre

from sklearn.metrics import average_precision_score


In [2]:
turbo_mode = False

In [3]:
# Read config.
conf_dict = yaml.safe_load(Path("config/conf.yaml").read_text())

persons_df = pd.read_parquet(conf_dict["persons_nodes"])
companies_df = pd.read_parquet(conf_dict["companies_nodes"])
edges_df = pd.read_parquet(conf_dict["edges"])

features_path = conf_dict["preprocessed_features_path"]

In [4]:
# Load features for data split.
def load_features(path_root, split):
    features_dir = Path(path_root) / split
    companies_df = pd.read_parquet(features_dir / "companies.parquet").dropna()
    persons_df = pd.read_parquet(features_dir / "persons.parquet").dropna()
    return companies_df, persons_df

In [5]:
companies_train_df, persons_train_df = load_features(features_path, "train") 

In [6]:
select_cols = set(companies_train_df.columns) & set(persons_train_df.columns)
processed_feature_cols = [x for x in select_cols if x.endswith("__processed")]
raw_feature_cols = [x.split("__processed")[0] for x in processed_feature_cols]
target = "is_anomalous"

entities_df = pd.concat([companies_train_df, persons_train_df], axis=0)[select_cols]
# Sample entities_df so that half of the entities are anomalous.
def balanced_sample(entities_df) -> pd.DataFrame:
    n_entities = len(entities_df)
    n_anomalous = len(entities_df[entities_df[target] == True])
    n_normal = n_entities - n_anomalous
    n_sample = min(n_anomalous, n_normal)
    anomalous_df = entities_df.query("is_anomalous == False").sample(n_sample)
    normal_df = entities_df.query("is_anomalous == True").sample(n_sample)
    return pd.concat([anomalous_df, normal_df], axis=0)
    
drop_cols = ["id", "name", "component"]

entities_features_df = entities_df.drop(drop_cols, axis=1)
balanced_sample_df = balanced_sample(entities_features_df)


In [7]:
# Perform model selection with PyCaret.

categorical_features = ["isCompany"]


s = clf.setup(
    balanced_sample_df,
    target="is_anomalous",
    categorical_features=categorical_features,
    silent=True,
    feature_selection=True,
    feature_selection_method="boruta",
)

clf.add_metric('apc', 'APC', average_precision_score, target='pred_proba')


Unnamed: 0,Description,Value
0,session_id,8621
1,Target,is_anomalous
2,Target Type,Binary
3,Label Encoded,"False: 0, True: 1"
4,Original Data,"(3804, 25)"
5,Missing Values,False
6,Numeric Features,18
7,Categorical Features,6
8,Ordinal Features,False
9,High Cardinality Features,False


Name                                                               APC
Display Name                                                       APC
Score Function       <function average_precision_score at 0x7f1a36f...
Scorer               make_scorer(average_precision_score, needs_pro...
Target                                                      pred_proba
Args                                                                {}
Greater is Better                                                 True
Multiclass                                                        True
Custom                                                            True
Name: apc, dtype: object

In [8]:
best_model_balanced = clf.compare_models(sort="APC", turbo=turbo_mode)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,APC,TT (Sec)
catboost,CatBoost Classifier,0.7536,0.8388,0.6831,0.7966,0.7351,0.5073,0.5129,0.8629,1.301
gbc,Gradient Boosting Classifier,0.7494,0.8307,0.6614,0.8043,0.7254,0.4991,0.5077,0.8572,0.047
lightgbm,Light Gradient Boosting Machine,0.7498,0.826,0.6861,0.7887,0.7331,0.4998,0.5049,0.8512,0.018
mlp,MLP Classifier,0.7539,0.8233,0.6711,0.8081,0.731,0.5081,0.5175,0.8509,0.153
rf,Random Forest Classifier,0.7536,0.8233,0.6988,0.787,0.7396,0.5073,0.5112,0.8453,0.082
ada,Ada Boost Classifier,0.7231,0.8006,0.6487,0.7647,0.7012,0.4465,0.4524,0.831,0.029
rbfsvm,SVM - Radial Kernel,0.7397,0.7969,0.6561,0.7904,0.7164,0.4796,0.4873,0.8232,0.123
gpc,Gaussian Process Classifier,0.7235,0.7866,0.6449,0.7669,0.7003,0.4473,0.4534,0.8173,5.94
et,Extra Trees Classifier,0.7412,0.812,0.6876,0.772,0.7265,0.4825,0.4862,0.8084,0.068
knn,K Neighbors Classifier,0.7216,0.7986,0.6838,0.7417,0.7111,0.4434,0.4452,0.7871,0.008
