In [73]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import yaml

import pdcast as pdc

while not Path("data") in Path(".").iterdir():
    os.chdir("..")

import pycaret.classification as clf
import sklearn.preprocessing as pre

from sklearn.metrics import average_precision_score

In [74]:
turbo_mode = False

In [75]:
# Read config.
conf_dict = yaml.safe_load(Path("config/conf.yaml").read_text())

features_path = conf_dict["features_path"]

In [76]:
# Load features for data split.
def load_features(path_root):
    companies_df = pd.read_parquet(conf_dict["companies_features"])
    persons_df = pd.read_parquet(conf_dict["persons_features"])
    return companies_df, persons_df

In [77]:
companies_df, persons_df = load_features(features_path)

In [78]:
companies_df.columns

Index(['id', 'component', 'isCompany', 'name', 'foundingDate',
       'dissolutionDate', 'countryCode', 'companiesHouseID',
       'openCorporatesID', 'openOwnershipRegisterID', 'CompanyCategory',
       'CompanyStatus', 'Accounts_AccountCategory', 'SICCode_SicText_1',
       'is_anomalous', 'indegree', 'outdegree', 'closeness', 'clustering',
       'pagerank', 'neighbourhood_count', 'neighbourhood_indegree_min',
       'neighbourhood_outdegree_min', 'neighbourhood_closeness_min',
       'neighbourhood_clustering_min', 'neighbourhood_pagerank_min',
       'neighbourhood_indegree_max', 'neighbourhood_outdegree_max',
       'neighbourhood_closeness_max', 'neighbourhood_clustering_max',
       'neighbourhood_pagerank_max', 'neighbourhood_indegree_sum',
       'neighbourhood_outdegree_sum', 'neighbourhood_closeness_sum',
       'neighbourhood_clustering_sum', 'neighbourhood_pagerank_sum',
       'neighbourhood_indegree_mean', 'neighbourhood_outdegree_mean',
       'neighbourhood_closeness_

In [92]:
common_cols = set(companies_df.columns) & set(persons_df.columns)
drop_cols = ["id", "name", "component"]
select_cols = sorted(common_cols.difference(drop_cols))

target = "is_anomalous"

entities_df = pd.concat([companies_df, persons_df], axis=0)[
    list(common_cols)
].reset_index(drop=True)

In [93]:
entities_df[select_cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 124934 entries, 0 to 124933
Data columns (total 33 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   closeness                      124934 non-null  float64
 1   clustering                     124934 non-null  float64
 2   indegree                       124934 non-null  int64  
 3   isCompany                      124934 non-null  bool   
 4   is_anomalous                   124934 non-null  bool   
 5   neighbourhood_closeness_max    124934 non-null  float64
 6   neighbourhood_closeness_mean   124934 non-null  float64
 7   neighbourhood_closeness_min    124934 non-null  float64
 8   neighbourhood_closeness_std    124934 non-null  float64
 9   neighbourhood_closeness_sum    124934 non-null  float64
 10  neighbourhood_clustering_max   124934 non-null  float64
 11  neighbourhood_clustering_mean  124934 non-null  float64
 12  neighbourhood_clustering_min  

In [94]:
entities_df["indegree"] = entities_df["indegree"].astype(float)
entities_df["outdegree"] = entities_df["outdegree"].astype(float)

In [95]:
def get_data_split_masks(df: pd.DataFrame):
    component_mod = df["component"].to_numpy() % 10
    train_mask = component_mod <= 7
    val_mask = component_mod == 8
    test_mask = component_mod == 9
    assert not np.any(train_mask & val_mask)
    assert not np.any(train_mask & test_mask)
    assert not np.any(val_mask & test_mask)
    return train_mask, val_mask, test_mask

In [96]:
train_mask, val_mask, test_mask = get_data_split_masks(entities_df)

In [98]:
train_df = entities_df.loc[train_mask].drop(drop_cols, axis=1)
val_df = entities_df.loc[val_mask].drop(drop_cols, axis=1)
test_df = entities_df.loc[test_mask].drop(drop_cols, axis=1)

print("Anomaly proportions")
train_proportion = train_df["is_anomalous"].sum() / len(train_df)
print(f"Train: {train_proportion:.3f}")
val_proportion = val_df["is_anomalous"].sum() / len(val_df)
print(f"Val: {val_proportion:.3f}")
test_proportion = test_df["is_anomalous"].sum() / len(test_df)
print(f"Test: {test_proportion:.3f}")

Anomaly proportions
Train: 0.094
Val: 0.073
Test: 0.073


In [102]:
# Get indexes in both train and test.
train_idx = train_df.index
test_idx = test_df.index
train_test_idx = train_idx.intersection(test_idx)
assert len(train_test_idx) == 0

In [90]:
from imblearn.under_sampling import RandomUnderSampler

In [91]:
# Join val to train.
train_df = pd.concat([train_df, val_df], axis=0)

In [15]:
# Perform model selection with PyCaret.

categorical_features = ["isCompany"]

s = clf.setup(
    train_df,
    test_data=test_df,
    target="is_anomalous",
    categorical_features=categorical_features,
    silent=True,
    normalize=True,
    # feature_selection=True,
    # feature_selection_method="boruta",
    fix_imbalance=True,  # Uses SMOTE.
    fix_imbalance_method=RandomUnderSampler(sampling_strategy="auto"),
)

clf.add_metric("aprc", "Avg. Prec.", average_precision_score, target="pred_proba")

Unnamed: 0,Description,Value
0,session_id,4151
1,Target,is_anomalous
2,Target Type,Binary
3,Label Encoded,"False: 0, True: 1"
4,Original Data,"(117272, 33)"
5,Missing Values,False
6,Numeric Features,31
7,Categorical Features,1
8,Ordinal Features,False
9,High Cardinality Features,False


Name                                                        Avg. Prec.
Display Name                                                Avg. Prec.
Score Function       <function average_precision_score at 0x7f88a52...
Scorer               make_scorer(average_precision_score, needs_pro...
Target                                                      pred_proba
Args                                                                {}
Greater is Better                                                 True
Multiclass                                                        True
Custom                                                            True
Name: aprc, dtype: object

In [16]:
exclude = [
    "rbfsvm",
    "mlp",
    "gbc",
    "gpc",
]

In [17]:
best_model = clf.compare_models(sort="aprc", turbo=turbo_mode, fold=5, exclude=exclude)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,Avg. Prec.,TT (Sec)
catboost,CatBoost Classifier,0.4544,0.6272,0.7139,0.1214,0.2018,0.054,0.0922,0.1815,6.274
lightgbm,Light Gradient Boosting Machine,0.4564,0.6299,0.7154,0.1218,0.2027,0.055,0.0937,0.1776,0.174
xgboost,Extreme Gradient Boosting,0.4865,0.6213,0.6699,0.1216,0.2006,0.0539,0.0867,0.1735,0.748
ada,Ada Boost Classifier,0.4179,0.6047,0.7314,0.1169,0.1949,0.0453,0.0794,0.1417,0.276
lr,Logistic Regression,0.4062,0.5904,0.7177,0.1112,0.1864,0.035,0.0652,0.137,0.572
qda,Quadratic Discriminant Analysis,0.2838,0.5992,0.889,0.1084,0.1912,0.0325,0.0827,0.1367,0.072
lda,Linear Discriminant Analysis,0.4061,0.586,0.7133,0.1099,0.1845,0.0329,0.062,0.1315,0.076
rf,Random Forest Classifier,0.5453,0.6061,0.5945,0.121,0.1987,0.0538,0.0815,0.1298,0.576
knn,K Neighbors Classifier,0.735,0.571,0.333,0.1341,0.1902,0.067,0.0762,0.1251,2.684
et,Extra Trees Classifier,0.5576,0.5937,0.5605,0.1197,0.194,0.0498,0.0724,0.1212,0.474


In [18]:
Path("reports").mkdir(exist_ok=True)
model_selection_grid = clf.pull()
model_selection_grid.to_csv("reports/pycaret-model-selection.csv", index=False)

In [19]:
# Highlight highest value in each column
def highlight_max(s):
    is_max = s == s.max()
    return ["font-weight: bold" if v else "" for v in is_max]


model_selection_grid.style.apply(
    highlight_max, axis=0, subset=model_selection_grid.columns[1:]
).format("{:.3f}", subset=model_selection_grid.columns[1:]).to_html(
    "reports/pycaret-model-selection.html"
)

In [20]:
tuned_model = clf.tune_model(
    best_model,
    optimize="aprc",
    fold=3,
    search_library="optuna",
    n_iter=20,
    choose_better=True,
)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,Avg. Prec.
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.2756,0.5798,0.8706,0.1014,0.1817,0.0194,0.0612,0.1628
1,0.3125,0.5884,0.8391,0.1033,0.184,0.0233,0.0655,0.166
2,0.5068,0.6317,0.6947,0.1212,0.2064,0.0584,0.1058,0.1605
Mean,0.365,0.6,0.8014,0.1086,0.1907,0.0337,0.0775,0.1631
Std,0.1014,0.0227,0.0766,0.0089,0.0112,0.0175,0.0201,0.0023


In [21]:
predictions = clf.predict_model(tuned_model, data=test_df, raw_score=True)
predictions["pred_proba"] = predictions["Score_True"]
predictions["actual"] = predictions["is_anomalous"].astype(int)
predictions

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,Avg. Prec.
0,CatBoost Classifier,0.4469,0.6204,0.79,0.0973,0.1732,0.049,0.1114,0.0955


Unnamed: 0,neighbourhood_closeness_std,neighbourhood_pagerank_max,neighbourhood_closeness_min,neighbourhood_pagerank_mean,neighbourhood_clustering_mean,neighbourhood_outdegree_min,neighbourhood_indegree_mean,neighbourhood_indegree_std,isCompany,neighbourhood_pagerank_sum,...,neighbourhood_indegree_sum,clustering,neighbourhood_clustering_std,neighbourhood_clustering_min,indegree,Label,Score_False,Score_True,pred_proba,actual
17,0.000000,0.000006,0.000008,0.000006,0.0,0.0,1.000000,0.000000,True,0.000006,...,1.0,0.0,0.0,0.0,1.0,True,0.4771,0.5229,0.5229,0
24,0.000000,0.000006,0.000016,0.000006,0.0,0.0,2.000000,0.000000,True,0.000006,...,2.0,0.0,0.0,0.0,2.0,True,0.4927,0.5073,0.5073,0
40,0.000000,0.000006,0.000014,0.000006,0.0,0.0,1.000000,0.000000,True,0.000006,...,1.0,0.0,0.0,0.0,1.0,True,0.4651,0.5349,0.5349,0
50,0.000000,0.000007,0.000014,0.000007,0.0,0.0,1.000000,0.000000,True,0.000007,...,1.0,0.0,0.0,0.0,1.0,True,0.4766,0.5234,0.5234,0
66,0.000000,0.000008,0.000008,0.000008,0.0,0.0,1.000000,0.000000,True,0.000008,...,1.0,0.0,0.0,0.0,1.0,True,0.4993,0.5007,0.5007,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30792,0.000004,0.000013,0.000016,0.000010,0.0,0.0,2.333333,0.471404,False,0.000029,...,7.0,0.0,0.0,0.0,0.0,False,0.6533,0.3467,0.3467,0
30800,0.000006,0.000015,0.000016,0.000014,0.0,0.0,2.500000,0.500000,False,0.000028,...,5.0,0.0,0.0,0.0,0.0,False,0.6896,0.3104,0.3104,0
30807,0.000000,0.000013,0.000016,0.000013,0.0,1.0,2.000000,0.000000,False,0.000013,...,2.0,0.0,0.0,0.0,0.0,False,0.6441,0.3559,0.3559,0
30854,0.000000,0.000009,0.000016,0.000009,0.0,0.0,2.000000,0.000000,False,0.000018,...,4.0,0.0,0.0,0.0,0.0,False,0.9132,0.0868,0.0868,0


In [22]:
best_model_class = model_selection_grid.iloc[0]["Model"].replace(" ", "")
best_model_class

'CatBoostClassifier'

In [23]:
output_path = Path(f"data/predictions/{best_model_class}.csv")
output_path.parent.mkdir(exist_ok=True)
predictions.to_csv(output_path, index=False)