In [5]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import yaml

import pdcast as pdc

while not Path("data") in Path(".").iterdir():
    os.chdir("..")

import pycaret.classification as clf
import sklearn.preprocessing as pre

from sklearn.metrics import average_precision_score

In [6]:
turbo_mode = False

In [7]:
# Read config.
conf_dict = yaml.safe_load(Path("config/conf.yaml").read_text())

features_path = conf_dict["features_path"]

In [8]:
# Load features for data split.
def load_features(path_root):
    companies_df = pd.read_parquet(conf_dict["companies_features"])
    persons_df = pd.read_parquet(conf_dict["persons_features"])
    return companies_df, persons_df

In [9]:
companies_df, persons_df = load_features(features_path)

In [10]:
companies_df.columns

Index(['id', 'component', 'isCompany', 'name', 'foundingDate',
       'dissolutionDate', 'countryCode', 'companiesHouseID',
       'openCorporatesID', 'openOwnershipRegisterID', 'CompanyCategory',
       'CompanyStatus', 'Accounts_AccountCategory', 'SICCode_SicText_1',
       'is_anomalous', 'indegree', 'outdegree', 'closeness', 'clustering',
       'pagerank', 'neighbourhood_count', 'neighbourhood_indegree_min',
       'neighbourhood_outdegree_min', 'neighbourhood_closeness_min',
       'neighbourhood_clustering_min', 'neighbourhood_pagerank_min',
       'neighbourhood_indegree_max', 'neighbourhood_outdegree_max',
       'neighbourhood_closeness_max', 'neighbourhood_clustering_max',
       'neighbourhood_pagerank_max', 'neighbourhood_indegree_sum',
       'neighbourhood_outdegree_sum', 'neighbourhood_closeness_sum',
       'neighbourhood_clustering_sum', 'neighbourhood_pagerank_sum',
       'neighbourhood_indegree_mean', 'neighbourhood_outdegree_mean',
       'neighbourhood_closeness_

In [11]:
common_cols = set(companies_df.columns) & set(persons_df.columns)
drop_cols = ["id", "name", "component"]
select_cols = sorted(common_cols.difference(drop_cols))

target = "is_anomalous"

entities_df = pd.concat([companies_df, persons_df], axis=0)[list(common_cols)]

In [12]:
entities_df[select_cols].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 129139 entries, 0 to 32608
Data columns (total 33 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   closeness                      129139 non-null  float64
 1   clustering                     129139 non-null  float64
 2   indegree                       129139 non-null  int64  
 3   isCompany                      129139 non-null  bool   
 4   is_anomalous                   129139 non-null  bool   
 5   neighbourhood_closeness_max    129139 non-null  float64
 6   neighbourhood_closeness_mean   129139 non-null  float64
 7   neighbourhood_closeness_min    129139 non-null  float64
 8   neighbourhood_closeness_std    129139 non-null  float64
 9   neighbourhood_closeness_sum    129139 non-null  float64
 10  neighbourhood_clustering_max   129139 non-null  float64
 11  neighbourhood_clustering_mean  129139 non-null  float64
 12  neighbourhood_clustering_min   

In [13]:
entities_df["indegree"] = entities_df["indegree"].astype(float)
entities_df["outdegree"] = entities_df["outdegree"].astype(float)

In [14]:
def get_data_split_masks(df: pd.DataFrame):
    component_mod = df["component"].to_numpy() % 10
    train_mask = component_mod <= 7
    val_mask = component_mod == 8
    test_mask = component_mod == 9
    return train_mask, val_mask, test_mask

In [15]:
train_mask, val_mask, test_mask = get_data_split_masks(entities_df)

In [16]:
train_df = entities_df.loc[train_mask].drop(drop_cols, axis=1)
val_df = entities_df.loc[val_mask].drop(drop_cols, axis=1)
test_df = entities_df.loc[test_mask].drop(drop_cols, axis=1)

# Join val to train.
train_df = pd.concat([train_df, val_df], axis=0)

In [17]:
# Perform model selection with PyCaret.

categorical_features = ["isCompany"]

s = clf.setup(
    train_df,
    test_data=test_df,
    target="is_anomalous",
    categorical_features=categorical_features,
    silent=True,
    normalize=True,
    # feature_selection=True,
    # feature_selection_method="boruta",
    # fix_imbalance=True,  # Uses SMOTE.
)

clf.add_metric("aprc", "Avg. Prec.", average_precision_score, target="pred_proba")

Unnamed: 0,Description,Value
0,session_id,8746
1,Target,is_anomalous
2,Target Type,Binary
3,Label Encoded,"False: 0, True: 1"
4,Original Data,"(121125, 33)"
5,Missing Values,False
6,Numeric Features,31
7,Categorical Features,1
8,Ordinal Features,False
9,High Cardinality Features,False


Name                                                        Avg. Prec.
Display Name                                                Avg. Prec.
Score Function       <function average_precision_score at 0x7f27005...
Scorer               make_scorer(average_precision_score, needs_pro...
Target                                                      pred_proba
Args                                                                {}
Greater is Better                                                 True
Multiclass                                                        True
Custom                                                            True
Name: aprc, dtype: object

In [18]:
exclude = [
    "rbfsvm",
    "mlp",
]

In [20]:
best_model = clf.compare_models(sort="aprc", turbo=turbo_mode, fold=5, exclude=exclude)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,Avg. Prec.,TT (Sec)
catboost,CatBoost Classifier,0.9069,0.658,0.1031,0.7286,0.1737,0.1564,0.241,0.2842,17.158
lightgbm,Light Gradient Boosting Machine,0.907,0.6534,0.0953,0.7748,0.1623,0.1472,0.2376,0.2793,0.554
xgboost,Extreme Gradient Boosting,0.9066,0.6514,0.1025,0.7123,0.1723,0.1548,0.2371,0.278,3.252
gbc,Gradient Boosting Classifier,0.9054,0.6319,0.0682,0.7643,0.1185,0.1081,0.1891,0.2489,3.956
rf,Random Forest Classifier,0.899,0.6308,0.1188,0.4403,0.1844,0.1531,0.1899,0.2437,2.79
et,Extra Trees Classifier,0.8969,0.6244,0.1264,0.4122,0.1903,0.1553,0.1859,0.2213,2.038
knn,K Neighbors Classifier,0.9022,0.5949,0.1199,0.5012,0.1884,0.1614,0.2078,0.2085,22.898
dt,Decision Tree Classifier,0.8888,0.5788,0.1444,0.3323,0.1978,0.1519,0.1668,0.1718,0.318
ada,Ada Boost Classifier,0.8992,0.6026,0.0006,0.5467,0.0011,0.0009,0.0151,0.1628,0.964
qda,Quadratic Discriminant Analysis,0.2259,0.6029,0.9465,0.111,0.1986,0.0221,0.0771,0.1574,0.192


In [23]:
Path("reports").mkdir(exist_ok=True)
model_selection_grid = clf.pull()
model_selection_grid.to_csv("reports/pycaret-model-selection.csv", index=False)

In [26]:
tuned_model = clf.tune_model(
    best_model,
    optimize="aprc",
    fold=3,
    search_library="optuna",
    n_iter=20,
    choose_better=True,
)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,Avg. Prec.
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.9007,0.5871,0.0305,0.6703,0.0583,0.0499,0.1283,0.1869
1,0.9012,0.5912,0.0393,0.6667,0.0742,0.0637,0.1453,0.1902
2,0.9087,0.685,0.2351,0.6259,0.3418,0.3034,0.346,0.3698
Mean,0.9035,0.6211,0.1016,0.6543,0.1581,0.139,0.2065,0.249
Std,0.0037,0.0452,0.0944,0.0201,0.1301,0.1164,0.0988,0.0855


In [None]:
predictions = clf.predict_model(tuned_model, data=test_df, raw_score=True)
predictions["pred_proba"] = predictions["Score_True"]
predictions["actual"] = predictions["is_anomalous"].astype(int)
predictions

In [31]:
output_path = Path("data/predictions/CatBoost.csv")
output_path.parent.mkdir(exist_ok=True)
predictions.to_csv(output_path, index=False)