In [1]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import yaml

import pdcast as pdc

while not Path("data") in Path(".").iterdir():
    os.chdir("..")

import sklearn.preprocessing as pre

In [3]:
# Read config.
conf_dict = yaml.safe_load(Path("config/conf.yaml").read_text())

companies_df = pd.read_parquet(conf_dict["companies_preprocessed"])
persons_df = pd.read_parquet(conf_dict["persons_preprocessed"])
edges_df = pd.read_parquet(conf_dict["edges_preprocessed"])

In [7]:
companies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96530 entries, 0 to 96529
Data columns (total 65 columns):
 #   Column                                                                                                                 Non-Null Count  Dtype  
---  ------                                                                                                                 --------------  -----  
 0   id                                                                                                                     96530 non-null  object 
 1   component                                                                                                              96530 non-null  int64  
 2   isCompany                                                                                                              96530 non-null  bool   
 3   name                                                                                                                   96069 non-null  object 
 4 

In [6]:
companies_df.describe()

Unnamed: 0,component,indegree,outdegree,closeness,clustering,pagerank,neighbour_count,neighbourhood_indegree,neighbourhood_outdegree,neighbourhood_closeness,...,outdegree__processed,closeness__processed,clustering__processed,pagerank__processed,neighbourhood_indegree__processed,neighbourhood_outdegree__processed,neighbourhood_closeness__processed,neighbourhood_clustering__processed,neighbourhood_pagerank__processed,neighbourhood_neighbour_count__processed
count,96530.0,96530.0,96530.0,96530.0,96530.0,96530.0,96530.0,96530.0,96530.0,96530.0,...,96530.0,96530.0,96530.0,96530.0,96530.0,96530.0,96530.0,96530.0,96530.0,96530.0
mean,2053.295442,1.396809,0.565793,1.4e-05,0.004973,9e-06,1.0,21.708164,23.031441,0.000212,...,-4.180962e-17,-9.966589000000001e-17,2.795282e-17,-1.347771e-16,-3.533207e-18,-1.354396e-17,-6.33033e-18,6.771981e-18,-2.0610380000000003e-17,3.179887e-17
std,2260.014936,0.663277,3.065651,6e-06,0.044165,4e-06,0.0,71.331619,64.89238,0.000714,...,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005
min,0.0,0.0,0.0,0.0,0.0,5e-06,1.0,1.0,1.0,8e-06,...,-0.2578285,-2.26782,-0.1125923,-0.890713,-0.2903098,-0.3395092,-0.2863155,-0.1712746,-0.3074106,-0.2932483
25%,6.0,1.0,0.0,8e-06,0.0,6e-06,1.0,6.0,7.0,6.2e-05,...,-0.2578285,-0.9676828,-0.1125923,-0.6659312,-0.2202143,-0.2470479,-0.21042,-0.1712746,-0.2130911,-0.2142477
50%,1134.5,1.0,0.0,1.4e-05,0.0,7e-06,1.0,10.0,10.0,9e-05,...,-0.2578285,0.07242657,-0.1125923,-0.3681705,-0.1641379,-0.2008173,-0.1706652,-0.1712746,-0.1622241,-0.1549972
75%,3986.0,2.0,0.0,1.5e-05,0.0,1e-05,1.0,15.0,16.0,0.000142,...,-0.2578285,0.332454,-0.1125923,0.3578131,-0.09404242,-0.108356,-0.09838375,-0.1712746,-0.08898733,-0.0957468
max,6863.0,19.0,460.0,0.000147,1.0,8.7e-05,1.0,1319.0,2536.0,0.013067,...,94.0296,22.49669,22.5299,21.52689,18.18687,38.72538,17.99914,26.69804,17.22286,15.82287


In [22]:
def select_processed_columns(df: pd.DataFrame) -> pd.DataFrame:
    columns = [x for x in df.columns if x.endswith("processed")]
    return df[columns]

In [5]:
companies_preprocessed_df = pd.read_parquet(conf_dict["companies_preprocessed"])
persons_preprocessed_df = pd.read_parquet(conf_dict["persons_preprocessed"])
edges_preprocessed_df = pd.read_parquet(conf_dict["edges_preprocessed"])

In [23]:
companies_preprocessed_df = select_processed_columns(companies_preprocessed_df)
persons_preprocessed_df = select_processed_columns(persons_preprocessed_df)
edges_preprocessed_df = select_processed_columns(edges_preprocessed_df)

In [24]:
companies_preprocessed_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96530 entries, 0 to 96529
Data columns (total 38 columns):
 #   Column                                                                                                                 Non-Null Count  Dtype  
---  ------                                                                                                                 --------------  -----  
 0   onehotencoder__CompanyStatus_Active - Proposal to Strike off__processed                                                96530 non-null  float64
 1   onehotencoder__CompanyStatus_None__processed                                                                           96530 non-null  float64
 2   onehotencoder__CompanyStatus_infrequent_sklearn__processed                                                             96530 non-null  float64
 3   onehotencoder__Accounts_AccountCategory_DORMANT__processed                                                             96530 non-null  float64
 4 

In [26]:
companies_preprocessed_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
onehotencoder__CompanyStatus_Active - Proposal to Strike off__processed,96530.0,1.442726e-17,1.000005,-0.148435,-0.148435,-0.148435,-0.148435,6.736939
onehotencoder__CompanyStatus_None__processed,96530.0,2.3628320000000002e-17,1.000005,-0.30495,-0.30495,-0.30495,-0.30495,3.279225
onehotencoder__CompanyStatus_infrequent_sklearn__processed,96530.0,-5.1065890000000004e-17,1.000005,-0.107855,-0.107855,-0.107855,-0.107855,9.271675
onehotencoder__Accounts_AccountCategory_DORMANT__processed,96530.0,3.452238e-17,1.000005,-0.46519,-0.46519,-0.46519,-0.46519,2.149659
onehotencoder__Accounts_AccountCategory_FULL__processed,96530.0,4.2398490000000003e-17,1.000005,-0.180966,-0.180966,-0.180966,-0.180966,5.525889
onehotencoder__Accounts_AccountCategory_GROUP__processed,96530.0,-4.5048390000000006e-17,1.000005,-0.12401,-0.12401,-0.12401,-0.12401,8.06387
onehotencoder__Accounts_AccountCategory_MICRO ENTITY__processed,96530.0,-1.055546e-16,1.000005,-0.426626,-0.426626,-0.426626,-0.426626,2.343974
onehotencoder__Accounts_AccountCategory_NO ACCOUNTS FILED__processed,96530.0,-5.2409240000000005e-17,1.000005,-0.392097,-0.392097,-0.392097,-0.392097,2.550386
onehotencoder__Accounts_AccountCategory_SMALL__processed,96530.0,-1.221901e-17,1.000005,-0.239388,-0.239388,-0.239388,-0.239388,4.177311
onehotencoder__Accounts_AccountCategory_TOTAL EXEMPTION FULL__processed,96530.0,-3.4448770000000003e-17,1.000005,-0.660019,-0.660019,-0.660019,1.515108,1.515108


In [13]:
companies_preprocessed_df.head().T[:20]

Unnamed: 0,0,1,2,3,4
id,10012349682331830451,10040124645537466196,10054280345851297628,10144755671479781292,10161948772964784983
component,177,5777,6,1531,6
isCompany,True,True,True,True,True
name,TRIBECA UK PROPERTY LIMITED,LANDMARK BELLEVUE LIMITED,DW FILM LIMITED,NETLIGHTING LIMITED,BANKSIDE CONSULTING LIMITED
foundingDate,2017-02-01,2021-02-26,2019-12-16,2007-05-17,2009-08-18
dissolutionDate,,,,,
countryCode,GB,GB,GB,GB,GB
companiesHouseID,10595410,13229753,12366116,06250885,06993940
openCorporatesID,https://opencorporates.com/companies/gb/10595410,https://opencorporates.com/companies/gb/13229753,https://opencorporates.com/companies/gb/12366116,https://opencorporates.com/companies/gb/06250885,https://opencorporates.com/companies/gb/06993940
openOwnershipRegisterID,http://register.openownership.org/entities/59b...,http://register.openownership.org/entities/604...,http://register.openownership.org/entities/5e2...,http://register.openownership.org/entities/59b...,http://register.openownership.org/entities/59b...


In [21]:
companies_preprocessed_df.tail().T

Unnamed: 0,96525,96526,96527,96528,96529
id,9634847840062845537,9734575849797190222,9759580217616288489,9929201675266395062,9983322336905260724
component,5390,1594,6,2505,6361
isCompany,True,True,True,True,True
name,AEROSPACE MANUFACTURING LIMITED,ALBURY CANONGATE HOLDCO LIMITED,HEAT PUMP AND CLEAR AIR LTD,HHFC DEVELOPMENT LIMITED,KB PROPERTY DEVELOPMENT LTD
foundingDate,2018-07-05,2020-12-08,2020-03-19,2019-09-19,2015-02-23
...,...,...,...,...,...
neighbourhood_outdegree__processed,-0.277868,-0.262458,0.35395,-0.123766,-0.185407
neighbourhood_closeness__processed,-0.205492,-0.1733,0.255795,-0.039113,-0.167051
neighbourhood_clustering__processed,-0.171275,-0.171275,-0.171275,-0.171275,-0.171275
neighbourhood_pagerank__processed,-0.157013,-0.087815,0.432449,0.017503,-0.139358


In [None]:
# Load features for data split.
def load_features(path_root, split):
    features_dir = Path(path_root) / split
    companies_df = pd.read_parquet(features_dir / "companies.parquet").dropna()
    persons_df = pd.read_parquet(features_dir / "persons.parquet").dropna()
    return companies_df, persons_df

In [None]:
companies_train_df, persons_train_df = load_features(features_path, "train")

In [None]:
select_cols = set(companies_train_df.columns) & set(persons_train_df.columns)
processed_feature_cols = [x for x in select_cols if x.endswith("__processed")]
raw_feature_cols = [x.split("__processed")[0] for x in processed_feature_cols]
target = "is_anomalous"

entities_df = pd.concat([companies_train_df, persons_train_df], axis=0)

# Sample entities_df so that half of the entities are anomalous.
def balanced_sample(entities_df) -> pd.DataFrame:
    n_entities = len(entities_df)
    n_anomalous = len(entities_df[entities_df[target] == True])
    n_normal = n_entities - n_anomalous
    n_sample = min(n_anomalous, n_normal)
    anomalous_df = entities_df.query("is_anomalous == False").sample(n_sample)
    normal_df = entities_df.query("is_anomalous == True").sample(n_sample)
    return pd.concat([anomalous_df, normal_df], axis=0)


balanced_sample_df = balanced_sample(entities_df)

In [None]:
X = pd.concat([balanced_sample_df[processed_feature_cols]], axis=0)
y = balanced_sample_df[target]

In [None]:
y.value_counts()

False    1902
True     1902
Name: is_anomalous, dtype: int64

In [None]:
# Train logistic regression model.
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


logreg = LogisticRegression(solver="lbfgs", max_iter=1000)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print("Accuracy:", acc)

## classification report
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

## confusion matrix
from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_test, y_pred))

Accuracy: 0.5177398160315374
              precision    recall  f1-score   support

       False       0.52      0.56      0.54       386
        True       0.51      0.47      0.49       375

    accuracy                           0.52       761
   macro avg       0.52      0.52      0.52       761
weighted avg       0.52      0.52      0.52       761

[[217 169]
 [198 177]]


In [None]:
## Train random forest model.
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score


# Random forest classifier.
rf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print("Accuracy:", acc)

# classification report
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

Accuracy: 0.5137976346911958
              precision    recall  f1-score   support

       False       0.52      0.55      0.54       386
        True       0.51      0.47      0.49       375

    accuracy                           0.51       761
   macro avg       0.51      0.51      0.51       761
weighted avg       0.51      0.51      0.51       761

