In [1]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pyspark.sql.functions as F
import yaml

import pandas_profiling as pp

import graphframes as gf
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, Column

import pdcast as pdc

while not Path("data") in Path(".").iterdir():
    os.chdir("..")

import sklearn.preprocessing as pre


In [2]:
# Read config.
conf_dict = yaml.safe_load(Path("config/conf.yaml").read_text())

persons_df = pd.read_parquet(conf_dict["persons_nodes"])
companies_df = pd.read_parquet(conf_dict["companies_nodes"])
edges_df = pd.read_parquet(conf_dict["edges"])

In [3]:
persons_df.groupby("component")["id"].count().sort_values(ascending=False)

component
1555            252
8589938228      111
14328            80
9784             72
6950             70
               ... 
60129570143       1
17179899118       1
137439014439      1
266288017586      1
137438985685      1
Name: id, Length: 8586, dtype: int64

In [4]:
# select 10% of the person nodes
anomalous_persons_df = persons_df.sample(frac=0.1, random_state=42)

In [5]:
# select 10% of the company nodes
anomalous_companies_df = companies_df.sample(frac=0.1, random_state=42)

In [6]:
# flag the anomalous entities
persons_df["is_anomalous"] = False
persons_df.loc[anomalous_persons_df.index, "is_anomalous"] = True
companies_df["is_anomalous"] = False
companies_df.loc[anomalous_companies_df.index, "is_anomalous"] = True


In [7]:
# select edges for anomalous persons and companies
# anomalous_edges_df = edges_df[edges_df["src"].isin(anomalous_persons_df["id"])]
anomalous_edges_df = edges_df[
    edges_df["src"].isin(
        set(
            anomalous_persons_df["id"].to_list()
            + anomalous_companies_df["id"].to_list()
        )
    )
]
anomalous_edges_df = anomalous_edges_df.copy(deep=True)

In [10]:

# select array indexes where values are equal
def indexes_not_shuffled(a1, a2):
    return np.argwhere(a1 == a2)


def efficient_shuffle(a1):
    def inner(a1, _i):
        print(_i, end="\r")
        a2 = a1.copy()
        rng = np.random.default_rng(42 + _i)
        for i in range(100):
            to_shuffle = indexes_not_shuffled(a1, a2)
            a2[to_shuffle] = rng.permutation(a2[to_shuffle])
            if all_shuffled(a1, a2):
                break
        else:
            inner(a1, _i + 1)
        return a2
    return inner(a1, 0)


In [11]:
# permute the edges from anomalous entities until they are all shuffled

rng = np.random.default_rng(42)

shuffled_edges_df = anomalous_edges_df.copy()

def all_shuffled(a1, a2):
    return np.all(a1 != a2)

i = 0

original_edges = shuffled_edges_df["src"].to_numpy()

shuffled_edges = efficient_shuffle(original_edges)

shuffled_edges_df["src"] = shuffled_edges


24

In [13]:
# replace the edges with the shuffled ones
edges_anomalised_df = edges_df.copy()
edges_anomalised_df = edges_anomalised_df.drop(shuffled_edges_df.index)
edges_anomalised_df = pd.concat([edges_anomalised_df, shuffled_edges_df]).sort_index()

In [14]:
assert edges_anomalised_df.shape == edges_df.shape
assert not edges_anomalised_df.equals(edges_df)

In [19]:
persons_df.describe()

Unnamed: 0,component,class
count,32609.0,32609.0
mean,106961800000.0,0.100003
std,117067900000.0,0.300009
min,39.0,0.0
25%,25769810000.0,0.0
50%,68719510000.0,0.0
75%,154618800000.0,0.0
max,979252600000.0,1.0


In [20]:
companies_df.describe()

Unnamed: 0,component,class
count,96530.0,96530.0
mean,95221320000.0,0.1
std,111261700000.0,0.300002
min,39.0,0.0
25%,17179900000.0,0.0
50%,60129550000.0,0.0
75%,137439000000.0,0.0
max,979252600000.0,1.0


In [None]:
edges_anomalised_df.head()

Unnamed: 0,component,src,dst,interestedPartyIsPerson,minimumShare
0,7225,2356236782051912119,3732317247976753020,True,75.0
1,7225,2356236782051912119,14047622054401208865,True,75.0
2,7225,692314493058510508,390416379365304942,True,25.0
3,7225,15829769449001705952,3732317247976753020,True,75.0
4,7225,15829769449001705952,17654996330473534901,True,75.0


In [None]:
edges_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 134835 entries, 0 to 134834
Data columns (total 5 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   component                134835 non-null  int64  
 1   src                      134835 non-null  object 
 2   dst                      134835 non-null  object 
 3   interestedPartyIsPerson  134835 non-null  bool   
 4   minimumShare             134835 non-null  float64
dtypes: bool(1), float64(1), int64(1), object(2)
memory usage: 4.2+ MB
