In [4]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pyspark.sql.functions as F
import yaml

import pandas_profiling as pp

import graphframes as gf
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, Column

import pdcast as pdc

while not Path("data") in Path(".").iterdir():
    os.chdir("..")

import sklearn.preprocessing as pre


In [79]:
# Read config.
conf_dict = yaml.safe_load(Path("config/conf.yaml").read_text())

persons_df = pd.read_parquet(conf_dict["persons_nodes"])
companies_df = pd.read_parquet(conf_dict["companies_nodes"])
edges_df = pd.read_parquet(conf_dict["edges"])

In [80]:
# function to convert pandas columns to datetime
def convert_to_datetime(df, cols):
    for col in cols:
        df[col] = pd.to_datetime(df[col])
    return df


In [81]:
persons_df = convert_to_datetime(persons_df, ["birthDate"])

In [82]:
companies_df = convert_to_datetime(companies_df, ["foundingDate"])
companies_df = companies_df.drop(columns=["dissolutionDate"])

In [None]:
company_drop_cols = [
    "dissolutionDate", # practically invariant
    "CompanyCategory", # practically invariant
    
]

In [83]:
persons_one_hot_encoder_kwargs = dict(
    drop="first",
    sparse=False,
    dtype=int,
    handle_unknown="infrequent_if_exist",
    min_frequency=50,
)

companies_one_hot_encoder_kwargs = dict(
    drop="first",
    sparse=False,
    dtype=int,
    handle_unknown="infrequent_if_exist",
    min_frequency=1000,
)


In [84]:
persons_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32609 entries, 0 to 32608
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   id           32609 non-null  object        
 1   component    32609 non-null  int64         
 2   isCompany    32609 non-null  bool          
 3   birthDate    32608 non-null  datetime64[ns]
 4   name         32609 non-null  object        
 5   nationality  30695 non-null  object        
dtypes: bool(1), datetime64[ns](1), int64(1), object(3)
memory usage: 1.3+ MB


In [85]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer

In [88]:
companies_ct = ColumnTransformer(
    [
        (
            "one_hot_encoder",
            pre.OneHotEncoder(**companies_one_hot_encoder_kwargs),
            [
                "countryCode",
                "CompanyStatus",
                "Accounts_AccountCategory",
                "SICCode_SicText_1",
            ],
        ),
        ("standard_scaler", pre.StandardScaler(), ["foundingDate"]),
    ],
    remainder="drop",
)


In [89]:
persons_ct = ColumnTransformer(
    [
        ("one_hot_encoder", pre.OneHotEncoder(**persons_one_hot_encoder_kwargs), ["nationality"]),
        ("standard_scaler", pre.StandardScaler(), ["birthDate"]),
    ],
    remainder="drop",
)


In [90]:
persons_ct.fit_transform(persons_df)

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.82851403],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        , -0.86265711],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.87907438],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.51807759],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        , -0.24823433],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        , -0.62192323]])

In [91]:
persons_ct.get_feature_names_out()

array(['one_hot_encoder__nationality_BE',
       'one_hot_encoder__nationality_CA',
       'one_hot_encoder__nationality_CH',
       'one_hot_encoder__nationality_DE',
       'one_hot_encoder__nationality_ES',
       'one_hot_encoder__nationality_GB',
       'one_hot_encoder__nationality_IE',
       'one_hot_encoder__nationality_PH',
       'one_hot_encoder__nationality_PL',
       'one_hot_encoder__nationality_ZA',
       'one_hot_encoder__nationality_None',
       'one_hot_encoder__nationality_infrequent_sklearn',
       'standard_scaler__birthDate'], dtype=object)

In [92]:
companies_ct.fit_transform(companies_df)

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.25971065],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.4062732 ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.36307374],
       ...,
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.37234486],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.3543944 ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.18978276]])

In [93]:
companies_ct.get_feature_names_out()

array(['one_hot_encoder__CompanyCategory_None',
       'one_hot_encoder__CompanyCategory_infrequent_sklearn',
       'one_hot_encoder__CompanyStatus_Active - Proposal to Strike off',
       'one_hot_encoder__CompanyStatus_None',
       'one_hot_encoder__CompanyStatus_infrequent_sklearn',
       'one_hot_encoder__Accounts_AccountCategory_DORMANT',
       'one_hot_encoder__Accounts_AccountCategory_FULL',
       'one_hot_encoder__Accounts_AccountCategory_GROUP',
       'one_hot_encoder__Accounts_AccountCategory_MICRO ENTITY',
       'one_hot_encoder__Accounts_AccountCategory_NO ACCOUNTS FILED',
       'one_hot_encoder__Accounts_AccountCategory_SMALL',
       'one_hot_encoder__Accounts_AccountCategory_TOTAL EXEMPTION FULL',
       'one_hot_encoder__Accounts_AccountCategory_UNAUDITED ABRIDGED',
       'one_hot_encoder__Accounts_AccountCategory_None',
       'one_hot_encoder__Accounts_AccountCategory_infrequent_sklearn',
       'one_hot_encoder__SICCode_SicText_1_41202 - Construction of dome