In [135]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import yaml

import pdcast as pdc

while not Path("data") in Path(".").iterdir():
    os.chdir("..")

import sklearn.preprocessing as pre

In [136]:
# Read config.
conf_dict = yaml.safe_load(Path("config/conf.yaml").read_text())

persons_df = pd.read_parquet(conf_dict["persons_nodes"])
companies_df = pd.read_parquet(conf_dict["companies_nodes"])
edges_df = pd.read_parquet(conf_dict["edges"])

In [137]:
# function to convert pandas columns to datetime
def convert_to_datetime(df, cols):
    for col in cols:
        df[col] = pd.to_datetime(df[col])
    return df



In [138]:
persons_df = convert_to_datetime(persons_df, ["birthDate"])

In [139]:
companies_df = convert_to_datetime(companies_df, ["foundingDate", "dissolutionDate"])

In [140]:
company_drop_cols = [
    "dissolutionDate", # practically invariant
    "CompanyCategory", # practically invariant
]

In [141]:
persons_one_hot_encoder_kwargs = dict(
    drop="first",
    sparse=False,
    dtype=np.uint8,
    handle_unknown="infrequent_if_exist",
    min_frequency=50,
)

companies_one_hot_encoder_kwargs = dict(
    drop="first",
    sparse=False,
    dtype=np.uint8,
    handle_unknown="infrequent_if_exist",
    min_frequency=1000,
)

In [142]:
persons_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32609 entries, 0 to 32608
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   id           32609 non-null  object        
 1   component    32609 non-null  int64         
 2   isCompany    32609 non-null  bool          
 3   birthDate    32608 non-null  datetime64[ns]
 4   name         32609 non-null  object        
 5   nationality  30695 non-null  object        
dtypes: bool(1), datetime64[ns](1), int64(1), object(3)
memory usage: 1.3+ MB


In [143]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer

In [144]:
def identity(x):
    return x

passthough_kwargs = dict(
    func=identity,
    feature_names_out="one-to-one"
)

In [145]:
companies_ct = ColumnTransformer(
    [
        (
            "one_hot_encoder",
            pre.OneHotEncoder(**companies_one_hot_encoder_kwargs),
            [
                "countryCode",
                "CompanyStatus",
                "Accounts_AccountCategory",
                "SICCode_SicText_1",
            ],
        ),
        ("standard_scaler", pre.StandardScaler(), ["foundingDate"]),
        # pass through the id and component
        # ("passthrough", pre.FunctionTransformer(**passthough_kwargs), ["id", "component"]),
    ],
    remainder="drop",
)

In [146]:
persons_ct = ColumnTransformer(
    [
        ("one_hot_encoder", pre.OneHotEncoder(**persons_one_hot_encoder_kwargs), ["nationality"]),
        ("standard_scaler", pre.StandardScaler(), ["birthDate"]),
        # pass through the id and component
        # ("passthrough", pre.FunctionTransformer(**passthough_kwargs), ["id", "component"]),
    ],
    remainder="drop",
)

In [147]:
def remove_from_col_names(df, s):
    df.columns = [col.replace(s, "") for col in df.columns]
    return df

In [148]:
persons_processed_df = persons_ct.fit_transform(persons_df)
persons_processed_df = pd.DataFrame(persons_processed_df, columns=persons_ct.get_feature_names_out())
persons_processed_df = pdc.downcast(persons_processed_df, numpy_dtypes_only=True)
persons_processed_df = persons_df[["id", "component"]].join(persons_processed_df)
persons_processed_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32609 entries, 0 to 32608
Data columns (total 15 columns):
 #   Column                                           Non-Null Count  Dtype  
---  ------                                           --------------  -----  
 0   id                                               32609 non-null  object 
 1   component                                        32609 non-null  int64  
 2   one_hot_encoder__nationality_BE                  32609 non-null  bool   
 3   one_hot_encoder__nationality_CA                  32609 non-null  bool   
 4   one_hot_encoder__nationality_CH                  32609 non-null  bool   
 5   one_hot_encoder__nationality_DE                  32609 non-null  bool   
 6   one_hot_encoder__nationality_ES                  32609 non-null  bool   
 7   one_hot_encoder__nationality_GB                  32609 non-null  bool   
 8   one_hot_encoder__nationality_IE                  32609 non-null  bool   
 9   one_hot_encoder__nationality

In [150]:
companies_processed_df = companies_ct.fit_transform(companies_df)
companies_processed_df = pd.DataFrame(companies_processed_df, columns=companies_ct.get_feature_names_out())
companies_processed_df = remove_from_col_names(companies_processed_df, "passthrough__")
companies_processed_df = pdc.downcast(companies_processed_df, numpy_dtypes_only=True)
companies_processed_df = companies_df[["id", "component"]].join(companies_processed_df)
companies_processed_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96530 entries, 0 to 96529
Data columns (total 29 columns):
 #   Column                                                                                                        Non-Null Count  Dtype  
---  ------                                                                                                        --------------  -----  
 0   id                                                                                                            96530 non-null  object 
 1   component                                                                                                     96530 non-null  int64  
 2   one_hot_encoder__CompanyStatus_Active - Proposal to Strike off                                                96530 non-null  bool   
 3   one_hot_encoder__CompanyStatus_None                                                                           96530 non-null  bool   
 4   one_hot_encoder__CompanyStatus_infrequent_sklearn   

In [153]:
edges_processed_df = edges_df.drop(columns=["interestedPartyIsPerson"])
edges_processed_df["minimumShare"] = pre.StandardScaler().fit_transform(edges_processed_df["minimumShare"].values.reshape(-1, 1))

In [154]:
edges_processed_df

Unnamed: 0,component,src,dst,minimumShare
0,7225,2356236782051912119,3732317247976753020,1.085958
1,7225,2356236782051912119,14047622054401208865,1.085958
2,7225,692314493058510508,390416379365304942,-0.976339
3,7225,15829769449001705952,3732317247976753020,1.085958
4,7225,15829769449001705952,17654996330473534901,1.085958
...,...,...,...,...
134830,455266566409,18192837036067908255,15233613116661558738,-0.976339
134831,455266566409,18168561485814806981,7879326061664287605,-0.976339
134832,455266566409,2056455430524085329,7879326061664287605,-0.976339
134833,455266566409,15391449340824074448,15233613116661558738,-0.976339
