In [42]:
import os
from pathlib import Path

import pandas as pd
import networkx as nx
import scipy.sparse as sp
import yaml

while not Path("data") in Path(".").iterdir():
    os.chdir("..")

import mscproject.dataprep as dp
import mscproject.features as feat
import mscproject.simulate as sim


In [43]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [44]:
conf_dict = yaml.safe_load(Path("config/conf.yaml").read_text())

# Load features
companies_features = pd.read_parquet(conf_dict["companies_features"])
persons_features = pd.read_parquet(conf_dict["persons_features"])
edges_features = pd.read_parquet(conf_dict["edges_features"])

In [45]:
companies_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96530 entries, 0 to 96529
Data columns (total 27 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   id                             96530 non-null  object 
 1   component                      96530 non-null  int64  
 2   isCompany                      96530 non-null  bool   
 3   name                           96069 non-null  object 
 4   foundingDate                   95996 non-null  object 
 5   dissolutionDate                87 non-null     object 
 6   countryCode                    96530 non-null  object 
 7   companiesHouseID               96525 non-null  object 
 8   openCorporatesID               96013 non-null  object 
 9   openOwnershipRegisterID        96530 non-null  object 
 10  CompanyCategory                88317 non-null  object 
 11  CompanyStatus                  88317 non-null  object 
 12  Accounts_AccountCategory       88317 non-null 

In [46]:
persons_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32609 entries, 0 to 32608
Data columns (total 19 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   id                             32609 non-null  object 
 1   component                      32609 non-null  int64  
 2   isCompany                      32609 non-null  bool   
 3   birthDate                      32608 non-null  object 
 4   name                           32609 non-null  object 
 5   nationality                    30695 non-null  object 
 6   is_anomalous                   32609 non-null  bool   
 7   indegree                       32609 non-null  int64  
 8   outdegree                      32609 non-null  int64  
 9   closeness                      32609 non-null  float64
 10  clustering                     32609 non-null  float64
 11  pagerank                       32609 non-null  float64
 12  neighbour_count                32609 non-null 

In [37]:
def rename_columns(df):
    df = df.copy()
    column_names = df.columns.tolist()
    first = column_names.index("indegree")
    last = column_names.index("neighbourhood_0")
    graph_feats = column_names[first:last]
    keep_feats = column_names[0:last] + [f"neighbourhood_{x}" for x in graph_feats]
    df.columns = keep_feats
    return df


In [38]:
# Rename columns for each dataframe
companies_features = rename_columns(companies_features)
persons_features = rename_columns(persons_features)
# edges_features = rename_columns(edges_features)

In [41]:
# Overwrite old features with new features
companies_features.to_parquet(conf_dict["companies_features"])
persons_features.to_parquet(conf_dict["persons_features"])