In [1]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pyspark.sql.functions as F
import yaml

import networkx as nx

while not Path("data") in Path(".").iterdir():
    os.chdir("..")

import src.features as ft

plt.style.use("seaborn-white")
conf_dict = yaml.safe_load(Path("config/conf.yaml").read_text())

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
companies_df = pd.read_parquet(conf_dict["companies_nodes"])
persons_df = pd.read_parquet(conf_dict["persons_nodes"])
edges_df = pd.read_parquet(conf_dict["edges"])

In [4]:
graph = ft.make_graph(edges=edges_df)

In [5]:
node_features = ft.get_node_features(graph=graph)

Getting indegree...
Getting outdegree...
Getting closeness...
Getting clustering...
Getting pagerank...


In [6]:
node_features.describe()

Unnamed: 0,indegree,outdegree,closeness,clustering,pagerank
count,129139.0,129139.0,129139.0,129139.0,129139.0
mean,1.044108,1.044108,1e-05,0.005842,8e-06
std,0.834937,3.152014,8e-06,0.047806,4e-06
min,0.0,0.0,0.0,0.0,5e-06
25%,0.0,0.0,0.0,0.0,5e-06
50%,1.0,0.0,1e-05,0.0,6e-06
75%,2.0,1.0,1.5e-05,0.0,9e-06
max,19.0,460.0,0.000147,1.0,9.1e-05


In [10]:
neighbourhood_features = ft.get_average_neighbourhood_features(graph, node_features)

In [11]:
neighbourhood_features

Unnamed: 0,neighbourhood_indegree,neighbourhood_outdegree,neighbourhood_closeness,neighbourhood_clustering,neighbourhood_pagerank
2356236782051912119,1.272727,0.0,0.000010,0.0,0.000006
3732317247976753020,0.000000,0.0,0.000000,0.0,0.000000
14047622054401208865,0.000000,0.0,0.000000,0.0,0.000000
692314493058510508,2.000000,0.0,0.000015,0.0,0.000010
390416379365304942,0.000000,0.0,0.000000,0.0,0.000000
...,...,...,...,...,...
18192837036067908255,3.000000,0.0,0.000023,0.0,0.000014
3003051962805999676,0.000000,0.0,0.000000,0.0,0.000000
2056455430524085329,3.000000,0.0,0.000023,0.0,0.000014
18168561485814806981,3.000000,0.0,0.000023,0.0,0.000016
