In [1]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pyspark.sql.functions as F
import yaml

import networkx as nx

while not Path("data") in Path(".").iterdir():
    os.chdir("..")

import src.features as ft

plt.style.use("seaborn-white")
conf_dict = yaml.safe_load(Path("config/conf.yaml").read_text())

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
companies_df = pd.read_parquet(conf_dict["companies_nodes"])
persons_df = pd.read_parquet(conf_dict["persons_nodes"])
edges_df = pd.read_parquet(conf_dict["edges"])

In [4]:
graph = ft.make_graph(edges=edges_df)

In [5]:
node_features = ft.get_node_features(graph=graph)

In [6]:
node_features.describe()

Unnamed: 0,indegree,outdegree,closeness,clustering,pagerank
count,129139.0,129139.0,129139.0,129139.0,129139.0
mean,1.044108,1.044108,1e-05,0.005842,8e-06
std,0.834937,3.152014,8e-06,0.047806,4e-06
min,0.0,0.0,0.0,0.0,5e-06
25%,0.0,0.0,0.0,0.0,5e-06
50%,1.0,0.0,1e-05,0.0,6e-06
75%,2.0,1.0,1.5e-05,0.0,9e-06
max,19.0,460.0,0.000147,1.0,9.1e-05


In [7]:
list(nx.generators.ego_graph(graph, "2356236782051912119", 1, undirected=True).nodes)

['10741860318944961270',
 '10757660430204686758',
 '14047622054401208865',
 '9902298899698290018',
 '3732317247976753020',
 '2356236782051912119',
 '8747669870869825129',
 '16341034140672858559',
 '16543695269686124450',
 '18186132645391932375',
 '7820636420844813709',
 '390416379365304942']

In [8]:
list(nx.all_neighbors(graph, "2356236782051912119"))

['3732317247976753020',
 '14047622054401208865',
 '10757660430204686758',
 '16543695269686124450',
 '8747669870869825129',
 '16341034140672858559',
 '10741860318944961270',
 '9902298899698290018',
 '7820636420844813709',
 '18186132645391932375',
 '390416379365304942']

In [9]:
neighbourhood_features = ft.get_local_neighbourhood_features(graph, node_features)

In [11]:
neighbourhood_features.sort_values(by="neighbourhood_closeness", ascending=False).head(
    10
)

Unnamed: 0,neighbourhood_indegree,neighbourhood_outdegree,neighbourhood_closeness,neighbourhood_clustering,neighbourhood_pagerank,neighbourhood_num_neighbours
9198120535972248050,686.0,1.0,0.007088,0.5,0.003535,456.0
2286052163500911970,501.0,2.0,0.005049,0.0,0.002614,460.0
4513814456581281117,407.0,4.0,0.004052,0.083333,0.002117,359.0
9842794450934916335,222.0,13.0,0.002214,0.0,0.001205,153.0
7149306621246736401,48.0,7.0,0.001922,0.0,0.000292,47.0
2291515699414010435,171.0,0.0,0.001793,0.0,0.000872,125.0
17490528357289809058,165.0,2.0,0.001745,0.0,0.000839,105.0
13939501595514082250,205.0,0.0,0.001587,0.0,0.001061,176.0
8679954106469580161,75.0,1.0,0.001256,0.0,0.000381,49.0
12334461834142745614,119.0,38.0,0.001103,0.166667,0.000568,80.0
