In [1]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pyspark.sql.functions as F
import yaml

import graphframes as gf
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, Column

while not Path("data") in Path(".").iterdir():
    os.chdir("..")

import src.features as ft

plt.style.use("seaborn-white")
conf_dict = yaml.safe_load(Path("config/conf.yaml").read_text())

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
companies_df = pd.read_parquet(conf_dict["companies_nodes"])
persons_df = pd.read_parquet(conf_dict["persons_nodes"])
edges_df = pd.read_parquet(conf_dict["edges"])

In [4]:
edges_df = edges_df.sample(frac=0.01)
graph = ft.make_graph(edges=edges_df)

In [5]:
persons_df

Unnamed: 0,id,component,isCompany,birthDate,name,nationality
0,10033113650630221480,51539636259,False,1982-01-01,Maria Cristina Jalos,PH
1,1003532205566492230,154618822792,False,1959-10-01,Vanessa Lillian Brady,GB
2,10050008744520342618,34359802212,False,1982-09-01,Dave Kirby,GB
3,10118290751555540644,128849030701,False,1986-09-01,James Claffey,GB
4,10169105295523201820,326417527307,False,1984-12-01,Nikita Parekh,GB
...,...,...,...,...,...,...
32604,9840995851442736767,8589998816,False,1979-07-01,John Edward Barraclough,GB
32605,9864692745201654962,240518188755,False,1986-02-01,Selina Anne Ben-Yoav,GB
32606,9873850331556472653,17179887487,False,1977-12-01,Andrew Flintoff,GB
32607,9905019640801255009,34359787673,False,1967-11-01,Dale Russell Shaun Priestley,GB


In [16]:
ft.add_network_features(graph=graph)

                        id data
0      7891884204866369429   {}
1     13434926265158885364   {}
2     10331775550734659850   {}
3        66147880586722570   {}
4      4767622847371897074   {}
...                    ...  ...
2644  16329924650850821525   {}
2645  15071484163255013799   {}
2646   7301509594611767722   {}
2647   6157110392702229510   {}
2648  15743935692503982339   {}

[2649 rows x 2 columns]


NodeNotFound: Source 0        7891884204866369429
1       13434926265158885364
2       10331775550734659850
3          66147880586722570
4        4767622847371897074
                ...         
2644    16329924650850821525
2645    15071484163255013799
2646     7301509594611767722
2647     6157110392702229510
2648    15743935692503982339
Name: id, Length: 2649, dtype: object is not in G

In [10]:
import networkx as nx
nx.closeness_centrality(graph)

{'7891884204866369429': 0.0,
 '13434926265158885364': 0.00037764350453172205,
 '10331775550734659850': 0.0,
 '66147880586722570': 0.0007552870090634441,
 '4767622847371897074': 0.0,
 '5418878984254378759': 0.00037764350453172205,
 '15391758754681071602': 0.0,
 '13646403226720930885': 0.00037764350453172205,
 '50141818656404986': 0.0,
 '5815515800424087364': 0.00037764350453172205,
 '8679249462771218579': 0.0,
 '6509253124213453403': 0.00037764350453172205,
 '10025579008844477037': 0.0,
 '16046662958077953477': 0.00037764350453172205,
 '16743500227890898082': 0.0,
 '3351911789797807096': 0.00037764350453172205,
 '14190350008895016105': 0.0,
 '10324867166571390734': 0.00037764350453172205,
 '8651306729001194833': 0.0,
 '9870272800378954992': 0.00037764350453172205,
 '8621634361727569248': 0.0,
 '350983854228196849': 0.00037764350453172205,
 '1752745239525170502': 0.0,
 '10406999450759449578': 0.00037764350453172205,
 '14592012817923510054': 0.0,
 '7590343840458183967': 0.0003776435045317