In [1]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pyspark.sql.functions as F
import yaml

from pyspark.sql import SparkSession

while not Path("data") in Path(".").iterdir():
    os.chdir("..")

plt.style.use("seaborn-white")
conf_dict = yaml.safe_load(Path("config/conf.yaml").read_text())

spark = SparkSession.builder.getOrCreate()

22/05/25 23:09:24 WARN Utils: Your hostname, domvwt-XPS-13-9305 resolves to a loopback address: 127.0.1.1; using 192.168.0.24 instead (on interface wlp164s0)
22/05/25 23:09:24 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/05/25 23:09:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


## TODO

- [ ] work out how to link persons to companies via relationships
- [ ] identify useful node features
- [ ] decide whether to model as a heterogeneous 
- [ ] think we can drop `identifiers`

In [2]:
companies_sdf = spark.read.parquet(conf_dict["companies_processed"])
relationships_sdf = spark.read.parquet(conf_dict["relationships_processed"])
persons_sdf = spark.read.parquet(conf_dict["persons_processed"])

In [6]:
relationships_sdf.printSchema()

root
 |-- interestedParty: struct (nullable = true)
 |    |-- describedByEntityStatement: string (nullable = true)
 |    |-- describedByPersonStatement: string (nullable = true)
 |    |-- unspecified: struct (nullable = true)
 |    |    |-- description: string (nullable = true)
 |    |    |-- reason: string (nullable = true)
 |-- interests: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- details: string (nullable = true)
 |    |    |-- endDate: string (nullable = true)
 |    |    |-- share: struct (nullable = true)
 |    |    |    |-- exact: double (nullable = true)
 |    |    |    |-- exclusiveMaximum: boolean (nullable = true)
 |    |    |    |-- exclusiveMinimum: boolean (nullable = true)
 |    |    |    |-- maximum: double (nullable = true)
 |    |    |    |-- minimum: double (nullable = true)
 |    |    |-- startDate: string (nullable = true)
 |    |    |-- type: string (nullable = true)
 |-- source: struct (nullable = true)
 |    |-- descrip

In [41]:
relationships_sdf.select("interestedParty.*", "subject.*").show(truncate=False)

+------------------------------------------+-------------------------------------------+-----------+-------------------------------------------+
|describedByEntityStatement                |describedByPersonStatement                 |unspecified|describedByEntityStatement                 |
+------------------------------------------+-------------------------------------------+-----------+-------------------------------------------+
|null                                      |openownership-register-7780563163888271715 |null       |openownership-register-10001903462476671266|
|null                                      |openownership-register-4299764521413946178 |null       |openownership-register-10005700944398140823|
|null                                      |openownership-register-16055575199624308533|null       |openownership-register-10006249071307456609|
|null                                      |openownership-register-17430426542224702560|null       |openownership-register-1002778

In [20]:
rels_df01 = relationships_sdf.select("interestedParty.*", F.explode("interests").alias("interests")).drop("unspecified", "describedByEntityStatement")
rels_df01.show(truncate=False)

+-------------------------------------------+-------------------------------------------------------------------------------------------------------------------+
|describedByPersonStatement                 |interests                                                                                                          |
+-------------------------------------------+-------------------------------------------------------------------------------------------------------------------+
|openownership-register-7780563163888271715 |{ownership-of-shares-75-to-100-percent, null, {null, false, false, 100.0, 75.0}, 2018-12-19, shareholding}         |
|openownership-register-7780563163888271715 |{ownership-of-shares-75-to-100-percent-as-trust, null, {null, false, false, 100.0, 75.0}, 2018-12-19, shareholding}|
|openownership-register-7780563163888271715 |{ownership-of-shares-75-to-100-percent-as-firm, null, {null, false, false, 100.0, 75.0}, 2018-12-19, shareholding} |
|openownership-register-7780

In [26]:
rels_df02 = rels_df01.select("describedByPersonStatement", "interests.*")
rels_df02.show(truncate=False)

+-------------------------------------------+----------------------------------------------+----------+---------------------------------+----------+--------------------+
|describedByPersonStatement                 |details                                       |endDate   |share                            |startDate |type                |
+-------------------------------------------+----------------------------------------------+----------+---------------------------------+----------+--------------------+
|openownership-register-7780563163888271715 |ownership-of-shares-75-to-100-percent         |null      |{null, false, false, 100.0, 75.0}|2018-12-19|shareholding        |
|openownership-register-7780563163888271715 |ownership-of-shares-75-to-100-percent-as-trust|null      |{null, false, false, 100.0, 75.0}|2018-12-19|shareholding        |
|openownership-register-7780563163888271715 |ownership-of-shares-75-to-100-percent-as-firm |null      |{null, false, false, 100.0, 75.0}|2018-12-19|sh

In [28]:
rels_df02.select("share.*").show(truncate=False)

+-----+----------------+----------------+-------+-------+
|exact|exclusiveMaximum|exclusiveMinimum|maximum|minimum|
+-----+----------------+----------------+-------+-------+
|null |false           |false           |100.0  |75.0   |
|null |false           |false           |100.0  |75.0   |
|null |false           |false           |100.0  |75.0   |
|null |false           |false           |100.0  |75.0   |
|null |false           |false           |100.0  |75.0   |
|null |false           |false           |100.0  |75.0   |
|null |null            |null            |null   |null   |
|null |null            |null            |null   |null   |
|null |null            |null            |null   |null   |
|null |null            |null            |null   |null   |
|null |null            |null            |null   |null   |
|null |null            |null            |null   |null   |
|null |false           |true            |50.0   |25.0   |
|null |false           |false           |100.0  |75.0   |
|null |false  

In [34]:
person = persons_sdf.filter(F.col("statementID") == "openownership-register-7780563163888271715")
person.show()



+--------------------+----------+--------------------+-----------------+--------------------+----------------+--------------------+---------+
|           addresses| birthDate|         identifiers|missingInfoReason|               names|   nationalities|         statementID|partition|
+--------------------+----------+--------------------+-----------------+--------------------+----------------+--------------------+---------+
|[{Shop/Office # 6...|1983-09-01|[{/company/111907...|             null|[{Ansar Mahmood, ...|[{PK, Pakistan}]|openownership-reg...|       15|
+--------------------+----------+--------------------+-----------------+--------------------+----------------+--------------------+---------+



                                                                                

In [39]:
person.select(F.explode("addresses")).select("col.*").show(truncate=False)

[Stage 38:>                                                         (0 + 7) / 7]

+----------------------------------------------+-------+----+
|address                                       |country|type|
+----------------------------------------------+-------+----+
|Shop/Office # 60, Rushey Green, London, SE64JD|PK     |null|
+----------------------------------------------+-------+----+



                                                                                

In [37]:
person.select(F.explode("names")).select("col.*").show()

+-------------+----------+
|     fullName|      type|
+-------------+----------+
|Ansar Mahmood|individual|
+-------------+----------+



In [45]:
company = companies_sdf.filter(F.col("statementID") == "openownership-register-10001903462476671266")
company.show(truncate=False)



+---------------------------------------------------------------------------+---------------+------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------+-------------------------------------------+
|addresses                                                                  |dissolutionDate|foundingDate|identifiers                                                                                                                                                                                                                                                                                                                                      |name                 |statementID            

                                                                                