In [1]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pyspark.sql.functions as F
import yaml

from pyspark.sql import SparkSession

while not Path("data") in Path(".").iterdir():
    os.chdir("..")

plt.style.use("seaborn-white")
conf_dict = yaml.safe_load(Path("config/conf.yaml").read_text())

spark = SparkSession.builder.getOrCreate()

In [4]:
def repartition(sdf):
    return (
        sdf
        .withColumn("partition", F.substring("statementID", -2, 2))
        .repartition("partition")   
    )

In [6]:
companies_sdf = repartition(spark.read.parquet(conf_dict["companies_interim"]))
relationships_sdf = repartition(spark.read.parquet(conf_dict["relationships_interim"]))
persons_sdf = repartition(spark.read.parquet(conf_dict["persons_interim"]))

## Companies

In [7]:
companies_sdf.count()

                                                                                

7246475

In [8]:
companies_sample_pdf = companies_sdf.limit(10_000).toPandas()

                                                                                

In [9]:
companies_sdf.printSchema()

root
 |-- addresses: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- address: string (nullable = true)
 |    |    |-- country: string (nullable = true)
 |    |    |-- type: string (nullable = true)
 |-- birthDate: string (nullable = true)
 |-- dissolutionDate: string (nullable = true)
 |-- entityType: string (nullable = true)
 |-- foundingDate: string (nullable = true)
 |-- identifiers: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: string (nullable = true)
 |    |    |-- scheme: string (nullable = true)
 |    |    |-- schemeName: string (nullable = true)
 |    |    |-- uri: string (nullable = true)
 |-- incorporatedInJurisdiction: struct (nullable = true)
 |    |-- code: string (nullable = true)
 |    |-- name: string (nullable = true)
 |-- interestedParty: struct (nullable = true)
 |    |-- describedByEntityStatement: string (nullable = true)
 |    |-- describedByPersonStatement: string (nullable = tru

In [10]:
companies_sample_pdf.dtypes

addresses                     object
birthDate                     object
dissolutionDate               object
entityType                    object
foundingDate                  object
identifiers                   object
incorporatedInJurisdiction    object
interestedParty               object
interests                     object
missingInfoReason             object
name                          object
names                         object
nationalities                 object
personType                    object
source                        object
statementDate                 object
statementID                   object
statementType                 object
subject                       object
partition                     object
dtype: object

In [11]:
companies_sample_pdf.describe().T

Unnamed: 0,count,unique,top,freq
addresses,9996,9026,"[(20-22 Wenlock Road, London, N1 7GU, GB, regi...",42.0
birthDate,0,0,,
dissolutionDate,3290,306,2018-01-02,103.0
entityType,10000,1,registeredEntity,10000.0
foundingDate,9981,4134,2018-03-26,38.0
identifiers,10000,10000,"[(10649951, GB-COH, Companies House, None), (h...",1.0
incorporatedInJurisdiction,10000,1,"(GB, United Kingdom of Great Britain and North...",10000.0
interestedParty,0,0,,
interests,0,0,,
missingInfoReason,0,0,,


In [12]:
companies_sample_pdf.head().T

Unnamed: 0,0,1,2,3,4
addresses,"[(Unit 19e Eley Road, Edmonton, London, N18 3B...","[(9 Wellington Gate, East Grinstead, RH19 3GE,...","[(3 The Lee, Northwood, Middlesex, HA6 3HT, GB...","[(First Floor Right 440 George Street, Aberdee...","[(Flat 4 Prudance House, 22 Smithsland Road, R..."
birthDate,,,,,
dissolutionDate,,,,2019-04-16,2018-08-07
entityType,registeredEntity,registeredEntity,registeredEntity,registeredEntity,registeredEntity
foundingDate,2017-03-03,2015-02-26,2000-02-24,2017-03-03,2017-03-03
identifiers,"[(10649951, GB-COH, Companies House, None), (h...","[(09460241, GB-COH, Companies House, None), (h...","[(03936630, GB-COH, Companies House, None), (h...","[(SC559379, GB-COH, Companies House, None), (h...","[(10650588, GB-COH, Companies House, None), (h..."
incorporatedInJurisdiction,"(GB, United Kingdom of Great Britain and North...","(GB, United Kingdom of Great Britain and North...","(GB, United Kingdom of Great Britain and North...","(GB, United Kingdom of Great Britain and North...","(GB, United Kingdom of Great Britain and North..."
interestedParty,,,,,
interests,,,,,
missingInfoReason,,,,,


In [13]:
companies_sample_pdf["identifiers"][0]

[Row(id='10649951', scheme='GB-COH', schemeName='Companies House', uri=None),
 Row(id='https://opencorporates.com/companies/gb/10649951', scheme=None, schemeName='OpenCorporates', uri='https://opencorporates.com/companies/gb/10649951'),
 Row(id='http://register.openownership.org/entities/59c5026967e4ebf34060b471', scheme=None, schemeName='OpenOwnership Register', uri='http://register.openownership.org/entities/59c5026967e4ebf34060b471')]

In [14]:
companies_sample_pdf["identifiers"][1]

[Row(id='09460241', scheme='GB-COH', schemeName='Companies House', uri=None),
 Row(id='https://opencorporates.com/companies/gb/09460241', scheme=None, schemeName='OpenCorporates', uri='https://opencorporates.com/companies/gb/09460241'),
 Row(id='http://register.openownership.org/entities/59c5026a67e4ebf34060b67a', scheme=None, schemeName='OpenOwnership Register', uri='http://register.openownership.org/entities/59c5026a67e4ebf34060b67a')]

In [15]:
companies_sample_pdf["incorporatedInJurisdiction"]

0       (GB, United Kingdom of Great Britain and North...
1       (GB, United Kingdom of Great Britain and North...
2       (GB, United Kingdom of Great Britain and North...
3       (GB, United Kingdom of Great Britain and North...
4       (GB, United Kingdom of Great Britain and North...
                              ...                        
9995    (GB, United Kingdom of Great Britain and North...
9996    (GB, United Kingdom of Great Britain and North...
9997    (GB, United Kingdom of Great Britain and North...
9998    (GB, United Kingdom of Great Britain and North...
9999    (GB, United Kingdom of Great Britain and North...
Name: incorporatedInJurisdiction, Length: 10000, dtype: object

One address per entity

In [18]:
companies_sample_pdf["addresses"].map(lambda x: len(x) if x else 0).max()

1

Many identifiers

In [19]:
companies_sample_pdf["identifiers"].map(lambda x: len(x) if x else 0).max()

762

In [20]:
keep_cols = [
    "addresses",
    "dissolutionDate",
    "foundingDate",
    "identifiers",
    # "incorporatedInJurisdiction",
    # "interestedParty",
    # "interests",
    # "missingInfoReason",
    "name",
    # "names",
    # "nationalities",
    # "personType",
    # "source",
    # "statementDate",
    "statementID",
    # "statementType",
    # "subject",
]

In [22]:
companies_sdf.select(keep_cols).write.parquet(conf_dict["companies_processed"])

22/05/25 22:38:04 WARN MemoryManager: Total allocation exceeds 95.00% (919,941,928 bytes) of heap memory
Scaling row group sizes to 97.92% for 7 writers
22/05/25 22:38:04 WARN MemoryManager: Total allocation exceeds 95.00% (919,941,928 bytes) of heap memory
Scaling row group sizes to 85.68% for 8 writers
22/05/25 22:38:05 WARN MemoryManager: Total allocation exceeds 95.00% (919,941,928 bytes) of heap memory
Scaling row group sizes to 97.92% for 7 writers
22/05/25 22:38:05 WARN MemoryManager: Total allocation exceeds 95.00% (919,941,928 bytes) of heap memory
Scaling row group sizes to 85.68% for 8 writers
22/05/25 22:38:05 WARN MemoryManager: Total allocation exceeds 95.00% (919,941,928 bytes) of heap memory
Scaling row group sizes to 97.92% for 7 writers
22/05/25 22:38:05 WARN MemoryManager: Total allocation exceeds 95.00% (919,941,928 bytes) of heap memory
Scaling row group sizes to 97.92% for 7 writers
22/05/25 22:38:05 WARN MemoryManager: Total allocation exceeds 95.00% (919,941,928

## Relationships

In [23]:
relationships_sdf.count()

                                                                                

9996335

In [24]:
relationships_sample_pdf = relationships_sdf.limit(10_000).toPandas()

                                                                                

In [25]:
relationships_sample_pdf.describe().T

Unnamed: 0,count,unique,top,freq
addresses,0,0,,
birthDate,0,0,,
dissolutionDate,0,0,,
entityType,0,0,,
foundingDate,0,0,,
identifiers,0,0,,
incorporatedInJurisdiction,0,0,,
interestedParty,10000,9533,"(None, None, (No person, no-beneficial-owners))",409.0
interests,10000,5800,[],450.0
missingInfoReason,0,0,,


In [27]:
keep_cols = [
    # "addresses",
    # "birthDate",
    # "dissolutionDate",
    # "entityType",
    # "foundingDate",
    # "identifiers",
    # "incorporatedInJurisdiction",
    "interestedParty",
    "interests",
    # "missingInfoReason",
    # "name",
    # "names",
    # "nationalities",
    # "personType",
    "source",
    "statementDate",
    "statementID",
    # "statementType",
    "subject",
    "partition",
]

In [28]:
relationships_sdf.select(keep_cols).write.parquet(conf_dict["relationships_processed"])

22/05/25 22:42:58 WARN MemoryManager: Total allocation exceeds 95.00% (919,941,928 bytes) of heap memory
Scaling row group sizes to 97.92% for 7 writers
22/05/25 22:42:58 WARN MemoryManager: Total allocation exceeds 95.00% (919,941,928 bytes) of heap memory
Scaling row group sizes to 85.68% for 8 writers
22/05/25 22:42:59 WARN MemoryManager: Total allocation exceeds 95.00% (919,941,928 bytes) of heap memory
Scaling row group sizes to 97.92% for 7 writers
22/05/25 22:42:59 WARN MemoryManager: Total allocation exceeds 95.00% (919,941,928 bytes) of heap memory
Scaling row group sizes to 97.92% for 7 writers
22/05/25 22:42:59 WARN MemoryManager: Total allocation exceeds 95.00% (919,941,928 bytes) of heap memory
Scaling row group sizes to 85.68% for 8 writers
22/05/25 22:43:00 WARN MemoryManager: Total allocation exceeds 95.00% (919,941,928 bytes) of heap memory
Scaling row group sizes to 97.92% for 7 writers
22/05/25 22:43:00 WARN MemoryManager: Total allocation exceeds 95.00% (919,941,928

## Persons

In [29]:
persons_sdf.count()

                                                                                

7861347

In [33]:
persons_sample_pdf = persons_sdf.limit(10_000).toPandas()

                                                                                

In [34]:
persons_sample_pdf.describe().T

Unnamed: 0,count,unique,top,freq
addresses,10000,9505,"[(20-22, Wenlock Road, London, N1 7GU, GB, None)]",76.0
birthDate,9987,820,1974-07-01,34.0
dissolutionDate,0,0,,
entityType,0,0,,
foundingDate,0,0,,
identifiers,10000,9989,[],12.0
incorporatedInJurisdiction,0,0,,
interestedParty,0,0,,
interests,0,0,,
missingInfoReason,12,5,The company knows or has reasonable cause to b...,6.0


In [37]:
keep_cols = [
    "addresses",
    "birthDate",
    # "dissolutionDate",
    # "entityType",
    # "foundingDate",
    "identifiers",
    # "incorporatedInJurisdiction",
    # "interestedParty",
    # "interests",
    "missingInfoReason",
    # "name",
    "names",
    "nationalities",
    # "personType",
    # "source",
    # "statementDate",
    "statementID",
    # "statementType",
    # "subject",
    "partition",
]

In [38]:
persons_sdf.select(keep_cols).write.parquet(conf_dict["persons_processed"])

22/05/25 22:48:46 WARN MemoryManager: Total allocation exceeds 95.00% (919,941,928 bytes) of heap memory
Scaling row group sizes to 97.92% for 7 writers
22/05/25 22:48:46 WARN MemoryManager: Total allocation exceeds 95.00% (919,941,928 bytes) of heap memory
Scaling row group sizes to 85.68% for 8 writers
22/05/25 22:48:46 WARN MemoryManager: Total allocation exceeds 95.00% (919,941,928 bytes) of heap memory
Scaling row group sizes to 97.92% for 7 writers
22/05/25 22:48:46 WARN MemoryManager: Total allocation exceeds 95.00% (919,941,928 bytes) of heap memory
Scaling row group sizes to 97.92% for 7 writers
22/05/25 22:48:46 WARN MemoryManager: Total allocation exceeds 95.00% (919,941,928 bytes) of heap memory
Scaling row group sizes to 85.68% for 8 writers
22/05/25 22:48:47 WARN MemoryManager: Total allocation exceeds 95.00% (919,941,928 bytes) of heap memory
Scaling row group sizes to 97.92% for 7 writers
22/05/25 22:48:47 WARN MemoryManager: Total allocation exceeds 95.00% (919,941,928