In [16]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pyspark.sql.functions as F
import yaml

from pyspark.sql import SparkSession

while not Path("data") in Path(".").iterdir():
    os.chdir("..")

plt.style.use("seaborn-white")
conf_dict = yaml.safe_load(Path("config/dataprep.yaml").read_text())

spark = SparkSession.builder.config("spark.driver.memory", "8g").getOrCreate()

In [19]:
with Path(conf_dict["companies_house_data_raw"]).open() as f:
    column_names = f.readline().split(",")
column_names_clean = [s.strip().replace(".", "_") for s in column_names]

In [21]:
df00 = spark.read.csv(conf_dict["companies_house_data_raw"], header=False)
cols_renamed = [
    F.col(colname).alias(colname_new)
    for colname, colname_new in zip(df00.columns, column_names_clean)
]
df01 = df00.select(cols_renamed).filter(F.col("CompanyName") != "CompanyName")

In [22]:
df01.show()

+--------------------+-------------+-----------------+----------------+-----------------------+-----------------------+-------------------+-----------------+------------------+-------------------+--------------------+-------------+---------------+---------------+-----------------+----------------------+------------------------+--------------------+-----------------------+------------------------+-------------------+----------------------+------------------------+----------------------------+------------------------------+--------------------------+--------------------+--------------------+--------------------+--------------------+----------------------------------+----------------------------------+--------------------+----------------------+--------------------------+----------------------+--------------------------+----------------------+--------------------------+----------------------+--------------------------+----------------------+--------------------------+---------------------

22/06/01 10:13:58 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


In [10]:
keep_cols = [
    "CompanyName",
    "CompanyNumber",
    "CompanyCategory",
    "CompanyStatus",
    "Accounts_AccountCategory",
    "Returns_NextDueDate",
    "Returns_LastMadeUpDate",
    "Mortgages_NumMortCharges",
    "Mortgages_NumMortOutstanding",
    "Mortgages_NumMortPartSatisfied",
    "Mortgages_NumMortSatisfied",
    "SICCode_SicText_1",
    "URI",
    "ConfStmtNextDueDate",
    "ConfStmtLastMadeUpDate",
]
df02 = df01.select(keep_cols)
sample = df02.limit(10).toPandas()
sample.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
CompanyName,! LTD,!? LTD,!BIG IMPACT GRAPHICS LIMITED,!GOBERUB LTD,!NFOGENIE LTD,!NNOV8 LIMITED,!NSPIRED INVESTMENTS LTD,!NSPIRED LTD,!NVERTD DESIGNS LIMITED,!OBAC LIMITED
CompanyNumber,08209948,11399177,11743365,13404790,13522064,11006939,SC606050,SC421617,09152972,FC031362
CompanyCategory,Private Limited Company,Private Limited Company,Private Limited Company,Private Limited Company,Private Limited Company,Private Limited Company,Private Limited Company,Private Limited Company,Private Limited Company,Other company type
CompanyStatus,Active,Active,Active,Active,Active,Active,Active,Liquidation,Active,Active
Accounts_AccountCategory,DORMANT,TOTAL EXEMPTION FULL,DORMANT,NO ACCOUNTS FILED,NO ACCOUNTS FILED,MICRO ENTITY,TOTAL EXEMPTION FULL,TOTAL EXEMPTION FULL,MICRO ENTITY,FULL
Returns_NextDueDate,09/10/2016,03/07/2019,25/01/2020,14/06/2022,18/08/2022,08/11/2018,19/09/2019,09/05/2017,27/08/2016,
Returns_LastMadeUpDate,11/09/2015,,,,,,,11/04/2016,30/07/2015,
Mortgages_NumMortCharges,0,0,0,0,0,0,5,0,0,0
Mortgages_NumMortOutstanding,0,0,0,0,0,0,5,0,0,0
Mortgages_NumMortPartSatisfied,0,0,0,0,0,0,0,0,0,0
