In [0]:
def dirty_address_data(address_row):
    """
        Return a dict row with dirtied address data.
        Accepts: address_row (py dict)
        Returns dirtied address_row (py_dict)
    """
    
    postcode_mods = [
        lambda x: x.replace("9","E"),
        lambda x: x.replace("E","9"),
        lambda x: x.replace("0","o"),
        lambda x: x.replace("o","0"),
        lambda x: x.replace("1","l"),
        lambda x: x.replace("l","1"),
    ]
    mods = [
        lambda x: x.replace("Street", "St"),                           # abbreviate
        lambda x: x.replace("Road", "Rd"),                             # abbreviate
        lambda x: x.replace("Road", ""),                               # drop words
        lambda x: x.replace("Street", ""),                             # drop words
        lambda x: x.replace("o", "0"),                                 # typo
        lambda x: x.replace("1","l"),                                  # typo
        lambda x: x.replace("l","1"),                                  # typo
        lambda x: x.upper(),                                           # all caps
        lambda x: x.lower(),                                           # all lowercase
        lambda x: x + " Apt 1B",                                       # add noise
        lambda x: x[:len(x)//2],                                       # truncate
    ]
    
    address_row["postcode"] = random.choice(postcode_mods)(address_row["postcode"])
    address_row["fulladdress"] = random.choice(mods)(address_row["fulladdress"])
    return address_row

def destroy_address_data(address_row):
    """
        Return a dict with destroyed address data.
        Accepts: address_row (py dict)
        Returns destroyed address_row (py_dict)
    """
    
    postcode_mods = [
        lambda x: "",
        lambda x: "NOTAPOSTC"
    ]
    mods = [
        lambda x: "123 Madeup Road, Imaginary Town",                   # fully fake
        lambda x: "",                                                  # blank out
        lambda x: x[::-1],                                             # reverse string
    ]

    # dirty postcode
    address_row["postcode"] = random.choice(postcode_mods)(address_row["postcode"])
    # address line
    address_row["fulladdress"] = random.choice(mods)(address_row["fulladdress"])
    address_row["expected_uprn"] = None
    return address_row
    
def transform_row(row):
    row_dict = row.asDict()

    if row_dict["unique_id"] % 2 == 0:
        return dirty_address_data(row_dict)
    elif row_dict["unique_id"] % 7 == 0:
        return destroy_address_data(row_dict)
    else:
        return row_dict
    


In [0]:
# Get all addresses (currently a subset of GB addresses)
all_address_sdf = spark.read.option("header", True).csv([
    # "dbfs:/Volumes/catalog-sbx-uks-ctdp-001/schema-sbx-uks-ctdp-001-acquire-clickops/test_ctdp/part_0.csv",
    # "dbfs:/Volumes/catalog-sbx-uks-ctdp-001/schema-sbx-uks-ctdp-001-acquire-clickops/test_ctdp/part_1.csv",
    # "dbfs:/Volumes/catalog-sbx-uks-ctdp-001/schema-sbx-uks-ctdp-001-acquire-clickops/test_ctdp/part_2.csv",
    "dbfs:/Volumes/catalog-sbx-uks-ctdp-001/schema-sbx-uks-ctdp-001-acquire-clickops/test_ctdp/part_3.csv",
])

# Extract required fields for address matching
all_address_sdf = all_address_sdf.select(
    "uprn",
    "fulladdress",
    "postcode"
)

In [0]:
import pyspark.sql.functions as F
from pyspark.sql import Row
import random

number_of_test_rows = 50

# Generate SAMPLE of addresses to dirty
address_samples = all_address_sdf.orderBy(F.rand()).limit(number_of_test_rows)
address_samples = address_samples.withColumnRenamed("uprn", "expected_uprn")
# give each test address a key of 'unique_id' to join-on in matcher output
address_samples = address_samples.withColumn("unique_id", F.monotonically_increasing_id())
    
address_samples = address_samples.rdd.map(transform_row).toDF()
# we dont want uprn potenitally interfearing with matching
input_address_samples = address_samples.drop("expected_uprn")




In [0]:
# change column names to match moj address matcher's expectations

moj_input_address_samples = input_address_samples.withColumnRenamed("fulladdress", "address_concat")

moj_all_address_subset_sdf = all_address_sdf.withColumnRenamed("uprn", "unique_id").withColumnRenamed("fulladdress", "address_concat")

# MoJ algo wants to read from a Parquet file format
moj_input_address_samples.write.mode("overwrite").parquet("dbfs:/tmp/address_data/input/")
moj_all_address_subset_sdf.write.mode("overwrite").parquet("dbfs:/tmp/address_data/all")