In [32]:
import pandas as pd
from google.cloud import bigquery
from google.cloud import storage
import io
import hashlib
import time

In [45]:
class SingletonIDGenerator:
    """A class that generates unique IDs using a singleton pattern.

    Attributes
    ----------
    start : int
        The starting value of the ID sequence.
    step : int
        The increment value of the ID sequence.

    Examples
    --------
    >>> id_gen = SingletonIDGenerator(start=100, step=10)
    >>> next(id_gen)
    100
    >>> next(id_gen)
    110
    >>> id_gen2 = SingletonIDGenerator(start=200, step=20)
    >>> next(id_gen2)
    120
    """

    _instance = None

    def __new__(cls, start: int = 0, step: int = 1):
        """Create a new instance of the class or return the existing one.

        Parameters
        ----------
        start : int, optional
            The starting value of the ID sequence, by default 0
        step : int, optional
            The increment value of the ID sequence, by default 1

        Returns
        -------
        SingletonIDGenerator
            The singleton instance of the class.
        """
        if cls._instance is None:
            cls._instance = super(SingletonIDGenerator, cls).__new__(cls)
            cls._instance.start = start
            cls._instance.step = step
        return cls._instance

    def __init__(self, start: int = 0, step: int = 1) -> None:
        """Initialize the instance attributes.

        Parameters
        ----------
        start : int, optional
            The starting value of the ID sequence, by default 0
        step : int, optional
            The increment value of the ID sequence, by default 1
        """
        self.start: int = start
        self.step: int = step

    def __iter__(self):
        """Return the iterator object of the instance.

        Returns
        -------
        SingletonIDGenerator
            The iterator object of the instance.
        """
        return self

    def __next__(self):
        """Return the next ID in the sequence.

        Returns
        -------
        int
            The next ID in the sequence.
        """
        value: int = self.start
        self.start += self.step
        return value

    def reset(self, reset_val: int = 0) -> None:
        self.start = reset_val

In [46]:
id_gen = SingletonIDGenerator()


1709735613.7387638

In [30]:
# Read the data from the blob as a DataFrame
client = storage.Client()

bucket_name = "rdmf_mock_data"
blob_name = "AI_mockdata.csv"

blobs = [b for b in client.list_blobs(bucket_name)]

data = {}

for blob in blobs:
    data[blob.name.split("_")[0]] = pd.read_csv(f"gs://{bucket_name}/{blob.name}", encoding="latin-1")

data

{'AI':                   AI_ID          UPRN
 0       109883574750268  950698168954
 1       109256760223223  950320937277
 2       109893804250876  950186421264
 3       109805082504690  950680091532
 4       109403802151520  950396831655
 ...                 ...           ...
 100995  109984358090240  950178626481
 100996  109065983079912  950672927828
 100997  109186696116433  950696064737
 100998  109788394420272  950334094218
 100999  109260804272677  950187498346
 
 [101000 rows x 2 columns],
 'BI':                BI_ID             Business_name  PAYE_scheme_ref  SIC_code  \
 0    606162501213544     Infinidum Enterprises           100000     25143   
 1    606948361742718              Nitzsche PLC           100001     55303   
 2    606089685526234         Schamberger-Smith           100002     52512   
 3    606885102242594      Heidenreich-Schimmel           100003     68588   
 4    606517518834848   Turner, Davis and Weber           100004     85252   
 ..               ... 

In [31]:
data["BI"]

Unnamed: 0,BI_ID,Business_name,PAYE_scheme_ref,SIC_code,UPRN
0,606162501213544,Infinidum Enterprises,100000,25143,950187498346
1,606948361742718,Nitzsche PLC,100001,55303,950698168954
2,606089685526234,Schamberger-Smith,100002,52512,950320937277
3,606885102242594,Heidenreich-Schimmel,100003,68588,950186421264
4,606517518834848,"Turner, Davis and Weber",100004,85252,950680091532
...,...,...,...,...,...
995,606575633746410,Kris-Howe,100995,5704,950585454519
996,606360731007987,"Lesch, Legros and Herzog",100996,55448,950887629315
997,606804296291086,Roob Ltd,100997,20309,950515544317
998,606950340031983,Daniel Inc,100998,54995,950244942559


In [172]:
from typing import List
import numpy as np

class CustomerRequest:
    def __init__(self, gov_prefix: str, dataset1: str, dataset2: str, merge_col: str):
        self.gov_prefix = gov_prefix
        self.dataset1: str = dataset1
        self.dataset2: str = dataset2
        self.merge_col: str = merge_col
        self.req_id: int = next(id_gen)
        self.epoch = time.time()
        self.password_prefix = "".join(
            [str(ord(s)) for s in self.gov_prefix]
        ) + str(self.req_id)
    
    def set_data(self, client, bucket_name: str):
        blobs = [b for b in client.list_blobs(bucket_name)]
        self.data = {}

        for blob in blobs:
            # print(blob.name)
            if ".csv" in blob.name:
                self.data[blob.name.split("_")[0]] = pd.read_csv(f"gs://{bucket_name}/{blob.name}", encoding="latin-1")      
    
        self.merged_data = pd.merge(data[self.dataset1], data[self.dataset2], on=self.merge_col)

    def get_data(self, hashing_obj, password, hashed_cols: List[str]):
        cr_hasher = hashing_obj(self)
        for col in hashed_cols:
            self.merged_data[col] = cr_hasher.hash_data(self.merged_data[col])
        return self.merged_data
    
    def create_password(self):
        return self.password_prefix + str(np.random.randint(10000000, 99999999))
    
    
class Hasher:
    def __init__(self, customer_request):
        self.customer_request = customer_request
    
    def generate_hash_key(
        self, 
        input_data
    ):
        """
        Generates a hash key for secure record linkage.

        Parameters:
        project_name (str): The name of the project.
        input_data (str): The input data to be hashed.

        Returns:
        str: The generated hash key.
        """
        salt = self.customer_request.gov_prefix.encode()
        salt2 = str(self.customer_request.epoch).encode()
        salt3 = str(self.customer_request.req_id).encode()
        input_data_with_salt = str(input_data).encode() + salt + salt2 + salt3
        hash_key = hashlib.sha256(input_data_with_salt).hexdigest()
        return hash_key
    
    def hash_data(self, series):
        return series.apply(lambda x: self.generate_hash_key(x))
    
    


In [173]:
cr = CustomerRequest( "MOD", "AI", "BI", "UPRN")
cr

<__main__.CustomerRequest at 0x7fbb8583e410>

In [174]:
cr.create_password()

'7779681713503145'

In [175]:
vars(cr)

{'gov_prefix': 'MOD',
 'dataset1': 'AI',
 'dataset2': 'BI',
 'merge_col': 'UPRN',
 'req_id': 17,
 'epoch': 1709739983.6859555,
 'password_prefix': '77796817'}

In [156]:
cr.set_data(client, bucket_name)

In [157]:
data = cr.get_data(Hasher, ["AI_ID", "BI_ID"])
data

Unnamed: 0,AI_ID,UPRN,BI_ID,Business_name,PAYE_scheme_ref,SIC_code
0,4fdfa1a99fdce0dac8cd01346c7cabb6439366361073bd...,950698168954,0b9e423e3a7f3cd4efe8540d6bd8bd4eff7902c86903a8...,Nitzsche PLC,100001,55303
1,cc245093e8ed895db7a28dad124e22924ea6979241f3ab...,950320937277,eed1e6335fd7217ad36ddc661b8bf14aa9f19479d1dd4b...,Schamberger-Smith,100002,52512
2,afa5ec9176177f6500de4a2e632272e258426422b0f22a...,950186421264,6462d2b8ca65a67d9f76c05728d74fcccf237c95633701...,Heidenreich-Schimmel,100003,68588
3,624c3a44741296d774643b6e662e633d3f7e88a7bc4f94...,950680091532,f9eb09e37415a51a231003eb5e9f47bb02063b26d0db8f...,"Turner, Davis and Weber",100004,85252
4,012b7b12b8d22fbe4dd53c5dc82a445a1cc6133cef2069...,950396831655,844f7273bf715c849f645864988ff56b871365504c1597...,"Schmidt, Pouros and Bauch",100005,94309
...,...,...,...,...,...,...
995,d941597bfbedb146f334a60e830543d5a48c74c28550c1...,950887629315,d8393f0fe50454cc373c95c682fe47c7ea2b0421937e91...,"Lesch, Legros and Herzog",100996,55448
996,1c3990a1926aa088c68d1b2bd445b4032297da27696916...,950515544317,06d7d6620aaebff6c74e5b8db13da49f5b005ea6292be5...,Roob Ltd,100997,20309
997,8cf2330561c2a02e9141f10feb82ca92bd33faec5c4e90...,950244942559,276df1c70fa55f1111b1c476aacd87a272fbf953cf850f...,Daniel Inc,100998,54995
998,3d14eac61e8c8170fc2a9b411f09290e0e808e181e0f25...,950966683600,495328e0c1b75f3ba65044090a4e740688ca86f6db029e...,Tromp LLC,100999,62591


In [118]:
print(len(pd.merge(data["AI"], data["BI"], on="UPRN")))
print(len(pd.merge(data["AI"], data["DI"], on="UPRN")))
print(len(pd.merge(data["BI"], data["DI"], on="UPRN")))


1000
100000
0


In [133]:
def generate_hash_key(customer_request: CustomerRequest, input_data):
    """
    Generates a hash key for secure record linkage.

    Parameters:
    project_name (str): The name of the project.
    input_data (str): The input data to be hashed.

    Returns:
    str: The generated hash key.
    """
    salt = customer_request.gov_prefix.encode()
    salt2 = str(customer_request.epoch).encode()
    salt3 = str(customer_request.req_id).encode()
    input_data_with_salt = str(input_data).encode() + salt + salt2 + salt3
    hash_key = hashlib.sha256(input_data_with_salt).hexdigest()
    return hash_key


In [134]:
merged_df["BI_ID"] = merged_df["BI_ID"].apply(lambda x: generate_hash_key(cr, x))
merged_df["AI_ID"] = merged_df["AI_ID"].apply(lambda x: generate_hash_key(cr, x))
merged_df

Unnamed: 0,AI_ID,UPRN,BI_ID,Business_name,PAYE_scheme_ref,SIC_code
0,7c9610d2e4af85faca5da93b0e57b6e301678eb7c8f1de...,950698168954,8d2afcf5e56b0412f538bedca8ebdcefc7201ac2d3f4ec...,Nitzsche PLC,100001,55303
1,607ceaf273bb8fa458049c9d673d4b92cfe7aa4eb66877...,950320937277,6a510beae0640270b951305b427f1072d1426e8329d1f8...,Schamberger-Smith,100002,52512
2,a85ed54d7c82f307c9e08ff18491114a12aa0c9bb40137...,950186421264,1e59084e2bcc4e0d37243a97a650bedf5fdc2508494362...,Heidenreich-Schimmel,100003,68588
3,bc6d75d919f799e4e83f349d5f1b54057cc579642a766d...,950680091532,04a194f96b8f6ce61b4390efe7507fdf871aeacd6bcb4f...,"Turner, Davis and Weber",100004,85252
4,21369aa7fbd2f5ef7a80e2ae9162d29b474ad5e37106f7...,950396831655,21080257184923a5df7cda8e7c5ef938d6f028fd5ccf1c...,"Schmidt, Pouros and Bauch",100005,94309
...,...,...,...,...,...,...
995,ec8ac4af2f8135bdb4426820db22b39ee4da1c2fe38729...,950887629315,6e2d44bde962345355ca280c0f8dbf107b70b5875feeff...,"Lesch, Legros and Herzog",100996,55448
996,c27ca11ecbd9900990bc82825c4b328431e8f2eaad1d1e...,950515544317,a9ba6f47d32a881a96ee5773e2ed32ab7c24ca33f58f79...,Roob Ltd,100997,20309
997,364425a9534f987bed25f5390d9619cdab042f7c26665d...,950244942559,cf729c5088b1fbb5d8b563c3dec56ddaa824d20709dae6...,Daniel Inc,100998,54995
998,4cefc16068002a4cd02f7bb9314690ce0c76fbe51e035a...,950966683600,f88c242319e4ffa744241cb4181d2edaef60b5fd43995c...,Tromp LLC,100999,62591


In [135]:
str(cr.epoch)

'1709737387.235804'