Skip to content

Commit

Permalink
Added an option to generate pseudo-CHI numbers as UUIDs
Browse files Browse the repository at this point in the history
  • Loading branch information
gherka committed Sep 25, 2023
1 parent 4ef5c5e commit e46ef3e
Show file tree
Hide file tree
Showing 3 changed files with 88 additions and 1 deletion.
36 changes: 36 additions & 0 deletions exhibit/core/generate/tests/test_uuid.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,42 @@ def test_uuid_column_generation_with_more_uuids_than_requested(self):

self.assertEqual(len(result), 100)

def test_uuid_column_generation_with_pseudo_chis_standalone(self):
'''
Dummy CHIs are only similar to real CHIs in that they have 10 digits
and that they can start from zero. Neither DOB, nor Gender is encoded,
plus the month of the DOB part is always set to 13 to avoid even
accidental matches. The algorithm is completely deterministic.
'''

n = 1000
pseudo_chis = tm._generate_pseudo_chis(n=n)

self.assertEqual(len(set(pseudo_chis)), n)

def test_uuid_column_generation_with_pseudo_chis(self):
'''
Pseudi CHIs are only similar to real CHIs in that they have 10 digits
and that they can start from zero. Neither DOB, nor Gender is encoded,
plus the month of the DOB part is always set to 13 to avoid even
accidental matches. The algorithm is completely deterministic.
'''

n = 1000
expected = 900 # 800 CHIs to appear once and 100 CHIs appear twice.

freq_dist = [
"frequency | probability_vector",
"1 | 0.800",
"2 | 0.200"
]

result = tm.generate_uuid_column(
"pseudo_chi", n, 0, freq_dist, 0, uuid_type="pseudo_chi")


self.assertEqual(len(set(result)), expected)

if __name__ == "__main__" and __package__ is None:
#overwrite __package__ builtin as per PEP 366
__package__ = "exhibit"
Expand Down
51 changes: 51 additions & 0 deletions exhibit/core/generate/uuids.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,18 @@ def generate_uuid_column(
uuids = []
range_max = 0

# we need to know the total number of pseudo-chis ahead of generation time
# to ensure consistency with random seed and to avoid duplicates so that the first
# N of pseudo-chis for a given seed will always be the same.

if uuid_type == "pseudo_chi":
pseudo_chi_total = 0
for row in freq_df.itertuples():
_num_rows = int(np.ceil(
num_rows * float(row.probability_vector) / int(row.frequency)))
pseudo_chi_total = pseudo_chi_total + _num_rows
pseudo_chis = _generate_pseudo_chis(n=pseudo_chi_total, seed=seed)

for row in freq_df.itertuples():

# always round up the number of generated rows before casting to int
Expand All @@ -53,6 +65,12 @@ def generate_uuid_column(
range_max = range_max + _num_rows
continue

if uuid_type == "pseudo_chi":
_uuids = pseudo_chis[range_max: range_max + _num_rows] * int(row.frequency)
uuids.extend(_uuids)
range_max = range_max + _num_rows
continue

_uuids = []

for _ in range(_num_rows):
Expand Down Expand Up @@ -87,3 +105,36 @@ def generate_uuid_column(
)

return uuid_series

def _generate_pseudo_chis(n, seed=0):
'''
Generate pseudo CHI numbers that consist of 10 digits, including
a possible zero as the first digit.
The logic of CHIs is not preserved to avoid potential collisions with the real CHIs.
In addition, the month part of the CHI is fixed at the impossible 13.
Parameters
----------
n : int
the number of dummy CHIs to generate.
Returns
-------
A sorted list with unique dummy CHI numbers
'''

random.seed = seed
result = set()

while len(result) < n:
pseudo_chi = (
str(random.randint(0,31)) + # day will be zero padded if total length < 10
'13' + # ensure no accidental collissions
str(random.randint(20, 99)) +
str(random.randint(0,9999)).zfill(4) # no specific logic for 9th digit
).zfill(10)

result.add(pseudo_chi)

return sorted(list(result))
2 changes: 1 addition & 1 deletion exhibit/core/generate/yaml.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def generate_YAML_string(spec_dict):
# miss_probability: 0.0
# anonymising_set: uuid
#
# You can choose between uuid and range anonymising sets.
# You can choose between uuid, range and pseudo_chi anonymising sets.
# ----------------------------------------------------------
""")

Expand Down

0 comments on commit e46ef3e

Please sign in to comment.