In [75]:
%pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [76]:
import pandas as pd
import numpy as np
import hashlib

In [77]:
data = pd.read_json('sampleHealthData.json')

# Set the value of k for k-anonymity
k = 500

# age_bins = [0, 20, 40, 60, 80, 120]
age_bins = np.arange(0, 101, 1)

In [78]:
# testing the hashing function
test_df = data.copy()
test_df['Hashed Value'] = test_df['Name'].apply(lambda x:hashlib.sha256(x.encode()).hexdigest())

test_df = test_df.groupby("Diagnosis").agg({'Hashed Value':'count'}, inplace = True)
test_df.rename(columns={'Hashed Value': 'Count'}, inplace=True)

print(test_df)


              Count
Diagnosis          
Arthritis      2029
Asthma         1994
Depression     1952
Diabetes       1991
Hypertension   2034


In [79]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Patient ID     10000 non-null  object
 1   Name           10000 non-null  object
 2   Date of Birth  10000 non-null  object
 3   Age            10000 non-null  int64 
 4   Gender         10000 non-null  object
 5   Address        10000 non-null  object
 6   ZIP Code       10000 non-null  int64 
 7   Diagnosis      10000 non-null  object
 8   Medication     10000 non-null  object
 9   Lab Results    10000 non-null  object
dtypes: int64(2), object(8)
memory usage: 859.4+ KB
None


In [80]:
# function to bin the ages
def generalize_age(dataframe, bins):
    dataframe['Age Bin'], retbins = pd.cut(dataframe['Age'], bins, retbins = True, ordered = True)
    return dataframe, retbins

# calling function
data, bins = generalize_age(data, age_bins)

print(bins)
print(data.head())
print(data.info())

[  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100]
  Patient ID             Name Date of Birth  Age  Gender  \
0      P0001     David Peters    1950-08-11   73   Other   
1      P0002    Brendan Moody    1970-02-01   54    Male   
2      P0003        Adam Reed    1955-12-26   68  Female   
3      P0004    Nicole Holmes    1994-10-03   29    Male   
4      P0005  Kenneth Mendoza    1989-02-02   35  Female   

                                             Address  ZIP Code   Diagnosis  \
0           1485 Chen Port\nEast Laurafurt, VA 29311     83142  Depression   
1  927 Raymond Ports Suite 331\nJenniferside, PW ...     42179  Depressi

In [81]:
#  Check if any bin violates k-anonymity
def violation_checker(k, data, bins):
    age_value_counts = data['Age Bin'].value_counts()
    # dont check if the bin value is 0
    violating_bins = age_value_counts[(age_value_counts < k) & (age_value_counts > 0)].index
    # print(violating_bins)
    if len(violating_bins) > 0:
        return True
    else:
        return False
    
# Group the data by age bins and count the records
violation_checker(k, data, bins)
age_value_counts = data['Age Bin'].value_counts()
print(age_value_counts)


(66, 67]     181
(71, 72]     178
(33, 34]     177
(72, 73]     174
(63, 64]     174
            ... 
(12, 13]       0
(13, 14]       0
(14, 15]       0
(15, 16]       0
(99, 100]      0
Name: Age Bin, Length: 100, dtype: int64


In [82]:
# If ANY bin violates k-anonymity, increment the size of ALL bins
for i in range(1, (len(age_bins) - 1)):
    while violation_checker(k, data, bins) == True:
        data, bins = generalize_age(data, np.arange(0,101,i))
        age_value_counts = data['Age Bin'].value_counts()
        print(age_value_counts) 
        i+=1
        # print(bins)

# Write the anonymized dataset to a new CSV file
# data.to_csv('anonymized_dataset.csv', index=False)
age_value_counts = data['Age Bin'].value_counts()
print(age_value_counts.sum())

# print(data['Age'].size())
print(data)

(66, 67]     181
(71, 72]     178
(33, 34]     177
(72, 73]     174
(63, 64]     174
            ... 
(12, 13]       0
(13, 14]       0
(14, 15]       0
(15, 16]       0
(99, 100]      0
Name: Age Bin, Length: 100, dtype: int64
(72, 74]     335
(52, 54]     327
(22, 24]     327
(62, 64]     324
(70, 72]     322
(30, 32]     319
(66, 68]     319
(78, 80]     317
(34, 36]     314
(80, 82]     312
(54, 56]     312
(38, 40]     311
(46, 48]     309
(58, 60]     307
(32, 34]     306
(50, 52]     303
(40, 42]     303
(24, 26]     302
(44, 46]     301
(64, 66]     299
(60, 62]     299
(56, 58]     296
(26, 28]     295
(42, 44]     294
(48, 50]     292
(76, 78]     290
(18, 20]     289
(20, 22]     288
(28, 30]     287
(68, 70]     277
(74, 76]     266
(36, 38]     265
(82, 84]     182
(16, 18]     111
(86, 88]       0
(90, 92]       0
(92, 94]       0
(94, 96]       0
(84, 86]       0
(96, 98]       0
(88, 90]       0
(0, 2]         0
(2, 4]         0
(14, 16]       0
(12, 14]       0
(10, 12

10000
     Patient ID              Name Date of Birth  Age  Gender  \
0         P0001      David Peters    1950-08-11   73   Other   
1         P0002     Brendan Moody    1970-02-01   54    Male   
2         P0003         Adam Reed    1955-12-26   68  Female   
3         P0004     Nicole Holmes    1994-10-03   29    Male   
4         P0005   Kenneth Mendoza    1989-02-02   35  Female   
...         ...               ...           ...  ...     ...   
9995      P9996    Laurie Sanchez    1951-12-29   72   Other   
9996      P9997       Jared Young    1945-11-05   78   Other   
9997      P9998     Robin Vazquez    1970-02-22   54  Female   
9998      P9999  Justin Carpenter    1958-04-08   65    Male   
9999     P10000      William Wood    1994-02-21   30    Male   

                                                Address  ZIP Code  \
0              1485 Chen Port\nEast Laurafurt, VA 29311     83142   
1     927 Raymond Ports Suite 331\nJenniferside, PW ...     42179   
2     877 Delacruz