<a href="https://colab.research.google.com/github/desstaw/Seminar_DataManagement23/blob/main/K_Anonymity_generalized_heart_k%3D5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [39]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt
import random


# Load the generalized heart dataset
url = "https://raw.githubusercontent.com/desstaw/Seminar_DataManagement23/main/datasets/v1_heart.csv"
df = pd.read_csv(url)

import warnings
warnings.simplefilter('ignore')

In [40]:
# Choose the quasi-identifier attributes
quasi_identifiers = ['sex', 'cp', 'trestbps', 'fbs', 'restecg', 'thalach', 'exang', 'slope',
       'ca', 'thal', 'age_range', 'chol_range', 'oldpeak_range']

# Set the value of k
k = 5

# Group the records based on the quasi-identifier attributes
groups = {}
for i, row in df.iterrows():
    key = tuple(row[quasi_identifiers].values)
    if key not in groups:
        groups[key] = []
    groups[key].append(i)

# Ensure that each group has at least k-1 records
for key in groups:
    if len(groups[key]) < k:
        # If the group is too small, add records from other groups
        other_groups = [g for g in groups if g != key]
        for other_key in other_groups:
            if len(groups[other_key]) > k-1:
                record_indices = groups[other_key][:k-len(groups[key])]
                groups[key] += record_indices
                groups[other_key] = [i for i in groups[other_key] if i not in record_indices]

# Create a new DataFrame with the anonymized data
anonymized_data = df.copy()
for key in groups:
    for i in groups[key]:
        anonymized_data.loc[i, quasi_identifiers] = pd.Series(key, index=quasi_identifiers)
df = anonymized_data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   sex            1025 non-null   int64  
 1   cp             1025 non-null   int64  
 2   trestbps       1025 non-null   float64
 3   fbs            1025 non-null   int64  
 4   restecg        1025 non-null   int64  
 5   thalach        1025 non-null   float64
 6   exang          1025 non-null   int64  
 7   slope          1025 non-null   int64  
 8   ca             1010 non-null   object 
 9   thal           1025 non-null   object 
 10  age_range      1019 non-null   object 
 11  chol_range     1021 non-null   object 
 12  oldpeak_range  1025 non-null   object 
 13  target         1025 non-null   int64  
dtypes: float64(2), int64(7), object(5)
memory usage: 112.2+ KB


### Test for the k-anonymity

In [41]:
from collections import Counter

# Define the quasi-identifiers
qi_cols = [col for col in df.columns if col != sensitive_attribute]

# Define the value of k for k-anonymity
k_anonymity = 5

# Check k-anonymity for each group of records
for qi_vals, group in df.groupby(qi_cols):
    if len(group) < k_anonymity:
        print("k-anonymity is not satisfied for the group:", qi_vals)

# Get the total number of records that do not satisfy k-anonymity
counts = Counter(df[qi_cols].apply(tuple, axis=1))
num_violations = len([count for count in counts.values() if count < k_anonymity])
print("Number of records that do not satisfy k-anonymity:", num_violations)

k-anonymity is not satisfied for the group: (0, 0, 100.0, 0, 0, 120.0, 0, 1, '[0,1]', '[1,2]', '(53.0, 58.0]', '(214.0, 258.0]', '(-0.001, 1.0]')
k-anonymity is not satisfied for the group: (0, 0, 100.0, 0, 0, 120.0, 0, 1, '[0,1]', '[2,3]', '(39.0, 44.0]', '(258.0, 302.0]', '(-0.001, 1.0]')
k-anonymity is not satisfied for the group: (0, 0, 100.0, 0, 0, 120.0, 0, 1, '[1,2]', '[1,2]', '(39.0, 44.0]', '(258.0, 302.0]', '(-0.001, 1.0]')
k-anonymity is not satisfied for the group: (0, 0, 100.0, 0, 0, 120.0, 0, 1, '[1,2]', '[1,2]', '(53.0, 58.0]', '(214.0, 258.0]', '(-0.001, 1.0]')
k-anonymity is not satisfied for the group: (0, 0, 100.0, 0, 0, 120.0, 0, 1, '[1,2]', '[2,3]', '(39.0, 44.0]', '(258.0, 302.0]', '(-0.001, 1.0]')
k-anonymity is not satisfied for the group: (0, 0, 105.0, 0, 1, 140.0, 0, 2, '[1,2]', '[2,3]', '(63.0, 68.0]', '(214.0, 258.0]', '(-0.001, 1.0]')
k-anonymity is not satisfied for the group: (0, 0, 105.0, 0, 1, 140.0, 0, 2, '[2,3]', '[2,3]', '(63.0, 68.0]', '(214.0, 258.

### Also test for the l-diversity

In [42]:
from collections import Counter

sensitive_attribute = 'target'

l_diversity = 2

# Initialize counter
count = 0

# Define the quasi-identifiers
qi_cols = [col for col in df.columns if col != sensitive_attribute]

# Check l-diversity for each group of records
for qi_vals, group in df.groupby(qi_cols):
    if len(group[sensitive_attribute].unique()) < l_diversity:
        count += len(group)
        #print("l-diversity is not satisfied for the group:", qi_vals)
# Print the total number of records that do not satisfy l-diversity
print(f"Total number of records that do not satisfy l-diversity: {count}")

Total number of records that do not satisfy l-diversity: 690


In [43]:
#from google.colab import drive
#drive.mount('/content/drive')

Mounted at /content/drive


In [44]:
#df.to_csv('/content/drive/MyDrive/Colab Notebooks/Sepsis/v3_heart.csv', index=False)