<a href="https://colab.research.google.com/github/desstaw/PrivacyPreservingTechniques/blob/main/K_Anonymity_kidney_k%3D2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt
import random


# Load data
url = "https://raw.githubusercontent.com/desstaw/PrivacyPreservingTechniques/main/datasets/imputed_kidney.csv"
df = pd.read_csv(url)

import warnings
warnings.simplefilter('ignore')

### K-anonymity on imputed kidney ds

**Explanation**:

1. Apply generalization to the quasi-identifiers:
This step applies the generalization hierarchy to each quasi-identifier in the dataset using the pandas "cut" function. This function cuts a Series into bins and then labels the bins with the provided categories. In this code, each quasi-identifier is cut into bins based on the corresponding generalization hierarchy defined earlier.

2. Define "k" that represents the minimum number of individuals that must be in a group to avoid suppression.

3. Group the dataset by the quasi-identifiers and suppress any groups with less than k individuals by adding their indices to the "suppressed_indices" list. Then use "groupby" function to group the dataset by the quasi-identifiers, and then a loop is used to check if each group has at least k individuals. If a group has fewer than k individuals, its indices are added to the "suppressed_indices" list.

4. Drops the rows with indices in the "suppressed_indices" list from the dataset to ensure that all data is now k-anonymous.

In [None]:
#numeric_columns = df.select_dtypes(include='number').columns
#df[numeric_columns] = df[numeric_columns][df[numeric_columns] >= 0]
#df = df.dropna()

#df = df[df['pot'] <= 10]

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 378 entries, 0 to 399
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              378 non-null    float64
 1   age             378 non-null    float64
 2   bp              378 non-null    float64
 3   sg              378 non-null    float64
 4   al              378 non-null    float64
 5   su              378 non-null    float64
 6   bgr             378 non-null    float64
 7   bu              378 non-null    float64
 8   sc              378 non-null    float64
 9   sod             378 non-null    float64
 10  pot             378 non-null    float64
 11  hemo            378 non-null    float64
 12  rbc             378 non-null    object 
 13  pc              378 non-null    object 
 14  pcc             378 non-null    object 
 15  ba              378 non-null    object 
 16  wc              378 non-null    float64
 17  htn             378 non-null    obj

In [None]:
# Define the sensitive attribute and the quasi-identifiers
sensitive_attribute = 'classification'

quasi_identifiers = ['age', 'bp', 'bgr', 'bu',
                     'sc', 'hemo', 'wc', 'rc']

# Approximate sg
conditions = [
    (df['sg'] >= 1.0000) & (df['sg'] < 1.0125),
    (df['sg'] >= 1.0125) & (df['sg'] < 1.0175),
    (df['sg'] >= 1.0175) & (df['sg'] < 1.0225),
    (df['sg'] >= 1.0225) & (df['sg'] <= 1.0275)
]

choices = [1.010, 1.015, 1.020, 1.025]

df['sg'] = np.select(conditions, choices, df['sg'])

# Approximate 'al' column values
df['al'] = np.round(df['al']).clip(0, 4)

# Approximate 'su' column values
df['su'] = np.round(df['su']).clip(0, 4)


# Define the generalization hierarchy for each quasi-identifier
generalization_hierarchy = {
    'age': pd.cut(df['age'], bins=[0, 40, 50, 60, 100]),
    'bp': pd.cut(df['bp'], bins=[0, 80, 180]),
    'bgr': pd.cut(df['bgr'], bins=[0, 200, 500]),
    'bu': pd.cut(df['bu'], bins=[0, 50, 400]),
    'sc': pd.cut(df['sc'], bins=[0, 1.3, 32]),
    #'sod': pd.cut(df['sod'], bins=[0, 130, 140, 170]),
    #'pot': pd.cut(df['pot'], bins=[0, 3.6, 3.7, 3.8, 3.9, 4, 10]),
    'hemo': pd.cut(df['hemo'], bins=[0, 12, 18]),
    'wc': pd.cut(df['wc'], bins=[0, 8000, 28000]),
    'rc': pd.cut(df['rc'], bins=[0, 11, 14, 19]),
    #'pcv': pd.cut(df['pcv'], bins=[0, 8, 10, 12, 14, 16, 18]),

}

<class 'pandas.core.frame.DataFrame'>
Int64Index: 378 entries, 0 to 399
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   id              378 non-null    float64 
 1   age             378 non-null    category
 2   bp              378 non-null    category
 3   sg              378 non-null    float64 
 4   al              378 non-null    float64 
 5   su              378 non-null    float64 
 6   bgr             378 non-null    category
 7   bu              378 non-null    category
 8   sc              378 non-null    category
 9   sod             378 non-null    float64 
 10  pot             378 non-null    float64 
 11  hemo            378 non-null    category
 12  rbc             378 non-null    object  
 13  pc              378 non-null    object  
 14  pcc             378 non-null    object  
 15  ba              378 non-null    object  
 16  wc              378 non-null    category
 17  htn             

In [None]:
# Print the distribution of values in their intervals
print(df["age"].value_counts().sort_index())
print(df["bp"].value_counts().sort_index())
#print(df["sg"].value_counts().sort_index())
#print(df["al"].value_counts().sort_index())
#print(df["su"].value_counts().sort_index())
print(df["bgr"].value_counts().sort_index())
print(df["bu"].value_counts().sort_index())
print(df["sc"].value_counts().sort_index())
#print(df["sod"].value_counts().sort_index())
#print(df["pot"].value_counts().sort_index())
print(df["hemo"].value_counts().sort_index())
print(df["wc"].value_counts().sort_index())
print(df["rc"].value_counts().sort_index())
#print(df["pcv"].value_counts().sort_index())

(0, 40]       88
(40, 50]      77
(50, 60]      91
(60, 100]    122
Name: age, dtype: int64
(0, 80]      297
(80, 180]     81
Name: bp, dtype: int64
(0, 200]      301
(200, 500]     77
Name: bgr, dtype: int64
(0, 50]      254
(50, 400]    124
Name: bu, dtype: int64
(0.0, 1.3]     198
(1.3, 32.0]    180
Name: sc, dtype: int64
(0, 12]     164
(12, 18]    214
Name: hemo, dtype: int64
(0, 8000]        145
(8000, 28000]    233
Name: wc, dtype: int64
(0, 11]     111
(11, 14]    145
(14, 19]    122
Name: rc, dtype: int64


In [None]:
# Apply generalization to the quasi-identifiers
for col, hierarchy in generalization_hierarchy.items():
    df[col] = pd.cut(df[col], bins=hierarchy.cat.categories, labels=hierarchy.cat.categories[:-1])

# Define the privacy parameter
k = 2

# Group the dataset by the quasi-identifiers and suppress the groups with less than k rows
grouped = df.groupby(quasi_identifiers)
suppressed_indices = []
for group_name, group in grouped:
    if len(group) < k:
        suppressed_indices.extend(group.index)
df = df.drop(suppressed_indices)
df.info()
# Restore the original index
df_index = df.index
df = df.reset_index(drop=True)

counter = 0
for index in suppressed_indices:
    counter = counter + 1
    #print(df.loc[index])
    #print("------------")
print('sum of supressed records:', counter)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 310 entries, 0 to 399
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   id              310 non-null    float64 
 1   age             310 non-null    category
 2   bp              310 non-null    category
 3   sg              310 non-null    float64 
 4   al              310 non-null    float64 
 5   su              310 non-null    float64 
 6   bgr             310 non-null    category
 7   bu              310 non-null    category
 8   sc              310 non-null    category
 9   sod             310 non-null    float64 
 10  pot             310 non-null    float64 
 11  hemo            310 non-null    category
 12  rbc             310 non-null    object  
 13  pc              310 non-null    object  
 14  pcc             310 non-null    object  
 15  ba              310 non-null    object  
 16  wc              310 non-null    category
 17  htn             

In [None]:
'''
# Define the privacy parameter
k = 2

# Group the dataset by the quasi-identifiers and suppress the groups with less than k rows
grouped = df.groupby(quasi_identifiers)
suppressed_indices = []
for group_name, group in grouped:
    if len(group) < k:
        suppressed_indices.extend(group.index)
df = df.drop(suppressed_indices)

# Restore the original index
df_index = df.index
df = df.reset_index(drop=True)
'''

'\n# Define the privacy parameter\nk = 2\n\n# Group the dataset by the quasi-identifiers and suppress the groups with less than k rows\ngrouped = df.groupby(quasi_identifiers)\nsuppressed_indices = []\nfor group_name, group in grouped:\n    if len(group) < k:\n        suppressed_indices.extend(group.index)\ndf = df.drop(suppressed_indices)\n\n# Restore the original index\ndf_index = df.index\ndf = df.reset_index(drop=True)\n'

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 310 entries, 0 to 309
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   id              310 non-null    float64 
 1   age             310 non-null    category
 2   bp              310 non-null    category
 3   sg              310 non-null    float64 
 4   al              310 non-null    float64 
 5   su              310 non-null    float64 
 6   bgr             310 non-null    category
 7   bu              310 non-null    category
 8   sc              310 non-null    category
 9   sod             310 non-null    float64 
 10  pot             310 non-null    float64 
 11  hemo            310 non-null    category
 12  rbc             310 non-null    object  
 13  pc              310 non-null    object  
 14  pcc             310 non-null    object  
 15  ba              310 non-null    object  
 16  wc              310 non-null    category
 17  htn             

In [None]:
df.head(20)

Unnamed: 0,id,age,bp,sg,al,su,bgr,bu,sc,sod,...,wc,htn,dm,cad,appet,pe,ane,rc,pcv,classification
0,0.0,"(40, 50]","(0, 80]",1.02,1.0,0.0,"(0, 200]","(0, 50]","(0.0, 1.3]",136.841184,...,"(0, 8000]",yes,yes,no,good,no,no,"(14, 19]",15.4,ckd
1,2.0,"(60, 100]","(0, 80]",1.01,2.0,3.0,"(200, 500]","(50, 400]","(1.3, 32.0]",132.244944,...,"(0, 8000]",no,yes,no,poor,no,yes,"(0, 11]",9.6,ckd
2,4.0,"(50, 60]","(0, 80]",1.01,2.0,0.0,"(0, 200]","(0, 50]","(1.3, 32.0]",134.895939,...,"(0, 8000]",no,no,no,good,no,no,"(11, 14]",11.6,ckd
3,6.0,"(60, 100]","(0, 80]",1.01,0.0,0.0,"(0, 200]","(50, 400]","(1.3, 32.0]",104.0,...,"(8000, 28000]",no,no,no,good,no,no,"(11, 14]",12.4,ckd
4,8.0,"(50, 60]","(80, 180]",1.015,3.0,0.0,"(0, 200]","(50, 400]","(1.3, 32.0]",134.767891,...,"(8000, 28000]",yes,yes,no,good,no,yes,"(0, 11]",10.8,ckd
5,9.0,"(50, 60]","(80, 180]",1.02,2.0,0.0,"(0, 200]","(50, 400]","(1.3, 32.0]",114.0,...,"(8000, 28000]",yes,yes,no,poor,no,yes,"(0, 11]",9.5,ckd
6,10.0,"(40, 50]","(0, 80]",1.01,2.0,4.0,"(200, 500]","(50, 400]","(1.3, 32.0]",128.380739,...,"(8000, 28000]",yes,yes,no,good,no,yes,"(0, 11]",9.4,ckd
7,11.0,"(60, 100]","(0, 80]",1.01,3.0,0.0,"(200, 500]","(50, 400]","(1.3, 32.0]",131.0,...,"(0, 8000]",yes,yes,no,poor,yes,no,"(0, 11]",10.8,ckd
8,12.0,"(60, 100]","(0, 80]",1.015,3.0,1.0,"(200, 500]","(50, 400]","(1.3, 32.0]",138.0,...,"(8000, 28000]",yes,yes,yes,poor,yes,no,"(0, 11]",9.7,ckd
9,14.0,"(60, 100]","(0, 80]",1.01,3.0,2.0,"(0, 200]","(50, 400]","(1.3, 32.0]",130.0,...,"(8000, 28000]",yes,yes,yes,poor,yes,no,"(0, 11]",5.6,ckd


In [None]:
df.isnull().sum()
df.replace('', np.nan, inplace=True)
df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 310 entries, 0 to 309
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   id              310 non-null    float64 
 1   age             310 non-null    category
 2   bp              310 non-null    category
 3   sg              310 non-null    float64 
 4   al              310 non-null    float64 
 5   su              310 non-null    float64 
 6   bgr             310 non-null    category
 7   bu              310 non-null    category
 8   sc              310 non-null    category
 9   sod             310 non-null    float64 
 10  pot             310 non-null    float64 
 11  hemo            310 non-null    category
 12  rbc             310 non-null    object  
 13  pc              310 non-null    object  
 14  pcc             310 non-null    object  
 15  ba              310 non-null    object  
 16  wc              310 non-null    category
 17  htn             

Test the k-anonimity:

**Explanation**

`for qi_vals, group in df.groupby(qi_cols):`group the records in the df based on the values of the quasi-identifiers. For each group of records, the variable qi_vals contains the values of the quasi-identifiers for the group, and the variable group contains the records in the group.

`counts = Counter(df[qi_cols].apply(tuple, axis=1))`: creates a Counter object that counts the number of occurrences of each combination of quasi-identifiers in the dataframe df. The apply() function is used to apply the tuple() function to each row of the dataframe, which converts the values of the quasi-identifiers in each row to a tuple.

`num_violations = len([count for count in counts.values() if count < k_anonymity])`: This line counts the number of records in the dataframe df that do not satisfy k-anonymity. It does this by iterating over the values of the Counter object counts and counting the number of values that are less than the value of k.

In [None]:
from collections import Counter

from collections import Counter
# Convert interval column to string type
df['age'] = df['age'].astype(str)
df['bp'] = df['bp'].astype(str)
df['bu'] = df['bu'].astype(str)
df['bgr'] = df['bgr'].astype(str)
df['wc'] = df['wc'].astype(str)
df['hemo'] = df['hemo'].astype(str)
df['sc'] = df['sc'].astype(str)
df['rc'] = df['rc'].astype(str)



# Define the quasi-identifiers
qi_cols = ['age', 'bp','bu', 'bgr', 'wc', 'hemo', 'sc', 'rc' ]

k_anonymity = 2

# Calculate the frequency count of unique combinations of quasi-identifiers
qi_counts = df[qi_cols].apply(tuple, axis=1).value_counts()

# Find the number of records that do not satisfy k-anonymity
num_violations = (qi_counts < k_anonymity).sum()

print("Number of records that do not satisfy k-anonymity:", num_violations)

Number of records that do not satisfy k-anonymity: 26


Test l-diversity

In [None]:

'''
from collections import Counter

sensitive_attribute = 'target'

l_diversity = 2

# Initialize counter
count = 0

# Define the quasi-identifiers
qi_cols = ['age', 'sex']

# Check l-diversity for each group of records
for qi_vals, group in df.groupby(qi_cols):
    if len(group[sensitive_attribute].unique()) < l_diversity:
        count += len(group)
        #print("l-diversity is not satisfied for the group:", qi_vals)
# Print the total number of records that do not satisfy l-diversity
print(f"Total number of records that do not satisfy l-diversity: {count}")
'''

'\nfrom collections import Counter\n\nsensitive_attribute = \'target\'\n\nl_diversity = 2\n\n# Initialize counter\ncount = 0\n\n# Define the quasi-identifiers\nqi_cols = [\'age\', \'sex\']\n\n# Check l-diversity for each group of records\nfor qi_vals, group in df.groupby(qi_cols):\n    if len(group[sensitive_attribute].unique()) < l_diversity:\n        count += len(group)\n        #print("l-diversity is not satisfied for the group:", qi_vals)\n# Print the total number of records that do not satisfy l-diversity\nprint(f"Total number of records that do not satisfy l-diversity: {count}")\n'

In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#df.to_csv('/content/drive/MyDrive/Colab Notebooks/Kidney_DS/k=3_anonymized_kidney.csv', index=False)