<a href="https://colab.research.google.com/github/desstaw/PrivacyPreservingTechniques/blob/main/Wrapper_Instance_KNN_with_k_Anonymization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
from sklearn.model_selection import train_test_split
from functools import wraps
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score 
import warnings
warnings.simplefilter('ignore')

In [None]:
def train_and_evaluate_classifier(classifier, X_train, y_train, X_test, y_test):
    # Train the classifier on the training dataset
    classifier.fit(X_train, y_train)

    # Predict labels on the testing dataset and calculate the evaluation metrics
    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # Print the evaluation metrics results
    print("Accuracy:", accuracy)
    print("Recall:", recall)
    print("F1 score:", f1)

    return accuracy, recall, f1

In [None]:
def anonymize_decorator(classifier):
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):

            # Split the original dataset into features and labels
            X = args[0].iloc[:, :-1]
            y = args[0].iloc[:, -1]

            # Split into training and testing 
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

            # Evaluate the classifier on the original dataset
            print("Results on original dataset:")
            train_and_evaluate_classifier(classifier, X_train, y_train, X_test, y_test)

            # Apply the privacy preserving technique to the dataset
            anonym_dataset = func(*args, **kwargs)

            # Split the anonymized dataset into features and labels
            anonym_X = anonym_dataset.iloc[:, :-1]
            anonym_y = anonym_dataset.iloc[:, -1]

            # Split the anonymized dataset into training and testing
            anonym_X_train, anonym_X_test, anonym_y_train, anonym_y_test = train_test_split(anonym_X, anonym_y, test_size=0.2)

            # Evaluate the classifier on the anonymized dataset
            print("Results on anonymized dataset:")
            train_and_evaluate_classifier(classifier, anonym_X_train, anonym_y_train, anonym_X_test, anonym_y_test)

            # Return the anonymized dataset
            return anonym_dataset
        return wrapper
    return decorator

In [None]:
# Create an instance of the KNeighborsClassifier (or any other classifier)
knn_classifier = KNeighborsClassifier(n_neighbors=6)

In [None]:
# Privacy preserving function decorator
@anonymize_decorator(knn_classifier)
def k_anonymize(dataset, columns):
    """
    Anonymize certain columns of the database
    using suppression and generalization
    
    :param dataset: The set of columns and rows composing the dataset
    :param columns: Specific columns which entries are considered sensitive
    
    """
    
    anon_dataset = dataset.copy()
    # Describe the columns and their respective anonymization type
    columns = [
        {"label": "sex", "type": "suppressed"},
        {"label": "age", "type": "generalized"}
    ]
    
    # Target all sensitive columns
    for column in columns:
        column_label = column['label']
        if column['type'] == 'suppressed':
            # Replace all characters with asterix
            anon_dataset[column_label] = ['*' for x in anon_dataset[column_label]]
        
        if column['type'] == 'semi-suppressed':
            # Replace 70% of the characters with asterix
            anon_dataset[column_label] = [('*'*(round(len(x)*.7)) + x[(round(len(x)*.7)):]) for x in anon_dataset[column_label]]
        
        if column['type'] == 'generalized':
            # Summarize the data using ranges
            for i in range(len(anon_dataset[column_label])):
                # convert column type from int to string
                anon_dataset[column_label] = anon_dataset[column_label].astype(str)
                x = int(anon_dataset[column_label][i])
                if x <= 40: anon_dataset[column_label][i] =  "0-40"
                if x > 40 and x <= 60: anon_dataset[column_label][i] =  "40-60"
                if x > 60 and x <= 100: anon_dataset[column_label][i] =  "> 60"
            # specify the categories
            categories = ['0-40', '40-60', '> 60']
            # change the "age" column to the category data type
            anon_dataset['age'] = pd.Categorical(anon_dataset['age'], categories=categories)
            # convert the "age" column to float values using the cat.codes attribute
            anon_dataset['age'] = anon_dataset['age'].cat.codes.astype(float)
    # drop the "sex" column
    anon_dataset = anon_dataset.drop('sex', axis=1)
    return anon_dataset


In [None]:
def preprocess(data):
  #oldpeak to int
  df['oldpeak'] = df['oldpeak'].astype(int) 
  #categorical to object
  df['sex'] = df['sex'].astype(object) 
  df['cp'] = df['cp'].astype(object) 
  df['fbs'] = df['fbs'].astype(object) 
  df['restecg'] = df['restecg'].astype(object) 
  df['exang'] = df['exang'].astype(object) 
  df['slope'] = df['slope'].astype(object) 
  df['ca'] = df['ca'].astype(object) 
  df['thal'] = df['thal'].astype(object) 
  df['target'] = df['target'].astype(int)
  df_norm = df.copy()
  scaler = MinMaxScaler()
  #scaler.fit(data)
  df_norm[['trestbps', 'chol', 'thalach', 'oldpeak']] = scaler.fit_transform(df_norm[['trestbps', 'chol', 'thalach', 'oldpeak']])
  return df_norm 


In [None]:
# Read the dataset
url = "https://raw.githubusercontent.com/desstaw/PrivacyPreservingTechniques/main/datasets/heart.csv"
df = pd.read_csv(url)

# Preprocess the data using a predefined preprocess() function
df_preprocessed = preprocess(df)

# Use the decorated function with the dataset
anonym_dataset = k_anonymize(df_preprocessed, df_preprocessed.columns)

Results on original dataset:
Accuracy: 0.8878048780487805
Recall: 0.84
F1 score: 0.8795811518324608
Results on anonymized dataset:
Accuracy: 0.8634146341463415
Recall: 0.8214285714285714
F1 score: 0.8679245283018867
