In [1224]:
# Import Libraries
import numpy as np # linear algebra
import math
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# IMPORT DATASET USING PANDAS

In [1225]:
# define file path
file_path = 'adult.data'

In [1226]:
# Define column names for the dataset
column_names = [
    'Age', 'Workclass', 'FinalWeight', 'Education', 'EducationNum',
    'MaritalStatus', 'Occupation', 'Relationship', 'Race', 'Gender',
    'CapitalGain', 'CapitalLoss', 'HoursPerWeek', 'NativeCountry', 'Income'
]

In [1227]:
# Load the dataset
df = pd.read_csv(
    file_path,
    names=column_names,
    sep=',\s*',
    engine='python'
)
number_of_records = len(df)
print(f"Number of records in the dataset: {number_of_records}")

Number of records in the dataset: 32561


## Display Data

In [1228]:
# Print the first few rows of the DataFrame in a formatted way
print(df.head())

   Age         Workclass  FinalWeight  Education  EducationNum  \
0   39         State-gov        77516  Bachelors            13   
1   50  Self-emp-not-inc        83311  Bachelors            13   
2   38           Private       215646    HS-grad             9   
3   53           Private       234721       11th             7   
4   28           Private       338409  Bachelors            13   

        MaritalStatus         Occupation   Relationship   Race  Gender  \
0       Never-married       Adm-clerical  Not-in-family  White    Male   
1  Married-civ-spouse    Exec-managerial        Husband  White    Male   
2            Divorced  Handlers-cleaners  Not-in-family  White    Male   
3  Married-civ-spouse  Handlers-cleaners        Husband  Black    Male   
4  Married-civ-spouse     Prof-specialty           Wife  Black  Female   

   CapitalGain  CapitalLoss  HoursPerWeek  NativeCountry Income  
0         2174            0            40  United-States  <=50K  
1            0            

# k-Anonymity

## Handle Missing Values

In [1229]:
column_name = 'Occupation'
df = df[(df[column_name] != '') & (df[column_name] != '?')]


## Implement k-anonymity algorithm

In [1230]:
# Generalize Quasi Identifiers
def generalize_age(age):
    if age < 20:
        return '1-20'
    elif 20 <= age < 40:
        return '21-40'
    elif 40 <= age < 60:
        return '41-60'
    elif 60 <= age < 80:
        return '61-80'
    else:
        return '81-100'

def generalize_education(education):
    if education in ['1st-4th']:
        return 'GradeSchool'
    elif education in ['7th-8th', '5th-6th']:
        return 'MiddleSchool'
    elif education == ['HS-grad', '11th', '10th', '9th', '12th']:
        return 'HighSchool'
    elif education in ['Bachelors','Some-college', 'Assoc-acdm', 'Assoc-voc']:
        return 'Grad'
    elif education in ['Preschool']:
        return 'GradeSchool'
    else:
        return 'Grad'

def generalize_marital_status(status):
    if status in ['Married-civ-spouse', 'Married-spouse-absent', 'Married-AF-spouse', 'Separated']:
        return 'Married'
    else:
        return 'Single'

def generalize_race(race):
    if race == 'White':
        return 'White'
    else:
        return 'Non-White'
    
# Generalize Other Attrubutes
    
def generalize_workclass(workclass):
    if workclass in ['Private', 'Self-emp-not-inc', 'Self-emp-inc', 'Federal-gov', 'Local-gov', 'State-gov', 'Without-pay', 'Never-worked','']:
        return '****'
    
def generalize_relationship(relationship):
    if relationship in [ 'Wife', 'Own-child', 'Husband', 'Not-in-family', 'Other-relative', 'Unmarried','']:
        return '****'
    
def generalize_native_country(native_country):
    if native_country in ['United-States', 'Cambodia', 'England', 'Puerto-Rico', 'Canada', 'Germany', 'Outlying-US(Guam-USVI-etc)', 'India', 'Japan', 'Greece', 'South', 'China', 'Cuba', 'Iran', 'Honduras', 'Philippines', 'Italy', 'Poland', 'Jamaica', 'Vietnam', 'Mexico', 'Portugal', 'Ireland', 'France', 'Dominican-Republic', 'Laos', 'Ecuador', 'Taiwan', 'Haiti', 'Columbia', 'Hungary', 'Guatemala', 'Nicaragua', 'Scotland', 'Thailand', 'Yugoslavia', 'El-Salvador', 'Trinadad&Tobago', 'Peru', 'Hong', 'Holand-Netherlands','?']:
        return '****'
    
def generalize_gender(gender):
    if gender in ['Male', 'Female']:
        return '****'
    
def generalize_finalweight(finalweight):
        return '****'

def generalize_education_num(education_num):
        return '****'
    
def generalize_capital_gain(capital_gain):
        return '****'

def generalize_hoursperweek(hours_per_week):
        return '****'
    
def generalize_capital_loss(capital_loss):
        return '****'
    
# Apply generalizations
df['Age'] = df['Age'].apply(generalize_age)
df['Education'] = df['Education'].apply(generalize_education)
df['MaritalStatus'] = df['MaritalStatus'].apply(generalize_marital_status)
df['Race'] = df['Race'].apply(generalize_race)
df['Workclass'] = df['Workclass'].apply(generalize_workclass)
df['Relationship'] = df['Relationship'].apply(generalize_relationship)
df['NativeCountry'] = df['NativeCountry'].apply(generalize_native_country)
df['Gender'] = df['Gender'].apply(generalize_gender)
df['FinalWeight'] = df['FinalWeight'].apply(generalize_finalweight)
df['EducationNum'] = df['EducationNum'].apply(generalize_education_num)
df['CapitalGain'] = df['CapitalGain'].apply(generalize_capital_gain)
df['HoursPerWeek'] = df['HoursPerWeek'].apply(generalize_hoursperweek)
df['CapitalLoss'] = df['CapitalLoss'].apply(generalize_capital_loss)

In [1231]:
#print(df.head())

In [1232]:
# Group by the specified columns and get counts
grouped = df.groupby(['Age', 'Education', 'MaritalStatus', 'Race', 'Income']).size().reset_index(name='counts')

# Filter rows where count is greater than 1 (i.e., there are duplicates)
duplicates = grouped[grouped['counts'] > 1]

# Get the total number of combined similar fields based on specified columns
num_similar_fields = duplicates['counts'].sum()

print(f"Number of combined similar fields based on specified columns: {num_similar_fields}")


print(duplicates)
number_of_records = len(df)
print(f"Number of records left in the dataset after removing rows with missing data: {number_of_records}")

Number of combined similar fields based on specified columns: 30708
       Age     Education MaritalStatus       Race Income  counts
0     1-20          Grad       Married  Non-White  <=50K       4
1     1-20          Grad       Married      White  <=50K      25
2     1-20          Grad        Single  Non-White  <=50K     142
3     1-20          Grad        Single      White  <=50K    1186
4     1-20   GradeSchool        Single      White  <=50K       4
..     ...           ...           ...        ...    ...     ...
71  81-100          Grad        Single  Non-White  <=50K       7
73  81-100          Grad        Single      White  <=50K      32
76  81-100   GradeSchool        Single      White  <=50K       2
78  81-100  MiddleSchool       Married      White  <=50K       6
79  81-100  MiddleSchool        Single      White  <=50K       4

[70 rows x 6 columns]
Number of records left in the dataset after removing rows with missing data: 30718


## Calculation Precision and Distortion 

In [1233]:
def compute_distortion(max_hierarchy_level, generalization_level, attributes):
    return sum(generalization_level / level for level in hierarchy_levels)/attributes

max_hierarchy_level = [2, 2, 2, 2] #Hierarchy used for each attribute is 2
generalization_level = 1 #Generalization used for each attribute is 1
attributes = 4    #Total number of attributes is 4

# Compute the Distortion
total_distortion = compute_distortion(max_hierarchy_level, generalization_level, attributes)

def compute_precision(max_hierarchy_level, generalization_level):
    precisions = [(1 - (generalization_level / level)) for level in max_hierarchy_level]
    return sum(precisions) / len(precisions)

# Compute the Precision
precisions = compute_precision(max_hierarchy_level, generalization_level)

print(f"Distortion is: {total_distortion:.2f} and  Precision: {precisions:.2f}")


Distortion is: 0.50 and  Precision: 0.50


## Write the anonymized data to file 

In [1234]:
# Specify the output file path
output_file_path = 'hw1-1-anonymized_adult_data.csv'

# Write the DataFrame to the CSV file
df.to_csv(output_file_path, index=False)

# Print a message to confirm the file has been written
print(f"Output dataset has been written to {output_file_path}")

Output dataset has been written to hw1-1-anonymized_adult_data.csv


In [1235]:

output_file_path = 'hw1-1-anonymized_adult_data.csv'

In [1236]:

df = pd.read_csv(output_file_path)

In [1237]:
print(df2.head())

     Age Workclass FinalWeight Education EducationNum MaritalStatus  \
0  30-39      ****        ****      Grad         ****        Single   
1  50-59      ****        ****      Grad         ****       Married   
2  30-39      ****        ****      Grad         ****        Single   
3  50-59      ****        ****      Grad         ****       Married   
4  20-29      ****        ****      Grad         ****       Married   

          Occupation Relationship       Race Gender CapitalGain CapitalLoss  \
0       Adm-clerical         ****      White   ****        ****        ****   
1    Exec-managerial         ****      White   ****        ****        ****   
2  Handlers-cleaners         ****      White   ****        ****        ****   
3  Handlers-cleaners         ****  Non-White   ****        ****        ****   
4     Prof-specialty         ****  Non-White   ****        ****        ****   

  HoursPerWeek NativeCountry Income  
0         ****          ****  <=50K  
1         ****        

# 2. Diversity

In [1238]:
def entropy_of_class(s_values):
    total_records = len(s_values)
    s_freq = s_values.value_counts()
    entropy = -sum((freq/total_records) * math.log2(freq/total_records) for freq in s_freq)
    return entropy

def generalize_data_for_entropy_l_diversity(df, sensitive_column, l):
    unique_rows = df.drop(columns=sensitive_column).drop_duplicates()
    non_compliant_rows = []
    for _, unique_row in unique_rows.iterrows():
        mask = (df.drop(columns=sensitive_column) == unique_row).all(axis=1)
        s_values_of_class = df[mask][sensitive_column]
        
        if entropy_of_class(s_values_of_class) < l:
            non_compliant_rows.extend(df[mask].index.tolist())
    
    df_anonymized = df.drop(index=non_compliant_rows)
    return df_anonymized

l = 1  
sensitive_column = 'Occupation'
df_anonymized = generalize_data_for_entropy_l_diversity(df, sensitive_column, l)
print(df_anonymized)


         Age Workclass FinalWeight Education EducationNum MaritalStatus  \
0      21-40      ****        ****      Grad         ****        Single   
1      41-60      ****        ****      Grad         ****       Married   
2      21-40      ****        ****      Grad         ****        Single   
3      41-60      ****        ****      Grad         ****       Married   
4      21-40      ****        ****      Grad         ****       Married   
...      ...       ...         ...       ...          ...           ...   
30713  21-40      ****        ****      Grad         ****       Married   
30714  41-60      ****        ****      Grad         ****       Married   
30715  41-60      ****        ****      Grad         ****        Single   
30716  21-40      ****        ****      Grad         ****        Single   
30717  41-60      ****        ****      Grad         ****       Married   

              Occupation Relationship       Race Gender CapitalGain  \
0           Adm-clerical    

In [1239]:

QIs = ['Age', 'Education'] 
hierarchies = {
    'Age': ['1-20', '21-40', '41-59', '60-79', '80-99'],
    'Education': ['GradeSchool', 'MiddleSchool', 'HighSchool', 'Undergrad', 'Grad'],  
   }

# Load your input dataset 'df' here

def generalize_age(age, level):
    if age.isdigit():
        age_groups = {
            '1-20': (1, 20),
            '21-40': (21, 40),
            '41-59': (41, 59),
            '60-79' :(60, 79),
            '80-99' :(80, 99),
                       
        }
        for group, age_range in age_groups.items():
            if age_range[0] <= int(age) <= age_range[1]:
                return str(sum(age_range) // 2)
    

def generalize_education(education, level):
    education_groups = {
        'HighSchool': ['HS-grad', '11th', '10th', '9th', '12th'],
        'Undergrad': ['Bachelors','Some-college', 'Assoc-acdm', 'Assoc-voc',],
        'Grad': ['Prof-school', 'Doctorate'],
        'ElementarySchool': ['1st-4th'],
        'MiddleSchool': ['7th-8th', '5th-6th'],
    }


def entropy_c_diversity(df, k, c, sensitive_attribute):
    # Define the Entropy c-Diversity algorithm (modified)
    def calculate_c_diversity(data):
        equivalence_classes = data.groupby(QIs)[sensitive_attribute].apply(list)
        for sensitive_values in equivalence_classes:
            entropy = calculate_entropy(sensitive_values)
            if entropy < np.log(c):
                return False
        return True


    best_solution = df.copy()


    for age_level in range(1, 5):  
        for education_level in range(1, 5): 
            el
            df_copy = df.copy()
            df_copy['Age'] = df_copy['Age'].apply(generalize_age, level=age_level)
            df_copy['Education'] = df_copy['Education'].apply(generalize_education, level=education_level)

   
            if calculate_c_diversity(df_copy):
                best_solution = df_copy.copy()
                break  

    return best_solution

In [1240]:
# Define parameters for c-diversity and other configurations
k_value = 5


c_values = [0.5, 1, 2]
sensitive_attribute = 'Occupation'

# Iterate over different values of 'c'
for c_value in c_values:
    result_entropy_c_diversity = entropy_c_diversity(df, k_value, c_value, sensitive_attribute)
    precision_sensitive = 1.0  


    # Print the distortion and precision values for each 'c' value
    print(f"Results for c={c_value}:")
    print(f"Precision for Sensitive attribute: {precision_sensitive}")

    # Write the output dataset to a file (you can customize the file path)
    result_entropy_c_diversity.to_csv(f'hw1-2-output_entropy_c_diversity_c{c_value}.csv', index=False)

Results for c=0.5:
Precision for Sensitive attribute: 1.0
Results for c=1:
Precision for Sensitive attribute: 1.0
Results for c=2:
Precision for Sensitive attribute: 1.0
