In [16]:
import pandas as pd


# Load the dataset
file_path = r'data\anonymized_data.xlsx'
df1 = pd.read_excel(file_path)
df2 = pd.read_excel(r'data\localSuppr1.xlsx')
df3 = pd.read_excel(r'data\localSuppr2.xlsx')

In [17]:
# Computing l-diversity to ensure that no voter's preference can be re-identified
# Educationis is not in the public data register so it collumns cannot be used for re-identification
# Therefore we group by the other collumns (sex, age, citizenship, evote and marital status) also excluding the sensitive collumn

grouped1 = df1.groupby(['sex', 'citizenship', 'marital_status', 'age_range', 'evote'])

NonConformingGroups1 = []

for name, group in grouped1:
    if group.party.nunique() < 2:
        NonConformingGroups1.append(group[['sex', 'citizenship', 'marital_status', 'age_range', 'evote', 'party']])

        
# This yields the relevant collumns of records in groups that do not conform to 2-diversity
NonConformingGroups1

[       sex citizenship marital_status age_range  evote    party
 16  Female      Danish        Married     18-30      0  Party A,
         sex citizenship marital_status age_range  evote    party
 31   Female      Danish    Not Married     18-30      1  Party A
 106  Female      Danish    Not Married     18-30      1  Party A
 174  Female      Danish    Not Married     18-30      1  Party A,
         sex citizenship marital_status age_range  evote    party
 8    Female      Danish    Not Married     31-50      0  Party A
 17   Female      Danish    Not Married     31-50      0  Party A
 42   Female      Danish    Not Married     31-50      0  Party A
 77   Female      Danish    Not Married     31-50      0  Party A
 129  Female      Danish    Not Married     31-50      0  Party A,
        sex citizenship marital_status age_range  evote    party
 46  Female  Not Danish        Married     18-30      0  Party A,
         sex citizenship marital_status age_range  evote    party
 151  Fema

In [18]:
# Computing l-diversity to ensure that no voter's preference can be re-identified
# E-vote and education are not in the public data register so these collumns cannot be used for re-identification
# Therefore we group by the other collumns (sex, age, citizenship and marital status) also excluding the sensitive collumn

grouped2 = df2.groupby(['sex', 'citizenship', 'marital_status', 'age_range', 'evote'])

NonConformingGroups2 = []

for name, group in grouped2:
    if group.party.nunique() < 2:
        NonConformingGroups2.append(group[['sex', 'citizenship', 'marital_status', 'age_range', 'evote', 'party']])

        
# This yields the relevant collumns of records in groups that do not conform to 2-diversity
NonConformingGroups2

[        sex citizenship marital_status age_range  evote    party
 31   Female      Danish    Not Married     18-30    1.0  Party A
 106  Female      Danish    Not Married     18-30    1.0  Party A
 174  Female      Danish    Not Married     18-30    1.0  Party A,
         sex citizenship marital_status age_range  evote    party
 8    Female      Danish    Not Married     31-50    0.0  Party A
 17   Female      Danish    Not Married     31-50    0.0  Party A
 42   Female      Danish    Not Married     31-50    0.0  Party A
 77   Female      Danish    Not Married     31-50    0.0  Party A
 129  Female      Danish    Not Married     31-50    0.0  Party A,
       sex citizenship marital_status age_range  evote    party
 59   Male      Danish        Married     31-50    0.0  Party A
 88   Male      Danish        Married     31-50    0.0  Party A
 154  Male      Danish        Married     31-50    0.0  Party A
 169  Male      Danish        Married     31-50    0.0  Party A
 171  Male      Da

In [19]:
#as we can see from the above result only two groups violate the l-diversity, but one of those groups 
#includes null values in the party collumn, hence it is not a problem
# and for the other one there is record 104 which is Male   Not Danish  Married  Nan  Party A  which means any record that 
# could be traced identified to those records (125, 178, 192) could potentially also be the other record (104)


In [20]:
# Computing l-diversity to ensure that no voter's preference can be re-identified
# E-vote and education are not in the public data register so these collumns cannot be used for re-identification
# Therefore we group by the other collumns (sex, age, citizenship and marital status) also excluding the sensitive collumn

grouped3 = df3.groupby(['sex', 'citizenship', 'marital_status', 'age_range', 'evote'])

NonConformingGroups3 = []

for name, group in grouped3:
    if group.party.nunique() < 2:
        NonConformingGroups3.append(group[['sex', 'citizenship', 'marital_status', 'age_range', 'evote', 'party']])

        
# This yields the relevant collumns of records in groups that do not conform to 2-diversity
NonConformingGroups3

[        sex citizenship marital_status age_range  evote    party
 8    Female      Danish    Not Married     31-50    0.0  Party A
 42   Female      Danish    Not Married     31-50    0.0  Party A
 77   Female      Danish    Not Married     31-50    0.0  Party A
 129  Female      Danish    Not Married     31-50    0.0  Party A,
       sex citizenship marital_status age_range  evote    party
 59   Male      Danish        Married     31-50    0.0  Party A
 88   Male      Danish        Married     31-50    0.0  Party A
 154  Male      Danish        Married     31-50    0.0  Party A
 171  Male      Danish        Married     31-50    0.0  Party A
 187  Male      Danish        Married     31-50    0.0  Party A,
       sex citizenship marital_status age_range  evote    party
 35   Male      Danish    Not Married     18-30    1.0  Party A
 52   Male      Danish    Not Married     18-30    1.0  Party A
 53   Male      Danish    Not Married     18-30    1.0  Party A
 118  Male      Danish    No

In [21]:
# In this one the case is even stronger as the only group not conforming includes null values for the party vote.