In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import os
import tqdm

In [2]:
def drop_columns(df, columns):
    return df.drop(columns=columns, inplace=False)

def discretize_columns(df, columns):
    column2discrete_value2value = {}
    for column in columns:
        value2discretized_value = {v: i for i, v in enumerate(sorted_with_nans(df[column].unique()))}
        column2discrete_value2value[column] = {v: i for i,v in value2discretized_value.items()}
        df[column] = [value2discretized_value[x] if not pd.isnull(x) else None for x in df[column]]
    return df, column2discrete_value2value

def sorted_with_nans(l):
    return sorted(l, key = lambda x : '' if pd.isnull(x) else x)

def process_column_names(column_names):
    return [column_name.strip().lower().replace('-', '') for column_name in column_names]

# 1. Adult

The dataset is available here: https://archive.ics.uci.edu/dataset/2/adult.
Download it in folder `datasets/adult`

In [3]:
# Divide the columns into continous and non_continous
column_names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week',' native-country', 'income']
continuous_columns = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']

column_names, continuous_columns = process_column_names(column_names), process_column_names(continuous_columns)
non_continuous_columns = [column_name for column_name in column_names if column_name not in continuous_columns]

In [4]:
# Load the downloaded dataset
adult_train = pd.read_csv('../datasets/adult/adult.data', names=column_names)
adult_test = pd.read_csv('../datasets/adult/adult.test', skiprows=1, names=column_names)
adult = pd.concat((adult_train, adult_test), ignore_index=True)

In [5]:
# Replace missing values in categorical columns with "Unknown" category
adult = adult.applymap(lambda x: x.strip() if isinstance(x, str) else x)
adult[non_continuous_columns] = adult[non_continuous_columns].applymap(lambda x: 'Unknown' if isinstance(x, str) and x.strip() == '?' else x)

## 1.1 Adult: Original dataset for attribute inference attacks

In [6]:
columns_to_drop = ['fnlwgt', 'income']
adult_aia = drop_columns(adult, columns_to_drop)

column_names_aia = [column_name for column_name in column_names if column_name not in columns_to_drop]
continuous_columns_aia = [column_name for column_name in continuous_columns if column_name not in columns_to_drop]
non_continuous_columns_aia = [column_name for column_name in column_names_aia if column_name not in continuous_columns_aia]

In [7]:
adult_aia, column2discrete_value2value_aia = discretize_columns(adult_aia, non_continuous_columns_aia)

In [8]:
adult_aia.to_csv('../datasets/adult/final_dataset.csv', index=False)

In [9]:
with open('../datasets/adult/final_continuous_columns.pickle', 'wb') as file:
    pickle.dump(continuous_columns_aia, file)

with open('../datasets/adult/final_column2discrete_value2value.pickle', 'wb') as file:
    pickle.dump(column2discrete_value2value_aia, file)

## 1.2 Adult: Original dataset for membership inference attacks

In [10]:
if not os.path.exists('../datasets/adult_with_sensitive'):
    os.makedirs('../datasets/adult_with_sensitive')

In [11]:
columns_to_drop = ['fnlwgt']
adult_mia = drop_columns(adult, columns_to_drop)

adult_mia['income'] = [value[:-1] if value[-1] == '.' else value for value in adult_mia['income']]

column_names_mia = [column_name for column_name in column_names if column_name not in columns_to_drop]
continuous_columns_mia = [column_name for column_name in continuous_columns if column_name not in columns_to_drop]
non_continuous_columns_mia = [column_name for column_name in column_names_mia if column_name not in continuous_columns_mia]

In [12]:
adult_mia, column2discrete_value2value_mia = discretize_columns(adult_mia, non_continuous_columns_mia)

In [13]:
adult_mia.to_csv('../datasets/adult_with_sensitive/final_dataset.csv', index=False)

In [14]:
with open('../datasets/adult_with_sensitive/final_continuous_columns.pickle', 'wb') as file:
    pickle.dump(continuous_columns_mia, file)

with open('../datasets/adult_with_sensitive/final_column2discrete_value2value.pickle', 'wb') as file:
    pickle.dump(column2discrete_value2value_mia, file)

## 1.3 Adult: Synthetic dataset from Adult

In [15]:
if not os.path.exists('../datasets/synadult'):
    os.makedirs('../datasets/synadult')

In [16]:
column_name2distribution = {}
for column_name in adult_aia:
    value_counts = adult_aia[column_name].value_counts(normalize=True)
    column_name2distribution[column_name] = [(value, probability) for (value, probability) in zip(value_counts.index.values, value_counts.values)]

In [17]:
np.random.seed(0)
new_dataset = []
for i in range(len(adult_aia)):
    instance = []
    for column_name in adult_aia:
        instance.append(np.random.choice([value for value, probability in column_name2distribution[column_name]], p=[probability for value, probability in column_name2distribution[column_name]]))
    new_dataset.append(instance)

In [18]:
syn_adult = pd.DataFrame(new_dataset, columns = adult_aia.columns.values) 

In [19]:
syn_adult.to_csv('../datasets/synadult/final_dataset.csv', index=False)

In [20]:
with open('../datasets/synadult/final_continuous_columns.pickle', 'wb') as file:
    pickle.dump(continuous_columns_aia, file)

with open('../datasets/synadult/final_column2discrete_value2value.pickle', 'wb') as file:
    pickle.dump(column2discrete_value2value_aia, file)

# 2. Census

The dataset is available here: https://archive.ics.uci.edu/dataset/117/census+income+kdd.
Download it in folder `datasets/census`

In [21]:
column_description = """(age) continuous
(class of worker) nominal
(detailed industry recode) nominal
(detailed occupation recode) nominal
(education) nominal
(wage per hour) continuous
(enroll in edu inst last wk) nominal
(marital stat) nominal
(major industry code) nominal
(major occupation code) nominal
(race) nominal
(hispanic origin) nominal
(sex) nominal
(member of a labor union) nominal
(reason for unemployment) nominal
(full or part time employment stat) nominal
(capital gains) continuous
(capital losses) continuous
(dividends from stocks) continuous
(tax filer stat) nominal
(region of previous residence) nominal
(state of previous residence) nominal
(detailed household and family stat) nominal
(detailed household summary in household) nominal
(instance weights) continuous
(migration code-change in msa) nominal
(migration code-change in reg) nominal
(migration code-move within reg) nominal
(live in this house 1 year ago) nominal
(migration prev res in sunbelt) nominal
(num persons worked for employer) continuous
(family members under 18) nominal
(country of birth father) nominal
(country of birth mother) nominal
(country of birth self) nominal
(citizenship) nominal
(own business or self employed) nominal
(fill inc questionnaire for veterans admin) nominal
(veterans benefits) nominal
(weeks worked in year) continuous
(year) nominal
(income) nominal"""

column_names = []
continuous_columns = set()

for line in column_description.split('\n'):
    parts = line.split('(')
    column_name, column_type = parts[1].split(')')
    column_type = column_type.strip()
    column_name = column_name.replace(' ', '')
    column_names.append(column_name)
    if column_type == 'continuous':
        continuous_columns.add(column_name)

In [22]:
column_names = [column_name.lower().replace('-', '') for column_name in column_names]
continuous_columns = {column_name.lower().replace('-', '') for column_name in continuous_columns}

In [23]:
census_train = pd.read_csv('../datasets/census/census-income.data', names=column_names)
census_test = pd.read_csv('../datasets/census/census-income.test', names=column_names)
census = pd.concat((census_train, census_test), ignore_index=True)

In [24]:
census = census.applymap(lambda x: x.strip() if isinstance(x, str) else x)
census = census.applymap(lambda x: 'Unknown' if isinstance(x, str) and x.strip() == '?' else x)

## 2.1 Census: Original dataset for attribute inference attacks

In [25]:
columns_to_be_dropped = ['instanceweights', 'income']
census_aia = drop_columns(census, columns_to_be_dropped)

In [26]:
column_names_aia = [column_name for column_name in column_names if column_name not in columns_to_be_dropped]
continuous_columns_aia = {column_name for column_name in continuous_columns if column_name not in columns_to_be_dropped}
non_continuous_columns_aia = [column_name for column_name in column_names if column_name not in continuous_columns and column_name not in columns_to_be_dropped]

census_aia, column2discrete_value2value_aia = discretize_columns(census_aia, non_continuous_columns_aia)

In [27]:
census_aia.to_csv('../datasets/census/final_dataset.csv', index=False)

In [28]:
with open('../datasets/census/final_continuous_columns.pickle', 'wb') as file:
    pickle.dump(continuous_columns_aia, file)

with open('../datasets/census/final_column2discrete_value2value.pickle', 'wb') as file:
    pickle.dump(column2discrete_value2value_aia, file)

## 2.2 Census: Original dataset for membership inference attacks

In [29]:
if not os.path.exists('../datasets/census_with_sensitive'):
    os.makedirs('../datasets/census_with_sensitive')

In [30]:
columns_to_be_dropped = ['instanceweights']
census_mia = drop_columns(census, columns_to_be_dropped)

In [31]:
column_names_mia = [column_name for column_name in column_names if column_name not in columns_to_be_dropped]
continuous_columns_mia = {column_name for column_name in continuous_columns if column_name not in columns_to_be_dropped}
non_continuous_columns_mia = [column_name for column_name in column_names if column_name not in continuous_columns and column_name not in columns_to_be_dropped]
census_mia, column2discrete_value2value_mia = discretize_columns(census_mia, non_continuous_columns_mia)

In [32]:
census_mia.to_csv('../datasets/census_with_sensitive/final_dataset.csv', index=False)

In [33]:
with open('../datasets/census_with_sensitive/final_continuous_columns.pickle', 'wb') as file:
    pickle.dump(continuous_columns_mia, file)

with open('../datasets/census_with_sensitive/final_column2discrete_value2value.pickle', 'wb') as file:
    pickle.dump(column2discrete_value2value_mia, file)

## 2.3 Census: Synthetic dataset from Census

In [34]:
if not os.path.exists('../datasets/syncensus'):
    os.makedirs('../datasets/syncensus')

In [35]:
column_name2distribution = {}
for column_name in census_aia:
    value_counts = census_aia[column_name].value_counts(normalize=True)
    column_name2distribution[column_name] = [(value, probability) for (value, probability) in zip(value_counts.index.values, value_counts.values)]

In [36]:
np.random.seed(0)
new_dataset = []
for i in tqdm.tqdm(range(len(census_aia))):
    instance = []
    for column_name in census_aia:
        instance.append(np.random.choice([value for value, probability in column_name2distribution[column_name]], p=[probability for value, probability in column_name2distribution[column_name]]))
    new_dataset.append(instance)

100%|██████████████████████████████████████████████████████████████████████████| 299285/299285 [05:39<00:00, 880.61it/s]


In [37]:
syn_census = pd.DataFrame(new_dataset, columns = census_aia.columns.values) 

In [38]:
syn_census.to_csv('../datasets/syncensus/final_dataset.csv', index=False)

In [39]:
with open('../datasets/syncensus/final_continuous_columns.pickle', 'wb') as file:
    pickle.dump(continuous_columns_aia, file)

with open('../datasets/syncensus/final_column2discrete_value2value.pickle', 'wb') as file:
    pickle.dump(column2discrete_value2value_aia, file)

# 3. Insurance

The dataset is available here: https://archive.ics.uci.edu/dataset/125/insurance+company+benchmark+coil+2000.
Download it in folder `datasets/insurance`

In [40]:
column_description = """Nr Name Description Domain
1 MOSTYPE Customer Subtype see L0
2 MAANTHUI Number of houses 1   10
3 MGEMOMV Avg size household 1   6
4 MGEMLEEF Avg age see L1
5 MOSHOOFD Customer main type see L2
6 MGODRK Roman catholic see L3
7 MGODPR Protestant ...
8 MGODOV Other religion
9 MGODGE No religion
10 MRELGE Married
11 MRELSA Living together
12 MRELOV Other relation
13 MFALLEEN Singles
14 MFGEKIND Household without children
15 MFWEKIND Household with children
16 MOPLHOOG High level education
17 MOPLMIDD Medium level education
18 MOPLLAAG Lower level education
19 MBERHOOG High status
20 MBERZELF Entrepreneur
21 MBERBOER Farmer
22 MBERMIDD Middle management
23 MBERARBG Skilled labourers
24 MBERARBO Unskilled labourers
25 MSKA Social class A
26 MSKB1 Social class B1
27 MSKB2 Social class B2
28 MSKC Social class C
29 MSKD Social class D
30 MHHUUR Rented house
31 MHKOOP Home owners
32 MAUT1 1 car
33 MAUT2 2 cars
34 MAUT0 No car
35 MZFONDS National Health Service
36 MZPART Private health insurance
37 MINKM30 Income < 30.000
38 MINK3045 Income 30-45.000
39 MINK4575 Income 45-75.000
40 MINK7512 Income 75-122.000
41 MINK123M Income >123.000
42 MINKGEM Average income
43 MKOOPKLA Purchasing power class
44 PWAPART Contribution private third party insurance see L4
45 PWABEDR Contribution third party insurance (firms) ...
46 PWALAND Contribution third party insurane (agriculture)
47 PPERSAUT Contribution car policies
48 PBESAUT Contribution delivery van policies
49 PMOTSCO Contribution motorcycle/scooter policies
50 PVRAAUT Contribution lorry policies
51 PAANHANG Contribution trailer policies
52 PTRACTOR Contribution tractor policies
53 PWERKT Contribution agricultural machines policies 
54 PBROM Contribution moped policies
55 PLEVEN Contribution life insurances
56 PPERSONG Contribution private accident insurance policies
57 PGEZONG Contribution family accidents insurance policies
58 PWAOREG Contribution disability insurance policies
59 PBRAND Contribution fire policies
60 PZEILPL Contribution surfboard policies
61 PPLEZIER Contribution boat policies
62 PFIETS Contribution bicycle policies
63 PINBOED Contribution property insurance policies
64 PBYSTAND Contribution social security insurance policies
65 AWAPART Number of private third party insurance 1 - 12
66 AWABEDR Number of third party insurance (firms) ...
67 AWALAND Number of third party insurane (agriculture)
68 APERSAUT Number of car policies
69 ABESAUT Number of delivery van policies
70 AMOTSCO Number of motorcycle/scooter policies
71 AVRAAUT Number of lorry policies
72 AAANHANG Number of trailer policies
73 ATRACTOR Number of tractor policies
74 AWERKT Number of agricultural machines policies
75 ABROM Number of moped policies
76 ALEVEN Number of life insurances
77 APERSONG Number of private accident insurance policies
78 AGEZONG Number of family accidents insurance policies
79 AWAOREG Number of disability insurance policies
80 ABRAND Number of fire policies
81 AZEILPL Number of surfboard policies
82 APLEZIER Number of boat policies
83 AFIETS Number of bicycle policies
84 AINBOED Number of property insurance policies
85 ABYSTAND Number of social security insurance policies
86 CARAVAN Number of mobile home policies 0 - 1"""

column_names = []
for line in column_description.split('\n')[1:]:
    column_names.append(line.split()[1])
column_names = process_column_names(column_names)

In [41]:
insurance_train = pd.read_csv('../datasets/insurance/ticdata2000.txt', sep='\t', names=column_names)
insurance_eval = pd.read_csv('../datasets/insurance/ticeval2000.txt', sep='\t', names=column_names)
insurance = pd.concat((insurance_train, insurance_eval), ignore_index=True)

columns_to_drop = column_names[43:-1]
insurance.drop(columns=columns_to_drop, inplace=True)

column_names = [column_name for column_name in column_names if column_name not in columns_to_drop]
continuous_columns = column_names[:]

## 3.1 Insurance: Original dataset for attribute inference attacks

In [42]:
columns_to_be_dropped = ['caravan']
insurance_aia = drop_columns(insurance, columns_to_be_dropped)

column_names_aia = [column_name for column_name in column_names if column_name not in columns_to_be_dropped]
continuous_columns_aia = [column_name for column_name in continuous_columns if column_name not in columns_to_be_dropped]

In [43]:
insurance_aia.to_csv('../datasets/insurance/final_dataset.csv', index=False)
with open('../datasets/insurance/final_continuous_columns.pickle', 'wb') as file:
    pickle.dump(continuous_columns_aia, file)

## 3.2 Insurance: Original dataset for membership inference attacks

In [44]:
if not os.path.exists('../datasets/insurance_with_sensitive'):
    os.makedirs('../datasets/insurance_with_sensitive')

In [45]:
insurance_eval_sensitive_attribute = pd.read_csv('../datasets/insurance/tictgts2000.txt', names=['caravan'])
insurance.loc[insurance.index >= len(insurance_train), 'caravan'] = insurance_eval_sensitive_attribute.values.reshape(-1)
insurance['caravan'] = insurance['caravan'].astype(int)

In [46]:
continuous_columns_mia = column_names[:-1]

In [47]:
insurance.to_csv('../datasets/insurance_with_sensitive/final_dataset.csv', index=False)
with open('../datasets/insurance_with_sensitive/final_continuous_columns.pickle', 'wb') as file:
    pickle.dump(continuous_columns_mia, file)

## 3.3 Insurance: Synthetic dataset from Insurance

In [48]:
if not os.path.exists('../datasets/syninsurance'):
    os.makedirs('../datasets/syninsurance')

In [49]:
column_name2distribution = {}
for column_name in insurance_aia:
    value_counts = insurance_aia[column_name].value_counts(normalize=True)
    column_name2distribution[column_name] = [(value, probability) for (value, probability) in zip(value_counts.index.values, value_counts.values)]

In [50]:
np.random.seed(0)
new_dataset = []
for i in tqdm.tqdm(range(len(insurance_aia))):
    instance = []
    for column_name in insurance_aia:
        instance.append(np.random.choice([value for value, probability in column_name2distribution[column_name]], p=[probability for value, probability in column_name2distribution[column_name]]))
    new_dataset.append(instance)

100%|█████████████████████████████████████████████████████████████████████████████| 9822/9822 [00:07<00:00, 1257.75it/s]


In [51]:
syn_insurance = pd.DataFrame(new_dataset, columns = insurance_aia.columns.values) 

In [52]:
syn_insurance.to_csv('../datasets/syninsurance/final_dataset.csv', index=False)
with open('../datasets/syninsurance/final_continuous_columns.pickle', 'wb') as file:
    pickle.dump(continuous_columns_aia, file)