# Dataset simulation

For every new dataset simulation the gradations have to be manually added for the specific quasi-identifiers.

### Choosing the dataset that should be simulated

In [1]:
# either diabetes_130, diabetes_012 or adult
simu_dataset='adult'

### Loading Data

1. loading data into pandas

In [2]:
import pandas as pd
import numpy as np
import random as rd

# read csv file
if simu_dataset == 'diabetes_130':
    df = pd.read_csv('./diabetes_130/diabetic_data.csv')
elif simu_dataset == 'diabetes_012':
    df = pd.read_csv('./diabetes_012/diabetes_012_cleaned.csv')
elif simu_dataset == 'adult':
    df = pd.read_csv('./adult/adult_cleaned.csv')

# print


3. selected attributes for simulation

In [3]:
# personal attributes depending on the data set 

if simu_dataset == 'diabetes_130':
    attributes = ['age', 'weight', 'gender', 'race']
elif simu_dataset == 'diabetes_012':
    attributes = ['Sex', 'Age', 'Education', 'Income']
elif simu_dataset == 'adult':
    attributes = ['age', 'native-country', 'sex', 'relationship']

### Prepare Dataframe

In [None]:
# analysing data set structure

# columnes & rows
print('(rows, columns) =',df.shape)

# column information
print(df.info())

- adding label
- all unknown values are "?"

In [5]:
unknown = '?'

df.replace('Unknown/Invalid', unknown, inplace=True)

# add column amount_changed to dataframe to label the amount of changed rows
df['amount_changed'] = pd.Series()

# change the NaN values in the column 'amount_changed' to 0
df['amount_changed'].fillna(0, inplace=True)

## Simulations

#### Gradiations

- have to be manually changed, because every dataset provides different possibilities to classify the attributes

**Gradiation:** for diabetes_130

age:
- 10-year age brackets
- 3 groups
    - Children & Young Adults [0-30)
    - Middle-aged Adults [30-60)
    - Old-aged Adults [60-100)
- Unknown

weight:
- 25 lbs weight chunks
- < or > 100 lbs
- Unknown

race:
- no further gradiation possible
- either known or unknown
  
sex:
- no further gradiation possible
- either known or unknown

In [5]:
# outsourced if conditions in extra functions
def attribute_changes_diabetes130(dataframe, attribute, index, row):
    if (attribute == 'age'):
        age_changes(dataframe, attribute, index, row)
    elif (attribute == 'weight'):
        weight_changes(dataframe, attribute, index, row)
    elif (attribute == 'race'):
        race_changes(dataframe, attribute, index, row)
    elif (attribute == 'gender'):
        sex_changes(dataframe, attribute, index, row)
    


def age_changes(dataframe, attribute, index, row):
    if (row[attribute] in ['[0-10)','[10-20)','[20-30)']):
        dataframe.at[index, attribute] = '[0-30)'
        return 'changed'
    elif (row[attribute] in ['[30-40)','[40-50)','[50-60)']):
        dataframe.at[index, attribute] = '[30-60)'
        return 'changed'
    elif (row[attribute] in ['[60-70)','[70-80)','[80-90)','[90-100)']):
        dataframe.at[index, attribute] = '[60-100)'
        return 'changed'
    elif (row[attribute] in ['[0-30)','[30-60)','[60-100)']):
        dataframe.at[index, attribute] = '?'
        return 'changed'
    elif row[attribute] == '?':
        return 'again'
    else:
        dataframe.at[index, attribute] = 'Error'

def weight_changes(dataframe, attribute, index, row):
    if (row[attribute] in ['[0-25)','[25-50)','[50-75)','[75-100)']):
        dataframe.at[index, attribute] = '< 100'
        return 'changed'
    elif (row[attribute] in ['[100-125)','[125-150)','[150-175)','[175-200)','>200']):
        dataframe.at[index, attribute] = '>= 100'
        return 'changed'
    elif (row[attribute] == '< 100' or row[attribute] == '>= 100'):
        dataframe.at[index, attribute] = '?'
        return 'changed'
    elif row[attribute] == '?':
        return 'again'
    else:
        dataframe.at[index, attribute] = 'Error'
        return 'changed'

def race_changes(dataframe, attribute, index, row):
    if (row[attribute] in ['Caucasian', 'AfricanAmerican', 'Hispanic', 'Asian', 'Other']):
        dataframe.at[index, attribute] = '?'
        return 'changed'
    elif row[attribute] == '?':
        return 'again'
    else:
        dataframe.at[index, attribute] = 'Error'
        return 'changed'

def sex_changes(dataframe, attribute, index, row):
    if (row[attribute] == 'Female' or row[attribute] == 'Male'):
        dataframe.at[index, attribute] = '?'
        return 'changed'
    elif row[attribute] == '?':
        return 'again'
    else:
        dataframe.at[index, attribute] = 'Error'
        return 'changed'

**Gradiation**: for diabetes_012

Sex
- no further gradiation possible
- either known or unknown

Age
- 13-level 5 year age category (_AGEG5YR see codebook) 
- 3 groups
    - Children & Young Adults [0-30)
    - Middle-aged Adults [30-60)
    - Old-aged Adults [60-100)
- Unknown
 
Education Level
- Education level (EDUCA see codebook) scale 1-6
- 3 Groups
    - did not graduate
    - High School Graduate
    - College Graduate 
- Unknown

Income
- Income scale (INCOME2 see codebook) scale 1-8 
- scale 1-4
    - 1 = less than $75,000 
    - 2 = $75,000 or more

In [7]:
def attribute_changes_diabetes012(dataframe, attribute, index, row):
    if (attribute == 'Age'):
        return age_changes_2(dataframe, attribute, index, row)
    elif (attribute == 'Sex'):
        return sex_changes_2(dataframe, attribute, index, row)
    elif (attribute == 'Education'):
        return edu_changes(dataframe, attribute, index, row)
    elif (attribute == 'Income'):
        return inc_changes(dataframe, attribute, index, row)

def age_changes_2(dataframe, attribute, index, row):
    if (row[attribute] == '18-24' or row[attribute] == '25-29'):
        dataframe.at[index, attribute] = '[0-30)'
        return 'changed'
    elif (row[attribute] in ['30-34','35-39','40-44','45-49','50-54','55-59']):
        dataframe.at[index, attribute] = '[30-60)'
        return 'changed'
    elif (row[attribute] in ['60-64','65-69','70-74','75-79','80+']):
        dataframe.at[index, attribute] = '[60-100)'
        return 'changed'
    elif (row[attribute] in ['[0-30)','[30-60)','[60-100)']): 
        dataframe.at[index, attribute] = '?'
        return 'changed'
    elif row[attribute] == '?':
        return 'again'
    else:
        dataframe.at[index, attribute] = 'Error'
        return 'changed'

def sex_changes_2(dataframe, attribute, index, row):
    if (row[attribute] == 'male' or row[attribute] == 'female'):
        dataframe.at[index, attribute] = '?'
        return 'changed'
    elif row[attribute] == '?':
        return 'again'
    else:
        dataframe.at[index, attribute] = 'Error'
        return 'changed'

def edu_changes(dataframe, attribute, index, row):
    if(row[attribute] in ['no school/only kindergarten','elementary','some high school']):
        dataframe.at[index, attribute] = 'did not graduate high school'
        return 'changed'
    elif (row[attribute] in ['high school graduate','some college/technical school']):
        dataframe.at[index, attribute] = 'graduated high school'
        return 'changed'
    elif (row[attribute] == 'college graduate'):
        dataframe.at[index, attribute] = 'graduated college'
        return 'changed'
    elif(row[attribute] in ['did not graduate high school','graduated high school','graduated college']):
        dataframe.at[index, attribute] = '?'
        return 'changed'
    elif row[attribute] == '?':
        return 'again'
    else:
        dataframe.at[index, attribute] = 'Error'
        return 'changed'

def inc_changes(dataframe, attribute, index, row):
    if (row[attribute] in ['< $10,000','< $15,000','< $20,000','< $25,000', '< $35,000', '< $50,000', '< $75,000']): 
        dataframe.at[index, attribute] = '< $50,000 '
        return 'changed'
    elif(row[attribute] in  ['< $75,000','>= $75,000']):
        dataframe.at[index, attribute] = '>= $50,000'
        return 'changed'
    elif(row[attribute] in ['< 50,000','>= $50,000']): 
        dataframe.at[index, attribute] = '?'
        return 'changed'
    elif row[attribute] == '?':
        return 'again'
    else:
        dataframe.at[index, attribute] = 'Error'
        return 'changed'

**Gradiation**: for adult

Sex
- Female, Male
- no further gradiation possible
- either known or unknown

Native-Country
- native country
    - (United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands)
- native continent
    - **North America:** ['United-States', 'Puerto Rico', 'Canada', 'Outlying-US (Guam-USVI-etc.)', 'Cuba', 'Honduras', 'Jamaica', 'Mexico', 'Dominican-Republic', 'Haiti', 'Guatemala', 'Nicaragua', 'El-Salvador', 'Trinadad&Tobago']
    - **Asia:** ['Cambodia', 'India', 'Japan', 'China', 'Iran', 'Philippines', 'Vietnam', 'Laos', 'Taiwan', 'Thailand', 'Hong'] 
    - **Europe:** ['England','Germany','Greece', 'Italy', 'Poland', 'Portugal', 'Ireland', 'France', 'Scotland', 'Hungary', 'Yugoslavia', 'Holand-Netherlands']
    - **South America:** ['Ecuador', 'Colombia', 'Peru']
- unknown

Age
- exact age given
- 3 groups
    - Children & Young Adults [0-30)
    - Middle-aged Adults [30-60)
    - Old-aged Adults [60-100)
- Unknown
 
Relationship
- Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried.
- married or unmarried
    - married: ['Wife', 'Husband']
    - not married: ['Own-child', 'Not-in-family', 'Other-relative', 'Unmarried']
- either known or unknown

In [8]:
def attribute_changes_adult(dataframe, attribute, index, row):
    if(attribute == 'age'):
        return age_changes_3(dataframe, attribute, index, row)
    elif (attribute == 'native-country'):
        return nat_changes(dataframe, attribute, index, row)
    elif (attribute == 'sex'):
        return sex_changes(dataframe, attribute, index, row)
    elif (attribute == 'relationship'):
        return rel_changes(dataframe, attribute, index, row)

def age_changes_3(dataframe, attribute, index, row):
    if (row[attribute] < 30):
        dataframe.at[index, attribute] = '[0-30)'
        return 'changed'
    elif (row[attribute] < 60 and row[attribute] >= 30):
        dataframe.at[index, attribute] = '[30-60)'
        return 'changed'
    elif (row[attribute] >= 60):
        dataframe.at[index, attribute] = '[60-100)'
        return 'changed'
    elif (row[attribute] in ['[0-30)','[30-60)','[60-100)']): 
        dataframe.at[index, attribute] = '?'
        return 'changed'
    elif row[attribute] == '?':
        return 'again'
    else:
        dataframe.at[index, attribute] = 'Error'
        return 'changed'

def nat_changes(dataframe, attribute, index, row):
    if(row[attribute] in ['United-States','Puerto-Rico','Canada','Outlying-US(Guam-USVI-etc)','Cuba','Honduras','Jamaica','Mexico','Dominican-Republic','Haiti','Guatemala','Nicaragua','El-Salvador','Trinadad&Tobago']):
        dataframe.at[index, attribute] = 'North-America'
        return 'changed'
    elif(row[attribute] in ['Ecuador','Columbia','Peru']):
        dataframe.at[index, attribute] = 'South-America'
        return 'changed'
    elif(row[attribute] in ['England','Germany','Greece','Italy','Poland','Portugal','Ireland','France','Scotland','Hungary','Yugoslavia','Holand-Netherlands']):
        dataframe.at[index, attribute] = 'Europe'
        return 'changed'
    elif(row[attribute] in ['Cambodia','India','Japan','China','Iran', 'Philippines','Vietnam','Laos','Taiwan','Thailand','Hong Kong']):
        dataframe.at[index, attribute] = 'Asia'
        return 'changed'
    elif(row[attribute] in ['North-America','South-America', 'Europe', 'Asia']):
        dataframe.at[index, attribute] = '?'
    elif row[attribute] == '?':
        return 'again'
    else:
        dataframe.at[index, attribute] = 'Error'
        return 'changed'


def rel_changes(dataframe, attribute, index, row):
    if (row[attribute] in ['Wife', 'Husband']):
        dataframe.at[index, attribute] = 'married'
        return 'changed'
    elif (row[attribute] in ['Own-child', 'Not-in-family', 'Other-relative', 'Unmarried']):
        dataframe.at[index, attribute] = 'not married'
        return 'changed'
    elif (row[attribute] == 'married' or row[attribute] == 'not married'):
        dataframe.at[index, attribute] = '?'
        return 'changed'
    elif row[attribute] == '?':
        return 'again'
    else:
        dataframe.at[index, attribute] = 'Error'
        return 'changed'

Changing values in dataframe rows with a given possibility and an amount of changes per row

In [6]:
def change_row_values(dataframe, probability, max_change):
    for index, row in dataframe.iterrows():
        # probability to change the row
        if rd.random() <= probability:
            for i in range(0, max_change):
                dataframe.at[index, 'amount_changed'] += 1
                ret = 'again'
                while ret == 'again':
                    column_rand = rd.randint(0, (len(attributes)-1))
                    if simu_dataset == 'diabetes_130':
                        ret = attribute_changes_diabetes130(dataframe, attributes[column_rand], index, row)
                    elif simu_dataset == 'diabetes_012':
                        ret = attribute_changes_diabetes012(dataframe, attributes[column_rand], index, row)
                    elif simu_dataset == 'adult':
                        ret = attribute_changes_adult(dataframe, attributes[column_rand], index, row)

change_values_strictness takes a dataframe and the level of strictness that should change the dataset accordingly with a growing probability for changes if the strictness grows.

In [10]:
def change_values_strictness(dataframe, strictness):
        if strictness == 'low':
            change_row_values(dataframe, 0.25, 1)
        elif strictness == 'medium':
            change_row_values(dataframe, 0.5, 2)
        elif strictness == 'high':
            change_row_values(dataframe, 0.75, 3)
        elif strictness == 'maximum':
            change_row_values(dataframe, 1, 4)

mixed simulation with 3 different levels, representing the 3 Westin classifications of population privacy (from Privacy practices of Internet users: Self-reports versus observed behavior)

- Fundamentalists (high strictness): 34%
- Pragmatist (medium strictness): 43%
- Unconcerned (low strictness): 23%

In [11]:
def change_values_strictness_westin(dataframe, percentage, strictness, start):
    subset = (percentage*dataframe.shape[0])/100
    subsetDf = dataframe.iloc[start:(start+int(subset)+1)]
    # print(subsetDf)
    change_values_strictness(subsetDf, strictness)
    return (start+int(subset)+1)


In [12]:
def change_realistic_values(dataframe, percentages, strictnesses):
    start = 0
    for percentage, strictness in zip(percentages, strictnesses):
        # print('Percentage:', percentage,'%', 'Strictness:', strictness)
        start = change_values_strictness_westin(dataframe, percentage, strictness, start)

In [13]:
# checking for errors in a dataframe, returns number of errors
def check_for_errors(dataframe):
    count = 0
    for column in dataframe:
        row_count_with_values = dataframe.loc[dataframe[column] == 'Error'].shape[0]
        # print('Error values in', column, 'column:', row_count_with_values)
        count+= row_count_with_values
    return count

# check_for_errors(test_)

Creating changed dataframes for all data and exporting the results into csv-files.

In [14]:
def export(dataframe, modus):
    errors = check_for_errors(dataframe)
    if (errors != 0):
        print('There are', errors ,'errors in the dataframe')
    else:
        print('There are', errors ,'errors in the dataframe')
        if simu_dataset == 'diabetes_130':
            dataframe.to_csv('./diabetes_130/simulations/'+modus+'.csv', index=False)
        elif simu_dataset == 'diabetes_012':
            dataframe.to_csv('./diabetes_012/simulations/'+modus+'.csv', index=False)
        elif simu_dataset == 'adult':
            dataframe.to_csv('./adult/simulations/'+modus+'.csv', index=False)
        else:
            print('Error export')


In [15]:
def export_changed_(strictness):
    dataframe = df.copy()
    change_values_strictness(dataframe, strictness)
    export(dataframe, strictness)

def export_changed_strictnesses(dataframes, strictnesses):
    for dataframe, strictness in zip(dataframes, strictnesses):
        change_values_strictness(dataframe, strictness)
    result = pd.concat(dataframes)
    export(result, 'karampela')

def export_westin_change(percentages, strictnesses):
    dataframe = df.copy()
    change_realistic_values(dataframe, percentages, strictnesses)
    export(dataframe, 'westin')
    

Simulating the three extreme cases of strictness.

In [None]:
export_changed_('low')
export_changed_('medium')
export_changed_('high')
export_changed_('maximum')

Simulating a realistic case of participants having different strictnesses. 

In [None]:
percentages=[26,64,10]
percentages2=[50,50,0]
percentages3=[0,50,50]
strictnesses=['high', 'medium', 'low']

export_westin_change(percentages, strictnesses)

practical distribution where participants over 45 are more likely to be stricter with their data sharing

1. analyse each row, if it has fitting attributes
    - label the row with the strictness
2. iterate and change the rows according to the label

In [6]:
data = df.copy()
data['strictness'] = pd.Series()
data['strictness'].fillna(0, inplace=True)
# possible values: low, medium, high, maximum

# age
if simu_dataset == 'diabetes_012':
    high = data.loc[data['Age'].isin(['45-49','50-54','55-59','60-64','65-69','70-74','75-79','80+'])]
    medium = data.loc[data['Age'].isin(['30-34','35-39','40-44'])]
    low = data.loc[data['Age'].isin(['18-24','25-29'])]
elif simu_dataset == 'diabetes_130':
    high = data.loc[data['age'].isin(['[50-60)','[60-70)','[70-80)','[80-90)','[90-100)'])]
    medium = data.loc[data['age'].isin(['[30-40)','[40-50)'])]
    low = data.loc[data['age'].isin(['[0-10)','[10-20)','[20-30)'])]
elif simu_dataset == 'adult':
    high = data.loc[data['age'].isin(range(45,100))]
    medium = data.loc[data['age'].isin(range(30,45))]
    low = data.loc[data['age'].isin(range(18,30))]

for index, row in high.iterrows():
    data.at[index,'strictness'] = 'high'
for index, row in medium.iterrows():
    data.at[index,'strictness'] = 'medium'
for index, row in low.iterrows():
    data.at[index,'strictness'] = 'low'

# + education in the dataset where education is given by participants
def rank_higher(value, index):
    if value == 'high':
        data.at[index,'strictness'] = 'maximum'
    elif value == 'medium':
        data.at[index,'strictness'] = 'high'
    elif value == 'low':
        data.at[index,'strictness'] = 'medium'
    else:
        data.at[index,'strictness'] = 'low'
def rank_lower(value, index):
    if value == 'high':
        data.at[index,'strictness'] = 'medium'
    elif value == 'medium':
        data.at[index,'strictness'] = 'low'
    elif value == 'low':
        data.at[index,'strictness'] = 'low'
    else:
        data.at[index,'strictness'] = 'low'
# "Lower levels of education are associated with lesser willingness to share data" Karampela et al. (2019)[24]
if simu_dataset != 'diabetes_130':
    if simu_dataset == 'diabetes_012':
        high = data.loc[data['Education'].isin(['no school/only kindergarten','elementary','some high school'])]
        medium = data.loc[data['Education'].isin(['high school graduate'])]
        low = data.loc[data['Education'].isin(['some college/technical school', 'college graduate'])]
    elif simu_dataset == 'adult':
        high = data.loc[data['education'].isin(['Preschool','1st-4th','5th-6th','7th-8th','9th','10th','11th','12th'])]
        medium = data.loc[data['education'].isin(['HS-grad'])]
        low = data.loc[data['education'].isin(['Some-college','Assoc-acdm','Assoc-voc','Bachelors','Masters','Doctorate'])]

    for index, row in high.iterrows():
        value = data.at[index,'strictness']
        rank_higher(value, index)
    for index, row in medium.iterrows():
        value = data.at[index,'strictness']
    for index, row in low.iterrows():
        value = data.at[index,'strictness']
        rank_lower(value, index)



In [None]:
low = data.loc[data['strictness']=='low']
print(simu_dataset)
print('low: ',low.shape[0]*100/data.shape[0], '%')
medium = data.loc[data['strictness']=='medium']
print('medium: ', medium.shape[0]*100/data.shape[0], '%')
high = data.loc[data['strictness']=='high']
print('high: ',high.shape[0]*100/data.shape[0], '%')
maximum = data.loc[data['strictness']=='maximum']
print('maximum: ',maximum.shape[0]*100/data.shape[0], '%')

export_changed_strictnesses([low, medium, high, maximum], ['low', 'medium', 'high', 'maximum'])