importing libraries

In [None]:
import pandas as pd
import itertools

Reading Data

In [None]:
data = pd.read_csv('/content/insurance.csv')

Data

In [None]:
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [None]:
data.dtypes

age           int64
sex          object
bmi         float64
children      int64
smoker       object
region       object
charges     float64
dtype: object

Defining Q

In [None]:
quasi_identifiers = ['age', 'sex', 'bmi', 'charges']

Frequency Set


In [None]:
# frequencySet = lambda table, columns: table.groupby(columns).size()

def k_Anonymous(table, q, k):
  for freq in table.groupby(q).size():
    if freq < k:
      return False
  return True

Check if the table is k anonymized

In [None]:
k_Anonymous(data, quasi_identifiers, 1)

True

Defining Generalization Hirarchy

SEX -> p

age :

10 <= age < 30

30 <= age < 50

50 <= age < 70

70 <= age < inf


bmi :

xx.xxx -> xx.xx*

xx.xx* -> xx.x**

xx.x** -> xx.***


Charges:

xxxx.xxxxx -> xxxx.xxx**

xxxx.xxx** -> xxxx.x****

xxxx.x**** -> xxx*.*****



In [None]:
generalizations = {'sex' : 1, 'age': 1, 'bmi' : 3, 'charges' : 3}

Defining generalization functions

In [None]:
# sex
def sex_zero_gen(column):
  return column

def sex_first_gen(column):
  return 'p'

# age
def age_zero_gen(column):
  return column

def age_first_gen(column):
  if column < 10:
    return '-10'
  elif column >= 10 and column < 30:
    return '10-30'
  elif column >= 30 and column < 50:
    return '30-50'
  elif column >= 50 and column < 70:
    return '50-70'
  else:
    return '70+'

# bmi
def bmi_zero_gen(column):
  return column

def bmi_first_gen(column):

  return str(column)[:-1] + '*'

def bmi_second_gen(column):
  return str(column)[:-2] + '**'

def bmi_third_gen(column):
  return '*'

# Charges
def charges_zero_gen(column):
  return column

def charges_first_gen(column):
  return str(column)[:-2] + '**'

def charges_second_gen(column):
  return str(column)[:-4] + '****'

def charges_third_gen(column):
  return '*'

Generalization

In [None]:
def get_generalization_function(column, level = 0):
  if column == 'age':
    if level == 0:
      return age_zero_gen
    elif level == 1:
      return age_first_gen

  elif column == 'sex':
    if level == 0:
      return sex_zero_gen
    elif level == 1:
      return sex_first_gen
  elif column == 'bmi':
    if level == 0:
      return bmi_zero_gen
    elif level == 1:
      return bmi_first_gen
    elif level == 2:
      return bmi_second_gen
    elif level == 3:
      return bmi_third_gen
  elif column == 'charges':
    if level == 0:
      return charges_zero_gen
    elif level == 1:
      return charges_first_gen
    elif level == 2:
      return charges_second_gen
    elif level == 3:
      return charges_third_gen


Hirarchy

In [None]:
def generate_combinations(dictionary):
    keys = list(dictionary.keys())

    def helper(index, current_combination):
        if index == len(keys):
            return [current_combination]

        key = keys[index]
        value = dictionary[key]
        combinations = []

        for i in range(value + 1):
            new_combination = current_combination.copy()
            new_combination[key] = i
            combinations.extend(helper(index + 1, new_combination))

        return combinations

    return helper(0, {})


In [None]:
generalizations

{'sex': 1, 'age': 1, 'bmi': 3, 'charges': 3}

In [None]:
len(generate_combinations(generalizations))

64

In [None]:
generate_combinations(generalizations)

[{'sex': 0, 'age': 0, 'bmi': 0, 'charges': 0},
 {'sex': 0, 'age': 0, 'bmi': 0, 'charges': 1},
 {'sex': 0, 'age': 0, 'bmi': 0, 'charges': 2},
 {'sex': 0, 'age': 0, 'bmi': 0, 'charges': 3},
 {'sex': 0, 'age': 0, 'bmi': 1, 'charges': 0},
 {'sex': 0, 'age': 0, 'bmi': 1, 'charges': 1},
 {'sex': 0, 'age': 0, 'bmi': 1, 'charges': 2},
 {'sex': 0, 'age': 0, 'bmi': 1, 'charges': 3},
 {'sex': 0, 'age': 0, 'bmi': 2, 'charges': 0},
 {'sex': 0, 'age': 0, 'bmi': 2, 'charges': 1},
 {'sex': 0, 'age': 0, 'bmi': 2, 'charges': 2},
 {'sex': 0, 'age': 0, 'bmi': 2, 'charges': 3},
 {'sex': 0, 'age': 0, 'bmi': 3, 'charges': 0},
 {'sex': 0, 'age': 0, 'bmi': 3, 'charges': 1},
 {'sex': 0, 'age': 0, 'bmi': 3, 'charges': 2},
 {'sex': 0, 'age': 0, 'bmi': 3, 'charges': 3},
 {'sex': 0, 'age': 1, 'bmi': 0, 'charges': 0},
 {'sex': 0, 'age': 1, 'bmi': 0, 'charges': 1},
 {'sex': 0, 'age': 1, 'bmi': 0, 'charges': 2},
 {'sex': 0, 'age': 1, 'bmi': 0, 'charges': 3},
 {'sex': 0, 'age': 1, 'bmi': 1, 'charges': 0},
 {'sex': 0, '

Remove direct generalizations

In [None]:
def is_direct_generalization(ref, gen):
  one_diff = False
  for key in ref:
    if ref[key] - gen[key] not in [0,-1]:
      return False
    elif ref[key] - gen[key] in [-1]:
      if one_diff:
        return False
      else:
        one_diff = True
  return True


def remove_direct_generalizations(ref, generalizations):
  filtered = []
  removed = []

  for comb in generalizations:
    if is_direct_generalization(ref, comb):
      removed.append(comb)
    else:
      filtered.append(comb)

  return filtered, removed


In [None]:
is_direct_generalization({'sex': 0, 'age': 0, 'bmi': 0, 'charges': 0}, {'sex': 0, 'age': 0, 'bmi': 0, 'charges': 2})

False

In [None]:
hirarchy = generate_combinations(generalizations)
filterd, removed = remove_direct_generalizations({'sex': 0, 'age': 0, 'bmi': 0, 'charges': 0}, hirarchy)
print(removed)
print("---------------------")
print(filterd)

[{'sex': 0, 'age': 0, 'bmi': 0, 'charges': 0}]
---------------------
[{'sex': 0, 'age': 0, 'bmi': 0, 'charges': 1}, {'sex': 0, 'age': 0, 'bmi': 0, 'charges': 2}, {'sex': 0, 'age': 0, 'bmi': 0, 'charges': 3}, {'sex': 0, 'age': 0, 'bmi': 1, 'charges': 0}, {'sex': 0, 'age': 0, 'bmi': 1, 'charges': 1}, {'sex': 0, 'age': 0, 'bmi': 1, 'charges': 2}, {'sex': 0, 'age': 0, 'bmi': 1, 'charges': 3}, {'sex': 0, 'age': 0, 'bmi': 2, 'charges': 0}, {'sex': 0, 'age': 0, 'bmi': 2, 'charges': 1}, {'sex': 0, 'age': 0, 'bmi': 2, 'charges': 2}, {'sex': 0, 'age': 0, 'bmi': 2, 'charges': 3}, {'sex': 0, 'age': 0, 'bmi': 3, 'charges': 0}, {'sex': 0, 'age': 0, 'bmi': 3, 'charges': 1}, {'sex': 0, 'age': 0, 'bmi': 3, 'charges': 2}, {'sex': 0, 'age': 0, 'bmi': 3, 'charges': 3}, {'sex': 0, 'age': 1, 'bmi': 0, 'charges': 0}, {'sex': 0, 'age': 1, 'bmi': 0, 'charges': 1}, {'sex': 0, 'age': 1, 'bmi': 0, 'charges': 2}, {'sex': 0, 'age': 1, 'bmi': 0, 'charges': 3}, {'sex': 0, 'age': 1, 'bmi': 1, 'charges': 0}, {'sex': 0,

Generalize Table

In [None]:
def generalize(node, data):
  result = data.copy()
  for column in node:
    function = get_generalization_function(column, node[column])
    result[column] = result[column].apply(function)

  return result

In [None]:
df = generalize({'sex': 1, 'age': 1, 'bmi': 1, 'charges': 1}, data)

In [None]:
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,10-30,p,27.*,0,yes,southwest,16884.9**
1,10-30,p,33.7*,1,no,southeast,1725.55**
2,10-30,p,33.*,3,no,southeast,4449.4**
3,30-50,p,22.70*,0,no,northwest,21984.470**
4,30-50,p,28.8*,0,no,northwest,3866.85**
...,...,...,...,...,...,...,...
1333,50-70,p,30.9*,3,no,northwest,10600.54**
1334,10-30,p,31.9*,0,no,northeast,2205.98**
1335,10-30,p,36.8*,0,no,southeast,1629.83**
1336,10-30,p,25.*,0,no,southwest,2007.9**


Incognito


In [None]:
def incognito(quasi_identifiers,generalizations, data, k):
  hirarchy = generate_combinations(generalizations)
  c = hirarchy.copy()
  s = []
  while len(c) != 0:
    node = c.pop()
    df = generalize(node, data)
    if k_Anonymous(df, quasi_identifiers, k):
      s.append(node)
      c, filterd = remove_direct_generalizations(node, c)
      print(filterd)
      s = s + filterd


  return s, c

In [None]:
s, c = incognito(quasi_identifiers, generalizations, data, 3)

[]
[]
[]
[]


In [None]:
s

[{'sex': 1, 'age': 1, 'bmi': 3, 'charges': 3},
 {'sex': 1, 'age': 0, 'bmi': 3, 'charges': 3},
 {'sex': 0, 'age': 1, 'bmi': 3, 'charges': 3},
 {'sex': 0, 'age': 0, 'bmi': 3, 'charges': 3}]

In [None]:
c

[]