# 1. Faker

In [1]:
import pandas as pd
from faker import Faker

# Create a Faker object
fake = Faker()

# Create a sample dataframe
df = pd.DataFrame({
    'Name': ['John', 'Jane', 'Bob', 'Alice'],
    'Age': [25, 30, 35, 40],
    'Email': ['john@example.com', 'jane@example.com', 'bob@example.com', 'alice@example.com'],
    'Phone': ['555-1234', '555-5678', '555-9012', '555-3456']
})
df.head()

Unnamed: 0,Name,Age,Email,Phone
0,John,25,john@example.com,555-1234
1,Jane,30,jane@example.com,555-5678
2,Bob,35,bob@example.com,555-9012
3,Alice,40,alice@example.com,555-3456


In [2]:
def anonymize_df(df):
    # Ciclo para pasar por cada columna 
    for col in df.columns:
        # Verificar si es string 
        if df[col].dtype == 'object':
            # Reemplazar con fake data
            df[col] = df[col].apply(lambda x: fake.name() if '@' not in x else fake.email())
        # Verificar si es entero
        elif df[col].dtype in ['int64','int32']:
            # Reemplazar con enteros random
            df[col] = df[col].apply(lambda x: fake.random_int(min=18, max=80))
        # Verificar si es flotante
        elif df[col].dtype in ['float64','float32']:
            # Reemplzar con flotantes random
            df[col] = df[col].apply(lambda x: fake.random.uniform(0, 1))
        # Verificar si es booleano
        elif df[col].dtype == 'bool':
            # Reemplazar con random booleanos
            df[col] = df[col].apply(lambda x: fake.boolean())
    return df

# Anonimizar el dataset
df_anon = anonymize_df(df)
df_anon

Unnamed: 0,Name,Age,Email,Phone
0,Timothy Adams,33,samantha15@example.com,April Wood
1,Nicholas Lopez,57,smithpaul@example.net,Devon Harvey
2,Veronica Martinez,21,millerdonald@example.org,Natalie Strickland
3,James White,25,leah33@example.com,Alexander Wright


# 2. anonympy

In [None]:
!pip install anonympy

In [None]:
!pip install cape-privacy==0.3.0 --no-deps

In [1]:
import pandas as pd

url = r'https://raw.githubusercontent.com/ArtLabss/open-data-anonimizer/0287f675a535101f145cb975baf361a96ff71ed3/examples/files/new.csv'
df = pd.read_csv(url, parse_dates=['birthdate'])
df.head()

Unnamed: 0,first_name,address,city,postal,phone,email,web,salary,birthdate,age
0,Aleshia,14 Taylor St,St. Stephens Ward,CT2 7PP,01835-703597,atomkiewicz@hotmail.com,http://www.alandrosenburgcpapc.co.uk,46391,2000-12-23 15:09:18.117475200,21
1,Evan,5 Binney St,Abbey Ward,HP11 2AX,01937-864715,evan.zigomalas@gmail.com,http://www.capgeminiamerica.co.uk,30798,2004-04-22 04:09:51.325948800,17
2,France,8 Moor Place,East Southbourne and Tuckton W,BH6 3BE,01347-368222,france.andrade@hotmail.com,http://www.elliottjohnwesq.co.uk,32384,2002-01-21 18:56:29.090025600,19
3,Ulysses,505 Exeter Rd,Hawerby cum Beesby,DN36 5RP,01912-771311,ulysses@hotmail.com,http://www.mcmahanbenl.co.uk,39298,2000-11-24 21:59:48.621840000,21
4,Tyisha,5396 Forth Street,Greets Green and Lyng Ward,B70 9DT,01547-429341,tyisha.veness@hotmail.com,http://www.champagneroom.co.uk,41630,1998-06-23 05:19:37.687008000,23


In [2]:
from anonympy.pandas import dfAnonymizer 
anonym = dfAnonymizer(df)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# chequear tipos datos
print(anonym.numeric_columns) 
print(anonym.categorical_columns) 
print(anonym.datetime_columns) 

['salary', 'age']
['first_name', 'address', 'city', 'postal', 'phone', 'email', 'web']
['birthdate']


In [4]:
#Metodos disponibles
from anonympy.pandas.utils_pandas import available_methods
print(available_methods())

`numeric`:
        * Perturbation - "numeric_noise"
        * Binning - "numeric_binning"
        * PCA Masking - "numeric_masking"
        * Rounding - "numeric_rounding"

`categorical`:
        * Synthetic Data - "categorical_fake"
        * Synthetic Data Auto - "categorical_fake_auto"
        * Resampling from same Distribution - "categorical_resampling"
        * Tokenazation - "categorical_tokenization"
        * Email Masking - "categorical_email_masking"

`datetime`:
        * Synthetic Date - "datetime_fake"
        * Perturbation - "datetime_noise"

`general`:
        * Drop Column - "column_suppression"
        
None


In [5]:
anonym.numeric_noise('age')   
anonym.numeric_rounding('salary')  
anonym.categorical_email_masking('email') 

O en una sola linea
```python
anonym.anonymize({'age':'numeric_noise',                      
                    'salary':'numeric_rounding',                      
                    'email':'categorical_email_masking'})
```

In [6]:
anonym.info()

+------------+--------+-------------+---------------------------+
|   Column   | Status |    Type     |          Method           |
| first_name | 0      | categorical |                           |
+------------+--------+-------------+---------------------------+
| address    | 0      | categorical |                           |
+------------+--------+-------------+---------------------------+
| city       | 0      | categorical |                           |
+------------+--------+-------------+---------------------------+
| postal     | 0      | categorical |                           |
+------------+--------+-------------+---------------------------+
| phone      | 0      | categorical |                           |
+------------+--------+-------------+---------------------------+
| email      | 1      | categorical | Partial Masking           |
+------------+--------+-------------+---------------------------+
| web        | 0      | categorical |                           |
+---------

In [9]:
from anonympy.pandas.utils_pandas import fake_methods

print(fake_methods('f')) # agrs: None / 'all' / any letter  

 factories, file_extension, file_name,file_path,firefox, first_name,first_name_female, first_name_male,first_name_nonbinary,fixed_width, format,free_email, free_email_domain, future_date, future_datetime
None


In [7]:
anonym.categorical_fake('first_name') 

In [8]:
anonym.categorical_fake_auto() # esto cambiara `address` y `city` 
anonym.categorical_fake({'web': 'url', 'phone': 'phone_number'}) 

`first_name` column already anonymized!
`email` column already anonymized!


In [9]:
anonym.datetime_noise('birthdate')

In [10]:
anonym

+---------------------------------------+
|      Total number of columns: 10      |
| Anonymized Column -> Method:          |
| - age -> Numeric Perturbation         |
| - salary -> Generalization - Rounding |
| - email -> Partial Masking            |
| - first_name -> Synthetic Data        |
| - address -> Synthetic Data           |
| - city -> Synthetic Data              |
| - web -> Synthetic Data               |
| - phone -> Synthetic Data             |
| - birthdate -> Datetime Perturbation  |
+---------------------------------------+
| Unanonymized Columns:                 |
| - postal                              |
+---------------------------------------+

In [11]:
anonym._df

Unnamed: 0,first_name,address,city,postal,phone,email,web,salary,birthdate,age
0,Thomas,"745 Gross Road Apt. 296\nWest Matthew, AR 54492",Scottview,CT2 7PP,650.029.5700x8031,a*****z@hotmail.com,https://willis.com/,50000,2001-07-25 15:09:18.117475200,13
1,Julie,73973 Justin Ridge Apt. 479\nWest Jenniferbury...,Castillochester,HP11 2AX,001-849-473-1406x412,e*****s@gmail.com,http://powers.biz/,30000,2004-05-20 04:09:51.325948800,20
2,Brandon,"7742 Amanda Square\nEast Erika, AZ 99118",West Austin,BH6 3BE,324.937.2814,f*****e@hotmail.com,https://www.reid.com/,30000,2001-10-22 18:56:29.090025600,15
3,John,"31502 Raymond Coves Apt. 393\nSouth Danielton,...",Davischester,DN36 5RP,330-771-8935x84031,u*****s@hotmail.com,http://hernandez.org/,40000,2001-08-21 21:59:48.621840000,27
4,Anna,53741 Thomas Junction Suite 458\nWest Eugenesi...,Michaeltown,B70 9DT,719.039.9052x393,t*****s@hotmail.com,https://www.armstrong.com/,40000,1997-08-26 05:19:37.687008000,21
...,...,...,...,...,...,...,...,...,...,...
495,Amy,"7955 Miller Alley Suite 458\nNorth Michael, PR...",South Benjaminberg,SW1W 8JY,+1-529-225-6545x7178,a*****y@veit.co.uk,https://www.ramos.com/,40000,1996-01-07 16:40:58.379318400,26
496,Carol,271 Justin Squares Suite 456\nNorth Jessicafur...,South Brittanybury,IV2 6WT,005-145-4844,r*****i@euresti.co.uk,http://west.info/,40000,1999-10-08 11:23:56.188204800,22
497,Rebecca,"601 Theresa Springs Suite 233\nWest Lisafurt, ...",Laurieland,S75 5EJ,(442)645-0341x316,c*****g@brenning.co.uk,http://carroll.com/,40000,1997-04-14 22:03:29.331331200,30
498,Katherine,"83052 Coleman Grove\nPeckside, AS 61863",Miketown,DH8 5LP,(938)389-7081x6244,c*****y@gmail.com,https://cruz-henderson.com/,30000,1995-08-14 21:48:38.237414400,24


# 3. Faker-Otros idiomas

In [12]:
import csv
#from faker import Faker
from faker import Factory
from collections import defaultdict

import pandas as pd

In [13]:
from faker.config import AVAILABLE_LOCALES
print(AVAILABLE_LOCALES)

['ar_AA', 'ar_AE', 'ar_BH', 'ar_EG', 'ar_JO', 'ar_PS', 'ar_SA', 'az_AZ', 'bg_BG', 'bn_BD', 'bs_BA', 'cs_CZ', 'da_DK', 'de', 'de_AT', 'de_CH', 'de_DE', 'dk_DK', 'el_CY', 'el_GR', 'en', 'en_AU', 'en_CA', 'en_GB', 'en_IE', 'en_IN', 'en_NZ', 'en_PH', 'en_TH', 'en_US', 'es', 'es_AR', 'es_CA', 'es_CL', 'es_CO', 'es_ES', 'es_MX', 'et_EE', 'fa_IR', 'fi_FI', 'fil_PH', 'fr_BE', 'fr_CA', 'fr_CH', 'fr_FR', 'fr_QC', 'ga_IE', 'he_IL', 'hi_IN', 'hr_HR', 'hu_HU', 'hy_AM', 'id_ID', 'it_CH', 'it_IT', 'ja_JP', 'ka_GE', 'ko_KR', 'la', 'lb_LU', 'lt_LT', 'lv_LV', 'mt_MT', 'ne_NP', 'nl_BE', 'nl_NL', 'no_NO', 'or_IN', 'pl_PL', 'pt_BR', 'pt_PT', 'ro_RO', 'ru_RU', 'sk_SK', 'sl_SI', 'sq_AL', 'sv_SE', 'ta_IN', 'th', 'th_TH', 'tl_PH', 'tr_TR', 'tw_GH', 'uk_UA', 'vi_VN', 'zh_CN', 'zh_TW']


In [14]:
#Local - Ingles
from faker import Faker
fake = Faker(['en'])
for _ in range(10):
    print(fake.name())

Elizabeth Davis
Alexandra Black
Brandon Schmidt
Robert Sweeney
Kaylee Acosta
Madeline Stephens
Gabrielle Mitchell
Tony Singleton
George Watson
Pamela Bennett DDS


In [15]:
#Local - India | Language - Tamil
from faker import Faker
fake = Faker(['ta_IN'])
for _ in range(10):
    print(fake.name())

இந்திரகுமார்
மணவழகன்
வர்ணவதி ஒளியன்
ராகவன்
அகானா நடேஷ்
மகிணன்
அக்ஷா
தணிகைத்தம்பி
ஹேமா
வல்லரசு


# 4. Pseudonymization

La Pseudonymization es el proceso de reemplazar datos identificables con seudónimos o alias para proteger la privacidad de las personas y al mismo tiempo mantener la usabilidad de los datos.

En este ejemplo, la función pseudonimizar() toma los datos como entrada y utiliza el algoritmo hash SHA-256 del módulo hashlib para seudonimizar los datos. El parámetro de datos se convierte a su representación de cadena mediante .encode(). Luego, la función hashlib.sha256() genera un objeto hash SHA-256. Finalmente, el método hexdigest() devuelve la representación hexadecimal del hash, que sirve como valor seudonimizado.

In [16]:
import hashlib

def pseudonymize(data):
    hash_object = hashlib.sha256(data.encode())
    pseudonym = hash_object.hexdigest()
    return pseudonym

In [18]:
name = "John Doe"
pseudonymized_name = pseudonymize(name)
print(pseudonymized_name)

6cea57c2fb6cbc2a40411135005760f241fffc3e5e67ab99882726431037f908


In [19]:
import base64
from hashlib import blake2b

import pandas as pd
import requests

from faker import Faker

def encode(key, clear):
    enc = []
    for i in range(len(clear)):
        key_c = key[i % len(key)]
        #print(key_c)
        enc_c = (ord(clear[i]) + ord(key_c)) % 256
        #print(enc_c)
        enc.append(enc_c)
    return base64.urlsafe_b64encode(bytes(enc))

def decode(key, enc):
    dec = []
    enc = base64.urlsafe_b64decode(enc)
    for i in range(len(enc)):
        key_c = key[i % len(key)]
        dec_c = chr((256 + enc[i] - ord(key_c)) % 256)
        dec.append(dec_c)
    return "".join(dec)

In [20]:
df = pd.read_csv('iot_example.csv')
df

Unnamed: 0,timestamp,username,temperature,heartrate,build,latest,note
0,2017-01-01T12:00:23,michaelsmith,12,67,4e6a7805-8faa-2768-6ef6-eb3198b483ac,0,interval
1,2017-01-01T12:01:09,kharrison,6,78,7256b7b0-e502-f576-62ec-ed73533c9c84,0,wake
2,2017-01-01T12:01:34,smithadam,5,89,9226c94b-bb4b-a6c8-8e02-cb42b53e9c90,0,
3,2017-01-01T12:02:09,eddierodriguez,28,76,2599ac79-e5e0-5117-b8e1-57e5ced036f7,0,update
4,2017-01-01T12:02:36,kenneth94,29,62,122f1c6a-403c-2221-6ed1-b5caa08f11e0,0,user
...,...,...,...,...,...,...,...
146392,2017-02-28T23:58:06,mcontreras,15,63,69e61a15-d2d0-47a7-1a27-e07b3eeeba10,0,
146393,2017-02-28T23:58:43,joelrusso,27,89,4af5c12e-df08-40f4-4c70-b9361044cc4b,0,
146394,2017-02-28T23:59:23,lellis,13,84,dac87426-e147-9c39-6e4c-790bb11f8fc9,0,update
146395,2017-02-28T23:59:48,grayjasmin,17,64,4911a589-3a15-4bbf-1de1-e5a69ab739da,1,update


In [25]:
username = df.iloc[0,1]
username

'michaelsmith'

In [22]:
encode('supa_secret', username)

b'4N7TycDY0dbfzujb'

In [23]:
decode('supa_secret', b'4N7TycDY0dbfzujb')

'michaelsmith'

Otros conceptos: 

- Pseudoanonimizacion multivariada
- K-Anonymity
- Differential Privacy