# 1. Faker

In [1]:
!pip install -q faker

You should consider upgrading via the 'c:\users\david\appdata\local\programs\python\python39\python.exe -m pip install --upgrade pip' command.


In [2]:
import pandas as pd
from faker import Faker

# Create a Faker object
fake = Faker()

# Create a sample dataframe
df = pd.DataFrame({
    'Name': ['John', 'Jane', 'Bob', 'Alice'],
    'Age': [25, 30, 35, 40],
    'Email': ['john@example.com', 'jane@example.com', 'bob@example.com', 'alice@example.com'],
    'Phone': ['555-1234', '555-5678', '555-9012', '555-3456']
})
df.head()

Unnamed: 0,Name,Age,Email,Phone
0,John,25,john@example.com,555-1234
1,Jane,30,jane@example.com,555-5678
2,Bob,35,bob@example.com,555-9012
3,Alice,40,alice@example.com,555-3456


In [3]:
def anonymize_df(df):
    # Ciclo para pasar por cada columna 
    for col in df.columns:
        # Verificar si es string 
        if df[col].dtype == 'object':
            # Reemplazar con fake data
            df[col] = df[col].apply(lambda x: fake.name() if '@' not in x else fake.email())
        # Verificar si es entero
        elif df[col].dtype in ['int64','int32']:
            # Reemplazar con enteros random
            df[col] = df[col].apply(lambda x: fake.random_int(min=18, max=80))
        # Verificar si es flotante
        elif df[col].dtype in ['float64','float32']:
            # Reemplzar con flotantes random
            df[col] = df[col].apply(lambda x: fake.random.uniform(0, 1))
        # Verificar si es booleano
        elif df[col].dtype == 'bool':
            # Reemplazar con random booleanos
            df[col] = df[col].apply(lambda x: fake.boolean())
    return df

# Anonimizar el dataset
df_anon = anonymize_df(df)
df_anon

Unnamed: 0,Name,Age,Email,Phone
0,Calvin Martinez,60,matthew94@example.org,Dana Sanders
1,Logan Hernandez,75,angelcox@example.com,Sherry Dalton
2,Theresa Richards,25,kathryn54@example.org,Patricia Romero
3,Michael Santiago,37,susan09@example.org,Xavier Burnett


# 2. anonympy

In [None]:
!pip install -q anonympy

In [7]:
!pip install cape-privacy==0.3.0 --no-deps

Collecting cape-privacy==0.3.0
  Downloading cape_privacy-0.3.0-py3-none-any.whl (48 kB)
Installing collected packages: cape-privacy
Successfully installed cape-privacy-0.3.0


You should consider upgrading via the 'c:\users\david\appdata\local\programs\python\python39\python.exe -m pip install --upgrade pip' command.


In [5]:
import pandas as pd

url = r'https://raw.githubusercontent.com/ArtLabss/open-data-anonimizer/0287f675a535101f145cb975baf361a96ff71ed3/examples/files/new.csv'
df = pd.read_csv(url, parse_dates=['birthdate'])
df.head()

Unnamed: 0,first_name,address,city,postal,phone,email,web,salary,birthdate,age
0,Aleshia,14 Taylor St,St. Stephens Ward,CT2 7PP,01835-703597,atomkiewicz@hotmail.com,http://www.alandrosenburgcpapc.co.uk,46391,2000-12-23 15:09:18.117475200,21
1,Evan,5 Binney St,Abbey Ward,HP11 2AX,01937-864715,evan.zigomalas@gmail.com,http://www.capgeminiamerica.co.uk,30798,2004-04-22 04:09:51.325948800,17
2,France,8 Moor Place,East Southbourne and Tuckton W,BH6 3BE,01347-368222,france.andrade@hotmail.com,http://www.elliottjohnwesq.co.uk,32384,2002-01-21 18:56:29.090025600,19
3,Ulysses,505 Exeter Rd,Hawerby cum Beesby,DN36 5RP,01912-771311,ulysses@hotmail.com,http://www.mcmahanbenl.co.uk,39298,2000-11-24 21:59:48.621840000,21
4,Tyisha,5396 Forth Street,Greets Green and Lyng Ward,B70 9DT,01547-429341,tyisha.veness@hotmail.com,http://www.champagneroom.co.uk,41630,1998-06-23 05:19:37.687008000,23


In [10]:
import warnings
warnings.filterwarnings('ignore')
from anonympy.pandas import dfAnonymizer 
anonym = dfAnonymizer(df)

In [11]:
# chequear tipos datos
print(anonym.numeric_columns) 
print(anonym.categorical_columns) 
print(anonym.datetime_columns) 

['salary', 'age']
['first_name', 'address', 'city', 'postal', 'phone', 'email', 'web']
['birthdate']


In [12]:
#Metodos disponibles
from anonympy.pandas.utils_pandas import available_methods
print(available_methods())

`numeric`:
        * Perturbation - "numeric_noise"
        * Binning - "numeric_binning"
        * PCA Masking - "numeric_masking"
        * Rounding - "numeric_rounding"

`categorical`:
        * Synthetic Data - "categorical_fake"
        * Synthetic Data Auto - "categorical_fake_auto"
        * Resampling from same Distribution - "categorical_resampling"
        * Tokenazation - "categorical_tokenization"
        * Email Masking - "categorical_email_masking"

`datetime`:
        * Synthetic Date - "datetime_fake"
        * Perturbation - "datetime_noise"

`general`:
        * Drop Column - "column_suppression"
        
None


In [13]:
anonym.numeric_noise('age')   
anonym.numeric_rounding('salary')  
anonym.categorical_email_masking('email') 

O en una sola linea
```python
anonym.anonymize({'age':'numeric_noise',                      
                    'salary':'numeric_rounding',                      
                    'email':'categorical_email_masking'})
```

In [14]:
anonym.info()

+------------+--------+-------------+---------------------------+
|   Column   | Status |    Type     |          Method           |
| first_name | 0      | categorical |                           |
+------------+--------+-------------+---------------------------+
| address    | 0      | categorical |                           |
+------------+--------+-------------+---------------------------+
| city       | 0      | categorical |                           |
+------------+--------+-------------+---------------------------+
| postal     | 0      | categorical |                           |
+------------+--------+-------------+---------------------------+
| phone      | 0      | categorical |                           |
+------------+--------+-------------+---------------------------+
| email      | 1      | categorical | Partial Masking           |
+------------+--------+-------------+---------------------------+
| web        | 0      | categorical |                           |
+---------

In [15]:
from anonympy.pandas.utils_pandas import fake_methods

print(fake_methods('f')) # agrs: None / 'all' / any letter  

 factories, file_extension, file_name,file_path,firefox, first_name,first_name_female, first_name_male,first_name_nonbinary,fixed_width, format,free_email, free_email_domain, future_date, future_datetime
None


In [16]:
anonym.categorical_fake('first_name') 

In [17]:
anonym.categorical_fake_auto() # esto cambiara `address` y `city` 
anonym.categorical_fake({'web': 'url', 'phone': 'phone_number'}) 

`first_name` column already anonymized!
`email` column already anonymized!


In [18]:
anonym.datetime_noise('birthdate')

In [19]:
anonym

+---------------------------------------+
|      Total number of columns: 10      |
| Anonymized Column -> Method:          |
| - age -> Numeric Perturbation         |
| - salary -> Generalization - Rounding |
| - email -> Partial Masking            |
| - first_name -> Synthetic Data        |
| - address -> Synthetic Data           |
| - city -> Synthetic Data              |
| - web -> Synthetic Data               |
| - phone -> Synthetic Data             |
| - birthdate -> Datetime Perturbation  |
+---------------------------------------+
| Unanonymized Columns:                 |
| - postal                              |
+---------------------------------------+

In [20]:
anonym._df

Unnamed: 0,first_name,address,city,postal,phone,email,web,salary,birthdate,age
0,Brent,"039 George Bypass\nAngelaside, TX 32867",North Mikeburgh,CT2 7PP,+1-392-716-2409,a*****z@hotmail.com,http://www.baker.com/,50000,2000-12-21 15:09:18.117475200,14
1,Daniel,"946 Christopher Road Suite 311\nJeffreyside, V...",Reedville,HP11 2AX,219.327.4441x3896,e*****s@gmail.com,https://www.baldwin.org/,30000,2003-10-20 04:09:51.325948800,17
2,Kenneth,93913 Rebekah Stravenue Suite 272\nJacobsmouth...,Bradfordburgh,BH6 3BE,863-830-1408,f*****e@hotmail.com,http://www.phillips.info/,30000,2001-05-23 18:56:29.090025600,14
3,Amber,"016 Scott Heights\nRichardsonmouth, MD 30612",Wrightshire,DN36 5RP,963-910-9597,u*****s@hotmail.com,https://www.leonard-pitts.biz/,40000,2001-02-23 21:59:48.621840000,17
4,Madison,"1646 Rebecca Lodge Suite 039\nRamosmouth, AK 3...",North Natasha,B70 9DT,001-390-595-0222x55356,t*****s@hotmail.com,http://www.bauer.com/,40000,1997-08-22 05:19:37.687008000,21
...,...,...,...,...,...,...,...,...,...,...
495,Luis,"0269 Joshua Inlet\nBestton, HI 14235",Leebury,SW1W 8JY,+1-686-303-5088x2850,a*****y@veit.co.uk,http://www.morgan.com/,40000,1995-07-11 16:40:58.379318400,18
496,Donna,"824 Alisha Street\nSouth Alexandriaborough, GA...",Port Brenda,IV2 6WT,(581)674-4979x80160,r*****i@euresti.co.uk,https://www.davis.com/,40000,1999-05-11 11:23:56.188204800,14
497,David,USNV Woods\nFPO AE 48730,North Natasha,S75 5EJ,882.879.4744x5254,c*****g@brenning.co.uk,https://hernandez.org/,40000,1998-06-12 22:03:29.331331200,30
498,Andre,"5743 Paul Loop Suite 104\nLake Darlene, AR 71562",Rebeccatown,DH8 5LP,001-796-890-3220x923,c*****y@gmail.com,https://donaldson.info/,30000,1994-07-22 21:48:38.237414400,19


# 3. Faker-Otros idiomas

In [21]:
import csv
#from faker import Faker
from faker import Factory
from collections import defaultdict

import pandas as pd

In [22]:
from faker.config import AVAILABLE_LOCALES
print(AVAILABLE_LOCALES)

['ar_AA', 'ar_AE', 'ar_BH', 'ar_EG', 'ar_JO', 'ar_PS', 'ar_SA', 'az_AZ', 'bg_BG', 'bn_BD', 'bs_BA', 'cs_CZ', 'da_DK', 'de', 'de_AT', 'de_CH', 'de_DE', 'dk_DK', 'el_CY', 'el_GR', 'en', 'en_AU', 'en_BD', 'en_CA', 'en_GB', 'en_IE', 'en_IN', 'en_NZ', 'en_PH', 'en_TH', 'en_US', 'es', 'es_AR', 'es_CA', 'es_CL', 'es_CO', 'es_ES', 'es_MX', 'et_EE', 'fa_IR', 'fi_FI', 'fil_PH', 'fr_BE', 'fr_CA', 'fr_CH', 'fr_FR', 'fr_QC', 'ga_IE', 'he_IL', 'hi_IN', 'hr_HR', 'hu_HU', 'hy_AM', 'id_ID', 'it_CH', 'it_IT', 'ja_JP', 'ka_GE', 'ko_KR', 'la', 'lb_LU', 'lt_LT', 'lv_LV', 'mt_MT', 'ne_NP', 'nl_BE', 'nl_NL', 'no_NO', 'or_IN', 'pl_PL', 'pt_BR', 'pt_PT', 'ro_RO', 'ru_RU', 'sk_SK', 'sl_SI', 'sq_AL', 'sv_SE', 'ta_IN', 'th', 'th_TH', 'tl_PH', 'tr_TR', 'tw_GH', 'uk_UA', 'vi_VN', 'zh_CN', 'zh_TW', 'zu_ZA']


In [23]:
#Local - Ingles
from faker import Faker
fake = Faker(['en'])
for _ in range(10):
    print(fake.name())

Derek Mitchell
Jeff Rodriguez
Aaron Berry
Jessica Matthews
Mark Decker
Heather Schmidt
Angela Jackson
Charles Vaughan
Tyler Davis
Edward Brown


In [24]:
#Local - India | Language - Tamil
from faker import Faker
fake = Faker(['ta_IN'])
for _ in range(10):
    print(fake.name())

கடற்கோமகள்
பச்சையம்மாள் இந்திரஜா
ஸன்யுக்தா
ஸத்வரி
தனுஷ்கா ஆகர்ணா,
கனிரா
எழிற்கதிர்
இந்து
உத்தியா
இதயா


# 4. Pseudonymization

La Pseudonymization es el proceso de reemplazar datos identificables con seudónimos o alias para proteger la privacidad de las personas y al mismo tiempo mantener la usabilidad de los datos.

En este ejemplo, la función pseudonimizar() toma los datos como entrada y utiliza el algoritmo hash SHA-256 del módulo hashlib para seudonimizar los datos. El parámetro de datos se convierte a su representación de cadena mediante .encode(). Luego, la función hashlib.sha256() genera un objeto hash SHA-256. Finalmente, el método hexdigest() devuelve la representación hexadecimal del hash, que sirve como valor seudonimizado.

In [25]:
import hashlib

def pseudonymize(data):
    hash_object = hashlib.sha256(data.encode())
    pseudonym = hash_object.hexdigest()
    return pseudonym

In [26]:
name = "John Doe"
pseudonymized_name = pseudonymize(name)
print(pseudonymized_name)

6cea57c2fb6cbc2a40411135005760f241fffc3e5e67ab99882726431037f908


In [27]:
import base64
from hashlib import blake2b

import pandas as pd
import requests

from faker import Faker

def encode(key, clear):
    enc = []
    for i in range(len(clear)):
        key_c = key[i % len(key)]
        #print(key_c)
        enc_c = (ord(clear[i]) + ord(key_c)) % 256
        #print(enc_c)
        enc.append(enc_c)
    return base64.urlsafe_b64encode(bytes(enc))

def decode(key, enc):
    dec = []
    enc = base64.urlsafe_b64decode(enc)
    for i in range(len(enc)):
        key_c = key[i % len(key)]
        dec_c = chr((256 + enc[i] - ord(key_c)) % 256)
        dec.append(dec_c)
    return "".join(dec)

In [31]:
df = pd.read_csv('iot_example.txt')
df

Unnamed: 0,timestamp,username,temperature,heartrate,build,latest,note
0,2017-01-01T12:00:23,michaelsmith,12,67,4e6a7805-8faa-2768-6ef6-eb3198b483ac,0,interval
1,2017-01-01T12:01:09,kharrison,6,78,7256b7b0-e502-f576-62ec-ed73533c9c84,0,wake
2,2017-01-01T12:01:34,smithadam,5,89,9226c94b-bb4b-a6c8-8e02-cb42b53e9c90,0,
3,2017-01-01T12:02:09,eddierodriguez,28,76,2599ac79-e5e0-5117-b8e1-57e5ced036f7,0,update
4,2017-01-01T12:02:36,kenneth94,29,62,122f1c6a-403c-2221-6ed1-b5caa08f11e0,0,user
...,...,...,...,...,...,...,...
146392,2017-02-28T23:58:06,mcontreras,15,63,69e61a15-d2d0-47a7-1a27-e07b3eeeba10,0,
146393,2017-02-28T23:58:43,joelrusso,27,89,4af5c12e-df08-40f4-4c70-b9361044cc4b,0,
146394,2017-02-28T23:59:23,lellis,13,84,dac87426-e147-9c39-6e4c-790bb11f8fc9,0,update
146395,2017-02-28T23:59:48,grayjasmin,17,64,4911a589-3a15-4bbf-1de1-e5a69ab739da,1,update


In [32]:
username = df.iloc[0,1]
username

'michaelsmith'

In [33]:
encode('supa_secret', username)

b'4N7TycDY0dbfzujb'

In [34]:
decode('supa_secret', b'4N7TycDY0dbfzujb')

'michaelsmith'

Otros conceptos: 

- Pseudoanonimizacion multivariada
- K-Anonymity
- Differential Privacy