In [29]:
import os
import sys
import pandas as pd
project_root = (os.path.dirname(os.path.abspath('.')))

# Add the project root to sys.path if it's not already there
if project_root not in sys.path:
    sys.path.append(project_root)
    
import src.data.anonymizer as anonymizer
from src.data.id_mapper import IDMapper
from src.data.file_encoding import detect_file_encoding, batch_detect_encodings


Le paramètre `salt` dans la fonction `create_anonymous_id` joue un rôle crucial dans la génération d'ID anonymes de manière sécurisée et déterministe. Voici une explication détaillée de son rôle :

1. **Sécurité accrue** : Le `salt` est une valeur aléatoire ajoutée à l'identifiant (`identifier`) avant de générer le hachage. Cela empêche les attaques par dictionnaire et les attaques par force brute, car même si deux identifiants sont identiques, leurs hachages seront différents si des `salt` différents sont utilisés.

2. **Déterminisme** : Si un `salt` spécifique est fourni, la fonction générera toujours le même ID anonyme pour un identifiant donné. Cela est utile pour garantir que les mêmes identifiants d'origine produisent les mêmes ID anonymes à chaque exécution, tant que le même `salt` est utilisé.

3. **Génération aléatoire** : Si aucun `salt` n'est fourni, la fonction en génère un aléatoirement en utilisant `os.urandom(32)`. Cela garantit que chaque appel à la fonction avec le même identifiant produira un ID anonyme différent, ce qui peut être utile pour des besoins de sécurité spécifiques où le déterminisme n'est pas nécessaire.

Example d'usage:

In [2]:
for i in range(3):
    hashed = anonymizer.create_anonymous_id(identifier='123123', salt=None)
    print(hashed)

# Whereas if we include a salt value, the hashed value will be different but consistent
print("\nWith salt:")
for i in range(3):
    hashed = anonymizer.create_anonymous_id(identifier='123123', salt='000')
    print(hashed)

48a61232d2ee
d8d235257a71
47157d22d90f

With salt:
e42e53f77567
e42e53f77567
e42e53f77567


### IDMapper class
It's easy to just create an unique identifer, and it will always return the same value once the salt is supplied (which it is).

In [24]:
mapper = IDMapper()
[mapper.create_anonymous_id(original_id="123123") for i in range(3)]

['8f17889ea21e', '8f17889ea21e', '8f17889ea21e']

# Build sample datasets

In [26]:
path_raw = os.path.join(project_root, 'data/raw')

## MjÁlvarez

In [27]:
# Print full paths of all files in the raw data directory
path = "".join([path_raw, '/MjÁlvarez'])
files = os.listdir(path)

# Read each file with pandas
df_list = []
for file in files:
    file_path = os.path.join(path, file)
    df = pd.read_csv(file_path)
    df_list.append(df)
    print(f"File: {os.path.basename(file_path)}")
    print(f"Shape: {df.shape}")

File: Ciencia_Politica_Trabajos_20182.csv
Shape: (42, 43)
File: Ciencia_Politica_Amistad_20182.csv
Shape: (42, 43)
File: Medicina_Amistad_20172.csv
Shape: (56, 57)
File: Medicina_Casa_20172.csv
Shape: (56, 57)
File: Economia_Trabajos_20172.csv
Shape: (76, 77)
File: Economia_Casa_20182.csv
Shape: (76, 77)
File: Ingenieria_Civil_Casa_20182.csv
Shape: (70, 71)
File: Economia_Amistad_20172.csv
Shape: (76, 77)
File: Ingenieria_Civil_Amistad_20182.csv
Shape: (70, 71)
File: Medicina_Trabajos_20182.csv
Shape: (56, 57)
File: Ciencia_Politica_Casa_20182.csv
Shape: (42, 43)
File: Ingenieria_Civil_Trabajos_20172.csv
Shape: (70, 71)
File: Ingenieria_Civil_Trabajos_20182.csv
Shape: (70, 71)
File: Economia_Casa_20172.csv
Shape: (76, 77)
File: Ingenieria_Civil_Casa_20172.csv
Shape: (70, 71)
File: Economia_Amistad_20182.csv
Shape: (76, 77)
File: Ingenieria_Civil_Amistad_20172.csv
Shape: (70, 71)
File: Medicina_Trabajos_20172.csv
Shape: (56, 57)
File: Medicina_Amistad_20182.csv
Shape: (56, 57)
File: Med

In [None]:
# Given that the dataframes in df_list are adjacency matrices, suggest way to anonimize the rows and columns of the adjacency matrices

In [79]:
df_list[0]

Unnamed: 0,CARNET,201712444,201714291,201712405,201713777,201715358,201715822,201717004,201718923,201719581,...,201713335,201715496,201718939,201633930,201714131,201621018,201713440,201716595,201632189,201719527
0,201712444,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,201714291,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,201712405,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,201713777,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,201715358,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,201715822,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,201717004,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,201718923,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,201719581,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,201712045,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [76]:
len(df_list)

21

## TrustExperiment

In [38]:
# Revise the encoding of the files TrustExperiment

encodings = batch_detect_encodings('../data/raw/TrustExperiment', pattern='*.csv')
for filename, encoding in encodings.items():
    print(f"{filename}: {encoding}")

INFO:src.data.file_encoding:Successfully read MasterIDsFile.csv with latin-1 encoding


INFO:src.data.file_encoding:Successfully read Lunch.csv with utf-8 encoding
INFO:src.data.file_encoding:Successfully read Friends.csv with utf-8 encoding
INFO:src.data.file_encoding:Successfully read Confide.csv with utf-8 encoding
INFO:src.data.file_encoding:Successfully read Study.csv with utf-8 encoding
INFO:src.data.file_encoding:Successfully read MetBefore.csv with utf-8 encoding
INFO:src.data.file_encoding:Successfully read Saludo.csv with utf-8 encoding


MasterIDsFile.csv: latin-1
Lunch.csv: utf-8
Friends.csv: utf-8
Confide.csv: utf-8
Study.csv: utf-8
MetBefore.csv: utf-8
Saludo.csv: utf-8


In [53]:
path = "".join([path_raw, '/TrustExperiment'])
files = os.listdir(path)


# Read files with the encoding in the encodings dictionary
df_list = []
for file in files:
    file_path = os.path.join(path, file)
    df = pd.read_csv(file_path, encoding=encodings[file])
    df_list.append(df)
    print(f"File: {os.path.basename(file_path)}")
    print(f"Shape: {df.shape}")

File: MasterIDsFile.csv
Shape: (113, 5)
File: Lunch.csv
Shape: (285, 2)
File: Friends.csv
Shape: (744, 2)
File: Confide.csv
Shape: (331, 2)
File: Study.csv
Shape: (595, 2)
File: MetBefore.csv
Shape: (207, 2)
File: Saludo.csv
Shape: (1570, 2)


Now we've got to revise the current `MasterIDsFile.csv` and anonymize all the CSV for all the others.

In [58]:
df_list[0]

Unnamed: 0,studentID,email,nameInQualtrics,trustExpID,anonymousID
0,201711822,ed.aguirre@uniandes.edu.co,EDWIN DANIEL AGUIRRE,,105anon
1,201731369,n.al@uniandes.edu.co,NASIM AL ASHRAM,29.0,95anon
2,201711858,lm.alvarado@uniandes.edu.co,LINA MARIA ALVARADO,,23anon
3,201729052,mj.aragon@uniandes.edu.co,MAURICIO JOSE ARAGON,65.0,11anon
4,201729072,js.arcos@uniandes.edu.co,JUAN SEBASTIAN ARCOS,,20anon
...,...,...,...,...,...
108,201731811,s.zaldua@uniandes.edu.co,STEFAN ZALDUA,70.0,58anon
109,201714113,lm.orjuelac@uniandes.edu.co,LAURA MANUELA ORJUELA,75.0,76anon
110,201732746,t.ramirezv@uniandes.edu.co,TOMçS RAMêREZ,19.0,88anon
111,201732742,mateoslalom@gmail.com,MATEO PEA,1.0,70anon


In [None]:
len(df_list[0].anonymousID.unique())

113