In [None]:
import os
import sys
import pandas as pd
from pathlib import Path
project_root = (os.path.dirname(os.path.abspath('.')))

# Add the project root to sys.path if it's not already there
if project_root not in sys.path:
    sys.path.append(project_root)
    
import src.data.anonymizer as anonymizer
from src.data.id_mapper import IDMapper
from src.data.file_encoding import detect_file_encoding, batch_detect_encodings


Le paramètre `salt` dans la fonction `create_anonymous_id` joue un rôle crucial dans la génération d'ID anonymes de manière sécurisée et déterministe. Voici une explication détaillée de son rôle :

1. **Sécurité accrue** : Le `salt` est une valeur aléatoire ajoutée à l'identifiant (`identifier`) avant de générer le hachage. Cela empêche les attaques par dictionnaire et les attaques par force brute, car même si deux identifiants sont identiques, leurs hachages seront différents si des `salt` différents sont utilisés.

2. **Déterminisme** : Si un `salt` spécifique est fourni, la fonction générera toujours le même ID anonyme pour un identifiant donné. Cela est utile pour garantir que les mêmes identifiants d'origine produisent les mêmes ID anonymes à chaque exécution, tant que le même `salt` est utilisé.

3. **Génération aléatoire** : Si aucun `salt` n'est fourni, la fonction en génère un aléatoirement en utilisant `os.urandom(32)`. Cela garantit que chaque appel à la fonction avec le même identifiant produira un ID anonyme différent, ce qui peut être utile pour des besoins de sécurité spécifiques où le déterminisme n'est pas nécessaire.

Example d'usage:

In [None]:
for i in range(3):
    hashed = anonymizer.create_anonymous_id(identifier='123123', salt=None)
    print(hashed)

# Whereas if we include a salt value, the hashed value will be different but consistent
print("\nWith salt:")
for i in range(3):
    hashed = anonymizer.create_anonymous_id(identifier='123123', salt='000')
    print(hashed)

### IDMapper class
It's easy to just create an unique identifer, and it will always return the same value once the salt is supplied (which it is).

In [None]:
saltpath = Path().cwd().parent / 'config' / 'secure' / 'salt.key'

mapper = IDMapper(
    salt_path= saltpath)
[mapper.create_anonymous_id(original_id="123123") for i in range(3)]

# Build sample datasets

In [None]:
path_raw = os.path.join(project_root, 'data/raw')

## MjÁlvarez

In [None]:
# Print full paths of all files in the raw data directory
path = "".join([path_raw, '/MjÁlvarez'])
files = os.listdir(path)

# Read each file with pandas
df_list = []
for file in files:
    file_path = os.path.join(path, file)
    df = pd.read_csv(file_path)
    df_list.append(df)
    print(f"File: {os.path.basename(file_path)}")
    print(f"Shape: {df.shape}")

In [None]:
# Given that the dataframes in df_list are adjacency matrices, suggest way to anonimize the rows and columns of the adjacency matrices

In [None]:
df_list[5]

In [None]:
len(df_list)

### Anonymize and Sample

In [None]:
df_list[1].head()

In [None]:
df_list_anon = []

for df in df_list:
    # Récupérer les codes étudiants (première colonne et en-têtes)
    student_ids = set([df['CARNET'].values[0]] + list(df.columns[1:]))

    # Create a dictionary maapping for IDs
    id_mapping = {}
    for student_id in student_ids:
        anonymous_id = mapper.add_identifier(str(student_id), source='survey')
        id_mapping[str(student_id)] = anonymous_id

    # Rename columns and the first column
    df_anon = df.copy()
    df_anon.columns = ['CARNET'] + [id_mapping[str(col)] for col in df.columns[1:]]
    df_anon['CARNET'] = df_anon['CARNET'].map(lambda x: id_mapping[str(x)])
    
    df_list_anon.append(df_anon)



In [None]:
df_list_anon[1].head()

In [None]:
# Sauvegarder la matrice anonymisée
df_anon.to_csv("Ciencia_Politica_Amistad_20182_anonymized.csv", index=False)


### Produce Sample Data

In [None]:
df_list_anon[0].to_csv("../tests/data/Ciencia_Politica_Trabajos_20182_sample_anon.csv", index = False)

# Save mapping
# output_dir = Path('../data/intermediate')
# mapper.save_mappings(output_dir)

## TrustExperiment

In [None]:
# Revise the encoding of the files TrustExperiment

encodings = batch_detect_encodings('../data/raw/TrustExperiment', pattern='*.csv')
for filename, encoding in encodings.items():
    print(f"{filename}: {encoding}")

In [None]:
path = "".join([path_raw, '/TrustExperiment'])
files = os.listdir(path)


# Read files with the encoding in the encodings dictionary
df_list = []
for file in files:
    file_path = os.path.join(path, file)
    df = pd.read_csv(file_path, encoding=encodings[file])
    df_list.append(df)
    print(f"File: {os.path.basename(file_path)}")
    print(f"Shape: {df.shape}")

Now we've got to revise the current `MasterIDsFile.csv` and anonymize all the CSV for all the others.

In [None]:
df_list[0]

In [None]:
len(df_list[0].anonymousID.unique())

### Produce sample data 

In [None]:
df_list[4].head(20).to_csv('../tests/data/TestExperiment_sample.csv', index=False)

## Turnstile 

### Anonymize Mapper

In [None]:
from pathlib import Path
import pandas as pd
# from id_mapper import IDMapper
import glob
import logging

def anonymize_turnstile_files(input_dir: Path, output_dir: Path, salt_path: Path):
    """
    Anonymize all turnstile CSV files in the input directory
    
    Args:
        input_dir: Directory containing the original CSV files
        output_dir: Directory where anonymized files will be saved
        salt_path: Path to the salt file for consistent anonymization
    """
    # Initialize the IDMapper
    mapper = IDMapper(salt_path=salt_path)
    
    # Create output directory if it doesn't exist
    output_dir.mkdir(parents=True, exist_ok=True)
    
    # Process each CSV file
    for csv_file in glob.glob(str(input_dir / "P2000*.csv")):
        file_path = Path(csv_file)
        
        # Read the CSV file
        print(f"Reading {file_path}")
        try:
            df = pd.read_csv(file_path, delimiter=',')
        except pd.errors.ParserError:
            df = pd.read_csv(file_path, delimiter=';')
        
        # Anonymize student IDs
        assert 'carnet' in df.columns, "Column 'carnet' does not exist in the dataframe"
        df['carnet'] = df['carnet'].astype(str).apply(
            lambda x: mapper.add_identifier(x, source='turnstile')
        )
        
        # Save anonymized file
        output_file = output_dir / f"anon_{file_path.name}"
        df.to_csv(output_file, index=False)
        logging.info(f"Processed {file_path.name}")
    
    # Save the mapping files
    mapper.save_mappings(output_dir / "mappings")



In [None]:
# Configuration
INPUT_DIR = Path("data/intermediate/daily")
OUTPUT_DIR = Path("data/intermediate/")
SALT_PATH = Path("config/secure/salt.key")

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

# Run anonymization

In [None]:

anonymize_turnstile_files(Path("../tests/data"), OUTPUT_DIR, SALT_PATH)