In [1]:
# Import necessary libraries
import pandas as pd
import hashlib
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Get SALT_KEY from environment
SALT_KEY = os.getenv('SALT_KEY')
if SALT_KEY is None:
    raise ValueError("SALT_KEY not found in .env file. Please ensure SALT_KEY is set in the root .env file.")

print(f"SALT_KEY loaded successfully (length: {len(SALT_KEY)})")


SALT_KEY loaded successfully (length: 9)


In [2]:
# Load the healthcare data
df = pd.read_csv('../data/healthcare_translated.csv')

print(f"Data loaded: {len(df)} rows, {len(df.columns)} columns")
print(f"\nFirst few rows of taj_identifier:")
print(df['taj_identifier'].head(10))
print(f"\nEmpty/null taj_identifier count: {df['taj_identifier'].isna().sum() + (df['taj_identifier'] == '').sum()}")


Data loaded: 26551 rows, 176 columns

First few rows of taj_identifier:
0    140 932 100
1    068 689 139
2    030 175 949
3    110 846 440
4    062 458 739
5    070 941 410
6    087 182 291
7    081 554 320
8    075 866 598
9    109 465 898
Name: taj_identifier, dtype: object

Empty/null taj_identifier count: 38


  df = pd.read_csv('../data/healthcare_translated.csv')


In [3]:
# Create pid column using SHA256(SALT_KEY + taj_identifier)
def hash_taj_identifier(taj_id):
    """
    Hash taj_identifier using SHA256(SALT_KEY + taj_identifier)
    If taj_identifier is empty/null, return empty string
    """
    if pd.isna(taj_id) or taj_id == '':
        return ''
    
    # Convert to string and concatenate with SALT_KEY
    combined = SALT_KEY + str(taj_id)
    # Create SHA256 hash
    hash_obj = hashlib.sha256(combined.encode('utf-8'))
    return hash_obj.hexdigest()

# Apply hashing function
df['pid'] = df['taj_identifier'].apply(hash_taj_identifier)

print(f"pid column created")
print(f"Non-empty pid count: {(df['pid'] != '').sum()}")
print(f"\nSample pids:")
print(df[['taj_identifier', 'pid']].head(10))


pid column created
Non-empty pid count: 26513

Sample pids:
  taj_identifier                                                pid
0    140 932 100  8c4a1e6788d9f0dbe446ae0878093a771c1f2d76035d61...
1    068 689 139  279159d05b4d9db0bc548ac4db6a934cfbcc6715646876...
2    030 175 949  7c2a4b0d57e732af7252a5304c1f12718e0d063a199f3b...
3    110 846 440  8cee43857a01ec5adfb64eba5bf1b57b19bbeb85090d14...
4    062 458 739  241502f62bc78e65a13679ea5e193c8b9de3b932105847...
5    070 941 410  7a3f70f8ffb56809060781f23bc88066ed415da2fe5215...
6    087 182 291  a1723bd81c55960ca798391cdd229589cbce3dbb5c1c42...
7    081 554 320  11e9c3b8eac26528da105777ec455fc0de475f976688f4...
8    075 866 598  cd13b283dd4befaad13c349123a0819c47e503d18b8ad2...
9    109 465 898  80d488452138b4c721caf321062f29c7347df70473555e...


In [4]:
# Create taj_present column (yes/no) based on whether taj_identifier was non-empty
def check_taj_present(taj_id):
    """
    Return 'yes' if taj_identifier was non-empty, 'no' otherwise
    """
    if pd.isna(taj_id) or taj_id == '':
        return 'no'
    return 'yes'

df['taj_present'] = df['taj_identifier'].apply(check_taj_present)

print(f"taj_present column created")
print(f"\ntaj_present distribution:")
print(df['taj_present'].value_counts())


taj_present column created

taj_present distribution:
taj_present
yes    26513
no        38
Name: count, dtype: int64


In [5]:
# Verify hash consistency: same taj_identifier should produce same pid
print("Verifying hash consistency...")

# Group by taj_identifier and check if all pids are the same
verification = df.groupby('taj_identifier')['pid'].agg(['nunique', 'count']).reset_index()
verification.columns = ['taj_identifier', 'unique_pids', 'row_count']

# Find inconsistencies (taj_identifiers with multiple different pids)
inconsistencies = verification[verification['unique_pids'] > 1]

if len(inconsistencies) > 0:
    print(f"\nERROR: Found {len(inconsistencies)} taj_identifier(s) with inconsistent hashes!")
    print(inconsistencies)
    raise ValueError("Hash verification failed: Same taj_identifier produces different pids")
else:
    print(f"\n✓ Hash verification passed!")
    print(f"  - All {len(verification)} unique taj_identifiers produce consistent hashes")
    
# Also verify that non-empty taj_identifiers produce non-empty pids
non_empty_taj = df[(df['taj_identifier'].notna()) & (df['taj_identifier'] != '')]
if (non_empty_taj['pid'] == '').any():
    print(f"\nWARNING: Found {((non_empty_taj['pid'] == '')).sum()} non-empty taj_identifiers with empty pids")
else:
    print(f"  - All non-empty taj_identifiers produce non-empty pids")


Verifying hash consistency...

✓ Hash verification passed!
  - All 6357 unique taj_identifiers produce consistent hashes
  - All non-empty taj_identifiers produce non-empty pids


In [6]:
# Drop taj_identifier column
df = df.drop(columns=['taj_identifier'])

print(f"taj_identifier column dropped")
print(f"\nRemaining columns: {len(df.columns)}")
print(f"\nFirst few rows:")
print(df[['pid', 'taj_present']].head(10))


taj_identifier column dropped

Remaining columns: 177

First few rows:
                                                 pid taj_present
0  8c4a1e6788d9f0dbe446ae0878093a771c1f2d76035d61...         yes
1  279159d05b4d9db0bc548ac4db6a934cfbcc6715646876...         yes
2  7c2a4b0d57e732af7252a5304c1f12718e0d063a199f3b...         yes
3  8cee43857a01ec5adfb64eba5bf1b57b19bbeb85090d14...         yes
4  241502f62bc78e65a13679ea5e193c8b9de3b932105847...         yes
5  7a3f70f8ffb56809060781f23bc88066ed415da2fe5215...         yes
6  a1723bd81c55960ca798391cdd229589cbce3dbb5c1c42...         yes
7  11e9c3b8eac26528da105777ec455fc0de475f976688f4...         yes
8  cd13b283dd4befaad13c349123a0819c47e503d18b8ad2...         yes
9  80d488452138b4c721caf321062f29c7347df70473555e...         yes


In [7]:
# Save processed dataset to data/patient_records.csv
output_path = '../data/patient_records.csv'
df.to_csv(output_path, index=False)

print(f"Dataset saved successfully to {output_path}")
print(f"\nFinal dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)[:10]}...")  # Show first 10 columns


Dataset saved successfully to ../data/patient_records.csv

Final dataset shape: (26551, 177)
Columns: ['created', 'birth_name', 'mothers_name', 'patient_name', 'birth_date', 'birth_place', 'clinic_name', 'mep', 'settlement', 'mep_region']...
