In [1]:
import pandas as pd
import hashlib

In [2]:
# Read the CSV files
patients = pd.read_csv("patients_age_flags.csv", delimiter=";")
lab = pd.read_csv("cbc_chem.csv", delimiter=";")

In [3]:
# Merge the two files on subject_id and hadm_id
temp_df = pd.merge(lab, patients, on=["subject_id", "hadm_id"], how="left")

In [4]:
# Set the subset to use for deidentification
subset = ["subject_id", "hadm_id", "charttime"]

# Convert the relevant columns astype string 
temp_df[subset] = temp_df[subset].astype(str)

# Extract the relevant columns
subject_id = temp_df["subject_id"]
hadm_id = temp_df["hadm_id"]
charttime = temp_df["charttime"]

# Combine columns to create a hash value
combined_cols = subject_id + hadm_id + charttime

# Apply SHA-256 hash function to create the index
index = combined_cols.apply(lambda x: hashlib.sha256(x.encode()).hexdigest())

# Set the index of the DataFrame
temp_df.set_index(index, inplace=True)

In [5]:
# Drop unnecessary columns and sort by index
df = temp_df.drop(columns=["subject_id", "hadm_id", "specimen_id", "charttime"]).sort_index()

In [6]:
# Convert 'age' column to integer data type
df['age'] = df['age'].astype(int)

In [7]:
# Export the de-identified DataFrame to a new CSV file
df.to_csv("ckd_deidentified.csv")

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1636315 entries, 0000003c42d42886f0512786c8cef89336c3dbc4a25e5dd9531479e383b8844b to fffff04f91d9bb69086b462eed519b73e03567d768e873746fe6c101e48f1fec
Data columns (total 26 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   hematocrit     1629960 non-null  float64
 1   hemoglobin     1593606 non-null  float64
 2   mch            1588370 non-null  float64
 3   mchc           1588542 non-null  float64
 4   mcv            1588470 non-null  float64
 5   platelet       1598386 non-null  float64
 6   rbc            1588481 non-null  float64
 7   rdw            1587352 non-null  float64
 8   wbc            1590141 non-null  float64
 9   albumin        257170 non-null   float64
 10  globulin       7655 non-null     float64
 11  total_protein  13247 non-null    float64
 12  aniongap       1580546 non-null  float64
 13  bicarbonate    1583570 non-null  float64
 14  bun            1598277 non-null

In [9]:
df

Unnamed: 0,hematocrit,hemoglobin,mch,mchc,mcv,platelet,rbc,rdw,wbc,albumin,...,chloride,creatinine,glucose,sodium,potassium,age,gender,ckd,hypertension,diabetes
0000003c42d42886f0512786c8cef89336c3dbc4a25e5dd9531479e383b8844b,25.3,8.5,28.6,33.4,86.0,275.0,2.96,13.4,3.0,,...,109.0,0.7,114.0,140.0,3.9,48,F,0,0,0
00000727efa061858c1e6611657af5b6defd0ab071091a096c393afe938b1489,28.6,10.0,29.8,34.9,85.0,102.0,3.36,13.8,1.9,,...,104.0,0.7,82.0,139.0,3.9,59,M,0,1,0
00002d1472769d21c156977f93e2e1db0557807f470e08a904060f0f32abf0c8,31.7,10.1,28.2,31.9,88.0,201.0,3.58,14.5,4.2,,...,104.0,0.6,97.0,139.0,3.6,83,F,0,1,0
00004622a4c6b7bb6b873558488dd2e828dc77a5d9587f6382aa48cb940f589f,37.8,12.4,31.1,32.9,95.0,347.0,4.00,13.8,11.2,3.9,...,101.0,0.9,149.0,139.0,4.1,51,F,0,0,0
00004741d15bcf0f0453f94cdd9eeb85c663584f26c5fe0705e96c8aa01eab66,23.1,7.8,28.7,33.7,85.0,76.0,2.71,16.4,7.8,,...,105.0,1.5,207.0,136.0,4.8,71,M,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ffffbc4b10d69e134494f554c6d11aa5c7b3d62a3ed9f2779dd95f0a499c8dc9,23.7,7.8,33.3,32.9,101.0,169.0,2.34,14.4,23.3,,...,94.0,7.2,143.0,133.0,4.3,81,M,1,1,1
ffffcd111a56292182022764fb3047749d3f2aa3fd4801efbc5b24fbec3f580c,38.3,12.7,33.3,33.2,100.0,179.0,3.82,13.1,12.6,,...,99.0,0.7,121.0,136.0,3.0,82,M,0,1,0
ffffd3c4b674a4b05cd88da299e9955366949066dad07cc2655f3926f80cdd90,32.6,10.1,28.7,31.0,93.0,254.0,3.52,13.4,7.6,,...,99.0,0.6,97.0,137.0,3.8,74,F,0,1,0
ffffd3e90e8f8f17ca53a0fd689e5fa276d7b14f088b2b397b3ef36d118737ac,38.4,12.8,31.4,33.3,94.0,180.0,4.08,14.6,8.0,,...,104.0,1.1,92.0,142.0,4.0,74,M,0,1,1
