In [19]:
import pandas as pd
from datetime import datetime

In [20]:
# Load each CSV file into a DataFrame
patients = pd.read_csv('../data/synthea_sample_data_csv_latest/patients.csv')
conditions = pd.read_csv('../data/synthea_sample_data_csv_latest/conditions.csv')
medications = pd.read_csv('../data/synthea_sample_data_csv_latest/medications.csv')


In [21]:
# Ensure the 'BIRTHDATE' & 'DEATHDATE' column is in datetime format
patients['BIRTHDATE'] = pd.to_datetime(patients['BIRTHDATE'], format='%Y-%m-%d')
patients['DEATHDATE'] = pd.to_datetime(patients['DEATHDATE'], format='%Y-%m-%d')


In [23]:
# Drop rows where DEATHDATE is not null (person has passed away)
preprocess_patients = patients[patients['DEATHDATE'].isna()]

In [24]:
preprocess_patients = preprocess_patients[['Id', 'BIRTHDATE', 'GENDER']]

In [25]:
# Function to calculate age for a single date
def calculate_age(birthdate):
    today = datetime.today()
    age = today.year - birthdate.year - ((today.month, today.day) < (birthdate.month, birthdate.day))
    return age

# Apply the age calculation function to the 'BIRTHDATE' column
preprocess_patients['Age'] = preprocess_patients['BIRTHDATE'].apply(calculate_age)

In [26]:
preprocess_patients

Unnamed: 0,Id,BIRTHDATE,GENDER,Age
0,e93300bf-3a53-55c0-bd38-2ede59462f21,2007-09-26,F,17
1,6b0b0021-df03-cbb6-305a-11ec40da7af4,1976-07-01,M,48
2,9eedeb5a-e86d-ddd6-f928-ae031b593968,1989-10-06,F,34
4,87f6c14f-38d1-966f-4109-ac29736ec22f,2005-06-16,M,19
5,c56ad513-c5a6-cb7f-8b55-0e140eb6b55d,1977-02-23,M,47
...,...,...,...,...
104,b1886192-fd4d-8077-bb70-b07f5818bd6b,1999-10-01,M,25
105,56c7fc10-632f-3760-e98f-4b270c87360e,2002-06-06,M,22
106,ddd42265-8985-ee08-a0d6-56da944048d1,1942-03-05,F,82
107,2a19ac8c-076b-857f-48f9-5776d28cd7f8,1956-07-13,F,68


In [29]:
preprocess_patients = preprocess_patients.drop(columns=['BIRTHDATE'])

In [30]:
preprocess_patients

Unnamed: 0,Id,GENDER,Age
0,e93300bf-3a53-55c0-bd38-2ede59462f21,F,17
1,6b0b0021-df03-cbb6-305a-11ec40da7af4,M,48
2,9eedeb5a-e86d-ddd6-f928-ae031b593968,F,34
4,87f6c14f-38d1-966f-4109-ac29736ec22f,M,19
5,c56ad513-c5a6-cb7f-8b55-0e140eb6b55d,M,47
...,...,...,...
104,b1886192-fd4d-8077-bb70-b07f5818bd6b,M,25
105,56c7fc10-632f-3760-e98f-4b270c87360e,M,22
106,ddd42265-8985-ee08-a0d6-56da944048d1,F,82
107,2a19ac8c-076b-857f-48f9-5776d28cd7f8,F,68


In [31]:
# Export the DataFrame to a CSV file
preprocess_patients.to_csv('../data/preprocess_patients.csv', index=False)