In [22]:
import pandas as pd

# Load the CSV file into a pandas DataFrame
try:
  df = pd.read_csv('ehr_data_raw.csv')
except FileNotFoundError:
  print("Error: 'ehr_data_raw.csv' not found. Please upload the file or provide the correct path.")
  exit() # or handle the error differently, like prompting the user for the correct path

# Display column names and data types
print(df.info())
print(df.head())


# Convert 'STARDATE' and 'DOB' columns to datetime objects
# Handle potential errors during conversion
try:
    df['STARTDATE'] = pd.to_datetime(df['STARTDATE'], errors='coerce')
    df['DOB'] = pd.to_datetime(df['DOB'], errors='coerce')
except KeyError as e:
    print(f"Error: Column '{e}' not found in the DataFrame.")
    exit()
except Exception as e:
    print(f"An error occurred during date conversion: {e}")
    exit()

# Handle unexplainable date of birth
df = df[(df['DOB'] >= '1901-01-01')]
# Calculate age
df['AGE'] = (df['STARTDATE'] - df['DOB']).dt.days / 365.25


print(df[['STARTDATE', 'DOB', 'AGE']].describe())
print(df[['STARTDATE', 'DOB']].head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10481 entries, 0 to 10480
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   DRUG         10481 non-null  object
 1   ROUTE        10481 non-null  object
 2   DOSE_VAL_RX  10481 non-null  object
 3   GENDER       10481 non-null  object
 4   DOB          10481 non-null  object
 5   STARTDATE    10441 non-null  object
 6   INTIME       10481 non-null  object
 7   DEATHTIME    2988 non-null   object
dtypes: object(8)
memory usage: 655.2+ KB
None
                     DRUG ROUTE DOSE_VAL_RX GENDER                      DOB  \
0           Metronidazole    PO         500      M  2047-04-04 00:00:00.000   
1  Vancomycin Oral Liquid    PO         250      M  2047-04-04 00:00:00.000   
2  Vancomycin Oral Liquid    PO         250      M  2047-04-04 00:00:00.000   
3           Metronidazole    PO         500      M  2047-04-04 00:00:00.000   
4            Levofloxacin    PO         

In [23]:

# Create the 'DEATH' column based on 'DEATHTIME'
df['DEATH'] = df['DEATHTIME'].notna().astype(int)

# Encode gender
df['GENDER'] = df['GENDER'].map({'M': 0, 'F': 1})

# Apply one-hot encoding to categorical features
df = pd.get_dummies(df, columns=['ROUTE', 'DRUG'], drop_first=True)

# Convert DOSE_VAL_RX to numeric, coercing errors to NaN
df['DOSE_VAL_RX'] = pd.to_numeric(df['DOSE_VAL_RX'], errors='coerce')

# Handle INTIME
try:
    df['INTIME'] = pd.to_datetime(df['INTIME'], errors='coerce')
    df['ICU_DATE'] = df['INTIME'].dt.date
    df['ICU_TIME'] = df['INTIME'].dt.time
except KeyError:
    print("Error: 'INTIME' column not found in the DataFrame.")
except Exception as e:
    print(f"An error occurred during INTIME processing: {e}")

In [26]:
print(df.describe())

       DOSE_VAL_RX       GENDER                            DOB  \
count  9837.000000  9882.000000                           9882   
mean    380.703390     0.395568  2089-06-20 13:23:21.092896256   
min       1.000000     0.000000            2014-04-27 00:00:00   
25%     250.000000     0.000000            2062-05-25 00:00:00   
50%     500.000000     0.000000            2090-01-03 00:00:00   
75%     500.000000     1.000000            2112-08-15 18:00:00   
max    2000.000000     1.000000            2201-01-11 00:00:00   
std     221.058536     0.488997                            NaN   

                           STARTDATE                         INTIME  \
count                           9842                           9882   
mean   2152-08-18 00:18:08.559236096  2152-09-03 11:58:14.093200384   
min              2100-07-28 00:00:00            2100-07-24 18:51:53   
25%              2128-08-03 00:00:00  2128-08-15 09:53:29.249999872   
50%              2153-08-09 12:00:00  2153-08-11 1

In [27]:
print(df.head())

   DOSE_VAL_RX  GENDER        DOB  STARTDATE              INTIME  \
0        500.0       0 2047-04-04 2135-01-31 2135-01-30 20:53:34   
1        250.0       0 2047-04-04 2135-02-01 2135-01-30 20:53:34   
2        250.0       0 2047-04-04 2135-02-02 2135-01-30 20:53:34   
3        500.0       0 2047-04-04 2135-02-02 2135-01-30 20:53:34   
4        250.0       0 2090-08-31 2166-08-10 2166-08-10 00:29:36   

                 DEATHTIME        AGE  DEATH  ROUTE_BOTH EYES  ROUTE_G TUBE  \
0  2135-02-08 02:08:00.000  87.824778      1            False         False   
1  2135-02-08 02:08:00.000  87.827515      1            False         False   
2  2135-02-08 02:08:00.000  87.830253      1            False         False   
3  2135-02-08 02:08:00.000  87.830253      1            False         False   
4                      NaN  75.939767      0            False         False   

   ...  DRUG_Vancomycin 25mg/mL Ophth Soln  DRUG_Vancomycin Enema  \
0  ...                               False     