In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [4]:
input_path = r"C:\Documents\GitHubRepos\hospital-emergency-room\data\raw\Hospital ER_Data.csv"
cleaned_path = r"C:\Documents\GitHubRepos\hospital-emergency-room\data\cleaned\Hospital_ER_Cleaned.csv"
load_path = r"C:\Documents\GitHubRepos\hospital-emergency-room\data\load\Hospital_ER_Cleaned.csv"

In [5]:
# Load dataset
try:
    df = pd.read_csv(input_path)
    print("Data loaded successfully.")
except FileNotFoundError:
    print(f"File not found: {input_path}")


Data loaded successfully.


In [6]:
# Check if any missing values exist in the dataset
missing_columns = df.columns[df.isnull().any()]

# Display columns with missing values
print("Columns with missing values:")
print(missing_columns)

# Percentage of missing values
missing_percentage = (df.isnull().mean() * 100)
print("\nPercentage of missing values:")
print(missing_percentage)


Columns with missing values:
Index(['Department Referral', 'Patient Satisfaction Score'], dtype='object')

Percentage of missing values:
Patient Id                     0.000000
Patient Admission Date         0.000000
Patient First Inital           0.000000
Patient Last Name              0.000000
Patient Gender                 0.000000
Patient Age                    0.000000
Patient Race                   0.000000
Department Referral           58.593750
Patient Admission Flag         0.000000
Patient Satisfaction Score    72.688802
Patient Waittime               0.000000
Patients CM                    0.000000
dtype: float64


In [7]:
# Fill missing values with the mode (most frequent referral)
df['Department Referral'] = df['Department Referral'].fillna(df['Department Referral'].mode()[0])

# Fill missing values in 'Patient Satisfaction Score' with the mean
df['Patient Satisfaction Score'] = df['Patient Satisfaction Score'].fillna(df['Patient Satisfaction Score'].mean())


# Check if any missing values exist after imputation
missing_values_after_imputation = df.isnull().sum()
print("Missing values after imputation:")
print(missing_values_after_imputation)


Missing values after imputation:
Patient Id                    0
Patient Admission Date        0
Patient First Inital          0
Patient Last Name             0
Patient Gender                0
Patient Age                   0
Patient Race                  0
Department Referral           0
Patient Admission Flag        0
Patient Satisfaction Score    0
Patient Waittime              0
Patients CM                   0
dtype: int64


In [8]:
# Try block to perform operations and handle any exceptions
try:
    # Convert 'Patient Admission Date' to datetime with dayfirst=True
    df['Patient Admission Date'] = pd.to_datetime(df['Patient Admission Date'], dayfirst=True)

    # Convert 'Patient Gender' to a categorical type
    df['Patient Gender'] = df['Patient Gender'].astype('category')

    # Convert 'Patient Race' to a categorical type
    df['Patient Race'] = df['Patient Race'].astype('category')

    # If no exceptions, print success message
    print("Data types converted successfully.")

except Exception as e:
    # If an exception occurs, print failure message along with the error
    print(f"Data type conversion failed: {e}")


Data types converted successfully.


In [9]:
# Try block to create the 'Age Group' column
try:
    # Create 'Age Group' based on 'Patient Age'
    bins = [0, 18, 30, 45, 60, 100]
    labels = ['0-18', '19-30', '31-45', '46-60', '60+']
    df['Age Group'] = pd.cut(df['Patient Age'], bins=bins, labels=labels, right=False)

    # If successful, print success message
    print("Age Group column created successfully.")

except Exception as e:
    # If an error occurs, print failure message with the error
    print(f"Failed to create Age Group column: {e}")


Age Group column created successfully.


In [10]:
# Save cleaned data (overwrite old version)
df.to_csv(cleaned_path, index=False)
print(f"Cleaned data saved to: {cleaned_path}")

# Load cleaned data (if needed to load again after saving)
df.to_csv(load_path, index=False)
print(f"Cleaned data saved to: {load_path}")

print(f"Cleaned data saved to: {df.columns}")


Cleaned data saved to: C:\Documents\GitHubRepos\hospital-emergency-room\data\cleaned\Hospital_ER_Cleaned.csv
Cleaned data saved to: C:\Documents\GitHubRepos\hospital-emergency-room\data\load\Hospital_ER_Cleaned.csv
Cleaned data saved to: Index(['Patient Id', 'Patient Admission Date', 'Patient First Inital',
       'Patient Last Name', 'Patient Gender', 'Patient Age', 'Patient Race',
       'Department Referral', 'Patient Admission Flag',
       'Patient Satisfaction Score', 'Patient Waittime', 'Patients CM',
       'Age Group'],
      dtype='object')
