# Hospital ER Data - Exploratory Data Analysis

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-whitegrid')
%matplotlib inline

In [None]:
df = pd.read_csv('Hospital ER_Data.csv')
print(f'Dataset Shape: {df.shape}')
df.head()

## 1. Data Overview & Info

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# Missing values
missing = df.isnull().sum()
missing_pct = (missing / len(df) * 100).round(2)
pd.DataFrame({'Missing': missing, 'Percentage': missing_pct}).query('Missing > 0')

In [None]:
# Data types and unique values
for col in df.columns:
    print(f'{col}: {df[col].dtype} | Unique: {df[col].nunique()}')

## 2. Data Preprocessing

In [None]:
# Convert date column
df['Patient Admission Date'] = pd.to_datetime(df['Patient Admission Date'], format='%d-%m-%Y %H:%M')
df['Admission_Year'] = df['Patient Admission Date'].dt.year
df['Admission_Month'] = df['Patient Admission Date'].dt.month
df['Admission_Day'] = df['Patient Admission Date'].dt.day
df['Admission_Hour'] = df['Patient Admission Date'].dt.hour
df['Admission_DayOfWeek'] = df['Patient Admission Date'].dt.day_name()
df.head()

## 3. Univariate Analysis

In [None]:
# Gender distribution
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
df['Patient Gender'].value_counts().plot(kind='bar', ax=axes[0], color=['steelblue', 'coral', 'green'])
axes[0].set_title('Gender Distribution')
axes[0].set_xlabel('Gender')
axes[0].tick_params(axis='x', rotation=0)
df['Patient Gender'].value_counts().plot(kind='pie', ax=axes[1], autopct='%1.1f%%')
axes[1].set_title('Gender Distribution (%)')
plt.tight_layout()
plt.show()

In [None]:
# Age distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
sns.histplot(df['Patient Age'], bins=30, kde=True, ax=axes[0], color='steelblue')
axes[0].set_title('Age Distribution')
axes[0].set_xlabel('Age')
sns.boxplot(x=df['Patient Age'], ax=axes[1], color='coral')
axes[1].set_title('Age Box Plot')
plt.tight_layout()
plt.show()

In [None]:
# Race distribution
plt.figure(figsize=(12, 5))
df['Patient Race'].value_counts().plot(kind='bar', color='teal')
plt.title('Patient Race Distribution')
plt.xlabel('Race')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Department Referral distribution
plt.figure(figsize=(12, 5))
df['Department Referral'].value_counts().plot(kind='bar', color='purple')
plt.title('Department Referral Distribution')
plt.xlabel('Department')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Wait time distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
sns.histplot(df['Patient Waittime'], bins=30, kde=True, ax=axes[0], color='orange')
axes[0].set_title('Wait Time Distribution')
axes[0].set_xlabel('Wait Time (minutes)')
sns.boxplot(x=df['Patient Waittime'], ax=axes[1], color='lightgreen')
axes[1].set_title('Wait Time Box Plot')
plt.tight_layout()
plt.show()
print(f"Average Wait Time: {df['Patient Waittime'].mean():.2f} minutes")
print(f"Median Wait Time: {df['Patient Waittime'].median():.2f} minutes")

In [None]:
# Satisfaction Score distribution
plt.figure(figsize=(10, 5))
df['Patient Satisfaction Score'].value_counts().sort_index().plot(kind='bar', color='green')
plt.title('Patient Satisfaction Score Distribution')
plt.xlabel('Satisfaction Score')
plt.ylabel('Count')
plt.tight_layout()
plt.show()

In [None]:
# Admission Flag distribution
plt.figure(figsize=(8, 5))
df['Patient Admission Flag'].value_counts().plot(kind='bar', color=['steelblue', 'coral'])
plt.title('Admission Flag Distribution')
plt.xlabel('Admitted')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

## 4. Temporal Analysis

In [None]:
# Admissions by hour
plt.figure(figsize=(14, 5))
df['Admission_Hour'].value_counts().sort_index().plot(kind='bar', color='steelblue')
plt.title('ER Visits by Hour of Day')
plt.xlabel('Hour')
plt.ylabel('Number of Visits')
plt.tight_layout()
plt.show()

In [None]:
# Admissions by day of week
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
plt.figure(figsize=(12, 5))
df['Admission_DayOfWeek'].value_counts().reindex(day_order).plot(kind='bar', color='coral')
plt.title('ER Visits by Day of Week')
plt.xlabel('Day')
plt.ylabel('Number of Visits')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Admissions by month
plt.figure(figsize=(12, 5))
df['Admission_Month'].value_counts().sort_index().plot(kind='bar', color='teal')
plt.title('ER Visits by Month')
plt.xlabel('Month')
plt.ylabel('Number of Visits')
plt.tight_layout()
plt.show()

## 5. Bivariate Analysis

In [None]:
# Wait time by Gender
plt.figure(figsize=(10, 5))
sns.boxplot(x='Patient Gender', y='Patient Waittime', data=df, palette='Set2')
plt.title('Wait Time by Gender')
plt.tight_layout()
plt.show()

In [None]:
# Wait time by Department
plt.figure(figsize=(14, 6))
sns.boxplot(x='Department Referral', y='Patient Waittime', data=df, palette='Set3')
plt.title('Wait Time by Department Referral')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Satisfaction by Department
plt.figure(figsize=(14, 6))
df_sat = df.dropna(subset=['Patient Satisfaction Score'])
sns.boxplot(x='Department Referral', y='Patient Satisfaction Score', data=df_sat, palette='coolwarm')
plt.title('Satisfaction Score by Department')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Age groups analysis
df['Age_Group'] = pd.cut(df['Patient Age'], bins=[0, 18, 35, 50, 65, 100], labels=['0-18', '19-35', '36-50', '51-65', '65+'])
plt.figure(figsize=(10, 5))
df['Age_Group'].value_counts().sort_index().plot(kind='bar', color='purple')
plt.title('Patients by Age Group')
plt.xlabel('Age Group')
plt.ylabel('Count')
plt.tight_layout()
plt.show()

In [None]:
# Wait time by Age Group
plt.figure(figsize=(10, 5))
sns.boxplot(x='Age_Group', y='Patient Waittime', data=df, palette='viridis')
plt.title('Wait Time by Age Group')
plt.tight_layout()
plt.show()

In [None]:
# Correlation: Wait time vs Satisfaction
df_corr = df.dropna(subset=['Patient Satisfaction Score'])
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Patient Waittime', y='Patient Satisfaction Score', data=df_corr, alpha=0.5)
plt.title('Wait Time vs Satisfaction Score')
plt.tight_layout()
plt.show()
print(f"Correlation: {df_corr['Patient Waittime'].corr(df_corr['Patient Satisfaction Score']):.3f}")

## 6. Correlation Heatmap

In [None]:
# Correlation matrix for numeric columns
numeric_cols = df.select_dtypes(include=[np.number]).columns
plt.figure(figsize=(10, 8))
sns.heatmap(df[numeric_cols].corr(), annot=True, cmap='coolwarm', center=0, fmt='.2f')
plt.title('Correlation Heatmap')
plt.tight_layout()
plt.show()

## 7. Key Insights Summary

In [None]:
print('=== KEY INSIGHTS ===')
print(f"\n1. Total Records: {len(df)}")
print(f"2. Date Range: {df['Patient Admission Date'].min()} to {df['Patient Admission Date'].max()}")
print(f"3. Average Wait Time: {df['Patient Waittime'].mean():.2f} minutes")
print(f"4. Average Satisfaction Score: {df['Patient Satisfaction Score'].mean():.2f}")
print(f"5. Admission Rate: {(df['Patient Admission Flag'] == True).mean()*100:.1f}%")
print(f"6. Most Common Department: {df['Department Referral'].mode()[0]}")
print(f"7. Gender Distribution: {df['Patient Gender'].value_counts().to_dict()}")
print(f"8. Missing Satisfaction Scores: {df['Patient Satisfaction Score'].isnull().sum()} ({df['Patient Satisfaction Score'].isnull().mean()*100:.1f}%)")