## Data Cleaning

In [24]:
import pandas as pd
data = pd.read_csv('healthcare_dataset_raw.csv')
df = pd.DataFrame(data)

df.head()

Unnamed: 0,Name,Age,Gender,Blood Type,Medical Condition,Date of Admission,Doctor,Hospital,Insurance Provider,Billing Amount,Room Number,Admission Type,Discharge Date,Medication,Test Results
0,Bobby JacksOn,30,Male,B-,Cancer,2024-01-31,Matthew Smith,Sons and Miller,Blue Cross,18856.281306,328,Urgent,2024-02-02,Paracetamol,Normal
1,LesLie TErRy,62,Male,A+,Obesity,2019-08-20,Samantha Davies,Kim Inc,Medicare,33643.327287,265,Emergency,2019-08-26,Ibuprofen,Inconclusive
2,DaNnY sMitH,76,Female,A-,Obesity,2022-09-22,Tiffany Mitchell,Cook PLC,Aetna,27955.096079,205,Emergency,2022-10-07,Aspirin,Normal
3,andrEw waTtS,28,Female,O+,Diabetes,2020-11-18,Kevin Wells,"Hernandez Rogers and Vang,",Medicare,37909.78241,450,Elective,2020-12-18,Ibuprofen,Abnormal
4,adrIENNE bEll,43,Female,AB+,Cancer,2022-09-19,Kathleen Hanna,White-White,Aetna,14238.317814,458,Urgent,2022-10-09,Penicillin,Abnormal


In [25]:
# Check the data types of the columns
df.dtypes

Name                   object
Age                     int64
Gender                 object
Blood Type             object
Medical Condition      object
Date of Admission      object
Doctor                 object
Hospital               object
Insurance Provider     object
Billing Amount        float64
Room Number             int64
Admission Type         object
Discharge Date         object
Medication             object
Test Results           object
dtype: object

In [26]:
# Checking the values in the 'Gender' column
df['Gender'].value_counts()

Gender
Male      27774
Female    27726
Name: count, dtype: int64

In [27]:
# Checking the values in the 'Medication' column
df['Medication'].value_counts()

Medication
Lipitor        11140
Ibuprofen      11127
Aspirin        11094
Paracetamol    11071
Penicillin     11068
Name: count, dtype: int64

In [28]:
# Checking the values in the 'Test Results' column
df['Test Results'].value_counts()

Test Results
Abnormal        18627
Normal          18517
Inconclusive    18356
Name: count, dtype: int64

In [29]:
# Round the 'Billing Amount' column to 2 decimal places
df['Billing Amount'] = df['Billing Amount'].round(2)

In [30]:
df.head()

Unnamed: 0,Name,Age,Gender,Blood Type,Medical Condition,Date of Admission,Doctor,Hospital,Insurance Provider,Billing Amount,Room Number,Admission Type,Discharge Date,Medication,Test Results
0,Bobby JacksOn,30,Male,B-,Cancer,2024-01-31,Matthew Smith,Sons and Miller,Blue Cross,18856.28,328,Urgent,2024-02-02,Paracetamol,Normal
1,LesLie TErRy,62,Male,A+,Obesity,2019-08-20,Samantha Davies,Kim Inc,Medicare,33643.33,265,Emergency,2019-08-26,Ibuprofen,Inconclusive
2,DaNnY sMitH,76,Female,A-,Obesity,2022-09-22,Tiffany Mitchell,Cook PLC,Aetna,27955.1,205,Emergency,2022-10-07,Aspirin,Normal
3,andrEw waTtS,28,Female,O+,Diabetes,2020-11-18,Kevin Wells,"Hernandez Rogers and Vang,",Medicare,37909.78,450,Elective,2020-12-18,Ibuprofen,Abnormal
4,adrIENNE bEll,43,Female,AB+,Cancer,2022-09-19,Kathleen Hanna,White-White,Aetna,14238.32,458,Urgent,2022-10-09,Penicillin,Abnormal


In [31]:
# Normalizing the Name column
df['Name'] = df['Name'].str.title()

In [32]:
# Fixing some naming issues
df['Name'] = df['Name'].str.replace(' Md', '')
df['Name'] = df['Name'].str.replace('Jr.', ' Jr')
df['Name'] = df['Name'].str.replace(' Dds', '')
df['Name'] = df['Name'].str.replace(' Iii', ' III')

In [33]:
df.head()

Unnamed: 0,Name,Age,Gender,Blood Type,Medical Condition,Date of Admission,Doctor,Hospital,Insurance Provider,Billing Amount,Room Number,Admission Type,Discharge Date,Medication,Test Results
0,Bobby Jackson,30,Male,B-,Cancer,2024-01-31,Matthew Smith,Sons and Miller,Blue Cross,18856.28,328,Urgent,2024-02-02,Paracetamol,Normal
1,Leslie Terry,62,Male,A+,Obesity,2019-08-20,Samantha Davies,Kim Inc,Medicare,33643.33,265,Emergency,2019-08-26,Ibuprofen,Inconclusive
2,Danny Smith,76,Female,A-,Obesity,2022-09-22,Tiffany Mitchell,Cook PLC,Aetna,27955.1,205,Emergency,2022-10-07,Aspirin,Normal
3,Andrew Watts,28,Female,O+,Diabetes,2020-11-18,Kevin Wells,"Hernandez Rogers and Vang,",Medicare,37909.78,450,Elective,2020-12-18,Ibuprofen,Abnormal
4,Adrienne Bell,43,Female,AB+,Cancer,2022-09-19,Kathleen Hanna,White-White,Aetna,14238.32,458,Urgent,2022-10-09,Penicillin,Abnormal


In [34]:
# Dropping the room number column
df = df.drop(columns=['Room Number'])

In [35]:
# Converting the Date of Admission column to datetime
df['Date of Admission'] = pd.to_datetime(df['Date of Admission'])

In [36]:
# Converting the Date of Discharge column to datetime
df['Discharge Date'] = pd.to_datetime(df['Discharge Date'])

In [37]:
# Creating a column to calculate the length of stay
df['Length of Stay'] = df['Discharge Date'] - df['Date of Admission']

#Converting the length of stay to a number
df['Length of Stay'] = df['Length of Stay'].dt.days

In [38]:
# Detecting readmissions
df['Readmitted'] = df['Name'].duplicated()

In [39]:
# Counting the total number of times each patient has been to hospital and putting it in the readmitted column
df['# of Times Readmitted'] = df.groupby('Name')['Name'].transform('count')

In [40]:
# Saving the cleaned data to a new csv file
df.to_csv('healthcare_dataset_cleaned.csv', index=False)

In [41]:
import random

# Adding in more conditions and giving them random weights.
conditions = [
    'Diabetes', 'Hypertension', 'Asthma', 'Cancer', 'HIV', 'Malaria', 'Tuberculosis', 'Pneumonia', 'Bronchitis', 'Arthritis',
    'Osteoporosis', 'Heart Disease', 'Stroke', 'Kidney Disease', 'Liver Disease', 'Obesity', 'Anemia', 'Epilepsy', 'Migraine',
    "Alzheimer's Disease", 'Parkinson\'s Disease', 'Multiple Sclerosis', 'Lupus', 'Sickle Cell Disease', 'Cystic Fibrosis',
    'Chronic Fatigue Syndrome', 'Fibromyalgia', 'Rheumatoid Arthritis', 'Chronic Obstructive Pulmonary Disease', 'Endometriosis',
    'Polycystic Ovary Syndrome', 'Hypothyroidism', 'Hyperthyroidism', 'Gout', 'Glaucoma', 'Cataracts', 'Macular Degeneration',
    'Retinal Detachment', 'Retinitis Pigmentosa', 'Color Blindness', 'Hearing Loss', 'Tinnitus', 'Vertigo', 'Meniere\'s Disease',
    'Motion Sickness', 'Insomnia', 'Sleep Apnea', 'Narcolepsy'
]

weights = [
    0.200, 0.150, 0.100, 0.050, 0.040, 0.030, 0.025, 0.020, 0.018, 0.017,
    0.016, 0.015, 0.014, 0.013, 0.012, 0.011, 0.010, 0.009, 0.008, 0.007,
    0.006, 0.005, 0.004, 0.003, 0.002, 0.001, 0.001, 0.001, 0.001, 0.001,
    0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001,
    0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001
]

df['Medical Condition'] = random.choices(conditions, weights=weights, k=len(df))

In [42]:
# I want to make 50 fake hopsital names and replace them with the hospitals in the dataset
hospitals = [
    "Riverside General Hospital", "Summit Medical Center", "Grandview Health System",
    "Oakwood Community Hospital", "Evergreen Regional Medical Center", "Harmony Valley Hospital",
    "Crestview Medical Institute", "Pioneer Healthcare Center", "Northgate Memorial Hospital",
    "Sunridge Medical Plaza"
]
weights = [0.172, 0.145, 0.133, 0.121, 0.110, 0.098, 0.087, 0.065, 0.037, 0.032]
df['Hospital'] = random.choices(hospitals, weights=weights, k=len(df))


In [43]:
# I want the Admission types to be more spread out in terms of % of each type, I dont want them all to be equal amounts

# Define admission types and their corresponding weights
admission_types = ['Emergency', 'Urgent', 'Elective', 'Newborn', 'Trauma']
weights = [0.421, 0.179, 0.222, 0.128, 0.050]

# Generate admission types based on the specified weights
df['Admission Type'] = random.choices(admission_types, weights=weights, k=len(df))

In [44]:
# Removing the doctor's name column
df = df.drop(columns=['Doctor'])

In [45]:
# Randomizing the blood type column
# Define blood types and their corresponding weights
import numpy as np

blood_types = ['A+', 'A-', 'B+', 'B-', 'AB+', 'AB-', 'O+', 'O-']
blood_type_weights = [0.34, 0.06, 0.09, 0.02, 0.04, 0.01, 0.37, 0.07]

# Create a dictionary to store the blood type for each name
name_to_blood_type = {}

# Assign random blood type to each unique name with weights
for name in df['Name'].unique():
    name_to_blood_type[name] = np.random.choice(blood_types, p=blood_type_weights)

# Map the blood type to each record based on the name
df['Blood Type'] = df['Name'].map(name_to_blood_type)

In [46]:
# Define possible genders and their weights
genders = ['Male', 'Female', 'Other', 'Non-Conforming']
gender_weights = [0.42, 0.39, 0.08, 0.11]

# Create a dictionary to store the gender for each name
name_to_gender = {}

# Assign random gender to each unique name with weights
for name in df['Name'].unique():
    name_to_gender[name] = np.random.choice(genders, p=gender_weights)

# Map the gender to each record based on the name
df['Gender'] = df['Name'].map(name_to_gender)

In [47]:
# Randomizing the age to show more of a bell curve with the peak at 40-70
# Generate random ages between 0 and 100
import numpy as np

# Define age ranges and their corresponding probabilities
age_ranges = [(0, 9), (10, 19), (20, 29), (30, 39), (40, 49), (50, 59), (60, 69), (70, 79), (80, 89), (90, 99)]
age_weights = [0.10, 0.05, 0.05, 0.05, 0.10, 0.20, 0.20, 0.15, 0.08, 0.02]

# Create a dictionary to store the age for each name
name_to_age = {}

# Assign random age to each unique name within the specified ranges and probabilities
for name in df['Name'].unique():
    age_range = np.random.choice(len(age_ranges), p=age_weights)
    name_to_age[name] = np.random.randint(age_ranges[age_range][0], age_ranges[age_range][1] + 1)

# Map the age to each record based on the name
df['Age'] = df['Name'].map(name_to_age)

In [48]:
import numpy as np

# Generate a log-normal distribution for the billing amounts
main_billing_amounts = np.random.lognormal(mean=10, sigma=1, size=int(len(df) * 0.95))

# Scale the values to the desired range (100 to 90,000)
main_billing_amounts = 100 + (main_billing_amounts - main_billing_amounts.min()) * (90000 - 100) / (main_billing_amounts.max() - main_billing_amounts.min())

# Generate outliers
outliers = np.random.lognormal(mean=12, sigma=1, size=int(len(df) * 0.05))

# Scale the outliers to the desired range (100 to 90,000)
outliers = 100 + (outliers - outliers.min()) * (90000 - 100) / (outliers.max() - outliers.min())

# Combine the main billing amounts and outliers
billing_amounts = np.concatenate([main_billing_amounts, outliers])

# Shuffle the billing amounts to mix the outliers with the main data
np.random.shuffle(billing_amounts)

# Assign the billing amounts to the DataFrame
df['Billing Amount'] = billing_amounts

df['Billing Amount'] = billing_amounts.round(2)

In [49]:
# Add in coluimn for doctor's name
doctors = ['Dr. John Doe', 'Dr. Jane Smith', 'Dr. Michael Johnson', 'Dr. Sarah Williams', 'Dr. Robert Brown', 'Dr. Lisa Jones', 'Dr. William Davis', 'Dr. Karen Miller', 'Dr. Richard Wilson', 'Dr. Nancy Moore']
df['Doctor'] = random.choices(doctors, k=len(df))

In [50]:
insurance_companies = ['Aetna', 'Anthem', 'Blue Cross Blue Shield', 'Cigna', 'Humana', 'UnitedHealthcare', 'Kaiser Permanente', 'Molina Healthcare', 'Centene', 'Oscar Health']
weights = [0.20, 0.18, 0.15, 0.12, 0.10, 0.08, 0.07, 0.05, 0.03, 0.02]

# Generate blood types based on the specified weights
df['Insurance Provider'] = random.choices(insurance_companies, weights=weights, k=len(df))

In [51]:
races = ['Asian', 'Black', 'Hispanic', 'White', 'Other']

# Create a dictionary to store the race for each name
name_to_race = {}

# Assign random race to each unique name
for name in df['Name'].unique():
    name_to_race[name] = random.choices(races, weights=[0.27, 0.21, 0.25, 0.61, 0.11])[0]

# Map the race to each record based on the name
df['Race'] = df['Name'].map(name_to_race)

In [52]:
# Changing the distrubution of the test results
# Define possible test results and their weights
test_results = ['Normal', 'Abnormal', 'Inconclusive']
weights = [0.70, 0.20, 0.10]

# Generate test results based on the specified weights
df['Test Results'] = random.choices(test_results, weights=weights, k=len(df))

In [53]:
#Adding in a patient outcome column
outcomes = ['Discharged', 'Transferred']
weights = [0.87, 0.13]

# Generate patient outcomes based on the specified weights
df['Patient Outcome'] = random.choices(outcomes, weights=weights, k=len(df))

In [None]:
# Adding doctor column
doctors = ['Dr. John Doe', 'Dr. Jane Smith', 'Dr. Michael Johnson', 'Dr. Sarah Williams', 'Dr. Robert Brown', 'Dr. Lisa Jones', 'Dr. William Davis', 'Dr. Karen Miller', 'Dr. Richard Wilson', 'Dr. Nancy Moore']
df['Doctor'] = random.choices(doctors, k=len(df))

In [56]:
# Changing readmitted column to say "Readmitted" or "Not Readmitted"
df['Readmitted'] = df['Readmitted'].map({True: 'Readmitted', False: 'Not Readmitted'})

# Determine if any row for each name has been readmitted
name_readmitted = df.groupby('Name')['Readmitted'].transform(lambda x: 'Readmitted' if 'Readmitted' in x.values else 'Not Readmitted')

# Map the result back to the DataFrame
df['Readmitted'] = name_readmitted

In [57]:
# Saving the updated dataset to a new csv file
df.to_csv('healthcare_dataset_cleaned.csv', index=False)