# Exploring 55K Healthcare Data

In [1]:
import pandas as pd

In [2]:
# Set display options to show all rows and columns
pd.set_option('display.max_rows', None)  # None means no limit on rows
pd.set_option('display.max_columns', None)  # None means no limit on columns
pd.set_option('display.width', None)  # Set width to None for unrestricted line length
pd.set_option('display.max_colwidth', None)  # Set the max column width to None for unrestricted length

## Loading the dataset

In [3]:
# Load the CSV file
df = pd.read_csv('C:\\Users\\frank\\data_science_project_advance\\datasets\\healthcare_dataset.csv')

## Getting Information about the DataSet

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55500 entries, 0 to 55499
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Name                55500 non-null  object 
 1   Age                 55500 non-null  int64  
 2   Gender              55500 non-null  object 
 3   Blood Type          55500 non-null  object 
 4   Medical Condition   55500 non-null  object 
 5   Date of Admission   55500 non-null  object 
 6   Doctor              55500 non-null  object 
 7   Hospital            55500 non-null  object 
 8   Insurance Provider  55500 non-null  object 
 9   Billing Amount      55500 non-null  float64
 10  Room Number         55500 non-null  int64  
 11  Admission Type      55500 non-null  object 
 12  Discharge Date      55500 non-null  object 
 13  Medication          55500 non-null  object 
 14  Test Results        55500 non-null  object 
dtypes: float64(1), int64(2), object(12)
memory usage: 6.4

### Printing the Ist - 100 rows

In [5]:
df.head(100)

Unnamed: 0,Name,Age,Gender,Blood Type,Medical Condition,Date of Admission,Doctor,Hospital,Insurance Provider,Billing Amount,Room Number,Admission Type,Discharge Date,Medication,Test Results
0,Bobby JacksOn,30,Male,B-,Cancer,2024-01-31,Matthew Smith,Sons and Miller,Blue Cross,18856.281306,328,Urgent,2024-02-02,Paracetamol,Normal
1,LesLie TErRy,62,Male,A+,Obesity,2019-08-20,Samantha Davies,Kim Inc,Medicare,33643.327287,265,Emergency,2019-08-26,Ibuprofen,Inconclusive
2,DaNnY sMitH,76,Female,A-,Obesity,2022-09-22,Tiffany Mitchell,Cook PLC,Aetna,27955.096079,205,Emergency,2022-10-07,Aspirin,Normal
3,andrEw waTtS,28,Female,O+,Diabetes,2020-11-18,Kevin Wells,"Hernandez Rogers and Vang,",Medicare,37909.78241,450,Elective,2020-12-18,Ibuprofen,Abnormal
4,adrIENNE bEll,43,Female,AB+,Cancer,2022-09-19,Kathleen Hanna,White-White,Aetna,14238.317814,458,Urgent,2022-10-09,Penicillin,Abnormal
5,EMILY JOHNSOn,36,Male,A+,Asthma,2023-12-20,Taylor Newton,Nunez-Humphrey,UnitedHealthcare,48145.110951,389,Urgent,2023-12-24,Ibuprofen,Normal
6,edwArD EDWaRDs,21,Female,AB-,Diabetes,2020-11-03,Kelly Olson,Group Middleton,Medicare,19580.872345,389,Emergency,2020-11-15,Paracetamol,Inconclusive
7,CHrisTInA MARtinez,20,Female,A+,Cancer,2021-12-28,Suzanne Thomas,"Powell Robinson and Valdez,",Cigna,45820.462722,277,Emergency,2022-01-07,Paracetamol,Inconclusive
8,JASmINe aGuIlaR,82,Male,AB+,Asthma,2020-07-01,Daniel Ferguson,Sons Rich and,Cigna,50119.222792,316,Elective,2020-07-14,Aspirin,Abnormal
9,ChRISTopher BerG,58,Female,AB-,Cancer,2021-05-23,Heather Day,Padilla-Walker,UnitedHealthcare,19784.631062,249,Elective,2021-06-22,Paracetamol,Inconclusive


#### Checking for missing rows

In [6]:
print(df.isnull().sum())

Name                  0
Age                   0
Gender                0
Blood Type            0
Medical Condition     0
Date of Admission     0
Doctor                0
Hospital              0
Insurance Provider    0
Billing Amount        0
Room Number           0
Admission Type        0
Discharge Date        0
Medication            0
Test Results          0
dtype: int64


##### Unique Medical Condintions 

In [7]:
print(df['Medical Condition'].unique())

['Cancer' 'Obesity' 'Diabetes' 'Asthma' 'Hypertension' 'Arthritis']


###### Number of patients with cancer medical condition

In [8]:
cancer_count = df[df['Medical Condition'] == 'Cancer'].shape[0]
print(f"Number of people with Cancer: {cancer_count}")

Number of people with Cancer: 9227


##### Gender classicaftion of the patients

In [9]:
print(df[ 'Gender' ].unique())

['Male' 'Female']


#### Table showing all medical condintions and the exact number involved

In [23]:
condition_counts = df['Medical Condition'].value_counts()
print(condition_counts)


Medical Condition
Arthritis       9308
Diabetes        9304
Hypertension    9245
Obesity         9231
Cancer          9227
Asthma          9185
Name: count, dtype: int64


##### average age of patients per medical condition

In [24]:
avg_age_condition = df.groupby('Medical Condition')['Age'].mean()
print(avg_age_condition)


Medical Condition
Arthritis       51.565320
Asthma          51.575830
Cancer          51.558795
Diabetes        51.554170
Hypertension    51.741915
Obesity         51.240277
Name: Age, dtype: float64


#### average billing amount per medical condition

In [25]:
avg_billing = df.groupby('Medical Condition')['Billing Amount'].mean().sort_values(ascending=False)
print(avg_billing)


Medical Condition
Obesity         25805.971259
Diabetes        25638.405577
Asthma          25635.249359
Arthritis       25497.327056
Hypertension    25497.095761
Cancer          25161.792707
Name: Billing Amount, dtype: float64


##### Code to Group Patients by Age Bracket

In [26]:

# Define the age bins and labels
bins = [0, 18, 35, 50, 65, 80, 100]
labels = ['0-18', '19-35', '36-50', '51-65', '66-80', '81-100']

# Create a new column for age group
df['Age Bracket'] = pd.cut(df['Age'], bins=bins, labels=labels, right=True)

# Count number of patients in each age group
age_group_counts = df['Age Bracket'].value_counts().sort_index()

# Display as a table
print(age_group_counts)


Age Bracket
0-18        888
19-35     13644
36-50     12301
51-65     12417
66-80     12215
81-100     4035
Name: count, dtype: int64


##### average length of hospital stay per condition

In [27]:
df['Date of Admission'] = pd.to_datetime(df['Date of Admission'])
df['Discharge Date'] = pd.to_datetime(df['Discharge Date'])
df['Length of Stay'] = (df['Discharge Date'] - df['Date of Admission']).dt.days

avg_stay = df.groupby('Medical Condition')['Length of Stay'].mean()
print(avg_stay)


Medical Condition
Arthritis       15.517404
Asthma          15.696570
Cancer          15.495827
Diabetes        15.422936
Hypertension    15.458626
Obesity         15.464305
Name: Length of Stay, dtype: float64


##### Count of Patients by Blood Type

In [28]:
# Count of each blood type
blood_type_counts = df['Blood Type'].value_counts()

# Display as table
print(blood_type_counts)


Blood Type
A-     6969
A+     6956
AB+    6947
AB-    6945
B+     6945
B-     6944
O+     6917
O-     6877
Name: count, dtype: int64


#### Average Billing Amount per Gender

In [30]:
# Mean billing amount by gender
avg_billing_by_gender = df.groupby('Gender')['Billing Amount'].mean().round(2)

# Display as table
print(avg_billing_by_gender)


Gender
Female    25470.65
Male      25607.86
Name: Billing Amount, dtype: float64


##### Total Billing Amount per Gender

In [31]:
# Total billing by gender
total_billing_by_gender = df.groupby('Gender')['Billing Amount'].sum().round(2)

print(total_billing_by_gender)


Gender
Female    7.061993e+08
Male      7.112327e+08
Name: Billing Amount, dtype: float64


In [32]:
# Count, mean, median, std, min, max for each gender
billing_stats_by_gender = df.groupby('Gender')['Billing Amount'].describe().round(2)

print(billing_stats_by_gender)


          count      mean       std      min       25%       50%       75%  \
Gender                                                                       
Female  27726.0  25470.65  14204.00 -2008.49  13146.27  25425.25  37740.84   
Male    27774.0  25607.86  14218.81 -1660.01  13326.11  25681.04  37891.76   

             max  
Gender            
Female  52764.28  
Male    52271.66  
