# Patient Satisfaction

## Import Libraries and Datasets

In [1]:
# Import functions from the script
from scripts.load_data import load_hospital_data, load_state_region_data
from scripts.save_data import save_to_csv
from scripts.clean_data import clean_pivot_data, exclude_nulls
from scripts.merge_data import merge_state_region_data
from scripts.transform_data import update_measure_id, add_answer_column, pivot_data, reduce_columns, rename_features
from scripts.calculate_data import calculate_linear_scores

In [2]:
# Load the data
df = load_hospital_data("../data/raw/HCAHPS-Hospital.csv")

# Get the number of rows and columns
print('Dataset for 2023, number of rows and columns: ')
print(df.shape)
print('')

# Get the first 5 rows
print('First 5 rows: ')
print(df.head())
print('')

Dataset for 2023, number of rows and columns: 
(447516, 23)

First 5 rows: 
  Facility ID                    Facility Name                 Address  \
0      010001  SOUTHEAST HEALTH MEDICAL CENTER  1108 ROSS CLARK CIRCLE   
1      010001  SOUTHEAST HEALTH MEDICAL CENTER  1108 ROSS CLARK CIRCLE   
2      010001  SOUTHEAST HEALTH MEDICAL CENTER  1108 ROSS CLARK CIRCLE   
3      010001  SOUTHEAST HEALTH MEDICAL CENTER  1108 ROSS CLARK CIRCLE   
4      010001  SOUTHEAST HEALTH MEDICAL CENTER  1108 ROSS CLARK CIRCLE   

  City/Town State ZIP Code County/Parish Telephone Number  \
0    DOTHAN    AL    36301       HOUSTON   (334) 793-8701   
1    DOTHAN    AL    36301       HOUSTON   (334) 793-8701   
2    DOTHAN    AL    36301       HOUSTON   (334) 793-8701   
3    DOTHAN    AL    36301       HOUSTON   (334) 793-8701   
4    DOTHAN    AL    36301       HOUSTON   (334) 793-8701   

       HCAHPS Measure ID                                    HCAHPS Question  \
0           H_COMP_1_A_P  Patient

## Data Preparation

### Update Measure IDs

In [3]:
# Update Measure Id values
df = update_measure_id(df)

# Verify first 5 rows after renaming columns
print('First 5 rows after updating Measure Id: ')
print(df.head())
print('')

First 5 rows after updating Measure Id: 
  Facility ID                    Facility Name                 Address  \
0      010001  SOUTHEAST HEALTH MEDICAL CENTER  1108 ROSS CLARK CIRCLE   
1      010001  SOUTHEAST HEALTH MEDICAL CENTER  1108 ROSS CLARK CIRCLE   
2      010001  SOUTHEAST HEALTH MEDICAL CENTER  1108 ROSS CLARK CIRCLE   
3      010001  SOUTHEAST HEALTH MEDICAL CENTER  1108 ROSS CLARK CIRCLE   
4      010001  SOUTHEAST HEALTH MEDICAL CENTER  1108 ROSS CLARK CIRCLE   

  City/Town State ZIP Code County/Parish Telephone Number  \
0    DOTHAN    AL    36301       HOUSTON   (334) 793-8701   
1    DOTHAN    AL    36301       HOUSTON   (334) 793-8701   
2    DOTHAN    AL    36301       HOUSTON   (334) 793-8701   
3    DOTHAN    AL    36301       HOUSTON   (334) 793-8701   
4    DOTHAN    AL    36301       HOUSTON   (334) 793-8701   

         HCAHPS Measure ID                                    HCAHPS Question  \
0           NURSE_COMM_A_P  Patients who reported that their nurse

### Add "HCAHPS Answer" Column

In [4]:
# Create 'HCAHPS Answer' column
df = add_answer_column(df)

# Verify columns in df after creating 'HCAHPS Answer' column
print('Columns in df after creating "HCAHPS Answer" column: ')
print(df.columns)
print('')

Columns in df after creating "HCAHPS Answer" column: 
Index(['Facility ID', 'Facility Name', 'Address', 'City/Town', 'State',
       'ZIP Code', 'County/Parish', 'Telephone Number', 'HCAHPS Measure ID',
       'HCAHPS Question', 'HCAHPS Answer Description',
       'Patient Survey Star Rating', 'Patient Survey Star Rating Footnote',
       'HCAHPS Answer Percent', 'HCAHPS Answer Percent Footnote',
       'HCAHPS Linear Mean Value', 'Number of Completed Surveys',
       'Number of Completed Surveys Footnote', 'Survey Response Rate Percent',
       'Survey Response Rate Percent Footnote', 'Start Date', 'End Date',
       'Year', 'HCAHPS Answer'],
      dtype='object')



### Generate Pivot Table

In [5]:
# Pivot the data
pivot_df = pivot_data(df)

In [6]:
# Verify columns in pivot_df after pivoting the data
print('Columns in pivot_df after pivoting the data: ')
print(pivot_df.columns)
print('')

Columns in pivot_df after pivoting the data: 
Index(['Facility ID', 'Facility Name', 'Address', 'City/Town', 'State',
       'ZIP Code', 'County/Parish', 'Telephone Number',
       'Number of Completed Surveys', 'Survey Response Rate Percent',
       ...
       'H_HSP_RATING_7_8', 'H_HSP_RATING_9_10', 'H_HSP_RATING_LINEAR_SCORE',
       'H_HSP_RATING_STAR_RATING', 'H_RECMND_DN', 'H_RECMND_DY', 'H_RECMND_PY',
       'H_RECMND_LINEAR_SCORE', 'H_RECMND_STAR_RATING', 'H_STAR_RATING'],
      dtype='object', name='HCAHPS Measure ID', length=104)



### Convert to Numeric Values

In [7]:
# Clean the data
cleaned_pivot_df = clean_pivot_data(pivot_df)
print('Columns in pivot_df after cleaning the data: ')
print(cleaned_pivot_df.info(verbose=True))

Columns in pivot_df after cleaning the data: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4812 entries, 0 to 4811
Data columns (total 104 columns):
 #    Column                        Dtype 
---   ------                        ----- 
 0    Facility ID                   object
 1    Facility Name                 object
 2    Address                       object
 3    City/Town                     object
 4    State                         object
 5    ZIP Code                      object
 6    County/Parish                 object
 7    Telephone Number              object
 8    Number of Completed Surveys   Int64 
 9    Survey Response Rate Percent  Int64 
 10   Year                          Int64 
 11   NURSE_COMM_A_P                Int64 
 12   NURSE_COMM_SN_P               Int64 
 13   NURSE_COMM_U_P                Int64 
 14   NURSE_COMM_LINEAR_SCORE       Int64 
 15   NURSE_COMM_STAR_RATING        Int64 
 16   H_NURSE_RESPECT_A_P           Int64 
 17   H_NURSE_RESPECT_SN_P   

### Handle missing data

In [8]:
## Show Initial Information
print("Number of hospitals in dataset: ")
print(len(cleaned_pivot_df),"\n")

print("Null values per columns: ")
print(cleaned_pivot_df.isnull().sum()[cleaned_pivot_df.isnull().sum() > 0],"\n")

# Exclude null values
cleaned2_pivot_df = exclude_nulls(cleaned_pivot_df)

## Show Information after excluding nulls
print("Number of hospitals after excluding nulls: ")
print(len(cleaned2_pivot_df),"\n")

print("Null values per columns after excluding nulls: ")
print(cleaned2_pivot_df.isnull().sum()[cleaned2_pivot_df.isnull().sum() > 0],"\n")

Number of hospitals in dataset: 
4812 

Null values per columns: 
HCAHPS Measure ID
Number of Completed Surveys      716
Survey Response Rate Percent     716
NURSE_COMM_A_P                   716
NURSE_COMM_SN_P                  716
NURSE_COMM_U_P                   716
                                ... 
H_RECMND_DY                      716
H_RECMND_PY                      716
H_RECMND_LINEAR_SCORE           1554
H_RECMND_STAR_RATING            1554
H_STAR_RATING                   1554
Length: 95, dtype: int64 

Number of hospitals after excluding nulls: 
3751 

Null values per columns after excluding nulls: 
HCAHPS Measure ID
NURSE_COMM_LINEAR_SCORE        493
NURSE_COMM_STAR_RATING         493
DOCTOR_COMM_LINEAR_SCORE       493
DOCTOR_COMM_STAR_RATING        493
STAFF_RESPON_LINEAR_SCORE      493
STAFF_RESPON_STAR_RATING       493
MEDICINE_LINEAR_SCORE          493
MEDICINE_STAR_RATING           493
DISCHARGE_INFO_LINEAR_SCORE    493
DISCHARGE_INFO_STAR_RATING     493
CARE_TRANSIT_LI

In [9]:
# Calculate linear scores and compare
cleaned2_pivot_df = calculate_linear_scores(cleaned2_pivot_df)

print("Number of hospitals after Filling null values: ")
print(len(cleaned2_pivot_df),"\n")

print("Null values per columns after Filling null values: ")
print(cleaned2_pivot_df.isnull().sum()[cleaned2_pivot_df.isnull().sum() > 0],"\n")

Actual_Score         84.96321
Calculated_Score    84.953479
Difference          -0.009731
dtype: Float64
Number of hospitals after Filling null values: 
3751 

Null values per columns after Filling null values: 
HCAHPS Measure ID
NURSE_COMM_STAR_RATING        493
DOCTOR_COMM_STAR_RATING       493
STAFF_RESPON_STAR_RATING      493
MEDICINE_STAR_RATING          493
DISCHARGE_INFO_STAR_RATING    493
CARE_TRANSIT_STAR_RATING      493
H_CLEAN_STAR_RATING           493
H_QUIET_STAR_RATING           493
H_HSP_RATING_STAR_RATING      493
H_RECMND_STAR_RATING          493
H_STAR_RATING                 493
dtype: int64 



### Reduced pivot table

In [10]:
# Reduce columns
reduced_df = reduce_columns(cleaned2_pivot_df)

print("Null values per columns after droping non-necessary columns: ")
print(reduced_df.isnull().sum()[reduced_df.isnull().sum() > 0],"\n")

print("Reduced Dataframe Information: ")
print(reduced_df.info(),"\n")

Null values per columns after droping non-necessary columns: 
Series([], dtype: int64) 

Reduced Dataframe Information: 
<class 'pandas.core.frame.DataFrame'>
Index: 3751 entries, 0 to 4808
Data columns (total 17 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   Facility ID                   3751 non-null   object
 1   Facility Name                 3751 non-null   object
 2   City/Town                     3751 non-null   object
 3   State                         3751 non-null   object
 4   Number of Completed Surveys   3751 non-null   Int64 
 5   Survey Response Rate Percent  3751 non-null   Int64 
 6   Year                          3751 non-null   Int64 
 7   NURSE_COMM                    3751 non-null   Int64 
 8   DOCTOR_COMM                   3751 non-null   Int64 
 9   STAFF_RESPON                  3751 non-null   Int64 
 10  MEDICINE                      3751 non-null   Int64 
 11  DISCHARGE_INFO    

### Add Region Information

In [11]:
# Load state region data
state_region_df = load_state_region_data("../data/raw/states.csv")

# Merge state region data
completed_df = merge_state_region_data(reduced_df, state_region_df)

### Rename final Categories

In [12]:
# Rename columns to more descriptive names
completed_df = rename_features(completed_df)

print('Columns in completed_df after renaming columns: ')
print(reduced_df.info())

Columns in completed_df after renaming columns: 
<class 'pandas.core.frame.DataFrame'>
Index: 3751 entries, 0 to 4808
Data columns (total 17 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   Facility ID                   3751 non-null   object
 1   Facility Name                 3751 non-null   object
 2   City/Town                     3751 non-null   object
 3   State                         3751 non-null   object
 4   Number of Completed Surveys   3751 non-null   Int64 
 5   Survey Response Rate Percent  3751 non-null   Int64 
 6   Year                          3751 non-null   Int64 
 7   NURSE_COMM                    3751 non-null   Int64 
 8   DOCTOR_COMM                   3751 non-null   Int64 
 9   STAFF_RESPON                  3751 non-null   Int64 
 10  MEDICINE                      3751 non-null   Int64 
 11  DISCHARGE_INFO                3751 non-null   Int64 
 12  CARE_TRANSIT                  37

## Export Processed Data

In [15]:
# Save the final DataFrame to a CSV file
save_to_csv(completed_df, '../data/processed/completed_data.csv')