# Exploratory Data Analysis

In [1]:
# Import libraries
import pandas as pd

# Load data
completed_df = pd.read_csv('../data/processed/completed_data.csv')

## Basic Data information

In [2]:
print('Cleaned dataset for 2023, number of rows and columns: ')
print(completed_df.shape)
print('')

print(completed_df.info(verbose=True))

Cleaned dataset for 2023, number of rows and columns: 
(3751, 20)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3751 entries, 0 to 3750
Data columns (total 20 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   Facility ID                   3751 non-null   object
 1   Facility Name                 3751 non-null   object
 2   City/Town                     3751 non-null   object
 3   State                         3751 non-null   object
 4   Number of Completed Surveys   3751 non-null   int64 
 5   Survey Response Rate Percent  3751 non-null   int64 
 6   Year                          3751 non-null   int64 
 7   Nurse Communication           3751 non-null   int64 
 8   Doctor Communication          3751 non-null   int64 
 9   Staff Responsiveness          3751 non-null   int64 
 10  Medicine Communication        3751 non-null   int64 
 11  Discharge Information         3751 non-null   int64 
 12  Care Tran

## Information by Region

In [3]:
# Get Statistics by Region
byRegion_df = completed_df.groupby('Region').agg({
    'Number of Completed Surveys': 'mean',
    'Survey Response Rate Percent': 'mean',
    'Facility ID': 'count',
    'Nurse Communication': 'mean',
    'Doctor Communication': 'mean',
    'Staff Responsiveness': 'mean',
    'Medicine Communication': 'mean',
    'Discharge Information': 'mean',
    'Care Transition': 'mean',
    'Cleanliness': 'mean',
    'Quietness': 'mean',
    'Rating Score': 'mean',
    'Recommendation': 'mean'
}).reset_index().rename(columns={
    'Number of Completed Surveys': 'Avg Completed Surveys',
    'Survey Response Rate Percent': 'Avg Response Rate',
    'Facility ID': 'Facilities Count'
}).round(2)

# Calculate total statistics
total_stats = byRegion_df.agg({
    'Avg Completed Surveys': 'mean',
    'Avg Response Rate': 'mean',
    'Facilities Count': 'sum',
    'Nurse Communication': 'mean',
    'Doctor Communication': 'mean',
    'Staff Responsiveness': 'mean',
    'Medicine Communication': 'mean',
    'Discharge Information': 'mean',
    'Care Transition': 'mean',
    'Cleanliness': 'mean',
    'Quietness': 'mean',
    'Rating Score': 'mean',
    'Recommendation': 'mean'
}).to_frame().transpose().round(2)

# Sort the DataFrame by 'Facilities Count' in descending order
byRegion_df = byRegion_df.sort_values(by='Rating Score', ascending=False)

# Add a 'Region' entry for the total statistics
total_stats['Region'] = 'Total'

# Concatenate region_stats and total_stats
byRegion_df = pd.concat([byRegion_df, total_stats], ignore_index=True)

# Set index
if 'Region' in byRegion_df.columns:
    byRegion_df.set_index('Region', inplace=True)

# Print the statistics by region and total
print('Statistics by Region and Total: ')
byRegion_df

Statistics by Region and Total: 


Unnamed: 0_level_0,Avg Completed Surveys,Avg Response Rate,Facilities Count,Nurse Communication,Doctor Communication,Staff Responsiveness,Medicine Communication,Discharge Information,Care Transition,Cleanliness,Quietness,Rating Score,Recommendation
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Midwest,525.99,28.1,1056.0,91.87,91.41,85.56,77.94,87.3,81.88,87.15,84.12,88.4,87.71
South,645.0,20.79,1399.0,90.64,90.77,83.11,76.13,85.03,80.69,85.62,84.24,87.16,86.16
West,601.71,21.32,766.0,89.9,89.66,82.87,75.96,85.48,79.88,86.07,79.47,86.78,86.02
Northeast,719.68,24.16,517.0,90.19,89.77,81.47,74.45,85.55,79.72,85.02,78.03,85.85,84.92
U.S. Territories,421.38,27.46,13.0,84.46,87.0,74.77,63.46,69.69,75.46,84.0,79.62,78.92,77.54
Total,582.75,24.37,3751.0,89.41,89.72,81.56,73.59,82.61,79.53,85.57,81.1,85.42,84.47


In [4]:
# Remove the 'Total' row
byRegion_df = byRegion_df[byRegion_df.index != 'Total']

# Create CSV file
byRegion_df.to_csv('../data/processed/region_statistics.csv')

## Information by Division

In [5]:
# Get Statistics by Division
byDivision_df = completed_df.groupby('Division').agg({
    'Number of Completed Surveys': 'mean',
    'Survey Response Rate Percent': 'mean',
    'Facility ID': 'count',
    'Nurse Communication': 'mean',
    'Doctor Communication': 'mean',
    'Staff Responsiveness': 'mean',
    'Medicine Communication': 'mean',
    'Discharge Information': 'mean',
    'Care Transition': 'mean',
    'Cleanliness': 'mean',
    'Quietness': 'mean',
    'Rating Score': 'mean',
    'Recommendation': 'mean'
}).reset_index().rename(columns={
    'Number of Completed Surveys': 'Avg Completed Surveys',
    'Survey Response Rate Percent': 'Avg Response Rate',
    'Facility ID': 'Facilities Count'
}).round(2)
# Sort by most facilities by state
byDivision_df = byDivision_df.sort_values(by='Facilities Count', ascending=False)

# Set index
if 'Division' in byDivision_df.columns:
    byDivision_df.set_index('Division', inplace=True)

# Print
print('Stats by Division: ')
byDivision_df

Stats by Division: 


Unnamed: 0_level_0,Avg Completed Surveys,Avg Response Rate,Facilities Count,Nurse Communication,Doctor Communication,Staff Responsiveness,Medicine Communication,Discharge Information,Care Transition,Cleanliness,Quietness,Rating Score,Recommendation
Division,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
East North Central,591.95,26.75,633,91.51,90.79,84.73,77.17,87.26,81.42,86.67,83.09,87.88,86.87
South Atlantic,763.31,20.46,609,89.88,89.84,81.37,74.31,84.49,79.76,84.44,81.96,86.15,85.0
West South Central,547.94,20.68,495,91.31,91.33,84.73,77.83,85.53,81.63,87.24,86.39,88.32,87.55
Pacific,688.35,20.29,449,89.29,89.05,81.57,75.13,84.96,79.2,85.8,77.05,86.14,85.27
West North Central,427.28,30.12,423,92.41,92.33,86.8,79.1,87.36,82.56,87.87,85.65,89.19,88.96
Middle Atlantic,767.35,23.97,353,89.75,89.34,80.84,73.62,84.67,79.2,84.64,78.02,85.4,84.14
Mountain,478.98,22.77,317,90.75,90.51,84.7,77.13,86.21,80.84,86.45,82.89,87.7,87.1
East South Central,563.61,21.66,295,91.09,91.73,83.97,77.04,85.33,81.02,85.32,85.35,87.29,86.21
New England,617.06,24.57,164,91.13,90.7,82.8,76.26,87.46,80.86,85.82,78.07,86.83,86.58
U.S. Territories,421.38,27.46,13,84.46,87.0,74.77,63.46,69.69,75.46,84.0,79.62,78.92,77.54


## Information by State

In [6]:
# Get Statistics by state
byState_df = completed_df.groupby('State').agg({
    'Region':'first',
    'State Name':'first',
    #'State':'first',
    'Number of Completed Surveys': 'mean',
    'Survey Response Rate Percent': 'mean',
    'Facility ID': 'count',
    'Nurse Communication': 'mean',
    'Doctor Communication': 'mean',
    'Staff Responsiveness': 'mean',
    'Medicine Communication': 'mean',
    'Discharge Information': 'mean',
    'Care Transition': 'mean',
    'Cleanliness': 'mean',
    'Quietness': 'mean',
    'Rating Score': 'mean',
    'Recommendation': 'mean'
}).reset_index().rename(columns={
    'Number of Completed Surveys': 'Avg Completed Surveys',
    'Survey Response Rate Percent': 'Avg Response Rate',
    'Facility ID': 'Facilities Count'
}).round(2)
# Sort by most facilities by state
byState_df = byState_df.sort_values(by='Rating Score', ascending=False)


# Set index
if 'State' in byState_df.columns:
    byState_df.set_index('State', inplace=True)


# Print
print('Stats by States: ')
byState_df.head(5)

Stats by States: 


Unnamed: 0_level_0,Region,State Name,Avg Completed Surveys,Avg Response Rate,Facilities Count,Nurse Communication,Doctor Communication,Staff Responsiveness,Medicine Communication,Discharge Information,Care Transition,Cleanliness,Quietness,Rating Score,Recommendation
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
NE,Midwest,Nebraska,415.31,32.75,51,93.08,92.98,87.67,80.29,88.12,83.55,89.0,87.1,90.04,90.41
SD,Midwest,South Dakota,307.48,33.69,29,92.97,93.07,89.24,82.14,87.45,83.59,88.59,86.83,90.0,90.76
IA,Midwest,Iowa,314.24,30.79,78,92.88,93.01,86.95,79.56,88.45,83.06,89.44,85.73,89.81,89.9
KS,Midwest,Kansas,457.48,31.04,67,92.15,92.21,87.24,79.39,86.36,82.48,88.93,86.57,89.67,89.13
WI,Midwest,Wisconsin,490.54,32.07,119,93.02,92.24,87.5,80.9,89.37,83.27,89.18,85.74,89.62,89.11


In [7]:
# Create CSV file
byState_df.to_csv('../data/processed/state_statistics.csv')