In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn import metrics

In [2]:
data = pd.read_csv("Hospital_Inpatient_Discharges__SPARCS_De-Identified___2015.csv")

  data = pd.read_csv("Hospital_Inpatient_Discharges__SPARCS_De-Identified___2015.csv")


In [3]:
len(data)

2346760

In [4]:
data.dtypes

Health Service Area                     object
Hospital County                         object
Operating Certificate Number           float64
Facility Id                            float64
Facility Name                           object
Age Group                               object
Zip Code - 3 digits                     object
Gender                                  object
Race                                    object
Ethnicity                               object
Length of Stay                          object
Type of Admission                       object
Patient Disposition                     object
Discharge Year                           int64
CCS Diagnosis Code                       int64
CCS Diagnosis Description               object
CCS Procedure Code                       int64
CCS Procedure Description               object
APR DRG Code                             int64
APR DRG Description                     object
APR MDC Code                             int64
APR MDC Descr

In [5]:
# For simplicity, let's drop rows with missing values and drop non-essential columns
# Drop Unuseful Columns
unuseful_columns = ['Health Service Area', 'Hospital County', 'Operating Certificate Number', 
                    'Facility Name', 'Discharge Year', 'CCS Diagnosis Description', 
                    'CCS Procedure Description', 'APR DRG Description', 'APR MDC Description', 
                    'APR Severity of Illness Description', 'APR Medical Surgical Description', 
                    'Attending Provider License Number', 'Operating Provider License Number', 
                    'Other Provider License Number','Payment Typology 1','Payment Typology 2','Payment Typology 3',
                    'Total Costs']
data.drop(columns=unuseful_columns, inplace=True)

In [6]:
# For simplicity, let's drop rows with missing values and drop non-essential columns
# Drop Unuseful Columns
unuseful_columns = ['Zip Code - 3 digits']
data.drop(columns=unuseful_columns, inplace=True)

In [7]:
data.dtypes

Facility Id                       float64
Age Group                          object
Gender                             object
Race                               object
Ethnicity                          object
Length of Stay                     object
Type of Admission                  object
Patient Disposition                object
CCS Diagnosis Code                  int64
CCS Procedure Code                  int64
APR DRG Code                        int64
APR MDC Code                        int64
APR Severity of Illness Code        int64
APR Risk of Mortality              object
Birth Weight                        int64
Abortion Edit Indicator            object
Emergency Department Indicator     object
Total Charges                      object
dtype: object

In [8]:
data['Total Charges'] = data['Total Charges'].apply(lambda x: str(x).replace('$',''))
data['Total Charges'] = pd.to_numeric(data['Total Charges'])

In [9]:
data.isnull().sum()

Facility Id                       2911
Age Group                            0
Gender                               0
Race                                 0
Ethnicity                            0
Length of Stay                       0
Type of Admission                    0
Patient Disposition                  0
CCS Diagnosis Code                   0
CCS Procedure Code                   0
APR DRG Code                         0
APR MDC Code                         0
APR Severity of Illness Code         0
APR Risk of Mortality              112
Birth Weight                         0
Abortion Edit Indicator              0
Emergency Department Indicator       0
Total Charges                        0
dtype: int64

In [10]:
# Impute missing values in 'Facility Id' column based on the proportion of occurrence
facility_id_counts = data['Facility Id'].value_counts(normalize=True)
missing_indices = data['Facility Id'].isnull()
data.loc[missing_indices, 'Facility Id'] = np.random.choice(facility_id_counts.index, size=missing_indices.sum(), p=facility_id_counts.values)


In [11]:
# Impute missing values in 'APR Risk of Mortality' column based on the proportion of occurrence
facility_id_counts = data['APR Risk of Mortality'].value_counts(normalize=True)
missing_indices = data['APR Risk of Mortality'].isnull()
data.loc[missing_indices, 'APR Risk of Mortality'] = np.random.choice(facility_id_counts.index, size=missing_indices.sum(), p=facility_id_counts.values)


In [12]:
data.isnull().sum()

Facility Id                       0
Age Group                         0
Gender                            0
Race                              0
Ethnicity                         0
Length of Stay                    0
Type of Admission                 0
Patient Disposition               0
CCS Diagnosis Code                0
CCS Procedure Code                0
APR DRG Code                      0
APR MDC Code                      0
APR Severity of Illness Code      0
APR Risk of Mortality             0
Birth Weight                      0
Abortion Edit Indicator           0
Emergency Department Indicator    0
Total Charges                     0
dtype: int64

In [13]:
data.dtypes

Facility Id                       float64
Age Group                          object
Gender                             object
Race                               object
Ethnicity                          object
Length of Stay                     object
Type of Admission                  object
Patient Disposition                object
CCS Diagnosis Code                  int64
CCS Procedure Code                  int64
APR DRG Code                        int64
APR MDC Code                        int64
APR Severity of Illness Code        int64
APR Risk of Mortality              object
Birth Weight                        int64
Abortion Edit Indicator            object
Emergency Department Indicator     object
Total Charges                     float64
dtype: object

In [14]:
# Replace '120+' with a maximum value (e.g., 120)
data['Length of Stay'] = data['Length of Stay'].replace('120 +', '120')

# Convert 'Length of Stay' column to numeric type
data['Length of Stay'] = pd.to_numeric(data['Length of Stay'])


In [15]:
x= data.drop(columns=['Length of Stay','Race','Ethnicity','Patient Disposition','CCS Procedure Code','Birth Weight','Abortion Edit Indicator','Emergency Department Indicator'])
y = data['Length of Stay']

# Encode categorical variables if necessary
label_encoders = {}
for column in x.select_dtypes(include='object').columns:
    label_encoders[column] = LabelEncoder()
    x[column] = label_encoders[column].fit_transform(x[column])

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [16]:
data.dtypes

Facility Id                       float64
Age Group                          object
Gender                             object
Race                               object
Ethnicity                          object
Length of Stay                      int64
Type of Admission                  object
Patient Disposition                object
CCS Diagnosis Code                  int64
CCS Procedure Code                  int64
APR DRG Code                        int64
APR MDC Code                        int64
APR Severity of Illness Code        int64
APR Risk of Mortality              object
Birth Weight                        int64
Abortion Edit Indicator            object
Emergency Department Indicator     object
Total Charges                     float64
dtype: object

In [17]:
lr_model = LinearRegression()
lr_model.fit(x_train, y_train)
lr_pred_train = lr_model.predict(x_train)
lr_pred_test  = lr_model.predict(x_test)
lr_round_pred_train = np.ndarray.round(lr_pred_train)
lr_round_pred_test = np.ndarray.round(lr_pred_test)

In [18]:
# MAE
lr_mae_train = mean_absolute_error(y_train, lr_pred_train)
lr_mae_test = mean_absolute_error(y_test, lr_pred_test)
# MSE
lr_mse_train = mean_squared_error(y_train, lr_pred_train)
lr_mse_test = mean_squared_error(y_test, lr_pred_test)
# R square
lr_r2_train = r2_score(y_train, lr_pred_train)
lr_r2_test = r2_score(y_test, lr_pred_test)
# Accuracy
lr_accuracy_train= metrics.accuracy_score(y_train,lr_round_pred_train)
lr_accuracy_test= metrics.accuracy_score(y_test,lr_round_pred_test)

In [19]:
print("Train MAE:", lr_mae_train)
print("Test MAE:", lr_mae_test)
print("Train MSE:", lr_mse_train)
print("Test MSE:", lr_mse_test)
print("Train R-squared:", lr_r2_train)
print("Test R-squared:", lr_r2_test)
print("Train accuracy:", lr_accuracy_train*100)
print("Test accuracy:", lr_accuracy_test*100)

Train MAE: 2.5555343835604196
Test MAE: 2.558588331252548
Train MSE: 28.36962663103303
Test MSE: 28.821831666924062
Train R-squared: 0.559296216459956
Test R-squared: 0.5573833569332336
Train accuracy: 21.342776849784386
Test accuracy: 21.28658235183828


In [20]:

#R2Score
r2=r2_score(y_test,lr_pred_test)
print("R2Score : ",r2)

R2Score :  0.5573833569332336


In [38]:
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(x_train, y_train)
rf_pred_train = rf_model.predict(x_train)
rf_pred_test = rf_model.predict(x_test)

In [39]:
rf_round_pred_train = np.ndarray.round(rf_pred_train)
rf_round_pred_test = np.ndarray.round(rf_pred_test)

In [40]:
# MAE
rf_mae_train = mean_absolute_error(y_train, rf_pred_train)
rf_mae_test = mean_absolute_error(y_test, rf_pred_test)
# MSE
rf_mse_train = mean_squared_error(y_train, rf_pred_train)
rf_mse_test = mean_squared_error(y_test, rf_pred_test)
# R square
rf_r2_train = r2_score(y_train, rf_pred_train)
rf_r2_test = r2_score(y_test, rf_pred_test)
# Accuracy
rf_accuracy_train= metrics.accuracy_score(y_train,rf_round_pred_train)
rf_accuracy_test= metrics.accuracy_score(y_test,rf_round_pred_test)

In [41]:
print("Train MAE:", rf_mae_train)
print("Test MAE:", rf_mae_test)
print("Train MSE:", rf_mse_train)
print("Test MSE:", rf_mse_test)
print("Train R-squared:", rf_r2_train)
print("Test R-squared:", rf_r2_test)
print("Train accuracy:", rf_accuracy_train*100)
print("Test accuracy:", rf_accuracy_test*100)

Train MAE: 0.42712005658531144
Test MAE: 1.1520514342514805
Train MSE: 1.0840954081263576
Test MSE: 7.947786024955103
Train R-squared: 0.9831592796657727
Test R-squared: 0.8779459122920477
Train accuracy: 76.88184986960745
Test accuracy: 49.82635633810019


In [61]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np
log_reg = LogisticRegression(multi_class='ovr').fit(x_train, y_train)
y_train_pred = log_reg.predict(x_train)
y_pred = log_reg.predict(x_test)

test_acc = accuracy_score(y_test, y_pred)
train_acc = accuracy_score(y_train, y_train_pred)

print('Test accuracy:', test_acc)
print('Train accuracy:', train_acc)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Test accuracy: 0.24846383950638326
Train accuracy: 0.2492074178867886


In [21]:
import joblib
joblib.dump(lr_model,'lr.pkl')

['lr.pkl']

['rf.pkl']

In [49]:
unique_values = data['Age Group'].unique()

# Sort the unique values
sorted_unique_values = sorted(unique_values)

# Print the sorted unique values
print(sorted_unique_values)

['0 to 17', '18 to 29', '30 to 49', '50 to 69', '70 or Older']


In [50]:
unique_values = data['Gender'].unique()

# Sort the unique values
sorted_unique_values = sorted(unique_values)

# Print the sorted unique values
print(sorted_unique_values)

['F', 'M', 'U']


In [51]:
unique_values = data['Type of Admission'].unique()

# Sort the unique values
sorted_unique_values = sorted(unique_values)

# Print the sorted unique values
print(sorted_unique_values)

['Elective', 'Emergency', 'Newborn', 'Not Available', 'Trauma', 'Urgent']


In [55]:
illness_string_index={'Minor':1,'Moderate':2,'Major':3,'Extreme':4}
sorted(illness_string_index)

['Extreme', 'Major', 'Minor', 'Moderate']