# Importing Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


# LOAD AND CLEANINING DATA

In [42]:
# Load the datasets
ahrq_data = pd.read_excel(r"C:\Users\srava\Downloads\Agency for Healthcare Research and Quality.csv.xlsx")
medicare_spending = pd.read_excel(r"C:\Users\srava\Downloads\Medicare Spending Per Patient.csv.xlsx")
hospital_data = pd.read_excel(r"C:\Users\srava\Downloads\Hospital_Data.csv.xlsx")

# Merge datasets on common column (e.g., "Provider Number" or "Provider ID")
medicare_spending.rename(columns={"Provider ID": "Provider Number"}, inplace=True)
merged_data = ahrq_data.merge(hospital_data, on="Provider Number", how="inner")
merged_data = merged_data.merge(medicare_spending, on="Provider Number", how="inner")


# DATA PREPARATION

In [44]:
import pandas as pd
import numpy as np


# Standardize column names for merging on the provider number
medicare_spending.rename(columns={"Provider ID": "Provider Number"}, inplace=True)
merged_data = ahrq_data.merge(hospital_data, on="Provider Number", how="inner")
merged_data = merged_data.merge(medicare_spending, on="Provider Number", how="inner")

# Select columns based on your target and features
data_for_model = merged_data[[
    "Rate - Serious Complications",
    "Spending per Hospital Patient with Medicare",
    "Hospital Type", "Hospital Ownership", "Emergency Services",
    "Rate - Accidental cuts and tears from medical treatment",
    "Rate - Serious blood clots after surgery"
]]

# Use 'where' to replace "Not Available" in object columns only
data_for_model = data_for_model.apply(lambda x: x.where(x != "Not Available", other=np.nan))

# Drop rows with any missing values
data_for_model = data_for_model.dropna()

# Convert categorical columns to numeric using one-hot encoding
data_for_model = pd.get_dummies(data_for_model, columns=["Hospital Type", "Hospital Ownership", "Emergency Services"], drop_first=True)


# Set Up Features and Target Variable

In [53]:
# Separate features and target variable
X = data_for_model.drop(columns=["Rate - Serious Complications"])
y = data_for_model["Rate - Serious Complications"].astype(float)


# Split Data and Train Model

In [56]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train the model
model = LinearRegression()
model.fit(X_train, y_train)


# Evaluate Model Performance

In [60]:
# Make predictions and calculate performance metrics
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("Mean Absolute Error:", mae)
print("Root Mean Squared Error:", rmse)
print("R-squared:", r2)


Mean Absolute Error: 0.021572483519017818
Root Mean Squared Error: 0.02927620277324723
R-squared: 0.9571515253532772
