In [1]:
import pandas as pd

# Load the dataset
file_path = './health_claims.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset to understand its structure
data.head()

Unnamed: 0.1,Unnamed: 0,BeneID,ClaimID,ClaimStartDt,ClaimEndDt,Provider,InscClaimAmtReimbursed,AttendingPhysician,OperatingPhysician,OtherPhysician,...,ChronicCond_Diabetes,ChronicCond_IschemicHeart,ChronicCond_Osteoporasis,ChronicCond_rheumatoidarthritis,ChronicCond_stroke,IPAnnualReimbursementAmt,IPAnnualDeductibleAmt,OPAnnualReimbursementAmt,OPAnnualDeductibleAmt,PotentialFraud
0,275022,BENE78279,CLM704301,2009-11-28,2009-11-28,PRV55215,40,PHY386653,,,...,1,1,1,2,2,0,0,6530,360,Yes
1,84572,BENE23552,CLM352275,2009-05-11,2009-05-11,PRV51081,500,PHY325118,,,...,1,2,2,2,2,0,0,1700,1530,No
2,234874,BENE66787,CLM441102,2009-06-28,2009-06-28,PRV53733,60,PHY412216,,PHY412216,...,1,2,2,1,2,0,0,3160,1100,No
3,412183,BENE117455,CLM425806,2009-06-20,2009-06-23,PRV52090,400,PHY402729,PHY402729,PHY402729,...,2,1,1,2,2,0,0,400,0,No
4,471585,BENE134333,CLM743731,2009-12-24,2009-12-24,PRV57014,30,PHY325488,,,...,1,1,2,2,1,0,0,2920,90,No


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Select relevant columns
selected_columns = [
    'Gender', 'Race',
    'ChronicCond_Alzheimer', 'ChronicCond_Heartfailure', 'ChronicCond_KidneyDisease',
    'ChronicCond_Cancer', 'ChronicCond_ObstrPulmonary', 'ChronicCond_Depression',
    'ChronicCond_Diabetes', 'ChronicCond_IschemicHeart', 'ChronicCond_Osteoporasis',
    'ChronicCond_rheumatoidarthritis', 'ChronicCond_stroke',
    'IPAnnualReimbursementAmt', 'IPAnnualDeductibleAmt',
    'OPAnnualReimbursementAmt', 'OPAnnualDeductibleAmt',
    'PotentialFraud'
]

# Subset the data with the selected columns
data_relevant = data[selected_columns + ['InscClaimAmtReimbursed']]

In [3]:
data_relevant.head()

Unnamed: 0,Gender,Race,ChronicCond_Alzheimer,ChronicCond_Heartfailure,ChronicCond_KidneyDisease,ChronicCond_Cancer,ChronicCond_ObstrPulmonary,ChronicCond_Depression,ChronicCond_Diabetes,ChronicCond_IschemicHeart,ChronicCond_Osteoporasis,ChronicCond_rheumatoidarthritis,ChronicCond_stroke,IPAnnualReimbursementAmt,IPAnnualDeductibleAmt,OPAnnualReimbursementAmt,OPAnnualDeductibleAmt,PotentialFraud,InscClaimAmtReimbursed
0,1,1,2,1,1,1,1,1,1,1,1,2,2,0,0,6530,360,Yes,40
1,2,1,2,1,2,2,2,2,1,2,2,2,2,0,0,1700,1530,No,500
2,1,1,2,2,1,2,2,2,1,2,2,1,2,0,0,3160,1100,No,60
3,1,1,2,2,2,2,2,2,2,1,1,2,2,0,0,400,0,No,400
4,2,1,1,1,1,1,1,1,1,1,2,2,1,0,0,2920,90,No,30


In [4]:
# Handle missing values
imputer = SimpleImputer(strategy='most_frequent')
data_imputed = pd.DataFrame(imputer.fit_transform(data_relevant), columns=data_relevant.columns)

# Encode categorical variables
label_encoder = LabelEncoder()
data_imputed['Gender'] = label_encoder.fit_transform(data_imputed['Gender'])
data_imputed['Race'] = label_encoder.fit_transform(data_imputed['Race'])
data_imputed['PotentialFraud'] = label_encoder.fit_transform(data_imputed['PotentialFraud'])

In [5]:
# Split the data into training and testing sets
X = data_imputed.drop('InscClaimAmtReimbursed', axis=1)
y = data_imputed['InscClaimAmtReimbursed']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [6]:
# Build a predictive model using Random Forest Regressor
model = RandomForestRegressor(random_state=42)
model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = model.predict(X_test_scaled)

# Evaluate the model's performance
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred,squared=False)

mae, mse, rmse



(1221.8150396038006, 11758539.92271072, 3429.0727496964423)

In [7]:
from sklearn.ensemble import GradientBoostingRegressor

# Build a predictive model using Gradient Boosting Regressor
gb_model = GradientBoostingRegressor(random_state=42)
gb_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred_gb = gb_model.predict(X_test_scaled)

# Evaluate the model's performance
mae_gb = mean_absolute_error(y_test, y_pred_gb)
mse_gb = mean_squared_error(y_test, y_pred_gb)
rmse_gb = mean_squared_error(y_test, y_pred_gb, squared=False)

mae_gb, mse_gb, rmse_gb



(1183.2087145905487, 10786267.795808941, 3284.245392142454)

In [9]:
import joblib

# Serialize the Gradient Boosting Regressor model
model_filename = '../Model_Deployment_And_API/gradient_boosting_model.joblib'
joblib.dump(gb_model, model_filename)

['../Model_Deployment_And_API/gradient_boosting_model.joblib']

In [10]:
scaler_filename = '../Model_Deployment_And_API/scaler.joblib'
joblib.dump(scaler, scaler_filename)

['../Model_Deployment_And_API/scaler.joblib']