In [45]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

In [46]:
# Load the data into a DataFrame
data = pd.read_csv('insurance.csv')

In [47]:
le = LabelEncoder()
data['sex'] = le.fit_transform(data['sex'])
data['smoker'] = le.fit_transform(data['smoker'])

In [48]:
# One-hot encode the 'region' column
data = pd.get_dummies(data, columns=['region'])

In [49]:
# Split the data into features and target variable
X = data[['age', 'sex', 'bmi', 'children', 'region_northeast', 'region_northwest', 'region_southeast', 'region_southwest', 'smoker']]
y = data['charges']

In [50]:
# Scale the numerical features
scaler = StandardScaler()
X[['age', 'bmi', 'children']] = scaler.fit_transform(X[['age', 'bmi', 'children']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[['age', 'bmi', 'children']] = scaler.fit_transform(X[['age', 'bmi', 'children']])


In [51]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [52]:
# Train the XGBoost model
model = XGBRegressor()
model.fit(X_train, y_train)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=None, ...)

In [53]:
# Evaluate the model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 25454987.64114038


In [56]:
# Predict insurance charges for new data points
new_data = pd.DataFrame([[59, 1, 27.72, 3, 0, 0, 1, 0, 0]], columns=X.columns)
new_data[['age', 'bmi', 'children']] = scaler.transform(new_data[['age', 'bmi', 'children']])
prediction = model.predict(new_data)
print(f"Predicted Insurance Charges: {prediction}")

Predicted Insurance Charges: [12464.597]


In [55]:
import pickle

# Save the trained model to a file
model_filename = 'xgboost_model.pkl'
with open(model_filename, 'wb') as file:
    pickle.dump(model, file)

print(f"XGBoost model saved as '{model_filename}'.")


XGBoost model saved as 'xgboost_model.pkl'.
