Step 1: Data Loading

In [205]:
import pandas as pd
import numpy as np

data = pd.read_csv('Insurance.csv')

Step 2: Data Exploration

In [206]:
print(data.head())  # Display the first few rows of the DataFrame
print(data.info())  # Check for missing values and data types

   Gender  Age  Height  Weight   BMI      BMI Status  Body Fat  \
0    Male   42    1.90    97.3  27.0      Overweight      24.4   
1  Female   39    1.59   106.6  42.2  Morbidly obese      54.5   
2    Male   43    1.73    85.3  28.5      Overweight      32.0   
3  Female   40    1.70   104.2  37.1  Severely obese      48.3   
4    Male   42    1.72    91.0  30.8           Obese      30.3   

   Skeletal Muscle  Body Age  Visceral Fat Blood Pressure  Resting Metabolism  \
0             35.0        66             9         121/79                1944   
1             20.5        72            11         128/76                1753   
2             31.2        58            12         130/87                1788   
3             23.4        63            10         112/74                1798   
4             32.9        60            14         114/70                1886   

   Waist Circumference Charges (UGX)  
0                 38.0     2,045,511  
1                 40.0       172,555  

Step 3: Data Pre-Processing

In [207]:
# data.dropna(inplace=True)  # Drop rows with missing values
# Or
data.fillna(data.mean(), inplace=True)  # Impute missing values with the mean

  data.fillna(data.mean(), inplace=True)  # Impute missing values with the mean


In [208]:
data_encoded = pd.get_dummies(data, columns=['Gender', 'BMI Status'], drop_first=True)

In [209]:
X = data_encoded.drop('Charges (UGX)', axis=1)
y = data_encoded['Charges (UGX)']
data_encoded.columns

Index(['Age', 'Height', 'Weight', 'BMI', 'Body Fat', 'Skeletal Muscle',
       'Body Age', 'Visceral Fat', 'Blood Pressure', 'Resting Metabolism',
       'Waist Circumference', 'Charges (UGX)', 'Gender_Male',
       'BMI Status_Morbidly obese', 'BMI Status_Normal', 'BMI Status_Obese',
       'BMI Status_Overweight', 'BMI Status_Severely Obese',
       'BMI Status_Severely obese', 'BMI Status_Severely obse',
       'BMI Status_Severly Obese', 'BMI Status_Underweight',
       'BMI Status_overwight'],
      dtype='object')

In [210]:
# Convert 'Charges (UGX)' column to float
data_encoded['Charges (UGX)'] = data_encoded['Charges (UGX)'].str.replace(',', '').astype(float)

# Convert 'Blood Pressure' column to string type and handle non-string values
data_encoded['Blood Pressure'] = data_encoded['Blood Pressure'].astype(str)
data_encoded['Blood Pressure'] = data_encoded['Blood Pressure'].replace('nan', np.nan)

# Preprocess the blood pressure column
data_encoded['Systolic Pressure'] = data_encoded['Blood Pressure'].str.split('/').str[0].astype(float)
data_encoded['Diastolic Pressure'] = data_encoded['Blood Pressure'].str.split('/').str[1].astype(float)

# Drop the original 'Blood Pressure' column
data_encoded.drop('Blood Pressure', axis=1, inplace=True)

Step 4: Model Training

In [211]:
from sklearn.linear_model import LinearRegression

X = data_encoded.drop('Charges (UGX)', axis=1)
y = data_encoded['Charges (UGX)']
print("Training columns:", X.columns)

model = LinearRegression()
model.fit(X, y)


Training columns: Index(['Age', 'Height', 'Weight', 'BMI', 'Body Fat', 'Skeletal Muscle',
       'Body Age', 'Visceral Fat', 'Resting Metabolism', 'Waist Circumference',
       'Gender_Male', 'BMI Status_Morbidly obese', 'BMI Status_Normal',
       'BMI Status_Obese', 'BMI Status_Overweight',
       'BMI Status_Severely Obese', 'BMI Status_Severely obese',
       'BMI Status_Severely obse', 'BMI Status_Severly Obese',
       'BMI Status_Underweight', 'BMI Status_overwight', 'Systolic Pressure',
       'Diastolic Pressure'],
      dtype='object')


LinearRegression()

Step 5: Prediction

In [212]:
# Example input values for a new individual
new_data = pd.DataFrame({
    'Age': [42],
    'Height': [1.9],
    'Weight': [97.3],
    'BMI': [27],
    'Body Fat': [24.4],
    'Skeletal Muscle': [35],
    'Body Age': [66],
    'Visceral Fat': [9],
    'Resting Metabolism': [1944],
    'Waist Circumference': [38],
    'Gender_Male': [1],
    'BMI Status_Overweight': [1]
})

# Apply the same one-hot encoding for the new individual
new_data_encoded = pd.get_dummies(new_data)

# Make sure the new data columns match the training columns
missing_cols = set(X.columns) - set(new_data_encoded.columns)
for col in missing_cols:
    new_data_encoded[col] = 0
new_data_encoded = new_data_encoded[X.columns]

# Make the prediction using the trained model
prediction = model.predict(new_data_encoded)
print('Predicted Insurance Charge:', prediction)


Predicted Insurance Charge: [1281910.70312351]


In [213]:
import pickle

# Save the trained model to a file
with open('linear_regression_model.pkl', 'wb') as file:
    pickle.dump(model, file)

In [221]:
import pickle
import pandas as pd

# Load the saved model from file
with open('linear_regression_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

# Define the new data
new_data = pd.DataFrame({
    'Age': [25],
    'Height': [1.9],
    'Weight': [87.3],
    'BMI': [27],
    'Body Fat': [24.4],
    'Skeletal Muscle': [35],
    'Body Age': [66],
    'Visceral Fat': [9],
    'Resting Metabolism': [1944],
    'Waist Circumference': [38],
    'Gender_Male': [0],
    'BMI Status_Overweight': [1]
})

# Apply the same preprocessing steps to the new data as done during training
new_data_encoded = pd.get_dummies(new_data)

# Ensure the new data columns match the training columns
missing_cols = set(X.columns) - set(new_data_encoded.columns)
for col in missing_cols:
    new_data_encoded[col] = 0
new_data_encoded = new_data_encoded[X.columns]

# Make the prediction using the loaded model
prediction = loaded_model.predict(new_data_encoded)
print('Predicted Insurance Charge:', prediction)


Predicted Insurance Charge: [1486504.24217834]
