In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score



In [3]:
# Reading the Dataset
dataset = pd.read_csv('insurance_pre.csv')


In [7]:
dataset


Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.900,0,yes,16884.92400
1,18,male,33.770,1,no,1725.55230
2,28,male,33.000,3,no,4449.46200
3,33,male,22.705,0,no,21984.47061
4,32,male,28.880,0,no,3866.85520
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,10600.54830
1334,18,female,31.920,0,no,2205.98080
1335,18,female,36.850,0,no,1629.83350
1336,21,female,25.800,0,no,2007.94500


In [4]:
#One-hot encoding categorical variables
datasets = pd.get_dummies(dataset, drop_first=True)

In [8]:
datasets

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.900,0,16884.92400,False,True
1,18,33.770,1,1725.55230,True,False
2,28,33.000,3,4449.46200,True,False
3,33,22.705,0,21984.47061,True,False
4,32,28.880,0,3866.85520,True,False
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,True,False
1334,18,31.920,0,2205.98080,False,False
1335,18,36.850,0,1629.83350,False,False
1336,21,25.800,0,2007.94500,False,False


In [9]:
indep = datasets[['age', 'sex_male', 'bmi', 'children', 'smoker_yes']]
dep = datasets[['charges']]

In [10]:
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(indep, dep, test_size=1/3, random_state=0)

In [12]:
# Initializing and training the LGBMRegressor
regressor = LGBMRegressor()
regressor = regressor.fit(X_train, y_train.values.ravel())


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000152 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 312
[LightGBM] [Info] Number of data points in the train set: 892, number of used features: 5
[LightGBM] [Info] Start training from score 13138.323530


In [13]:
# Predicting the target variable for the test data
y_pred = regressor.predict(X_test)

# Calculating the R-squared score
r_score = r2_score(y_test, y_pred)
print(f'R-squared score: {r_score:.2f}')

R-squared score: 0.87


In [14]:
# Exploring LGBMRegressor attributes
print(f'Number of estimators: {regressor.n_estimators_}')
print(f'Feature importances: {regressor.feature_importances_}')


Number of estimators: 100
Feature importances: [ 923  149 1590  271   67]


In [15]:
# Taking user input for prediction
age = int(input("age: "))
sex = float(input("sex (female=0, male=1): "))
bmi = float(input("bmi: "))
children = int(input("children: "))
smoker = int(input("smoker (0 or 1): "))

age:  56
sex (female=0, male=1):  1
bmi:  11
children:  1
smoker (0 or 1):  0


In [16]:
input_data = pd.DataFrame([[age, sex, bmi, children, smoker]], columns=['age', 'sex_male', 'bmi', 'children', 'smoker_yes'])

In [17]:
# Predicting the charges based on user input
predicted_charge = regressor.predict(input_data)
print(f'Predicted charge: {predicted_charge[0]:.2f}')

Predicted charge: 8974.88
