In [1]:
import pandas as pd
url = "https://raw.githubusercontent.com/ogut77/DataScience/master/insurance.csv"
df = pd.read_csv(url)


In [None]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


Context in Insurance Data
This dataset is often used to predict charges based on the other variables (age, sex, bmi, children, smoker, region). For example:

Input Variables (X): age, sex, bmi, children, smoker, region (features used to make predictions).

Output Variable (y): charges (what you’re trying to predict).

Describtion of variables
1. Age
Description: The age of the individual (the insured person).
Type: Numerical (integer).
Example Values: 19, 45, 62, etc.
Role in Insurance: Age is a key factor in determining insurance charges. Older individuals often have higher medical costs (and thus higher charges) due to increased health risks.
2. Sex
Description: The gender of the individual.
Type: Categorical (text or binary).
Example Values: "male," "female"
Role in Insurance: Gender can influence insurance charges because health risks and medical expenses may differ between males and females (e.g., pregnancy-related costs for females).
3. BMI (Body Mass Index)
Description: A measure of body fat based on height and weight (calculated as weight in kg divided by height in meters squared).
Type: Numerical (float).
Example Values: 25.3, 30.1, 18.5, etc.
Role in Insurance: Higher BMI often correlates with increased health risks (e.g., obesity-related conditions like diabetes or heart disease), leading to higher insurance charges.
4. Children
Description: The number of children (dependents) covered under the individual’s insurance plan.
Type: Numerical (integer).
Example Values: 0, 1, 3, etc.
Role in Insurance: More children can increase insurance costs slightly, as it may reflect additional healthcare needs, though the effect is often less pronounced than other factors like smoking or age.
5. Smoker
Description: Indicates whether the individual smokes tobacco.
Type: Categorical (text or binary).
Example Values: "yes," "no" .
Role in Insurance: Smoking is a major factor in insurance charges. Smokers typically have much higher medical costs due to risks like lung disease or cancer, so their charges are significantly elevated.
6. Region
Description: The geographic region where the individual lives.
Type: Categorical (text).
Example Values: "northeast," "southeast," "southwest," "northwest" (common in U.S.-based datasets).
Role in Insurance: Charges can vary by region due to differences in healthcare costs, lifestyle factors, or local insurance regulations.
7. Charges
Description: The insurance charges (or premiums/costs) billed to the individual, typically in a currency like USD.
Type: Numerical (float).
Example Values: 1684.52, 11234.89, 32050.23, etc.
Role in Insurance: This is usually the target variable (output) in predictive modeling. It represents the amount the insurance company charges, influenced by all the other columns (age, sex, BMI, etc.).



In [2]:
#1. Check if there is null value in dataset df (5 pt)
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [3]:
#2. Assign charges to y  and others to X using df. y is output variable and X is input variables (5 pt)
y=df['charges']
X=df.drop('charges',axis=1)

In [4]:
#3. Use  get_dummies() function from the pandas library to convert categorical variables in a DataFrame (X).
# Drop first drops the first category’s dummy variable to avoid multicollinearity (5 pt)
X=pd.get_dummies(X, drop_first=True)


In [5]:
#Use following methods for the evaluation on test and train data
def evalmetric(y,ypred):
 from scipy.stats import pearsonr
 import numpy as np
 e = y - ypred
 mse_f = np.mean(e**2)
 rmse_f = np.sqrt(mse_f)
 mae_f = np.mean(abs(e))
 mape_f = 100*np.mean(abs(e/y))
 crl, _ = pearsonr(y, ypred)
 r2_f = crl*crl
 print("MSE:", mse_f)
 print("RMSE:", rmse_f)
 print("MAE:",mae_f)
 print("MAPE:",mape_f)
 print("R-Squared:", round(r2_f, 4))


In [7]:
#4.Get the correlation between X variables and y variables.(5 pt)
X.corrwith(y)

age                 0.299008
bmi                 0.198341
children            0.067998
sex_male            0.057292
smoker_yes          0.787251
region_northwest   -0.039905
region_southeast    0.073982
region_southwest   -0.043210
dtype: float64

In [9]:
#5.Split a dataset into 25%  of data as test data  and 75% of data as training data ( pt)
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
#6. Using Decision Tree and Linear Regression methods, compare the performance results on both test and training data
#to determine which one is more likely to overfit and which is more likely to underfit.
# Do you think that Lasso and Ridge regularization are more likely to improve the results of Linear model test data,
# or would Random Forest or Boosting methods are more likely to improve the results of Decison tree test data?
#Explain your reasoning.(35 pt)
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression

#Decision Tree
dt = DecisionTreeRegressor()
dt.fit(x_train, y_train)
y_pred = dt.predict(x_test)
print("Decision Tree")
evalmetric(y_test, y_pred)

#Linear Regression
lr = LinearRegression()
lr.fit(x_train, y_train)
y_pred = lr.predict(x_test)
print("Linear Regression")
evalmetric(y_test, y_pred)

#Decision Tree is more likely to overfit because it has a higher MSE, RMSE, MAE, MAPE, and lower R-Squared value than Linear Regression.
#Lasso and Ridge regularization are more likely to improve the results of Linear model test data because they help to reduce overfitting by adding a penalty to the coefficients.
#Random Forest or Boosting methods are more likely to improve the results of Decision tree test data because they help to reduce overfitting by using multiple decision trees to make predictions.
#Random Forest and Boosting methods are ensemble methods that combine multiple decision trees to make predictions, which can help to reduce overfitting and improve the results of Decision tree test data.


In [None]:
#7. Explain performance of linear regressin on test data
# using  Root mean squared error, mean absolute error, mean absolute percentage error and R2 metric (10 pt)
#Linear Regression
print("Linear Regression")
evalmetric(y_test, y_pred)


In [None]:
#8. Use Random Forest and Boosting methods (XGBoost, LightGBM, and CatBoost)
#to obtain the evaluation scores on  test data.
#Which Boosting technique yielded the best performance on the test data based on the R² metric?
#Did you achieve a better result compared to Random Forest on the test data based on the R² metric?
#If there is improvement on Random forest or boosting methods over decison tree, explain  (30 pt)

from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor    
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

#Random Forest
rf = RandomForestRegressor()
rf.fit(x_train, y_train)
y_pred = rf.predict(x_test)
print("Random Forest")
evalmetric(y_test, y_pred)

#XGBoost
xgb = XGBRegressor()
xgb.fit(x_train, y_train)
y_pred = xgb.predict(x_test)
print("XGBoost")
evalmetric(y_test, y_pred)

#LightGBM
lgb = LGBMRegressor()
lgb.fit(x_train, y_train)
y_pred = lgb.predict(x_test)
print("LightGBM")
evalmetric(y_test, y_pred)

#CatBoost

cb = CatBoostRegressor(verbose=0)
cb.fit(x_train, y_train)
y_pred = cb.predict(x_test)
print("CatBoost")
evalmetric(y_test, y_pred)

#CatBoost yielded the best performance on the test data based on the R² metric.
#Yes, I achieved a better result compared to Random Forest on the test data based on the R² metric.
#There is improvement on Random Forest and Boosting methods over Decision Tree because Random Forest and Boosting methods are ensemble methods that combine multiple decision trees to make predictions, which can help to reduce overfitting and improve the results of Decision tree test data.
