# Predicting the cost of health insurance for a person

We are required to:
- Understand a new dataset.
- Process it by applying exploratory data analysis (EDA).
- Model the data using logistic regression.
- Analyze the results and optimize the model if possible.

In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load the data
path = 'C:/Users/Jorge Payà/Desktop/4Geeks/DSML Bootcamp/linear-regression-project/data/raw/data.csv'
total_data = pd.read_csv(path)
total_data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


<ul style="color: pink; font-family: Comic Sans MS">
    <li>age. Age of primary beneficiary (numeric)</li>
    <li>sex. Gender of the primary beneficiary (categorical)</li>
    <li>bmi. Body mass index (numeric)</li>
    <li>children. Number of children/dependents covered by health insurance (numeric)</li>
    <li>smoker. smoker (categorical)</li>
    <li>region. Beneficiary's residential area in the U.S.: northeast, southeast, southwest, northwest (categorical)</li>
    <li>charges. Health insurance premium (numerical)</li>
</ul>

In [9]:
# Perform the EDA
# 1. Check the data types
print('Data types:\n',total_data.dtypes)
print('*'*50)
# 2. Check for missing values
print('Missing values:\n', total_data.isnull().sum())
print('*'*50)
# 3. Check for duplicates
print('Duplicates:\n',total_data.duplicated().sum())
print('*'*50)
# print the duplicated values
print('Duplicated values:\n',total_data[total_data.duplicated()])


Data types:
 age           int64
sex          object
bmi         float64
children      int64
smoker       object
region       object
charges     float64
dtype: object
**************************************************
Missing values:
 age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64
**************************************************
Duplicates:
 1
**************************************************
Duplicated values:
      age   sex    bmi  children smoker     region    charges
581   19  male  30.59         0     no  northwest  1639.5631


In [10]:
# 4. Describe data
print('Describe data:\n',total_data.describe())

Describe data:
                age          bmi     children       charges
count  1338.000000  1338.000000  1338.000000   1338.000000
mean     39.207025    30.663397     1.094918  13270.422265
std      14.049960     6.098187     1.205493  12110.011237
min      18.000000    15.960000     0.000000   1121.873900
25%      27.000000    26.296250     0.000000   4740.287150
50%      39.000000    30.400000     1.000000   9382.033000
75%      51.000000    34.693750     2.000000  16639.912515
max      64.000000    53.130000     5.000000  63770.428010


In [12]:
# # 5. Check for outliers
# # 5.1 Check for outliers in the numerical columns
# # 5.1.1 Create a list of numerical columns
# numerical_columns = total_data.select_dtypes(include=np.number).columns.tolist()
# print('Numerical columns:',numerical_columns)
# print('*'*50)

In [13]:
total_data["sex_n"] = pd.factorize(total_data["sex"])[0]
total_data["smoker_n"] = pd.factorize(total_data["smoker"])[0]
total_data["region_n"] = pd.factorize(total_data["region"])[0]
total_data.head()


Unnamed: 0,age,sex,bmi,children,smoker,region,charges,sex_n,smoker_n,region_n
0,19,female,27.9,0,yes,southwest,16884.924,0,0,0
1,18,male,33.77,1,no,southeast,1725.5523,1,1,1
2,28,male,33.0,3,no,southeast,4449.462,1,1,1
3,33,male,22.705,0,no,northwest,21984.47061,1,1,2
4,32,male,28.88,0,no,northwest,3866.8552,1,1,2


In [14]:
# remove the original columns for sex, smoker and region
total_data = total_data.drop(columns=["sex", "smoker", "region"], axis=1)
total_data.head() 

Unnamed: 0,age,bmi,children,charges,sex_n,smoker_n,region_n
0,19,27.9,0,16884.924,0,0,0
1,18,33.77,1,1725.5523,1,1,1
2,28,33.0,3,4449.462,1,1,1
3,33,22.705,0,21984.47061,1,1,2
4,32,28.88,0,3866.8552,1,1,2


In [25]:
# Perform linear regression 
# 1. Import the libraries
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.metrics import r2_score

# 2. Create the features and target
X = total_data.drop(columns=["charges"], axis=1)
y = total_data["charges"]

# 3. Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# 4. Create the model
model = LinearRegression()

# 5. Train the model
model.fit(X_train, y_train)

# 6. Make predictions
y_pred = model.predict(X_test)

# 7. Evaluate the model
# 7.1. Calculate the R2 score
r2 = r2_score(y_test, y_pred)
print('R2 score:', r2)

# 7.2. Calculate the Mean Absolute Error
mae = metrics.mean_absolute_error(y_test, y_pred)
print('Mean Absolute Error:', mae)

# 7.3. Calculate the Mean Squared Error
mse = metrics.mean_squared_error(y_test, y_pred)
print('Mean Squared Error:', mse)

# 7.4. Calculate the Root Mean Squared Error
rmse = np.sqrt(mse)
print('Root Mean Squared Error:', rmse)

# # 8. Plot the results
# # 8.1. Create a dataframe with the actual and predicted values
# results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
# print('Results:\n', results)

# # 8.2. Plot the actual and predicted values
# results.plot(kind='bar', figsize=(10, 8))
# plt.show()

# # 8.3. Plot the residuals
# residuals = y_test - y_pred
# sns.displot(residuals, kde=True)
# plt.show()

# # 8.4. Plot the actual vs predicted values
# plt.scatter(y_test, y_pred)
# plt.xlabel('Actual')
# plt.ylabel('Predicted')
# plt.show()



R2 score: 0.799874714544996
Mean Absolute Error: 3930.3332739011385
Mean Squared Error: 31845929.134159423
Root Mean Squared Error: 5643.2197488809015
