In [1]:
!curl -L -o insurance.zip https://www.kaggle.com/api/v1/datasets/download/mirichoi0218/insurance

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 16425  100 16425    0     0  23322      0 --:--:-- --:--:-- --:--:-- 23322


In [2]:
from zipfile import ZipFile

with ZipFile("insurance.zip","r") as f:
    f.extractall()

In [4]:
ls

insurance.csv  insurance.zip  [0m[01;34msample_data[0m/


In [5]:
import pandas as pd
df = pd.read_csv("insurance.csv")

In [6]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [7]:
df.isnull().sum()

Unnamed: 0,0
age,0
sex,0
bmi,0
children,0
smoker,0
region,0
charges,0


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [11]:
df.corr(numeric_only=True)

Unnamed: 0,age,bmi,children,charges
age,1.0,0.109272,0.042469,0.299008
bmi,0.109272,1.0,0.012759,0.198341
children,0.042469,0.012759,1.0,0.067998
charges,0.299008,0.198341,0.067998,1.0


In [12]:
X = df.drop(['charges'],axis=1)
y = df['charges'].copy()

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.linear_model import LinearRegression

In [14]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [15]:
import numpy as np

num_features = X_train.select_dtypes(include=[np.number]).columns
cat_features = X_train.select_dtypes(exclude=[np.number]).columns

In [20]:
num_pipeline = Pipeline ([
    ('impute',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())
])

cat_pipeline = Pipeline ([
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('encoder',OneHotEncoder(sparse_output=False,handle_unknown='ignore'))
])

transformer = ColumnTransformer ([
    ('num',num_pipeline,num_features),
    ('cat',cat_pipeline,cat_features)
],remainder='passthrough')

estimator = LinearRegression()

pipeline = Pipeline ([
    ('transformer',transformer),
    ('estimator',estimator)
])

In [21]:
pipeline.fit(X_train,y_train)

In [27]:
from sklearn.metrics import mean_absolute_error,mean_squared_error,root_mean_squared_error,r2_score

In [24]:
preds = pipeline.predict(X_test)

In [39]:
n = df.value_counts().sum()
p = len(df.columns)

In [42]:
p


7

In [43]:
print('==== Scores ====')
print('Train Score: ',pipeline.score(X_train,y_train))
print('Test Score: ',pipeline.score(X_test,y_test))
print('MAE: ',mean_absolute_error(y_test,preds))
print("MSE: ",mean_squared_error(y_test,preds))
print("RMSE: ",root_mean_squared_error(y_test,preds))
print("R_squared",r2_score(y_test,preds))
print("R_adjust: ", 1 - ((1 - r2_score(y_test,preds)) * (n-1)) / (n - p -1))

==== Scores ====
Train Score:  0.7458519465555833
Test Score:  0.7673596886834448
MAE:  3639.378115577046
MSE:  31385284.868961472
RMSE:  5602.257122710584
R_squared 0.7673596886834448
R_adjust:  0.7661352659923051
