In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np

In [10]:
df = pd.read_csv('data/raw/insurance.csv')

In [11]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,18,female,33.82,0,no,southeast,1630.6617
1,19,female,23.48,1,no,southeast,1836.8043
2,46,male,30.57,2,no,southeast,6632.3513
3,54,male,32.05,1,yes,southeast,31922.4295
4,21,male,21.345,4,no,northeast,1638.37255


In [12]:
df.shape

(50000, 7)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       50000 non-null  int64  
 1   sex       50000 non-null  object 
 2   bmi       50000 non-null  float64
 3   children  50000 non-null  int64  
 4   smoker    50000 non-null  object 
 5   region    50000 non-null  object 
 6   charges   50000 non-null  float64
dtypes: float64(2), int64(2), object(3)
memory usage: 2.7+ MB


In [14]:
df.describe(include='all')

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
count,50000.0,50000,50000.0,50000.0,50000,50000,50000.0
unique,,2,,,2,4,
top,,male,,,no,southeast,
freq,,25176,,,38976,14197,
mean,39.46312,,30.713734,1.11376,,,13343.216363
std,14.117142,,6.092727,1.212835,,,12131.222744
min,18.0,,17.291,0.0,,,1137.5359
25%,27.0,,26.6,0.0,,,4694.4318
50%,40.0,,30.3,1.0,,,9399.232775
75%,51.0,,34.57,2.0,,,17340.746925


In [15]:
df_cleaned = df.copy()

In [16]:
target = df_cleaned.pop('charges')

In [17]:
num_cols = list(df_cleaned.select_dtypes('number').columns)
cat_cols = list(set(df_cleaned.columns) - set(num_cols))

In [18]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [19]:
ohe = OneHotEncoder(sparse_output=False, drop='first')

In [20]:
features = ohe.fit_transform(df_cleaned[cat_cols])

In [21]:
features = pd.DataFrame(features, columns=ohe.get_feature_names_out())

In [22]:
scaler = StandardScaler()

In [23]:
features[num_cols] = scaler.fit_transform(df_cleaned[num_cols])

In [24]:
from joblib import dump

In [26]:
dump(ohe, 'models/ohe.joblib')
dump(scaler, 'models/scaler.joblib')

['models/scaler.joblib']

In [27]:
from sklearn.model_selection import train_test_split

In [28]:
X_data, X_test, y_data, y_test = train_test_split(features, target, test_size=0.2, random_state=8)
X_train, X_val, y_train, y_val = train_test_split(X_data, y_data, test_size=0.2, random_state=8)

In [29]:
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

(32000, 8)
(8000, 8)
(10000, 8)


In [30]:
print(y_train.shape)
print(y_val.shape)
print(y_test.shape)

(32000,)
(8000,)
(10000,)


In [32]:
X_train.to_csv('data/processed/X_train.csv', index=False)
X_val.to_csv('data/processed/X_val.csv', index=False)
X_test.to_csv('data/processed/X_test.csv', index=False)
y_train.to_csv('data/processed/y_train.csv', index=False)
y_val.to_csv('data/processed/y_val.csv', index=False)
y_test.to_csv('data/processed/y_test.csv', index=False)

In [33]:
pred_value = y_train.mean()

In [34]:
y_base = np.full((len(y_train), 1), pred_value)

In [35]:
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae