In [3]:
# mount drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# imports
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

In [5]:
# load data
path = '/content/drive/MyDrive/Coding Dojo/Raw Data/insurance.csv'
df = pd.read_csv(path)
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


How well can the charge be predicted based on age, sex, BMI, number of children, smoking habit, and region of the patient?

In [7]:
# define features (X) and target (y)
y = df['charges']
X = df.drop(columns = 'charges')

In [8]:
# train test split to prepare for machine learning
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [9]:
# categorical selector
cat_selector = make_column_selector(dtype_include = 'object')

In [10]:
# view categorical columns
cat_data = X_train[cat_selector(X_train)]
cat_data

Unnamed: 0,sex,smoker,region
693,male,no,northwest
1297,female,no,southeast
634,male,no,southwest
1022,male,yes,southeast
178,female,no,southwest
...,...,...,...
1095,female,no,northeast
1130,female,no,southeast
1294,male,no,northeast
860,female,yes,southwest


The 'age' feature is numerical.

The 'sex' feature is nominal.

The 'bmi' feature is numerical.

The 'children' feature is numerical.

The 'smoker' feature is nominal.

The 'region' feature is nominal.

In [11]:
# encode nominal features using OneHotEncoder
ohe_encoder = OneHotEncoder(sparse = False, handle_unknown = 'ignore')
ohe_encoder.fit(cat_data)
nom_ohe = ohe_encoder.transform(cat_data)
nom_ohe

array([[0., 1., 1., ..., 1., 0., 0.],
       [1., 0., 1., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 0., 1.],
       ...,
       [0., 1., 1., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 1.],
       [0., 1., 1., ..., 0., 0., 1.]])

In [12]:
# view in DataFrame to understand
pd.DataFrame(nom_ohe, columns = ohe_encoder.get_feature_names(cat_data.columns))

Unnamed: 0,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
1,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
3,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
4,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...
998,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
999,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1000,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
1001,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
