# Imports and configurations

In [1]:
import datetime
from math import sqrt

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler

#  (high-level, simple to use)
import plotly.express as px
# (low-level, highly customizable)
import plotly.graph_objects as go

from scipy import stats



In [2]:
data_path = "https://storage.googleapis.com/biosense-ml-data/insurance.csv"
data_path_clean = "https://storage.googleapis.com/biosense-ml-data/insurance_clean.csv"
data_path_clean_no_outliers = "https://storage.googleapis.com/biosense-ml-data/insurance_clean_no_outliers.csv"

In [3]:
# Set Plotly as Pandas plotting backend

pd.options.plotting.backend = "plotly"

In [21]:
np.set_printoptions(precision=2, suppress=True)

# Read the data

In [4]:
column_definitions = {
    'age': np.int8,
    'sex': 'category',
    'bmi': np.float32,
    'children': np.int8,
    'smoker': bool,
    'region': 'category',
    'charges': np.float32
}

In [5]:
df = pd.read_csv(data_path_clean, dtype=column_definitions)

In [None]:
df.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

# Train Linear Regression

In [35]:
X = df[['age', 'sex', 'bmi', 'smoker', 'region', 'children']]
y = df['charges']

X = pd.get_dummies(X, columns=['region', 'sex'], drop_first=True)

In [36]:
X_train, X_test_val, y_train, y_test_val = train_test_split(X, y, test_size=0.3, random_state=47)
X_val, X_test, y_val, y_test = train_test_split(X_test_val, y_test_val, test_size=0.5, random_state=47)

model = LinearRegression()
model.fit(X_train, y_train)

In [37]:
model.score(X_train, y_train)

0.7473715543746948

In [38]:
model.score(X_val, y_val)

0.7314801216125488

In [41]:
model.coef_

array([258.5, 317.1, 23600.0, 422.0, 591.4, -538.3, 7.1, -113.2],
      dtype=float32)

In [None]:
model.intercept_

-11790.111

In [42]:
weights = pd.DataFrame(model.coef_, index=X.columns, columns=['weight'])

In [43]:
weights.sort_values(by='weight', ascending=False)

Unnamed: 0,weight
smoker,23599.974609
region_northwest,591.429871
children,422.01709
bmi,317.129578
age,258.514221
region_southwest,7.086417
sex_male,-113.182938
region_southeast,-538.333618


# Normalization

### min-max

In [None]:
X = df[['age', 'bmi', 'smoker', 'region']]
y = df['charges']

X = pd.get_dummies(X, columns=['region'], drop_first=True)

In [None]:
X_train, X_test_val, y_train, y_test_val = train_test_split(X, y, test_size=0.3, random_state=47)
X_val, X_test, y_val, y_test = train_test_split(X_test_val, y_test_val, test_size=0.5, random_state=47)

In [None]:
minmax_scaler = MinMaxScaler()

In [None]:
minmax_scaler.fit(X_train)

In [None]:
minmax_scaler.transform(X_train)

array([[0.5, 0.3, 1.0, 1.0, 0.0, 0.0],
       [0.6, 0.6, 1.0, 0.0, 1.0, 0.0],
       [0.8, 0.3, 0.0, 1.0, 0.0, 0.0],
       ...,
       [0.4, 0.3, 0.0, 0.0, 0.0, 1.0],
       [0.6, 0.6, 1.0, 1.0, 0.0, 0.0],
       [0.0, 0.4, 0.0, 0.0, 0.0, 0.0]], dtype=float32)

In [None]:
X_train_scaled = pd.DataFrame(minmax_scaler.transform(X_train), columns=X.columns)

In [None]:
model = LinearRegression()
model.fit(X_train_scaled, y_train)

In [None]:
model.score(X_train_scaled, y_train)

0.7455049753189087

In [None]:
X_val_scaled = pd.DataFrame(minmax_scaler.transform(X_val), columns=X.columns)

In [None]:
model.score(X_val_scaled, y_val)

0.727748453617096

In [None]:
model.coef_

array([11991.3, 11793.5, 23617.7, 631.9, -545.8, 12.3], dtype=float32)

In [None]:
model.intercept_

-1721.1562

### Z-score

**Interpretability and Coefficient Comparison:**


- Unnormalized Features: With unnormalized features, you cannot directly compare the magnitudes of coefficients to judge feature importance. A larger coefficient does not necessarily mean that feature is "more important" in an absolute sense. It might just mean that feature is on a smaller scale.


- Normalized Features: After normalization (especially standardization or making features range from 0 to 1), the coefficients become more directly comparable in terms of their relative impact on the target variable **within the context of the normalized features**. If you standardize features (zero mean, unit variance), and you find that the absolute value of w1 is significantly larger than w2, it suggests that Feature 1 has a relatively stronger influence after accounting for their typical variations.

In [6]:
X = df[['age', 'bmi', 'smoker', 'region']]
y = df['charges']

X = pd.get_dummies(X, columns=['region'], drop_first=True)

In [7]:
X_train, X_test_val, y_train, y_test_val = train_test_split(X, y, test_size=0.3, random_state=47)
X_val, X_test, y_val, y_test = train_test_split(X_test_val, y_test_val, test_size=0.5, random_state=47)

In [8]:
z_scaler = StandardScaler()

In [9]:
z_scaler.fit(X_train)

In [10]:
z_scaler.mean_

array([39.36392743, 30.55909287,  0.20384205,  0.24226254,  0.26360726,
        0.25080043])

In [11]:
z_scaler.var_

array([1.97469478e+02, 3.77456021e+01, 1.62290468e-01, 1.83571402e-01,
       1.94118471e-01, 1.87899573e-01])

In [12]:
X_train_scaled = pd.DataFrame(z_scaler.transform(X_train),  columns=X_train.columns)

In [13]:
model = LinearRegression()
model.fit(X_train_scaled, y_train)

In [14]:
model.score(X_train_scaled, y_train)

0.7455049753189087

In [15]:
X_val_scaled = pd.DataFrame(z_scaler.transform(X_val),  columns=X_val.columns)

In [16]:
model.score(X_val_scaled, y_val)

0.727748453617096

In [30]:
weights = pd.DataFrame(model.coef_, index=X.columns, columns=['weight'])

In [34]:
weights.sort_values(by='weight', ascending=False)

Unnamed: 0,weight
smoker,9514.46582
age,3663.193848
bmi,1949.332031
region_northwest,270.739594
region_southwest,5.330894
region_southeast,-240.464142


In [23]:
model.intercept_

13306.697