# Imports and configurations

In [None]:
import datetime
from math import sqrt

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler

#  (high-level, simple to use)
import plotly.express as px
# (low-level, highly customizable)
import plotly.graph_objects as go

from scipy import stats



In [None]:
data_path = "https://storage.googleapis.com/biosense-ml-data/insurance.csv"
data_path_clean = "https://storage.googleapis.com/biosense-ml-data/insurance_clean.csv"
data_path_clean_no_outliers = "https://storage.googleapis.com/biosense-ml-data/insurance_clean_no_outliers.csv"

In [None]:
# Set Plotly as Pandas plotting backend

pd.options.plotting.backend = "plotly"

# Read the data

In [None]:
column_definitions = {
    'age': np.int8,
    'sex': 'category',
    'bmi': np.float32,
    'children': np.int8,
    'smoker': bool,
    'region': 'category',
    'charges': np.float32
}

In [None]:
df = pd.read_csv(data_path_clean, dtype=column_definitions)

In [None]:
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900000,0,True,southwest,16884.923828
1,18,male,33.770000,1,False,southeast,1725.552246
2,28,male,33.000000,3,False,southeast,4449.461914
3,33,male,22.705000,0,False,northwest,21984.470703
4,32,male,28.879999,0,False,northwest,3866.855225
...,...,...,...,...,...,...,...
1334,50,male,30.969999,3,False,northwest,10600.547852
1335,18,female,31.920000,0,False,northeast,2205.980713
1336,18,female,36.849998,0,False,southeast,1629.833496
1337,21,female,25.799999,0,False,southwest,2007.944946


### ❓ Exercise: Can we improve the model by splitting one model into multiple models?


- Train 2 separate models - one for smokers, one for non-smokers
- Check performance of each model
- Create an inference function that gets the data, checks which model to run, and returns the relevant prediction
- What is the performance of your inference that consists of 2 models?


### Data processing

In [None]:
df = pd.get_dummies(df, columns=['region', 'sex'], drop_first=True)

In [None]:
features = df.columns.drop('charges')
features

Index(['age', 'bmi', 'children', 'smoker', 'region_northwest',
       'region_southeast', 'region_southwest', 'sex_male'],
      dtype='object')

### Non-smokers

In [None]:
df_ns = df[~df['smoker']]
X_ns = df_ns.drop(columns='charges')
y_ns = df_ns['charges']

In [None]:
X_ns_train, X_test_val, y_ns_train, y_test_val = train_test_split(X_ns, y_ns, test_size=0.3, random_state=47)
X_ns_val, X_ns_test, y_ns_val, y_ns_test = train_test_split(X_test_val, y_test_val, test_size=0.5, random_state=47)

In [None]:
X_ns_train.shape, y_ns_val.shape

((745, 8), (160,))

In [None]:
model_ns = LinearRegression()
model_ns.fit(X_ns_train, y_ns_train)

In [None]:
model_ns.score(X_ns_train, y_ns_train)

0.48267555236816406

In [None]:
model_ns.score(X_ns_val, y_ns_val)

0.295568585395813

### Smokers

In [None]:
df_s = df[df['smoker']]
X_s = df_s.drop(columns='charges')
y_s = df_s['charges']

In [None]:
X_s_train, X_test_val, y_s_train, y_test_val = train_test_split(X_s, y_s, test_size=0.3, random_state=47)
X_s_val, X_s_test, y_s_val, y_s_test = train_test_split(X_test_val, y_test_val, test_size=0.5, random_state=47)

In [None]:
X_s_train.shape, X_s_val.shape

((191, 8), (41, 8))

In [None]:
model_s = LinearRegression()
model_s.fit(X_s_train, y_s_train)

In [None]:
model_s.score(X_s_train, y_s_train)

0.7648347020149231

In [None]:
model_s.score(X_s_val, y_s_val)

0.7420283555984497

### Combining two models for inference

In [None]:
def run_inference(X, model_ns, model_s):
    return X.apply(lambda row: model_s.predict(pd.DataFrame([row]))[0] if row['smoker'] else model_ns.predict(pd.DataFrame([row]))[0], axis=1)

In [None]:
X_train = pd.concat([X_ns_train, X_s_train], axis='rows')
y_train = pd.concat([y_ns_train, y_s_train], axis='rows')

In [None]:
X_val = pd.concat([X_ns_val, X_s_val], axis='rows')
y_val = pd.concat([y_ns_val, y_s_val], axis='rows')

In [None]:
X_val.shape, y_val.shape

((201, 8), (201,))

In [None]:
preds_val = run_inference(X_val, model_ns, model_s)

In [None]:
preds_val

In [None]:
metrics.r2_score(y_val, preds_val)

0.8157261873089529

In [None]:
preds_train = run_inference(X_train, model_ns, model_s)

In [None]:
metrics.r2_score(y_train, preds_train)

0.8572138941276404

### ❓ Exercise - Can we improve the model by removing outliers?
- Use IQR method and remove data outliers (by charges)
- Train the same models again
- Did the performance increase without outliers?