## Medical Cost Prediction


Given patient data, let's try to predict the charges a given will incur.

We will use a variety of linear regression models to make our predictions.

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler,MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, RidgeCV, LassoCV, ElasticNetCV

In [2]:
data = pd.read_csv("C:/Users/Ashraf/Documents/Extra-folder/insurance.csv")

In [3]:
data

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


## Preprocessing

In [5]:
print('Total missing values:', data.isna().sum().sum())

Total missing values: 0


In [6]:
data['children'] = data['children'].astype(str)

In [7]:
print('Total non-numeric columns:', len(data.select_dtypes('object').columns))

Total non-numeric columns: 4


In [8]:
{column: list(data[column].unique()) for column in data.select_dtypes('object').columns}

{'sex': ['female', 'male'],
 'children': ['0', '1', '3', '2', '5', '4'],
 'smoker': ['yes', 'no'],
 'region': ['southwest', 'southeast', 'northwest', 'northeast']}

In [9]:
def binary_encode(df, column, positive_value):
    df = df.copy()
    df[column] = df[column].apply(lambda x:1 if x == positive_value else 0)
    return df

def onehot_encode(df, column, prefix):
    df = df.copy()
    dummies = pd.get_dummies(df[column], prefix=prefix)
    df = pd.concat([df, dummies],axis=1)
    df = df.drop(column, axis=1)
    return df

In [10]:
def preprocess_inputs(df, scaler, train_size=0.7):
    
    df = df.copy()
    # Binary encode sex and smoker columns
    df = binary_encode(df, 'sex', 'male')
    df = binary_encode(df,'smoker', 'yes')
    
    # onehot_encode the children and region columns
    df = onehot_encode(df, 'children', 'ch')
    df = onehot_encode(df, 'region', 're')
    
    # Split df in X and y
    y = df['charges'].copy()
    X = df.drop('charges', axis=1).copy()
    
    # Scale X with the given scaler
    X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
    
    # Split into train and test set
    X_train,X_test,y_train,y_test = train_test_split(X,y, train_size=train_size, random_state=42)
    return X_train,X_test,y_train,y_test

In [11]:
data

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [12]:
X_train,X_test,y_train,y_test = preprocess_inputs(data, StandardScaler(), train_size=0.7)

## Training

In [13]:
models = {
    '         OLS Model:': LinearRegression(),
    '          L2 Model:': Ridge(),
    '          L1 Model:': Lasso(),
    '  ElasticNet Model:': ElasticNet(),
    '        L2CV Model:': RidgeCV(),
    '        L1CV MOdel:': LassoCV(),
    'ElasticNetCV Model:': ElasticNetCV()
 
}

for model in models.values():
    model.fit(X_train, y_train)

In [14]:
print('Model R^2 Scores:\n-----------------')

for name, model in models.items():
    print(name, model.score(X_test,y_test))

Model R^2 Scores:
-----------------
         OLS Model: 0.7672035409971123
          L2 Model: 0.7675373226475438
          L1 Model: 0.7675489914078932
  ElasticNet Model: 0.68048432066751
        L2CV Model: 0.7675373226475903
        L1CV MOdel: 0.7673201533250288
ElasticNetCV Model: 0.14020178648776382
