## Test Dataset
[dataset](https://www.kaggle.com/mirichoi0218/insurance)

In [3]:
import pandas as pd
from sklearn import linear_model
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from collections import defaultdict

In [2]:
data = pd.read_csv('insurance.csv')

In [4]:
data

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [5]:
train, test = train_test_split(data, test_size=0.3, random_state=42)
train.to_csv('train.csv', index=False)
test.to_csv('test.csv', index=False)

In [36]:
categorical_columns = ['sex','children', 'smoker', 'region']
data = pd.get_dummies(data, prefix='OHE', columns=categorical_columns, drop_first=True)

In [37]:
model = linear_model.LinearRegression()

In [46]:
X, y = data.drop('charges', axis=1), data.charges

In [48]:
kfold = KFold(shuffle=True, random_state=42)
cv_loss = defaultdict(list)
for i, (train_idx, test_idx) in enumerate(kfold.split(X, y)):
    print(f'Start {i} fold')
    train_X, train_y = X.iloc[train_idx], y.iloc[train_idx]
    print(f'train shape: {train_X.shape}')
    assert train_X.shape[0] == train_y.shape[0]
    test_X, test_y = X.iloc[test_idx], y.iloc[test_idx]
    print(f'test shape: {test_X.shape}')
    assert test_X.shape[0] == test_y.shape[0]
    assert all(train_shape == test_shape for
               train_shape, test_shape in zip(train_X.shape[-1:], test_X.shape[-1:])) 
    
    print('Start model train')
    model.fit(train_X, train_y)
    print('Start pred')
    train_pred = model.predict(train_X)
    test_pred = model.predict(test_X)
    cv_loss['train_rmse'].append(mean_squared_error(train_y, train_pred, squared=False))
    cv_loss['train_r2'].append(r2_score(train_y, train_pred))
    cv_loss['test_rmse'].append(mean_squared_error(test_y, test_pred, squared=False))
    cv_loss['test_r2'].append(r2_score(test_y, test_pred))

Start 0 fold
train shape: (1070, 12)
test shape: (268, 12)
Start model train
Start pred
Start 1 fold
train shape: (1070, 12)
test shape: (268, 12)
Start model train
Start pred
Start 2 fold
train shape: (1070, 12)
test shape: (268, 12)
Start model train
Start pred
Start 3 fold
train shape: (1071, 12)
test shape: (267, 12)
Start model train
Start pred
Start 4 fold
train shape: (1071, 12)
test shape: (267, 12)
Start model train
Start pred


In [49]:
cv_loss

defaultdict(list,
            {'train_rmse': [6087.942792141382,
              6021.17512380288,
              6098.246513576797,
              5928.476910761713,
              5978.578164109053],
             'train_r2': [0.7432126559011109,
              0.7540962033047006,
              0.7382119209084921,
              0.7729361046256888,
              0.7518260924134872],
             'test_rmse': [5810.028373627177,
              6091.23862858396,
              5769.820788688061,
              6467.46110542072,
              6254.489814513056],
             'test_r2': [0.7825655038982294,
              0.7403997218318874,
              0.7965683351700736,
              0.6300612393231182,
              0.7502129923784764]})