In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [27]:
df = pd.read_csv('insurance.csv')

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [29]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [30]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


# Encoding and feature engineering

In [31]:
df['gender_int'] = df['sex'].map({'male':1, 'female':0})

In [32]:
df['smoker_int'] = df['smoker'].map({'yes':1, 'no':0})

In [33]:
df['charges_log'] = np.log(df['charges'])

# Deleting redundant columns

In [34]:
df.pop('region')

0       southwest
1       southeast
2       southeast
3       northwest
4       northwest
          ...    
1333    northwest
1334    northeast
1335    southeast
1336    southwest
1337    northwest
Name: region, Length: 1338, dtype: object

In [35]:
df.pop('smoker')

0       yes
1        no
2        no
3        no
4        no
       ... 
1333     no
1334     no
1335     no
1336     no
1337    yes
Name: smoker, Length: 1338, dtype: object

In [36]:
df.pop('sex')

0       female
1         male
2         male
3         male
4         male
         ...  
1333      male
1334    female
1335    female
1336    female
1337    female
Name: sex, Length: 1338, dtype: object

In [37]:
df.describe()

Unnamed: 0,age,bmi,children,charges,gender_int,smoker_int,charges_log
count,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265,0.505232,0.204783,9.098659
std,14.04996,6.098187,1.205493,12110.011237,0.50016,0.403694,0.919527
min,18.0,15.96,0.0,1121.8739,0.0,0.0,7.022756
25%,27.0,26.29625,0.0,4740.28715,0.0,0.0,8.463853
50%,39.0,30.4,1.0,9382.033,1.0,0.0,9.146552
75%,51.0,34.69375,2.0,16639.912515,1.0,0.0,9.719558
max,64.0,53.13,5.0,63770.42801,1.0,1.0,11.063045


# Model Creation

In [38]:
from sklearn.model_selection import train_test_split


In [39]:
X = df.drop('charges', axis=1)
y = df['charges']

In [40]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y,test_size=0.2, random_state=500)

In [41]:
X_val, X_test, y_val, y_test = train_test_split(X, y,test_size=0.5, random_state=42)

In [42]:
from sklearn.svm import SVR

In [43]:
model = SVR(kernel='linear', C=1000, epsilon=1)

In [44]:
model.fit(X_train, y_train)

In [45]:
predictions = model.predict(X_val)

In [46]:
from sklearn import metrics

In [47]:
from sklearn.metrics import mean_squared_error

In [48]:
print('MAE:', metrics.mean_absolute_error(y_val, predictions))
print('MSE:', metrics.mean_squared_error(y_val, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_val, predictions)))

MAE: 2567.17260805913
MSE: 18811761.32072543
RMSE: 4337.252738857333


In [49]:
tpred = model.predict(X_test)

In [50]:
print('MAE:', metrics.mean_absolute_error(y_test, tpred))
print('MSE:', metrics.mean_squared_error(y_test, tpred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, tpred)))

MAE: 2701.472341656861
MSE: 19414059.47689354
RMSE: 4406.138839947459


# Cross validation

In [51]:
from sklearn.model_selection import cross_val_score

In [52]:
c_values = [0.1, 1, 10 , 100, 1000]


In [53]:
for c in c_values:
    score = cross_val_score(model, X, y, cv=5)
    print(f'C={c} , Mean_score={score.mean()}, Std_dev={score.std()}')

C=0.1 , Mean_score=0.868899362328438, Std_dev=0.005314145028643707
C=1 , Mean_score=0.868899362328438, Std_dev=0.005314145028643707
C=10 , Mean_score=0.868899362328438, Std_dev=0.005314145028643707
C=100 , Mean_score=0.868899362328438, Std_dev=0.005314145028643707
C=1000 , Mean_score=0.868899362328438, Std_dev=0.005314145028643707


In [54]:
e_values = [0.1,0.2,0.3,0.4,0.5,100]

In [55]:
for e in e_values:
    score = cross_val_score(model, X_train, y_train, cv=5)
    print(f'Epsilon={e} , Mean_score={score.mean()}, Std_dev={score.std()}')

Epsilon=0.1 , Mean_score=0.861624069454401, Std_dev=0.010409669570203428
Epsilon=0.2 , Mean_score=0.861624069454401, Std_dev=0.010409669570203428
Epsilon=0.3 , Mean_score=0.861624069454401, Std_dev=0.010409669570203428
Epsilon=0.4 , Mean_score=0.861624069454401, Std_dev=0.010409669570203428
Epsilon=0.5 , Mean_score=0.861624069454401, Std_dev=0.010409669570203428
Epsilon=100 , Mean_score=0.861624069454401, Std_dev=0.010409669570203428


# Cross-validation for RMSE

In [56]:
from sklearn.metrics import make_scorer, mean_squared_error

In [57]:
from sklearn.model_selection import cross_val_score,KFold

In [58]:
def rmse_score(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [59]:
rmse_scorer = make_scorer(rmse_score, greater_is_better=False)

In [60]:
kf = KFold(n_splits=15, shuffle=True, random_state=42)

In [61]:
cv_scores = cross_val_score(model, X, y, cv=kf, scoring=rmse_scorer)

In [62]:
print(f"Cross-validated RMSE scores: {cv_scores}")
print(f"Mean RMSE: {-np.mean(cv_scores)}")

Cross-validated RMSE scores: [-3978.42317105 -4176.2400089  -4503.18714564 -4192.02927002
 -4095.45435654 -3947.70193482 -5136.16944959 -4678.87978779
 -4286.49315272 -4539.00269287 -4621.55540322 -3639.88432812
 -4968.65830956 -4598.27313884 -4237.56747603]
Mean RMSE: 4373.301308380353
