In [4]:
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

from sklearn.metrics import confusion_matrix, classification_report

In [3]:
data = pd.read_csv('insurance.csv')
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [5]:
data = data.drop_duplicates()
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [6]:
data.tail(10)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
1328,23,female,24.225,2,no,northeast,22395.74424
1329,52,male,38.6,2,no,southwest,10325.206
1330,57,female,25.74,2,no,southeast,12629.1656
1331,23,female,33.4,0,no,southwest,10795.93733
1332,52,female,44.7,3,no,southwest,11411.685
1333,50,male,30.97,3,no,northwest,10600.5483
1334,18,female,31.92,0,no,northeast,2205.9808
1335,18,female,36.85,0,no,southeast,1629.8335
1336,21,female,25.8,0,no,southwest,2007.945
1337,61,female,29.07,0,yes,northwest,29141.3603


In [8]:
unique_names = data['region'].unique()
unique_names

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [7]:

counts = data['region'].value_counts()
print(counts)


region
southeast    364
southwest    325
northwest    324
northeast    324
Name: count, dtype: int64


In [9]:
Label_encoder = LabelEncoder()
data['region'] = Label_encoder.fit_transform(data['region'])


In [11]:
Label_encoder_smoker = LabelEncoder()
data['smoker'] = Label_encoder_smoker.fit_transform(data['smoker'])

In [12]:
Label_encoder_sex = LabelEncoder()
data['sex'] = Label_encoder_sex.fit_transform(data['sex'])

In [14]:
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,3,16884.924
1,18,1,33.77,1,0,2,1725.5523
2,28,1,33.0,3,0,2,4449.462
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.88,0,0,1,3866.8552


In [25]:
x = data.drop('charges', axis=1)  
y = data['charges']              

print(x.shape)
print(y.shape)


(1337, 6)
(1337,)


In [26]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

print(x_train.shape, x_test.shape)


(1069, 6) (268, 6)


In [27]:
model1 = DecisionTreeRegressor()
model1.fit(x_train, y_train)

In [28]:
print('Train score: ', model1.score(x_train, y_train))
print('Test score: ', model1.score(x_test, y_test))

Train score:  1.0
Test score:  0.7724343156709601


In [29]:
model1.get_depth()

18

In [30]:
model2 = DecisionTreeRegressor(max_depth=14)
model2.fit(x_train, y_train)
print('Train score: ', model2.score(x_train, y_train))
print('Test score: ', model2.score(x_test, y_test))

Train score:  0.9999740253294174
Test score:  0.7697807132436185


In [31]:
model3 = DecisionTreeRegressor(max_depth=10)
model3.fit(x_train, y_train)
print('Train score: ', model3.score(x_train, y_train))
print('Test score: ', model3.score(x_test, y_test))

Train score:  0.9815309738996293
Test score:  0.788870525326677


In [35]:
model4 = DecisionTreeRegressor(max_depth=6)
model4.fit(x_train, y_train)
print('Train score: ', model4.score(x_train, y_train))
print('Test score: ', model4.score(x_test, y_test))

Train score:  0.8869750357338171
Test score:  0.8690314340877405


In [36]:
# Predict on the test data
y_pred = model4.predict(x_test)
y_pred

array([ 9153.47064286,  3295.67270966, 11487.51314   , 46411.47284833,
        6073.21065164,  9457.16117758, 36110.266502  ,  2117.71088684,
        8577.90654523, 11487.51314   , 13952.62821895, 24456.62655333,
       46411.47284833, 13952.62821895,  8373.46941609,  8577.90654523,
        5109.97783217, 40414.74781154,  3295.67270966,  3295.67270966,
        2972.73851958, 22176.03309222,  9457.16117758, 19213.67534435,
       36678.52411429,  8373.46941609, 40414.74781154, 46033.3923    ,
       11487.51314   , 11487.51314   ,  3593.33582458,  9457.16117758,
        3295.67270966, 17346.68741926, 46598.74435667,  9457.16117758,
        3295.67270966,  8373.46941609, 24052.5176575 ,  8577.90654523,
        2972.73851958, 20277.64471   , 40414.74781154, 11487.51314   ,
        8577.90654523,  3593.33582458,  2972.73851958,  8577.90654523,
        5957.73170448,  8037.70220423,  5109.97783217,  6074.81395079,
       20277.64471   ,  2972.73851958,  8037.70220423,  8577.90654523,
      

In [37]:
pd.DataFrame({"True values":y_test, "Predicted values": y_pred})

Unnamed: 0,True values,Predicted values
900,8688.85885,9153.470643
1064,5708.86700,3295.672710
1256,11436.73815,11487.513140
298,38746.35510,46411.472848
237,4463.20510,6073.210652
...,...,...
534,13831.11520,13952.628219
542,13887.20400,13952.628219
760,3925.75820,8373.469416
1284,47403.88000,48814.069500
