In [1]:
# Random Forest Regression #2

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
# Import dataset
dataset = pd.read_csv('health_insurance_cost.csv')
dataset.describe()
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 6].values


In [3]:
# Preprocessing: 
# Handle Categorical variables using OneHotEncoder
# Preprocess Gender column
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import Normalizer, OneHotEncoder, StandardScaler
colT = ColumnTransformer(remainder='drop', transformers=
    [("dummy_gender", OneHotEncoder(categories='auto'), [1])])
genders = colT.fit_transform(X)
# Avoid Dummy variable trap
genders = genders[:, 1:]


In [4]:
# Preprocess Smoker column
colT = ColumnTransformer(remainder='drop', transformers=
    [("dummy_smoker", OneHotEncoder(categories='auto'), [4])])
smokers = colT.fit_transform(X)
# Avoid Dummy variable trap
smokers = smokers[:, 1:]


In [5]:
# Preprocess Region column
colT = ColumnTransformer(remainder='drop', transformers=
    [("dummy_region", OneHotEncoder(categories='auto'), [5])])
regions = colT.fit_transform(X)
# Avoid Dummy variable trap
regions = regions[:, 1:]


In [8]:
# Remove the original categorial columns
X = np.delete(X, [1,4,5], axis=1)


  


In [10]:

# Split dataset into Training and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)


In [11]:
# Feature Scaling
# Alternative: ColumnTransformer
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
sc_y = StandardScaler()
X_train = sc_X.fit_transform(X_train)
y_train = sc_y.fit_transform(np.array(y_train).reshape(-1, 1))
X_test = sc_X.fit_transform(X_test)
y_test_org = y_test
y_test = sc_y.fit_transform(np.array(y_test).reshape(-1, 1))




In [12]:
# Random Forest Regression for training set ( 10 estimators )
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
regressor.fit(X_train, y_train.ravel())

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [13]:
# Predict Test set using Random Forest regressor
y_pred = regressor.predict(X_test)


In [14]:
# Print unscaled test and predicted values
y_pred_inv = sc_y.inverse_transform(y_pred)
print(pd.DataFrame(np.column_stack((y_test_org, y_pred_inv))).head(10))


             0             1
0   9724.53000  15770.969632
1   8547.69130  13177.565798
2  45702.02235  11312.555662
3  12950.07120  17579.501133
4   9644.25250  12273.025808
5   4500.33925  21944.482935
6   2198.18985   6626.955528
7  11436.73815  10829.611933
8   7537.16390   8702.964826
9   5425.02335  12371.289695


In [16]:
# Metrics
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, explained_variance_score
print("Mean absolute error: %.2f" % mean_absolute_error(y_test, y_pred))
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
print("Root Mean squared error: %.2f" % np.sqrt(mean_squared_error(y_test, y_pred)))
print('Variance score: %.2f' % explained_variance_score(y_test, y_pred))
# Coefficient of determination
print('R^2 Square value', r2_score(y_test, y_pred))


Mean absolute error: 0.77
Mean squared error: 1.11
Root Mean squared error: 1.05
Variance score: -0.10
R^2 Square value -0.1059124921087855
