#### Simple tutorial how to use GridSearchCV with RandomForestRegressor.

In [1]:
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score

In [2]:
data = pd.read_csv('C://Users//mdbab//OneDrive//Desktop//archive//insurance.csv')
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [3]:
data.shape

(1338, 7)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [5]:
data.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [6]:
data.duplicated().sum()

1

In [7]:
data = data.drop_duplicates()

In [8]:
encoder = LabelEncoder()

In [9]:
labels = encoder.fit_transform(data.sex)
data['sex'] = labels
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,yes,southwest,16884.924
1,18,1,33.77,1,no,southeast,1725.5523
2,28,1,33.0,3,no,southeast,4449.462
3,33,1,22.705,0,no,northwest,21984.47061
4,32,1,28.88,0,no,northwest,3866.8552


In [10]:
labels = encoder.fit_transform(data.region)
data['region'] = labels
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,yes,3,16884.924
1,18,1,33.77,1,no,2,1725.5523
2,28,1,33.0,3,no,2,4449.462
3,33,1,22.705,0,no,1,21984.47061
4,32,1,28.88,0,no,1,3866.8552


In [11]:
labels = encoder.fit_transform(data.smoker)
data['smoker'] = labels
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,3,16884.924
1,18,1,33.77,1,0,2,1725.5523
2,28,1,33.0,3,0,2,4449.462
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.88,0,0,1,3866.8552


In [12]:
X = data.drop(columns='charges',axis=1)
Y = data['charges']

In [13]:
#train test split the dataset
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2,random_state=101)

In [14]:
print(X.shape,X_train.shape,X_test.shape)

(1337, 6) (1069, 6) (268, 6)


In [15]:
print(Y.shape,Y_train.shape,Y_test.shape)

(1337,) (1069,) (268,)


In [16]:
rf = RandomForestRegressor()

In [17]:
param_grid = {'n_estimators': [100, 200, 300],'max_depth': [None, 10, 20, 30],'min_samples_split': [2, 5, 10],'min_samples_leaf': [1, 2, 4]}

In [18]:
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid,cv=5, scoring= 'r2')

In [20]:
grid_search.fit(X_train, Y_train)

In [21]:
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_}")

Best parameters: {'max_depth': 20, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 100}
Best score: 0.8574446076948842


In [22]:
best_rf = grid_search.best_estimator_
y_train_pred = best_rf.predict(X_train)
y_test_pred = best_rf.predict(X_test)

In [23]:
from sklearn.metrics import r2_score

In [27]:
Train_accuracy = r2_score(Y_train,y_train_pred)
Test_accuracy =  r2_score(Y_test, y_test_pred)

In [28]:
print("TRAIN ACCURACY : ", Train_accuracy)
print("TEST ACCURACY : ", Test_accuracy)

TRAIN ACCURACY :  0.9145653547873749
TEST ACCURACY :  0.8534943309292949
