# Regression - Random Forest

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('reg_data/abalone.data',
                 names=['Sex','Length','Diameter','Height','Whole weight','Shucked weight','Viscera weight','Shell weight','Rings'])

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
df['Sex'] = df['Sex'].replace(('I','M','F'),(3,1,2))

X = df.iloc[:,:-1].values
y = df.iloc[:, -1].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

In [5]:
df.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,1,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,1,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,3,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


## Build Model

In [6]:
from sklearn.ensemble import RandomForestRegressor

In [7]:
rfreg = RandomForestRegressor(max_depth=None, random_state=0)

In [8]:
rfreg.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=0, verbose=0, warm_start=False)

In [9]:
y_pred = rfreg.predict(X_test)

## Evaluate

In [10]:
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score as r2

In [11]:
print('Score')
print('MSE :      ', mse(y_test, y_pred))
print('RMSE :     ', np.sqrt(mse(y_test, y_pred)))
print('r2_score : ', r2(y_test, y_pred))

Score
MSE :       5.029919019138756
RMSE :      2.2427480953372263
r2_score :  0.5517668075734965


## Model 2 Dummy Columns

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
dummy = pd.get_dummies(df['Sex'])
dummy.head()

df = pd.concat([df.iloc[:,0], dummy, df.iloc[:,1:]], axis=1)

X = df.iloc[:,1:-1].values
y = df.iloc[:, -1].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

In [14]:
df.head()

Unnamed: 0,Sex,1,2,3,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,1,1,0,0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,1,1,0,0,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,2,0,1,0,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,1,1,0,0,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,3,0,0,1,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [15]:
from sklearn.ensemble import RandomForestRegressor

In [16]:
rfreg = RandomForestRegressor(max_depth=None, random_state=0)

In [17]:
rfreg.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=0, verbose=0, warm_start=False)

In [18]:
y_pred = rfreg.predict(X_test)

## Evaluate

In [19]:
print('Score')
print('MSE :      ', mse(y_test, y_pred))
print('RMSE :     ', np.sqrt(mse(y_test, y_pred)))
print('r2_score : ', r2(y_test, y_pred))

Score
MSE :       5.044964952153111
RMSE :      2.2460999426011994
r2_score :  0.5504260133057561
