# Regression wrapup

Models

- Linear Regression
- Random Forest
- SVM(not yet)
- FFNN

Data Preprocessing for Categorical Data

- Replace by 0, 1, 2
- Replace by 1, 2, 3 
- Dummy Coding (ref. https://stats.idre.ucla.edu/spss/faq/coding-systems-for-categorical-variables-in-regression-analysis-2/)

In [1]:
# import tools

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score
rmse = lambda y_true, y_pred: np.sqrt(mse(y_true, y_pred))

In [2]:
# import models

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Flatten

In [15]:
# define functions

def model_evaluate(y_test, y_pred):
    y_res = (y_test, y_pred)
    
    eval_res = [mae(*y_res), mse(*y_res), rmse(*y_res)]
    
    print('Mean Absolute Error:    ', eval_res[0])
    print('Mean Squared Error:     ', eval_res[1])
    print('Root Mean Squared Error:', eval_res[2])
    
    return eval_res

def model_fit_evaluate_ml(X_train, y_train, X_test, y_test, model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    print(model.name)
    eval_res = model_evaluate(y_test, y_pred)
    print()
    
    return model, eval_res

In [4]:
# Preprocessing 01
df = pd.read_csv('reg_data/abalone.data',
                 names=['Sex','Length','Diameter','Height','Whole weight','Shucked weight','Viscera weight','Shell weight','Rings'])
df['Sex'] = df['Sex'].replace(('I','M','F'),(0,1,2))

X = df.iloc[:,:-1]
y = df.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((3341, 8), (836, 8), (3341,), (836,))

In [22]:
# Models

linreg = LinearRegression()
rfreg = RandomForestRegressor(n_estimators=50, random_state=0)

linreg.name='Linear Regression'
rfreg.name='Random Forest'

In [23]:
linreg, linreg_res_01 = model_fit_evaluate_ml(X_train, y_train, X_test, y_test, linreg)
rfreg, rfreg_res_01 = model_fit_evaluate_ml(X_train, y_train, X_test, y_test, rfreg)

Linear Regression
Mean Absolute Error:     1.5948681995252754
Mean Squared Error:      5.036852349442371
Root Mean Squared Error: 2.2442932850771466

Random Forest
Mean Absolute Error:     1.5492822966507176
Mean Squared Error:      4.682775119617225
Root Mean Squared Error: 2.1639720699716123

