## L1 Loss function

L1 Loss function는 실제 값과 예측 값의 절대값의 차이를 최소화합니다. 각 관측치별 실제값과 예측값의 차이를 더한 값이 L1 Loss function의 결과값입니다. 

## L2 loss function 
L2 Loss function은 실제 값과 예측 값의 차이의 제곱을 최소화합니다. 

위의 식에서 알 수 있듯이 데이터가 극단치(outliers)일수록 L2의 결과값이 L1 대비 크게 나타납니다. 따라서 L2가 L1 대비 outliers에 더 크게 반응하고 이를 최소화 하려고 할 것입니다.

과연 그러한지 샘플 데이터를 통해 확인해보겠습니다. 
<br />*샘플 데이터는 Kaggle의 Boston Housing Dataset을 활용하였습니다.

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from statsmodels.tools.eval_measures import rmse
import matplotlib.pylab as plt

In [3]:
# Read Housing Dataset from kaggle
data = pd.read_csv('../sample_data/housingdata.csv')

In [4]:
# See how the data looks like
data.describe()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063,22.532806
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95,17.025
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36,21.2
75%,3.677082,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


In [6]:
# Check the distribution of 'medv'(our y)
fig = plt.figure(figsize=(13,7))
plt.hist(data['medv'], bins=50, range = (0, 50))
#fig.suptitle('medv Count', fontsize = 20)
#plt.xlabel('medv', fontsize = 16)
#plt.ylabel('count', fontsize = 16)

(array([  0.,   0.,   0.,   0.,   0.,   3.,   1.,   7.,  10.,   3.,  10.,
          9.,   9.,  24.,  18.,  16.,  16.,  22.,  25.,  37.,  36.,  31.,
         35.,  37.,  25.,  11.,   8.,   9.,  10.,  10.,   7.,   8.,   7.,
          9.,   5.,   4.,   6.,   5.,   1.,   1.,   0.,   2.,   2.,   3.,
          2.,   1.,   2.,   0.,   3.,  16.]),
 array([  0.,   1.,   2.,   3.,   4.,   5.,   6.,   7.,   8.,   9.,  10.,
         11.,  12.,  13.,  14.,  15.,  16.,  17.,  18.,  19.,  20.,  21.,
         22.,  23.,  24.,  25.,  26.,  27.,  28.,  29.,  30.,  31.,  32.,
         33.,  34.,  35.,  36.,  37.,  38.,  39.,  40.,  41.,  42.,  43.,
         44.,  45.,  46.,  47.,  48.,  49.,  50.]),
 <a list of 50 Patch objects>)

In [30]:
# Create a data frame with all the independent features
x = data.drop('medv', axis = 1)

# Create a target vector(vector of dependent variable, i.e. 'medv')
y = data['medv']

# Split data into training and test sets
train_X, test_X, train_y, test_y = train_test_split(x, y,
                                                    test_size = 0.30,
                                                    random_state = None)

In [31]:
# So let's fit a GradientBoostingRegressor with a L1(Least Absolute Deviation) loss function
np.random.seed(9876)

# A GradientBoostingRegressor with L1(Least Absolute Deviation) as the loss function
mod = GradientBoostingRegressor(loss='lad')

fit = mod.fit(train_X, train_y)
predict = fit.predict(test_X)

# Root Mean Squared Error
rmse(predict, test_y)

4.8180905184817737

In [32]:
# A GradientBoostingRegressor with L2(Least Squares) as the loss function
mod = GradientBoostingRegressor(loss='ls')

fit = mod.fit(train_X, train_y)
predict = fit.predict(test_X)

# Root Mean Squared Error
rmse(predict, test_y)

3.6939988405861159

### Outlier가 심하지 않은 데이터셋에서의 L1 loss function과 L2 loss function의 RMSE는 각각 3.2440, 2.8018로 나타납니다. 
### 우리의 예상대로라면 outlier가 더 큰 데이터셋에서는 L2의 결과값이 L1보다 크게 나올 것입니다. 

In [33]:
##then let's make the distribution that has more outliers
min_y = data.describe().loc[['min'],:].drop('medv', axis = 1)
max_y = data.describe().loc[['max'],:].drop('medv', axis = 1)

In [34]:
np.random.seed(1234)

# Create 10 random values 
rands = np.random.rand(5, 1)
rands

# Get the 'min' and 'max' rows as numpy array
min_array = np.array(min_y)
max_array = np.array(max_y)

# Find the difference(range) of 'max' and 'min'
_range = max_array - min_array

# Generate 5 samples with 'rands' value
outliers_X = (rands * _range) + min_array
# Change the type of 'chas', 'rad' and 'tax' to rounded of Integers
outliers_X[:, [3, 8, 9]] = np.int64(np.round(outliers_X[:, [3, 8, 9]]))
outliers_X

array([[  1.70457825e+01,   1.91519450e+01,   5.68465061e+00,
          0.00000000e+00,   4.78078453e-01,   4.56054001e+00,
          2.14965386e+01,   3.23572024e+00,   5.00000000e+00,
          2.87000000e+02,   1.44002828e+01,   7.62727836e+01,
          8.67066488e+00],
       [  5.53552627e+01,   6.22108771e+01,   1.74311273e+01,
          1.00000000e+00,   6.87344863e-01,   6.80778568e+00,
          6.33067617e+01,   7.97086794e+00,   1.50000000e+01,
          5.13000000e+02,   1.84478224e+01,   2.47035896e+02,
          2.42752219e+01],
       [  3.89509044e+01,   4.37727739e+01,   1.24012127e+01,
          0.00000000e+00,   5.97735681e-01,   5.84550107e+00,
          4.54033635e+01,   5.94324817e+00,   1.10000000e+01,
          4.16000000e+02,   1.67146407e+01,   1.73914067e+02,
          1.75932533e+01],
       [  6.98795789e+01,   7.85358584e+01,   2.18845822e+01,
          1.00000000e+00,   7.66684272e-01,   7.65978645e+00,
          7.91583185e+01,   9.76610981e+00,   1.900

In [35]:
# We will also create some hard coded outliers
# for 'medv', i.e. our target
medv_outliers = np.array([0, 0, 800, 700, 600])
medv_outliers

array([  0,   0, 800, 700, 600])

In [36]:
# Finally concatenate our existing 'train_X' and
# 'train_y' with these outliers
train_X = np.append(train_X, outliers_X, axis = 0)
train_y = np.append(train_y, medv_outliers, axis = 0)

# Plot a histogram of 'medv' in train_y
fig = plt.figure(figsize=(13,7))
plt.hist(train_y, bins=50, range = (0, 850))
#fig.suptitle('medv Count', fontsize = 20)
#plt.xlabel('medv', fontsize = 16)
#plt.ylabel('count', fontsize = 16)

(array([  91.,  224.,   41.,    0.,    0.,    0.,    0.,    0.,    0.,
           0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
           0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
           0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    1.,
           0.,    0.,    0.,    0.,    0.,    1.,    0.,    0.,    0.,
           0.,    0.,    1.,    0.,    0.]),
 array([   0.,   17.,   34.,   51.,   68.,   85.,  102.,  119.,  136.,
         153.,  170.,  187.,  204.,  221.,  238.,  255.,  272.,  289.,
         306.,  323.,  340.,  357.,  374.,  391.,  408.,  425.,  442.,
         459.,  476.,  493.,  510.,  527.,  544.,  561.,  578.,  595.,
         612.,  629.,  646.,  663.,  680.,  697.,  714.,  731.,  748.,
         765.,  782.,  799.,  816.,  833.,  850.]),
 <a list of 50 Patch objects>)

In [37]:
# GradientBoostingRegressor with L1 loss function
np.random.seed(9876)

mod = GradientBoostingRegressor(loss='lad')

fit = mod.fit(train_X, train_y)
predict = fit.predict(test_X)

# Root Mean Squared Error
rmse(predict, test_y)

4.9276800806620722

In [38]:
# GradientBoostingRegressor with L2 loss function
mod = GradientBoostingRegressor(loss='ls')

fit = mod.fit(train_X, train_y)
predict = fit.predict(test_X)

# Root Mean Squared Error
rmse(predict, test_y)

95.081063278596588

In [39]:
fig = plt.figure(figsize=(13,7))
#plt.hist(data['medv'], bins=50, range = (0, 50))

In [40]:
fig

<matplotlib.figure.Figure at 0x11ac17d30>

In [41]:
plt.hist(data['medv'], bins=50, range = (0, 50))

(array([  0.,   0.,   0.,   0.,   0.,   3.,   1.,   7.,  10.,   3.,  10.,
          9.,   9.,  24.,  18.,  16.,  16.,  22.,  25.,  37.,  36.,  31.,
         35.,  37.,  25.,  11.,   8.,   9.,  10.,  10.,   7.,   8.,   7.,
          9.,   5.,   4.,   6.,   5.,   1.,   1.,   0.,   2.,   2.,   3.,
          2.,   1.,   2.,   0.,   3.,  16.]),
 array([  0.,   1.,   2.,   3.,   4.,   5.,   6.,   7.,   8.,   9.,  10.,
         11.,  12.,  13.,  14.,  15.,  16.,  17.,  18.,  19.,  20.,  21.,
         22.,  23.,  24.,  25.,  26.,  27.,  28.,  29.,  30.,  31.,  32.,
         33.,  34.,  35.,  36.,  37.,  38.,  39.,  40.,  41.,  42.,  43.,
         44.,  45.,  46.,  47.,  48.,  49.,  50.]),
 <a list of 50 Patch objects>)