In [43]:
!pip install xgboost



In [39]:
!brew install libomp

Updating Homebrew...
[34m==>[0m [1mAuto-updated Homebrew![0m
Updated 2 taps (homebrew/core and homebrew/services).
[34m==>[0m [1mNew Formulae[0m
argocd-autopilot    erlang@23           maturin             qthreads
at-spi2-atk         gcc@10              moar                range2cidr
at-spi2-core        gitbackup           mongocli            rmw
atuin               gitwatch            mongosh             scotch
autoconf@2.69       gpg-tui             neovim-remote       search-that-hash
avahi               gradle@6            nomino              simde
cadical             grepip              openexr@2           slides
caire               himalaya            opensearch          sqlbench
cidr2range          ipinfo-cli          osinfo-db           sqlx-cli
clazy               julia               osinfo-db-tools     storj-uplink
code-minimap        libmobi             parquet-cli         tbb@2020
ddcctl              lsix                php-cs-fixer@2      tmuxp
ehco                

In [42]:
!pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-3.2.1-py3-none-macosx_10_14_x86_64.macosx_10_15_x86_64.macosx_11_0_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 1.6 MB/s eta 0:00:01     |█████████████████               | 645 kB 1.1 MB/s eta 0:00:01
Installing collected packages: lightgbm
Successfully installed lightgbm-3.2.1


In [6]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import skew
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression , Ridge , Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def get_best_params(model, params):
    grid_model = GridSearchCV(model, param_grid=params, scoring='neg_mean_squared_error', cv=5)
    grid_model.fit(X_features, y_target)
    rmse = np.sqrt(-1* grid_model.best_score_)
    print('{0} 5 CV best average RMSE value: {1}, best alpha:{2}'.format(model.__class__.__name__,np.round(rmse, 4), grid_model.best_params_))
    return grid_model.best_estimator_

def get_model_predict(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    # scale back by expm1() because the predicted result is predicted by log-translated data
    y_test = np.expm1(y_test)
    pred = np.expm1(pred)
    print('\n###',model.__class__.__name__,'###')
    evaluate_regr(y_test, pred)

# calculate MAE, MSE, RMSE, RMSLE
def evaluate_regr(y,pred):
    mae_val = mean_absolute_error(y,pred)
    mse_val = mean_squared_error(y,pred)
    rmse_val = rmse(y,pred)
    rmsle_val = rmsle(y,pred)
    r2_val = r2_score(y, pred)
    print('MAE: {0:.3F}, MSE: {2:.3F}, RMSE: {1:.3F}, RMSLE: {0:.3F}, R2: {3:.3F}'.format(mae_val, mse_val, rmse_val, rmsle_val, r2_val))

# calculate RMSLE using log1p(), not log() because of the NaN issue 
def rmsle(y, pred):
    log_y = np.log1p(y)
    log_pred = np.log1p(pred)
    squared_error = (log_y - log_pred) ** 2
    rmsle = np.sqrt(np.mean(squared_error))
    return rmsle

# calculate RMSE using mean_square_error() of Scikit-learn 
def rmse(y,pred):
    return np.sqrt(mean_squared_error(y,pred))

def print_coefficient(models):
    for model in models:
        print('\n###',model.__class__.__name__,'###')
        coeff = pd.Series(data=np.round(model.coef_, 3), index=X_features_ohe.columns )
        print(coeff.sort_values(ascending=False))

# visualize coefficients of features for linear regression models
def visualize_coefficient(models):
    fig, axs = plt.subplots(figsize=(24,10),nrows=1, ncols=4)
    fig.tight_layout()
    for i_num, model in enumerate(models):
        coef_high, coef_low = get_top_bottom_coef(model)
        coef_concat = pd.concat( [coef_high , coef_low] )
        axs[i_num].set_title(model.__class__.__name__+' Coeffiecents', size=25)
        axs[i_num].tick_params(axis="y",direction="in", pad=-120)
        for label in (axs[i_num].get_xticklabels() + axs[i_num].get_yticklabels()):
            label.set_fontsize(22)
        sns.barplot(x=coef_concat.values, y=coef_concat.index , ax=axs[i_num])

# extract top 3 of features, bottom 3 efficient for linear regression models
def get_top_bottom_coef(model):
    # create Series objects based on coef_ property
    coef = pd.Series(model.coef_, index=X_features_ohe.columns)
    coef_high = coef.sort_values(ascending=False).head(3)
    coef_low = coef.sort_values(ascending=False).tail(3)
    return coef_high, coef_low

# visualize coefficients of features for regression tree models 
def visualize_ftr_importances(models):
    fig, axs = plt.subplots(figsize=(24,10),nrows=1, ncols=5)
    fig.tight_layout()
    for i_num, model in enumerate(models):
        ftr_top6 = get_top_features(model)
        axs[i_num].set_title(model.__class__.__name__+' Feature Importances', size=17)
        for label in (axs[i_num].get_xticklabels() + axs[i_num].get_yticklabels()):
            label.set_fontsize(22)
        sns.barplot(x=ftr_top6.values, y=ftr_top6.index , ax=axs[i_num])

# extract top 3 of features, bottom 3 efficient for regression tree models
def get_top_features(model):
    ftr_importances_values = model.feature_importances_
    ftr_importances = pd.Series(ftr_importances_values, index=X_features_ohe.columns)
    ftr_top6 = ftr_importances.sort_values(ascending=False)[:6]
    return ftr_top6

# file load
# delete 'Index' because it is provided when converted to a data frame, and delete 'Standard_Weight' because it is determined by the hegith
file_df = pd.read_csv('./train_data.csv')
target_name = 'Body_Fat_Rate'
no_need_features = ['Index', 'Standard_Weight']
category_features = ['Sex']

# arrange X and y
file_df.drop(no_need_features, axis=1, inplace=True)
y_target = file_df[target_name]
X_features = file_df.drop([target_name],axis=1,inplace=False)

# visualize data to find outliers
# for feature in X_features.drop(category_features, axis=1, inplace=False):
#     plt.scatter(x = file_df[feature], y = y_target)
#     plt.ylabel(target_name, fontsize=15)
#     plt.xlabel(feature, fontsize=15)
#     plt.show()

# remove outlier
outlier_name = 'Height'
cond1 = file_df[outlier_name] < 60
cond2 = file_df[target_name] < 30
outlier_index = X_features[cond1 & cond2].index
# print('Outlier index :', outlier_index.values)
# print('X_feature shape before Outlier is removed:', X_features.shape)
X_features.drop(outlier_index , axis=0, inplace=True)
y_target.drop(outlier_index, axis=0, inplace=True)
# print('X_feature shape after Outlier is removed:', X_features.shape)

# figure out the extent of distortion in features --> if the degree of distortion is high(>1 or <-1), log transformation is performed.
# 'Height' needs the log transformation
features_index = file_df.drop(category_features, axis=1, inplace=False).dtypes.index
skew_features = file_df[features_index].apply(lambda x : skew(x))
# print(skew_features.sort_values(ascending=False))
skew_features_change = skew_features[skew_features < -1]
file_df[skew_features_change.index] = np.log1p(file_df[skew_features_change.index])

# change the category feature to One-Hot Encoding --> 'Sex'
X_features_ohe = pd.get_dummies(X_features, columns=category_features)
# print(X_features_ohe)

# the log transformation is applied on the target column to form a normal distribution
y_target_log = np.log1p(y_target)
# print(y_target)
# print(y_target_log)

# split train/test data based on feature dataset with One-Hot encoding
X_train, X_test, y_train, y_test = train_test_split(X_features_ohe, y_target_log, test_size=0.2, random_state=0)

# define the model
lr_reg = LinearRegression()
ridge_reg = Ridge(alpha=8)
lasso_reg = Lasso(alpha=0.05)
en_reg = ElasticNet(alpha=0.07)
dt_reg = DecisionTreeRegressor(max_depth=7)
rf_reg = RandomForestRegressor(max_depth=14, min_samples_leaf=2, min_samples_split=2, n_estimators=700, n_jobs=-1)
gbm_reg = GradientBoostingRegressor(n_estimators=500, learning_rate=0.02, subsample=0.05)
xgb_reg = XGBRegressor(n_estimators=120, eta=0.1, min_child_weight=3, max_depth=3)
lgbm_reg = LGBMRegressor(n_estimators=1000, learning_rate=0.03, max_depth=3, min_child_samples=10, num_leaves=3)

# find best parameters
# ridge_params = { 'alpha':[0.01, 0.05, 0.09, 0.1, 0.11, 0.12, 0.5, 1, 3, 5, 8, 10, 12, 15, 20, 30, 40, 50]}
# lasso_params = { 'alpha':[0.01, 0.05, 0.09, 0.1, 0.11, 0.12, 0.5, 1, 3, 5, 8, 10, 12, 15, 20, 30, 40, 50]}
# en_params = { 'alpha':[0.07, 0.1, 0.5, 1, 3]}
# dt_params = {'max_depth':[1,3,5,7,9]}
# rf_params = {'n_estimators':[700], 'max_depth' : [14], 'min_samples_leaf' : [2], 'min_samples_split' : [2]}
# gbm_params = {'learning_rate': [0.02], 'n_estimators':[500], 'subsample': [0.05]}
# xgb_params = {'colsample_bytree': [1], 'eta': [0.1], 'max_depth': [3], 'min_child_weight': [3], 'n_estimators':[120]}
# lgbm_params = {'learning_rate': [0.03], 'max_depth': [3], 'min_child_samples': [10], 'n_estimators':[1000], 'num_leaves': [3]}
# best_rige = get_best_params(ridge_reg, ridge_params)
# best_lasso = get_best_params(lasso_reg, lasso_params)
# best_en = get_best_params(en_reg, en_params)
# best_dt = get_best_params(dt_reg, dt_params)
# best_rf = get_best_params(rf_reg, rf_params)
# best_gbm = get_best_params(gbm_reg, gbm_params)
# best_xgb = get_best_params(xgb_reg, xgb_params)
# best_lgbm = get_best_params(lgbm_reg, lgbm_params)

# linear regression models
models_linear = [lr_reg, ridge_reg, lasso_reg, en_reg]
for model in models_linear:
    get_model_predict(model,X_train, X_test, y_train, y_test)

# visualize coefficients of linear regression models
# print_coefficient(models_linear)
# visualize_coefficient(models_linear)

# regression tree models
models_tree = [dt_reg, rf_reg, gbm_reg, xgb_reg, lgbm_reg]
for model in models_tree:
    get_model_predict(model,X_train, X_test, y_train, y_test)

# visualize coefficients of regression tree models
# visualize_ftr_importances(models_tree)


### LinearRegression ###
MAE: 2.902, MSE: 3.731, RMSE: 13.920, RMSLE: 2.902, R2: 0.158

### Ridge ###
MAE: 2.889, MSE: 3.711, RMSE: 13.771, RMSLE: 2.889, R2: 0.159

### Lasso ###
MAE: 3.818, MSE: 4.857, RMSE: 23.587, RMSLE: 3.818, R2: 0.212

### ElasticNet ###
MAE: 3.754, MSE: 4.786, RMSE: 22.904, RMSLE: 3.754, R2: 0.209

### DecisionTreeRegressor ###
MAE: 2.350, MSE: 3.346, RMSE: 11.195, RMSLE: 2.350, R2: 0.141

### RandomForestRegressor ###
MAE: 2.044, MSE: 2.876, RMSE: 8.273, RMSLE: 2.044, R2: 0.117

### GradientBoostingRegressor ###
MAE: 2.345, MSE: 3.108, RMSE: 9.662, RMSLE: 2.345, R2: 0.136

### XGBRegressor ###
MAE: 2.329, MSE: 3.095, RMSE: 9.581, RMSLE: 2.329, R2: 0.130

### LGBMRegressor ###
MAE: 2.578, MSE: 3.388, RMSE: 11.477, RMSLE: 2.578, R2: 0.141
