In [2]:
import pandas as pd
import numpy as np
from scipy.stats import skew
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
# from sklearn.linear_model import LinearRegression, Ridge, Lasso
# from sklearn.ensemble import GradientBoostingRegressor
# from xgboost import XGBRegressor
# from lightgbm import LGBMRegressor

# train data file load
# delete 'Index' because it is provided when converted to a data frame, and delete 'Standard_Weight' because it is determined by the hegith
file_df = pd.read_csv('./train_data.csv')
target_name = 'Body_Fat_Rate'
no_need_features = ['Index', 'Standard_Weight']
category_features = ['Sex']

# arrange X and y
file_df.drop(no_need_features, axis=1, inplace=True)
y_target = file_df[target_name]
X_features = file_df.drop([target_name],axis=1,inplace=False)

# visualize data to find outliers
outlier_name = 'Height'
cond1 = file_df[outlier_name] < 60
cond2 = file_df[target_name] < 30
outlier_index = X_features[cond1 & cond2].index
X_features.drop(outlier_index , axis=0, inplace=True)
y_target.drop(outlier_index, axis=0, inplace=True)

# figure out the extent of distortion in features --> if the degree of distortion is high(>1 or <-1), log transformation is performed.
# 'Height' needs the log transformation
features_index = file_df.drop(category_features, axis=1, inplace=False).dtypes.index
skew_features = file_df[features_index].apply(lambda x : skew(x))
# print(skew_features.sort_values(ascending=False))
skew_features_change = skew_features[skew_features < -1]
file_df[skew_features_change.index] = np.log1p(file_df[skew_features_change.index])

# change the category feature to One-Hot Encoding --> 'Sex'
X_features_ohe = pd.get_dummies(X_features, columns=category_features)

# the log transformation is applied on the target column to form a normal distribution
y_target_log = np.log1p(y_target)

# split train/test data based on feature dataset with One-Hot encoding
X_train, X_test, y_train, y_test = train_test_split(X_features_ohe, y_target_log, test_size=0.2, random_state=0)

# single model
model = rf_reg = RandomForestRegressor(max_depth=14, min_samples_leaf=2, min_samples_split=2, n_estimators=700, n_jobs=-1)
model.fit(X_train, y_train)

# mixed model
# model1 = LinearRegression()
# model2 = Ridge(alpha=1)
# model1.fit(X_train, y_train)
# model2.fit(X_train, y_train)

# test data file load
# delete 'Index' because it is provided when converted to a data frame, and delete 'Standard_Weight' because it is determined by the hegith
test_df = pd.read_csv('./test_data.csv')
# print(test_df)

# arrange X and y
test_df.drop(no_need_features, axis=1, inplace=True)
y_test = test_df[target_name]
X_test = test_df.drop([target_name],axis=1,inplace=False)

# change the category feature to One-Hot Encoding --> 'Sex'
X_test_ohe = pd.get_dummies(X_test, columns=category_features)

# single model
predict_value = model.predict(X_test_ohe)
predict_final = np.expm1(predict_value)
predict_final
print("**Single Model**")
for x in predict_final:
    print(round(x,1))
print("")

# mixed model
# pred1 = model1.predict(X_test_ohe)
# pred2 = model2.predict(X_test_ohe)
# pred = 0.8 * pred1 + 0.2 * pred2
# np.expm1(pred)
# print("**Mixed Model**")
# for x in np.expm1(pred):
#     print(round(x,1))

**Single Model**
25.6
24.4
15.8
19.3
35.2
18.1
23.5
11.6
24.8
13.8
32.7
19.2
13.7
19.6
10.8
33.6
18.6
15.1
12.0
10.2
28.5
29.8
31.0
19.3
25.2
12.9
16.8
25.9
26.1
18.6
23.0
16.5
13.8
25.7
14.6
32.4
25.4
18.5
19.9
25.4
16.3
29.5
19.2
33.2
26.4
21.4
14.9
18.9
28.2
31.0
22.7
25.6
15.0
26.5
24.9
15.5
27.0
24.7
24.7
32.7
25.9
26.3
31.5
22.2
24.7
25.8
29.1
17.6
26.7
25.1
36.8
31.6
22.7
26.9
29.8
24.8
17.8
28.5
26.5
29.8
28.5
27.6
26.0
25.9
27.8
25.7
26.3
20.8
27.7
33.4
22.3
29.2
26.1
29.6
26.8
31.3
26.7
22.7
31.3
31.5
26.0
23.4
14.6
27.5
34.5
28.3
27.4
25.4
24.0
21.3
30.2
23.0
20.9
28.7
32.9
20.5
22.9
29.9
23.0
22.0
29.0
42.4
23.5
35.3
21.8
35.9
23.7
28.6
30.4
18.8
35.4
35.1
22.2
39.7
40.9
25.1
23.2
40.1
25.6
25.1
34.3
23.7
34.2
35.1
32.9
21.1
35.3
25.4
30.2
26.2
31.7
38.5
23.6
37.4
39.3
24.7
13.0
14.4
17.4
29.7
16.4
14.6
24.3
25.3
16.5
35.4
8.8
26.4
23.5

