In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# train data file load
# delete 'Index' because it is provided when converted to a data frame, and delete 'Standard_Weight' because it is determined by the hegith
file_df = pd.read_csv('./train_data.csv')
target_name = 'Body_Fat_Rate'
no_need_features = ['Index', 'Standard_Weight']
category_features = ['Sex']

# arrange X and y
file_df.drop(no_need_features, axis=1, inplace=True)
y_target = file_df[target_name]
X_features = file_df.drop([target_name],axis=1,inplace=False)

# visualize data to find outliers
outlier_name = 'Height'
cond1 = file_df[outlier_name] < 60
cond2 = file_df[target_name] < 30
outlier_index = X_features[cond1 & cond2].index
X_features.drop(outlier_index , axis=0, inplace=True)
y_target.drop(outlier_index, axis=0, inplace=True)

# change the category feature to One-Hot Encoding --> 'Sex'
X_features_ohe = pd.get_dummies(X_features, columns=category_features)

# the log transformation is applied on the target column to form a normal distribution
y_target_log = np.log1p(y_target)

# split train/test data based on feature dataset with One-Hot encoding
X_train, X_test, y_train, y_test = train_test_split(X_features_ohe, y_target_log, test_size=0.2, random_state=0)

# single model
model = rf_reg = RandomForestRegressor(max_depth=14, min_samples_leaf=2, min_samples_split=2, n_estimators=700, n_jobs=-1)
model.fit(X_train, y_train)

# mixed model
# model1 = LinearRegression()
# model2 = Ridge(alpha=1)
# model1.fit(X_train, y_train)
# model2.fit(X_train, y_train)

# test data file load
# delete 'Index' because it is provided when converted to a data frame, and delete 'Standard_Weight' because it is determined by the hegith
test_df = pd.read_csv('./test_data.csv')
# print(test_df)

# arrange X and y
test_df.drop(no_need_features, axis=1, inplace=True)
y_test = test_df[target_name]
X_test = test_df.drop([target_name],axis=1,inplace=False)

# change the category feature to One-Hot Encoding --> 'Sex'
X_test_ohe = pd.get_dummies(X_test, columns=category_features)

# single model
predict_value = model.predict(X_test_ohe)
predict_final = np.expm1(predict_value)
predict_final
print("**Single Model**")
for x in predict_final:
    print(round(x,1))
print("")

# mixed model
# pred1 = model1.predict(X_test_ohe)
# pred2 = model2.predict(X_test_ohe)
# pred = 0.8 * pred1 + 0.2 * pred2
# np.expm1(pred)
# print("**Mixed Model**")
# for x in np.expm1(pred):
#     print(round(x,1))

**Single Model**
25.5
24.8
16.3
19.3
35.1
18.2
23.5
11.4
24.9
13.7
32.7
19.2
13.8
19.7
10.9
33.5
18.5
15.1
11.8
10.2
28.4
29.9
30.9
19.2
25.3
12.9
16.8
25.9
26.1
18.4
22.8
16.5
13.5
25.7
14.4
32.4
25.5
18.3
19.9
25.4
16.6
29.4
19.2
33.1
26.4
21.3
14.9
18.9
28.3
31.1
22.8
25.8
15.3
26.5
24.9
15.7
27.0
24.6
24.7
32.8
26.1
25.9
31.6
22.4
24.5
26.2
29.1
17.9
26.9
25.1
36.5
31.3
22.7
27.0
30.0
24.8
18.0
28.5
26.5
29.7
28.2
27.7
25.8
25.9
28.0
25.8
26.3
20.8
27.7
33.7
22.5
29.0
26.5
29.5
26.6
31.5
26.7
22.9
31.2
32.1
26.3
23.4
14.5
27.6
34.0
28.4
26.9
25.6
23.9
21.4
30.0
23.0
20.8
28.5
32.9
20.6
23.1
29.3
23.2
22.1
29.0
42.7
23.2
35.3
22.0
35.7
23.4
28.2
30.1
18.9
35.5
34.8
22.6
39.9
40.9
25.1
23.1
40.2
25.7
25.3
34.0
23.7
33.9
35.2
32.9
21.0
35.3
25.5
29.7
25.9
31.6
38.6
23.3
37.4
39.8
24.7
13.2
14.4
17.5
29.8
16.7
14.8
24.3
25.4
16.6
35.4
8.8
26.5
23.5

