referensi https://www.analyticsvidhya.com/blog/2021/04/forward-feature-selection-and-its-implementation/

In [1]:
# #instal library mlxtend
# %pip install mlxtend

In [2]:
# Import Library
import pandas as pd
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, KFold, cross_validate
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import time

In [3]:
#import dataset
df = pd.read_csv('/home/jupyter-17523142/Rama/Dataset/dataset_rama.csv')
X = df.drop(['No', 'Mango Cultivars', 'Vit C (mg/100g)', 'TA (mg/100g)', 'SSC (oBrix)', 'label'], axis = 1)
y_vitc = df.loc[:, 'Vit C (mg/100g)']

# separate dataset into train and test
x_train, x_test, y_train, y_test = train_test_split(
    X, y_vitc, test_size = 0.3, random_state=0)

## Seleksi Fitur Forward Selection dengan Linear Regression

In [4]:
# calling the Linear Regression model
lr_model =  LinearRegression()
sfs = sfs(lr_model, k_features=80, forward=True, verbose=2, scoring='neg_root_mean_squared_error', n_jobs=-1)

In [5]:
%%time 

sfs = sfs.fit(x_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 80 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    5.2s
[Parallel(n_jobs=-1)]: Done 205 tasks      | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done 488 tasks      | elapsed:    7.5s
[Parallel(n_jobs=-1)]: Done 853 tasks      | elapsed:    8.5s
[Parallel(n_jobs=-1)]: Done 1298 tasks      | elapsed:    9.5s
[Parallel(n_jobs=-1)]: Done 1557 out of 1557 | elapsed:    9.8s finished

[2022-07-13 06:47:09] Features: 1/80 -- score: -13.787313513773904[Parallel(n_jobs=-1)]: Using backend LokyBackend with 80 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 250 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 816 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 1397 out of 1556 | elapsed:    2.8s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done 1556 out of 1556 | elapsed:    2.8s finished

[2022-07-13 06:47:12] Features: 2/80 -- score: -12.3

CPU times: user 13min 45s, sys: 28.9 s, total: 14min 14s
Wall time: 14min 3s


[Parallel(n_jobs=-1)]: Done 1478 out of 1478 | elapsed:   20.3s finished

[2022-07-13 07:01:03] Features: 80/80 -- score: -19.371340515482583

In [6]:
feat_names = list(sfs.k_feature_names_)
print("Fitur yang diambil adalah: ")
print(feat_names)
len(feat_names)

Fitur yang diambil adalah: 
['1009.6', '1041.7', '1043.4', '1045.9', '1047.1', '1047.6', '1048.4', '1050.5', '1051', '1052.2', '1052.7', '1054', '1057.4', '1060', '1062.6', '1065.7', '1070.5', '1081.7', '1084.8', '1085.3', '1085.7', '1086.6', '1087.1', '1087.6', '1088.9', '1089.4', '1089.8', '1091.2', '1094.4', '1094.9', '1096.3', '1098.6', '1100.5', '1101.4', '1104.2', '1115.2', '1149.3', '1390.2', '1391', '1397.7', '1406', '1411.4', '1414.5', '1422.2', '1424.6', '1425.4', '1427.7', '1436.4', '1439.6', '1446', '1446.8', '1450.9', '1452.5', '1454.1', '1464', '1466.5', '1485', '1485.8', '1490.1', '1494.4', '1518.9', '1521.6', '1527.8', '1528.7', '1536', '1536.9', '1540.5', '1555.3', '1559.1', '1560', '1654.6', '1659.9', '1660.9', '1664.1', '1666.3', '1669.5', '1674.9', '1680.3', '1681.4', '1732']


80

# Prediksi Linear Regression

In [7]:
#cross validation 10-fold
cv = KFold(n_splits=10, random_state=1, shuffle=True)

In [8]:
%%time

#tentukan metode scoring yang digunakan
metrics = {'rmse': 'neg_root_mean_squared_error',
               'mse': 'neg_mean_squared_error',
               'mae': 'neg_mean_absolute_error',
               'r2': 'r2'} 

n_feat = [80]

for nfeat in n_feat:
        print("==================================================")
        
        #ambil n fitur input hasil seleksi fitur SFS
        x_train_selected = x_train[feat_names].iloc[:,0:nfeat]
        x_test_selected = x_test[feat_names].iloc[:,0:nfeat]

        #Create a Linear Regression
        lr_model = LinearRegression()
        
        #Train the model using the training sets
        lr_model.fit(x_train_selected, y_train)
        y_pred_lr = lr_model.predict(x_test_selected)

        #hitung score model dari data train
        scores = cross_validate(lr_model, x_train_selected, y_train, scoring=metrics, cv=cv, return_train_score=True)

        print("MSE model Linear Regression data Train dengan " + str(nfeat) + " fitur: " 
              + str(abs(round(scores['train_mse'].mean(), 2))))
        print("RMSE model Linear Regression data Train dengan " + str(nfeat) + " fitur:"
              + str(abs(round(scores['train_rmse'].mean(), 2))))
        print("MAE model Linear Regression data Train dengan " + str(nfeat) + " fitur:"
              + str(abs(round(scores['train_mae'].mean(), 2))))
        print("R2 model Linear Regression data Train dengan " + str(nfeat) + " fitur:"
              + str(abs((round(scores['train_r2'].mean(), 2)))))    
        print("----------------------------")        
        print("MSE model Linear Regression data Test dengan " + str(nfeat) + " fitur:" 
          + str(abs(round(mean_squared_error(y_test, y_pred_lr), 2))))
        print("RMSE model Linear Regression data Test dengan " + str(nfeat) + " fitur:" 
          + str(abs(round(mean_squared_error(y_test, y_pred_lr, squared = False), 2))))
        print("MAE model Linear Regression data Test dengan " + str(nfeat) + " fitur:" 
          + str(abs(round(mean_absolute_error(y_test, y_pred_lr), 2))))
        print("R2 model Linear Regression data Test dengan " + str(nfeat) + " fitur:" 
          + str(abs(round(r2_score(y_test, y_pred_lr), 2))))
        print(" ")
        print("==================================================")

MSE model Linear Regression data Train dengan 80 fitur: 39.38
RMSE model Linear Regression data Train dengan 80 fitur:6.26
MAE model Linear Regression data Train dengan 80 fitur:4.64
R2 model Linear Regression data Train dengan 80 fitur:0.79
----------------------------
MSE model Linear Regression data Test dengan 80 fitur:457.57
RMSE model Linear Regression data Test dengan 80 fitur:21.39
MAE model Linear Regression data Test dengan 80 fitur:15.88
R2 model Linear Regression data Test dengan 80 fitur:1.65
 
CPU times: user 7.56 s, sys: 290 ms, total: 7.85 s
Wall time: 450 ms


# Prediksi Random Forest Regressor

## 100 Trees

In [9]:
%%time

#tentukan metode scoring yang digunakan
metrics = {'rmse': 'neg_root_mean_squared_error',
               'mse': 'neg_mean_squared_error',
               'mae': 'neg_mean_absolute_error',
               'r2': 'r2'} 

n_feat = [80]
n_trees = [100]

for nfeat in n_feat:
    for ntrees in n_trees: 
        print("==================================================")
        
        #ambil n fitur input hasil seleksi fitur SFS
        x_train_selected = x_train[feat_names].iloc[:,0:nfeat]
        x_test_selected = x_test[feat_names].iloc[:,0:nfeat]

        #Create a Random Forest Regression
        rfg_model = RandomForestRegressor(n_estimators=ntrees, random_state=100)
        
        #Train the model using the training sets
        rfg_model.fit(x_train_selected, y_train)
        y_pred_rfg = rfg_model.predict(x_test_selected)

        #hitung score model dari data train
        scores = cross_validate(rfg_model, x_train_selected, y_train, scoring=metrics, cv=cv, return_train_score=True)

        print("MSE model Random Forest Regression data Train dengan " + str(nfeat) + " fitur: " 
              + str(abs(round(scores['train_mse'].mean(), 2))))
        print("RMSE model Random Forest Regression data Train dengan " + str(nfeat) + " fitur:"
              + str(abs(round(scores['train_rmse'].mean(), 2))))
        print("MAE model Random Forest Regression data Train dengan " + str(nfeat) + " fitur:"
              + str(abs(round(scores['train_mae'].mean(), 2))))
        print("R2 model Random Forest Regression data Train dengan " + str(nfeat) + " fitur:"
              + str(abs((round(scores['train_r2'].mean(), 2)))))    
        print("----------------------------")
        print("MSE model Random Forest Regression data Test dengan " + str(nfeat) + " fitur:" 
          + str(abs(round(mean_squared_error(y_test, y_pred_rfg), 2))))
        print("RMSE model Random Forest Regression data Test dengan " + str(nfeat) + " fitur:" 
          + str(abs(round(mean_squared_error(y_test, y_pred_rfg, squared = False), 2))))
        print("MAE model Random Forest Regression data Test dengan " + str(nfeat) + " fitur:" 
          + str(abs(round(mean_absolute_error(y_test, y_pred_rfg), 2))))
        print("R2 model Random Forest Regression data Test dengan " + str(nfeat) + " fitur:" 
          + str(abs(round(r2_score(y_test, y_pred_rfg), 2))))
        print(" ")
        print("==================================================")

MSE model Linear Regression data Train dengan 80 fitur: 38.53
RMSE model Linear Regression data Train dengan 80 fitur:6.2
MAE model Linear Regression data Train dengan 80 fitur:4.72
R2 model Linear Regression data Train dengan 80 fitur:0.8
----------------------------
MSE model Linear Regression data Test dengan 80 fitur:185.62
RMSE model Linear Regression data Test dengan 80 fitur:13.62
MAE model Linear Regression data Test dengan 80 fitur:10.75
R2 model Linear Regression data Test dengan 80 fitur:0.08
 
CPU times: user 5.93 s, sys: 95.3 ms, total: 6.02 s
Wall time: 3.43 s


## 150 Trees

In [10]:
%%time

#tentukan metode scoring yang digunakan
metrics = {'rmse': 'neg_root_mean_squared_error',
               'mse': 'neg_mean_squared_error',
               'mae': 'neg_mean_absolute_error',
               'r2': 'r2'} 

n_feat = [80]
n_trees = [150]

for nfeat in n_feat:
    for ntrees in n_trees: 
        print("==================================================")
        
        #ambil n fitur input hasil seleksi fitur SFS
        x_train_selected = x_train[feat_names].iloc[:,0:nfeat]
        x_test_selected = x_test[feat_names].iloc[:,0:nfeat]

        #Create a Random Forest Regression
        rfg_model = RandomForestRegressor(n_estimators=ntrees, random_state=100)
        
        #Train the model using the training sets
        rfg_model.fit(x_train_selected, y_train)
        y_pred_rfg = rfg_model.predict(x_test_selected)

        #hitung score model dari data train
        scores = cross_validate(rfg_model, x_train_selected, y_train, scoring=metrics, cv=cv, return_train_score=True)

        print("MSE model Random Forest Regression data Train dengan " + str(nfeat) + " fitur: " 
              + str(abs(round(scores['train_mse'].mean(), 2))))
        print("RMSE model Random Forest Regression data Train dengan " + str(nfeat) + " fitur:"
              + str(abs(round(scores['train_rmse'].mean(), 2))))
        print("MAE model Random Forest Regression data Train dengan " + str(nfeat) + " fitur:"
              + str(abs(round(scores['train_mae'].mean(), 2))))
        print("R2 model Random Forest Regression data Train dengan " + str(nfeat) + " fitur:"
              + str(abs((round(scores['train_r2'].mean(), 2)))))    
        print("----------------------------")
        print("MSE model Random Forest Regression data Test dengan " + str(nfeat) + " fitur:" 
          + str(abs(round(mean_squared_error(y_test, y_pred_rfg), 2))))
        print("RMSE model Random Forest Regression data Test dengan " + str(nfeat) + " fitur:" 
          + str(abs(round(mean_squared_error(y_test, y_pred_rfg, squared = False), 2))))
        print("MAE model Random Forest Regression data Test dengan " + str(nfeat) + " fitur:" 
          + str(abs(round(mean_absolute_error(y_test, y_pred_rfg), 2))))
        print("R2 model Random Forest Regression data Test dengan " + str(nfeat) + " fitur:" 
          + str(abs(round(r2_score(y_test, y_pred_rfg), 2))))
        print(" ")
        print("==================================================")

MSE model Linear Regression data Train dengan 80 fitur: 38.43
RMSE model Linear Regression data Train dengan 80 fitur:6.19
MAE model Linear Regression data Train dengan 80 fitur:4.73
R2 model Linear Regression data Train dengan 80 fitur:0.8
----------------------------
MSE model Linear Regression data Test dengan 80 fitur:185.61
RMSE model Linear Regression data Test dengan 80 fitur:13.62
MAE model Linear Regression data Test dengan 80 fitur:10.79
R2 model Linear Regression data Test dengan 80 fitur:0.08
 
CPU times: user 4.84 s, sys: 34 ms, total: 4.88 s
Wall time: 4.87 s


## 200 Trees

In [11]:
%%time

#tentukan metode scoring yang digunakan
metrics = {'rmse': 'neg_root_mean_squared_error',
               'mse': 'neg_mean_squared_error',
               'mae': 'neg_mean_absolute_error',
               'r2': 'r2'} 

n_feat = [80]
n_trees = [200]

for nfeat in n_feat:
    for ntrees in n_trees: 
        print("==================================================")
        
        #ambil n fitur input hasil seleksi fitur SFS
        x_train_selected = x_train[feat_names].iloc[:,0:nfeat]
        x_test_selected = x_test[feat_names].iloc[:,0:nfeat]

        #Create a Random Forest Regression
        rfg_model = RandomForestRegressor(n_estimators=ntrees, random_state=100)
        
        #Train the model using the training sets
        rfg_model.fit(x_train_selected, y_train)
        y_pred_rfg = rfg_model.predict(x_test_selected)

        #hitung score model dari data train
        scores = cross_validate(rfg_model, x_train_selected, y_train, scoring=metrics, cv=cv, return_train_score=True)

        print("MSE model Random Forest Regression data Train dengan " + str(nfeat) + " fitur: " 
              + str(abs(round(scores['train_mse'].mean(), 2))))
        print("RMSE model Random Forest Regression data Train dengan " + str(nfeat) + " fitur:"
              + str(abs(round(scores['train_rmse'].mean(), 2))))
        print("MAE model Random Forest Regression data Train dengan " + str(nfeat) + " fitur:"
              + str(abs(round(scores['train_mae'].mean(), 2))))
        print("R2 model Random Forest Regression data Train dengan " + str(nfeat) + " fitur:"
              + str(abs((round(scores['train_r2'].mean(), 2)))))    
        print("----------------------------")
        print("MSE model Random Forest Regression data Test dengan " + str(nfeat) + " fitur:" 
          + str(abs(round(mean_squared_error(y_test, y_pred_rfg), 2))))
        print("RMSE model Random Forest Regression data Test dengan " + str(nfeat) + " fitur:" 
          + str(abs(round(mean_squared_error(y_test, y_pred_rfg, squared = False), 2))))
        print("MAE model Random Forest Regression data Test dengan " + str(nfeat) + " fitur:" 
          + str(abs(round(mean_absolute_error(y_test, y_pred_rfg), 2))))
        print("R2 model Random Forest Regression data Test dengan " + str(nfeat) + " fitur:" 
          + str(abs(round(r2_score(y_test, y_pred_rfg), 2))))
        print(" ")
        print("==================================================")

MSE model Linear Regression data Train dengan 80 fitur: 38.39
RMSE model Linear Regression data Train dengan 80 fitur:6.19
MAE model Linear Regression data Train dengan 80 fitur:4.73
R2 model Linear Regression data Train dengan 80 fitur:0.8
----------------------------
MSE model Linear Regression data Test dengan 80 fitur:184.11
RMSE model Linear Regression data Test dengan 80 fitur:13.57
MAE model Linear Regression data Test dengan 80 fitur:10.8
R2 model Linear Regression data Test dengan 80 fitur:0.07
 
CPU times: user 6.27 s, sys: 50 ms, total: 6.32 s
Wall time: 6.32 s
