referensi https://www.analyticsvidhya.com/blog/2021/04/forward-feature-selection-and-its-implementation/

In [1]:
# #instal library mlxtend
# %pip install mlxtend

In [2]:
# Import Library
import pandas as pd
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, KFold, cross_validate
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import time

In [3]:
#import dataset
df = pd.read_csv('/home/jupyter-17523142/Rama/Dataset/dataset_rama.csv')
X = df.drop(['No', 'Mango Cultivars', 'Vit C (mg/100g)', 'TA (mg/100g)', 'SSC (oBrix)', 'label'], axis = 1)
y_vitc = df.loc[:, 'Vit C (mg/100g)']

# separate dataset into train and test
x_train, x_test, y_train, y_test = train_test_split(
    X, y_vitc, test_size = 0.3, random_state=0)

## Seleksi Fitur Forward Selection dengan Linear Regression

In [4]:
# calling the Linear Regression model
lr_model =  LinearRegression()
sfs = sfs(lr_model, k_features=20, forward=True, verbose=2, scoring='neg_root_mean_squared_error', n_jobs=-1)

In [5]:
%%time 

sfs = sfs.fit(x_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 80 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done 205 tasks      | elapsed:    8.4s
[Parallel(n_jobs=-1)]: Done 488 tasks      | elapsed:    9.1s
[Parallel(n_jobs=-1)]: Done 853 tasks      | elapsed:   10.1s
[Parallel(n_jobs=-1)]: Done 1298 tasks      | elapsed:   11.3s
[Parallel(n_jobs=-1)]: Done 1557 out of 1557 | elapsed:   11.6s finished

[2022-07-13 06:23:32] Features: 1/40 -- score: -13.787313513773904[Parallel(n_jobs=-1)]: Using backend LokyBackend with 80 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 250 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 816 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 1556 out of 1556 | elapsed:    2.9s finished

[2022-07-13 06:23:35] Features: 2/40 -- score: -12.378759342766646[Parallel(n_jobs=-1)]: Using backend LokyBackend with 80 concurrent w

CPU times: user 3min 38s, sys: 11.2 s, total: 3min 49s
Wall time: 3min 54s


[Parallel(n_jobs=-1)]: Done 1359 out of 1518 | elapsed:    7.0s remaining:    0.8s
[Parallel(n_jobs=-1)]: Done 1518 out of 1518 | elapsed:    7.1s finished

[2022-07-13 06:27:15] Features: 40/40 -- score: -9.13691357528299

In [6]:
feat_names = list(sfs.k_feature_names_)
print("Fitur yang diambil adalah: ")
print(feat_names)
len(feat_names)

Fitur yang diambil adalah: 
['1043.4', '1045.9', '1047.1', '1050.5', '1051', '1052.2', '1052.7', '1054', '1060', '1070.5', '1084.8', '1085.3', '1085.7', '1087.1', '1087.6', '1089.4', '1089.8', '1094.4', '1094.9', '1096.3', '1101.4', '1104.2', '1391', '1422.2', '1424.6', '1425.4', '1427.7', '1439.6', '1446', '1446.8', '1452.5', '1454.1', '1464', '1540.5', '1555.3', '1654.6', '1660.9', '1666.3', '1669.5', '1732']


40

# Prediksi Linear Regression

In [7]:
#cross validation 10-fold
cv = KFold(n_splits=10, random_state=1, shuffle=True)

In [8]:
%%time

#tentukan metode scoring yang digunakan
metrics = {'rmse': 'neg_root_mean_squared_error',
               'mse': 'neg_mean_squared_error',
               'mae': 'neg_mean_absolute_error',
               'r2': 'r2'} 

n_feat = [40]

for nfeat in n_feat:
        print("==================================================")
        
        #ambil n fitur input hasil seleksi fitur SFS
        x_train_selected = x_train[feat_names].iloc[:,0:nfeat]
        x_test_selected = x_test[feat_names].iloc[:,0:nfeat]

        #Create a Linear Regression
        lr_model = LinearRegression()
        
        #Train the model using the training sets
        lr_model.fit(x_train_selected, y_train)
        y_pred_lr = lr_model.predict(x_test_selected)

        #hitung score model dari data train
        scores = cross_validate(lr_model, x_train_selected, y_train, scoring=metrics, cv=cv, return_train_score=True)

        print("MSE model Linear Regression data Train dengan " + str(nfeat) + " fitur: " 
              + str(abs(round(scores['train_mse'].mean(), 2))))
        print("RMSE model Linear Regression data Train dengan " + str(nfeat) + " fitur:"
              + str(abs(round(scores['train_rmse'].mean(), 2))))
        print("MAE model Linear Regression data Train dengan " + str(nfeat) + " fitur:"
              + str(abs(round(scores['train_mae'].mean(), 2))))
        print("R2 model Linear Regression data Train dengan " + str(nfeat) + " fitur:"
              + str(abs((round(scores['train_r2'].mean(), 2)))))    
        print("----------------------------")        
        print("MSE model Linear Regression data Test dengan " + str(nfeat) + " fitur:" 
          + str(abs(round(mean_squared_error(y_test, y_pred_lr), 2))))
        print("RMSE model Linear Regression data Test dengan " + str(nfeat) + " fitur:" 
          + str(abs(round(mean_squared_error(y_test, y_pred_lr, squared = False), 2))))
        print("MAE model Linear Regression data Test dengan " + str(nfeat) + " fitur:" 
          + str(abs(round(mean_absolute_error(y_test, y_pred_lr), 2))))
        print("R2 model Linear Regression data Test dengan " + str(nfeat) + " fitur:" 
          + str(abs(round(r2_score(y_test, y_pred_lr), 2))))
        print(" ")
        print("==================================================")

MSE model Linear Regression data Train dengan 40 fitur: 52.9
RMSE model Linear Regression data Train dengan 40 fitur:7.27
MAE model Linear Regression data Train dengan 40 fitur:5.63
R2 model Linear Regression data Train dengan 40 fitur:0.72
----------------------------
MSE model Linear Regression data Test dengan 40 fitur:394.58
RMSE model Linear Regression data Test dengan 40 fitur:19.86
MAE model Linear Regression data Test dengan 40 fitur:14.7
R2 model Linear Regression data Test dengan 40 fitur:1.29
 
CPU times: user 6.32 s, sys: 181 ms, total: 6.5 s
Wall time: 469 ms


# Prediksi Random Forest Regressor

## 100 Trees

In [9]:
%%time

#tentukan metode scoring yang digunakan
metrics = {'rmse': 'neg_root_mean_squared_error',
               'mse': 'neg_mean_squared_error',
               'mae': 'neg_mean_absolute_error',
               'r2': 'r2'} 

n_feat = [40]
n_trees = [100]

for nfeat in n_feat:
    for ntrees in n_trees: 
        print("==================================================")
        
        #ambil n fitur input hasil seleksi fitur SFS
        x_train_selected = x_train[feat_names].iloc[:,0:nfeat]
        x_test_selected = x_test[feat_names].iloc[:,0:nfeat]

        #Create a Random Forest Regression
        rfg_model = RandomForestRegressor(n_estimators=ntrees, random_state=100)
        
        #Train the model using the training sets
        rfg_model.fit(x_train_selected, y_train)
        y_pred_rfg = rfg_model.predict(x_test_selected)

        #hitung score model dari data train
        scores = cross_validate(rfg_model, x_train_selected, y_train, scoring=metrics, cv=cv, return_train_score=True)

        print("MSE model Random Forest Regression data Train dengan " + str(nfeat) + " fitur: " 
              + str(abs(round(scores['train_mse'].mean(), 2))))
        print("RMSE model Random Forest Regression data Train dengan " + str(nfeat) + " fitur:"
              + str(abs(round(scores['train_rmse'].mean(), 2))))
        print("MAE model Random Forest Regression data Train dengan " + str(nfeat) + " fitur:"
              + str(abs(round(scores['train_mae'].mean(), 2))))
        print("R2 model Random Forest Regression data Train dengan " + str(nfeat) + " fitur:"
              + str(abs((round(scores['train_r2'].mean(), 2)))))    
        print("----------------------------")
        print("MSE model Random Forest Regression data Test dengan " + str(nfeat) + " fitur:" 
          + str(abs(round(mean_squared_error(y_test, y_pred_rfg), 2))))
        print("RMSE model Random Forest Regression data Test dengan " + str(nfeat) + " fitur:" 
          + str(abs(round(mean_squared_error(y_test, y_pred_rfg, squared = False), 2))))
        print("MAE model Random Forest Regression data Test dengan " + str(nfeat) + " fitur:" 
          + str(abs(round(mean_absolute_error(y_test, y_pred_rfg), 2))))
        print("R2 model Random Forest Regression data Test dengan " + str(nfeat) + " fitur:" 
          + str(abs(round(r2_score(y_test, y_pred_rfg), 2))))
        print(" ")
        print("==================================================")

MSE model Linear Regression data Train dengan 40 fitur: 38.16
RMSE model Linear Regression data Train dengan 40 fitur:6.17
MAE model Linear Regression data Train dengan 40 fitur:4.72
R2 model Linear Regression data Train dengan 40 fitur:0.8
----------------------------
MSE model Linear Regression data Test dengan 40 fitur:185.67
RMSE model Linear Regression data Test dengan 40 fitur:13.63
MAE model Linear Regression data Test dengan 40 fitur:10.68
R2 model Linear Regression data Test dengan 40 fitur:0.08
 
CPU times: user 5.47 s, sys: 115 ms, total: 5.58 s
Wall time: 2.45 s


## 150 Trees

In [10]:
%%time

#tentukan metode scoring yang digunakan
metrics = {'rmse': 'neg_root_mean_squared_error',
               'mse': 'neg_mean_squared_error',
               'mae': 'neg_mean_absolute_error',
               'r2': 'r2'} 

n_feat = [40]
n_trees = [150]

for nfeat in n_feat:
    for ntrees in n_trees: 
        print("==================================================")
        
        #ambil n fitur input hasil seleksi fitur SFS
        x_train_selected = x_train[feat_names].iloc[:,0:nfeat]
        x_test_selected = x_test[feat_names].iloc[:,0:nfeat]

        #Create a Random Forest Regression
        rfg_model = RandomForestRegressor(n_estimators=ntrees, random_state=100)
        
        #Train the model using the training sets
        rfg_model.fit(x_train_selected, y_train)
        y_pred_rfg = rfg_model.predict(x_test_selected)

        #hitung score model dari data train
        scores = cross_validate(rfg_model, x_train_selected, y_train, scoring=metrics, cv=cv, return_train_score=True)

        print("MSE model Random Forest Regression data Train dengan " + str(nfeat) + " fitur: " 
              + str(abs(round(scores['train_mse'].mean(), 2))))
        print("RMSE model Random Forest Regression data Train dengan " + str(nfeat) + " fitur:"
              + str(abs(round(scores['train_rmse'].mean(), 2))))
        print("MAE model Random Forest Regression data Train dengan " + str(nfeat) + " fitur:"
              + str(abs(round(scores['train_mae'].mean(), 2))))
        print("R2 model Random Forest Regression data Train dengan " + str(nfeat) + " fitur:"
              + str(abs((round(scores['train_r2'].mean(), 2)))))    
        print("----------------------------")
        print("MSE model Random Forest Regression data Test dengan " + str(nfeat) + " fitur:" 
          + str(abs(round(mean_squared_error(y_test, y_pred_rfg), 2))))
        print("RMSE model Random Forest Regression data Test dengan " + str(nfeat) + " fitur:" 
          + str(abs(round(mean_squared_error(y_test, y_pred_rfg, squared = False), 2))))
        print("MAE model Random Forest Regression data Test dengan " + str(nfeat) + " fitur:" 
          + str(abs(round(mean_absolute_error(y_test, y_pred_rfg), 2))))
        print("R2 model Random Forest Regression data Test dengan " + str(nfeat) + " fitur:" 
          + str(abs(round(r2_score(y_test, y_pred_rfg), 2))))
        print(" ")
        print("==================================================")

MSE model Linear Regression data Train dengan 40 fitur: 37.97
RMSE model Linear Regression data Train dengan 40 fitur:6.16
MAE model Linear Regression data Train dengan 40 fitur:4.71
R2 model Linear Regression data Train dengan 40 fitur:0.8
----------------------------
MSE model Linear Regression data Test dengan 40 fitur:187.78
RMSE model Linear Regression data Test dengan 40 fitur:13.7
MAE model Linear Regression data Test dengan 40 fitur:10.77
R2 model Linear Regression data Test dengan 40 fitur:0.09
 
CPU times: user 3.59 s, sys: 19 ms, total: 3.61 s
Wall time: 3.61 s


## 200 Trees

In [11]:
%%time

#tentukan metode scoring yang digunakan
metrics = {'rmse': 'neg_root_mean_squared_error',
               'mse': 'neg_mean_squared_error',
               'mae': 'neg_mean_absolute_error',
               'r2': 'r2'} 

n_feat = [40]
n_trees = [200]

for nfeat in n_feat:
    for ntrees in n_trees: 
        print("==================================================")
        
        #ambil n fitur input hasil seleksi fitur SFS
        x_train_selected = x_train[feat_names].iloc[:,0:nfeat]
        x_test_selected = x_test[feat_names].iloc[:,0:nfeat]

        #Create a Random Forest Regression
        rfg_model = RandomForestRegressor(n_estimators=ntrees, random_state=100)
        
        #Train the model using the training sets
        rfg_model.fit(x_train_selected, y_train)
        y_pred_rfg = rfg_model.predict(x_test_selected)

        #hitung score model dari data train
        scores = cross_validate(rfg_model, x_train_selected, y_train, scoring=metrics, cv=cv, return_train_score=True)

        print("MSE model Random Forest Regression data Train dengan " + str(nfeat) + " fitur: " 
              + str(abs(round(scores['train_mse'].mean(), 2))))
        print("RMSE model Random Forest Regression data Train dengan " + str(nfeat) + " fitur:"
              + str(abs(round(scores['train_rmse'].mean(), 2))))
        print("MAE model Random Forest Regression data Train dengan " + str(nfeat) + " fitur:"
              + str(abs(round(scores['train_mae'].mean(), 2))))
        print("R2 model Random Forest Regression data Train dengan " + str(nfeat) + " fitur:"
              + str(abs((round(scores['train_r2'].mean(), 2)))))    
        print("----------------------------")
        print("MSE model Random Forest Regression data Test dengan " + str(nfeat) + " fitur:" 
          + str(abs(round(mean_squared_error(y_test, y_pred_rfg), 2))))
        print("RMSE model Random Forest Regression data Test dengan " + str(nfeat) + " fitur:" 
          + str(abs(round(mean_squared_error(y_test, y_pred_rfg, squared = False), 2))))
        print("MAE model Random Forest Regression data Test dengan " + str(nfeat) + " fitur:" 
          + str(abs(round(mean_absolute_error(y_test, y_pred_rfg), 2))))
        print("R2 model Random Forest Regression data Test dengan " + str(nfeat) + " fitur:" 
          + str(abs(round(r2_score(y_test, y_pred_rfg), 2))))
        print(" ")
        print("==================================================")

MSE model Linear Regression data Train dengan 40 fitur: 37.91
RMSE model Linear Regression data Train dengan 40 fitur:6.15
MAE model Linear Regression data Train dengan 40 fitur:4.71
R2 model Linear Regression data Train dengan 40 fitur:0.8
----------------------------
MSE model Linear Regression data Test dengan 40 fitur:186.1
RMSE model Linear Regression data Test dengan 40 fitur:13.64
MAE model Linear Regression data Test dengan 40 fitur:10.82
R2 model Linear Regression data Test dengan 40 fitur:0.08
 
CPU times: user 4.86 s, sys: 17.5 ms, total: 4.88 s
Wall time: 4.87 s
