referensi https://www.analyticsvidhya.com/blog/2021/04/forward-feature-selection-and-its-implementation/

In [5]:
# #instal library mlxtend
# %pip install mlxtend

In [6]:
# Import Library
import pandas as pd
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, KFold, cross_validate
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import time

In [7]:
#import dataset
df = pd.read_csv('/home/jupyter-17523142/Rama/Dataset/dataset_rama.csv')
X = df.drop(['No', 'Mango Cultivars', 'Vit C (mg/100g)', 'TA (mg/100g)', 'SSC (oBrix)', 'label'], axis = 1)
y_vitc = df.loc[:, 'Vit C (mg/100g)']

# separate dataset into train and test
x_train, x_test, y_train, y_test = train_test_split(
    X, y_vitc, test_size = 0.3, random_state=0)

## Seleksi Fitur Forward Selection dengan Linear Regression

In [8]:
# calling the Linear Regression model
lr_model =  LinearRegression()
sfs = sfs(lr_model, k_features=20, forward=True, verbose=2, scoring='neg_root_mean_squared_error', n_jobs=-1)

In [5]:
%%time 

sfs = sfs.fit(x_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 80 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done 205 tasks      | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done 488 tasks      | elapsed:    7.4s
[Parallel(n_jobs=-1)]: Done 853 tasks      | elapsed:    8.3s
[Parallel(n_jobs=-1)]: Done 1298 tasks      | elapsed:    9.4s
[Parallel(n_jobs=-1)]: Done 1557 out of 1557 | elapsed:    9.7s finished

[2022-07-13 06:19:19] Features: 1/20 -- score: -13.787313513773904[Parallel(n_jobs=-1)]: Using backend LokyBackend with 80 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 250 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 816 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 1556 out of 1556 | elapsed:    2.7s finished

[2022-07-13 06:19:22] Features: 2/20 -- score: -12.378759342766646[Parallel(n_jobs=-1)]: Using backend LokyBackend with 80 concurrent w

CPU times: user 1min 14s, sys: 4.91 s, total: 1min 19s
Wall time: 1min 21s


[Parallel(n_jobs=-1)]: Done 1538 out of 1538 | elapsed:    4.6s finished

[2022-07-13 06:20:31] Features: 20/20 -- score: -10.325347366652931

In [6]:
feat_names = list(sfs.k_feature_names_)
print("Fitur yang diambil adalah: ")
print(feat_names)
len(feat_names)

Fitur yang diambil adalah: 
['1043.4', '1045.9', '1047.1', '1051', '1052.2', '1052.7', '1054', '1070.5', '1085.3', '1085.7', '1087.1', '1087.6', '1089.8', '1094.4', '1096.3', '1422.2', '1427.7', '1446.8', '1654.6', '1732']


20

# Linear Regression

In [7]:
#cross validation 10-fold
cv = KFold(n_splits=10, random_state=1, shuffle=True)

In [8]:
%%time

#tentukan metode scoring yang digunakan
metrics = {'rmse': 'neg_root_mean_squared_error',
               'mse': 'neg_mean_squared_error',
               'mae': 'neg_mean_absolute_error',
               'r2': 'r2'} 

n_feat = [20]

for nfeat in n_feat:
        print("==================================================")
        
        #ambil n fitur input hasil seleksi fitur SFS
        x_train_selected = x_train[feat_names].iloc[:,0:nfeat]
        x_test_selected = x_test[feat_names].iloc[:,0:nfeat]

        #Create a Linear Regression
        lr_model = LinearRegression()
        
        #Train the model using the training sets
        lr_model.fit(x_train_selected, y_train)
        y_pred_lr = lr_model.predict(x_test_selected)

        #hitung score model dari data train
        scores = cross_validate(lr_model, x_train_selected, y_train, scoring=metrics, cv=cv, return_train_score=True)

        print("MSE model Linear Regression data Train dengan " + str(nfeat) + " fitur: " 
              + str(abs(round(scores['train_mse'].mean(), 2))))
        print("RMSE model Linear Regression data Train dengan " + str(nfeat) + " fitur:"
              + str(abs(round(scores['train_rmse'].mean(), 2))))
        print("MAE model Linear Regression data Train dengan " + str(nfeat) + " fitur:"
              + str(abs(round(scores['train_mae'].mean(), 2))))
        print("R2 model Linear Regression data Train dengan " + str(nfeat) + " fitur:"
              + str(abs((round(scores['train_r2'].mean(), 2)))))    
        print("----------------------------")        
        print("MSE model Linear Regression data Test dengan " + str(nfeat) + " fitur:" 
          + str(abs(round(mean_squared_error(y_test, y_pred_lr), 2))))
        print("RMSE model Linear Regression data Test dengan " + str(nfeat) + " fitur:" 
          + str(abs(round(mean_squared_error(y_test, y_pred_lr, squared = False), 2))))
        print("MAE model Linear Regression data Test dengan " + str(nfeat) + " fitur:" 
          + str(abs(round(mean_absolute_error(y_test, y_pred_lr), 2))))
        print("R2 model Linear Regression data Test dengan " + str(nfeat) + " fitur:" 
          + str(abs(round(r2_score(y_test, y_pred_lr), 2))))
        print(" ")
        print("==================================================")

MSE model Linear Regression data Train dengan 20 fitur: 84.69
RMSE model Linear Regression data Train dengan 20 fitur:9.2
MAE model Linear Regression data Train dengan 20 fitur:7.18
R2 model Linear Regression data Train dengan 20 fitur:0.56
----------------------------
MSE model Linear Regression data Test dengan 20 fitur:164.64
RMSE model Linear Regression data Test dengan 20 fitur:12.83
MAE model Linear Regression data Test dengan 20 fitur:10.89
R2 model Linear Regression data Test dengan 20 fitur:0.05
 
CPU times: user 5.69 s, sys: 137 ms, total: 5.83 s
Wall time: 255 ms


# Prediksi Random Forest Regression

## 100 Trees

In [12]:
%%time

#tentukan metode scoring yang digunakan
metrics = {'rmse': 'neg_root_mean_squared_error',
               'mse': 'neg_mean_squared_error',
               'mae': 'neg_mean_absolute_error',
               'r2': 'r2'} 

n_feat = [20]
n_trees = [100]

for nfeat in n_feat:
    for ntrees in n_trees: 
        print("==================================================")
        
        #ambil n fitur input hasil seleksi fitur SFS
        x_train_selected = x_train[feat_names].iloc[:,0:nfeat]
        x_test_selected = x_test[feat_names].iloc[:,0:nfeat]

        #Create a Random Forest Regression
        rfg_model = RandomForestRegressor(n_estimators=ntrees, random_state=100)
        
        #Train the model using the training sets
        rfg_model.fit(x_train_selected, y_train)
        y_pred_rfg = rfg_model.predict(x_test_selected)

        #hitung score model dari data train
        scores = cross_validate(rfg_model, x_train_selected, y_train, scoring=metrics, cv=cv, return_train_score=True)

        print("MSE model Random Forest Regression data Train dengan " + str(nfeat) + " fitur: " 
              + str(abs(round(scores['train_mse'].mean(), 2))))
        print("RMSE model Random Forest Regression data Train dengan " + str(nfeat) + " fitur:"
              + str(abs(round(scores['train_rmse'].mean(), 2))))
        print("MAE model Random Forest Regression data Train dengan " + str(nfeat) + " fitur:"
              + str(abs(round(scores['train_mae'].mean(), 2))))
        print("R2 model Random Forest Regression data Train dengan " + str(nfeat) + " fitur:"
              + str(abs((round(scores['train_r2'].mean(), 2)))))    
        print("----------------------------")
        print("MSE model Random Forest Regression data Test dengan " + str(nfeat) + " fitur:" 
          + str(abs(round(mean_squared_error(y_test, y_pred_rfg), 2))))
        print("RMSE model Random Forest Regression data Test dengan " + str(nfeat) + " fitur:" 
          + str(abs(round(mean_squared_error(y_test, y_pred_rfg, squared = False), 2))))
        print("MAE model Random Forest Regression data Test dengan " + str(nfeat) + " fitur:" 
          + str(abs(round(mean_absolute_error(y_test, y_pred_rfg), 2))))
        print("R2 model Random Forest Regression data Test dengan " + str(nfeat) + " fitur:" 
          + str(abs(round(r2_score(y_test, y_pred_rfg), 2))))
        print(" ")
        print("==================================================")

MSE model Linear Regression data Train dengan 20 fitur: 38.03
RMSE model Linear Regression data Train dengan 20 fitur:6.16
MAE model Linear Regression data Train dengan 20 fitur:4.7
R2 model Linear Regression data Train dengan 20 fitur:0.8
----------------------------
MSE model Linear Regression data Test dengan 20 fitur:187.95
RMSE model Linear Regression data Test dengan 20 fitur:13.71
MAE model Linear Regression data Test dengan 20 fitur:10.91
R2 model Linear Regression data Test dengan 20 fitur:0.09
 
CPU times: user 1.84 s, sys: 9.81 ms, total: 1.85 s
Wall time: 1.85 s


## 150 Trees

In [13]:
%%time

#tentukan metode scoring yang digunakan
metrics = {'rmse': 'neg_root_mean_squared_error',
               'mse': 'neg_mean_squared_error',
               'mae': 'neg_mean_absolute_error',
               'r2': 'r2'} 

n_feat = [20]
n_trees = [150]

for nfeat in n_feat:
    for ntrees in n_trees: 
        print("==================================================")
        
        #ambil n fitur input hasil seleksi fitur SFS
        x_train_selected = x_train[feat_names].iloc[:,0:nfeat]
        x_test_selected = x_test[feat_names].iloc[:,0:nfeat]

        #Create a Random Forest Regression
        rfg_model = RandomForestRegressor(n_estimators=ntrees, random_state=100)
        
        #Train the model using the training sets
        rfg_model.fit(x_train_selected, y_train)
        y_pred_rfg = rfg_model.predict(x_test_selected)

        #hitung score model dari data train
        scores = cross_validate(rfg_model, x_train_selected, y_train, scoring=metrics, cv=cv, return_train_score=True)

        print("MSE model Random Forest Regression data Train dengan " + str(nfeat) + " fitur: " 
              + str(abs(round(scores['train_mse'].mean(), 2))))
        print("RMSE model Random Forest Regression data Train dengan " + str(nfeat) + " fitur:"
              + str(abs(round(scores['train_rmse'].mean(), 2))))
        print("MAE model Random Forest Regression data Train dengan " + str(nfeat) + " fitur:"
              + str(abs(round(scores['train_mae'].mean(), 2))))
        print("R2 model Random Forest Regression data Train dengan " + str(nfeat) + " fitur:"
              + str(abs((round(scores['train_r2'].mean(), 2)))))    
        print("----------------------------")
        print("MSE model Random Forest Regression data Test dengan " + str(nfeat) + " fitur:" 
          + str(abs(round(mean_squared_error(y_test, y_pred_rfg), 2))))
        print("RMSE model Random Forest Regression data Test dengan " + str(nfeat) + " fitur:" 
          + str(abs(round(mean_squared_error(y_test, y_pred_rfg, squared = False), 2))))
        print("MAE model Random Forest Regression data Test dengan " + str(nfeat) + " fitur:" 
          + str(abs(round(mean_absolute_error(y_test, y_pred_rfg), 2))))
        print("R2 model Random Forest Regression data Test dengan " + str(nfeat) + " fitur:" 
          + str(abs(round(r2_score(y_test, y_pred_rfg), 2))))
        print(" ")
        print("==================================================")

MSE model Linear Regression data Train dengan 20 fitur: 37.91
RMSE model Linear Regression data Train dengan 20 fitur:6.15
MAE model Linear Regression data Train dengan 20 fitur:4.71
R2 model Linear Regression data Train dengan 20 fitur:0.8
----------------------------
MSE model Linear Regression data Test dengan 20 fitur:185.37
RMSE model Linear Regression data Test dengan 20 fitur:13.62
MAE model Linear Regression data Test dengan 20 fitur:10.81
R2 model Linear Regression data Test dengan 20 fitur:0.07
 
CPU times: user 2.71 s, sys: 4.94 ms, total: 2.72 s
Wall time: 2.71 s


## 200 Trees

In [14]:
%%time

#tentukan metode scoring yang digunakan
metrics = {'rmse': 'neg_root_mean_squared_error',
               'mse': 'neg_mean_squared_error',
               'mae': 'neg_mean_absolute_error',
               'r2': 'r2'} 

n_feat = [20]
n_trees = [200]

for nfeat in n_feat:
    for ntrees in n_trees: 
        print("==================================================")
        
        #ambil n fitur input hasil seleksi fitur SFS
        x_train_selected = x_train[feat_names].iloc[:,0:nfeat]
        x_test_selected = x_test[feat_names].iloc[:,0:nfeat]

        #Create a Random Forest Regression
        rfg_model = RandomForestRegressor(n_estimators=ntrees, random_state=100)
        
        #Train the model using the training sets
        rfg_model.fit(x_train_selected, y_train)
        y_pred_rfg = rfg_model.predict(x_test_selected)

        #hitung score model dari data train
        scores = cross_validate(rfg_model, x_train_selected, y_train, scoring=metrics, cv=cv, return_train_score=True)

        print("MSE model Random Forest Regression data Train dengan " + str(nfeat) + " fitur: " 
              + str(abs(round(scores['train_mse'].mean(), 2))))
        print("RMSE model Random Forest Regression data Train dengan " + str(nfeat) + " fitur:"
              + str(abs(round(scores['train_rmse'].mean(), 2))))
        print("MAE model Random Forest Regression data Train dengan " + str(nfeat) + " fitur:"
              + str(abs(round(scores['train_mae'].mean(), 2))))
        print("R2 model Random Forest Regression data Train dengan " + str(nfeat) + " fitur:"
              + str(abs((round(scores['train_r2'].mean(), 2)))))    
        print("----------------------------")
        print("MSE model Random Forest Regression data Test dengan " + str(nfeat) + " fitur:" 
          + str(abs(round(mean_squared_error(y_test, y_pred_rfg), 2))))
        print("RMSE model Random Forest Regression data Test dengan " + str(nfeat) + " fitur:" 
          + str(abs(round(mean_squared_error(y_test, y_pred_rfg, squared = False), 2))))
        print("MAE model Random Forest Regression data Test dengan " + str(nfeat) + " fitur:" 
          + str(abs(round(mean_absolute_error(y_test, y_pred_rfg), 2))))
        print("R2 model Linear Regression data Test dengan " + str(nfeat) + " fitur:" 
          + str(abs(round(r2_score(y_test, y_pred_rfg), 2))))
        print(" ")
        print("==================================================")

MSE model Linear Regression data Train dengan 20 fitur: 37.79
RMSE model Linear Regression data Train dengan 20 fitur:6.14
MAE model Linear Regression data Train dengan 20 fitur:4.7
R2 model Linear Regression data Train dengan 20 fitur:0.8
----------------------------
MSE model Linear Regression data Test dengan 20 fitur:184.91
RMSE model Linear Regression data Test dengan 20 fitur:13.6
MAE model Linear Regression data Test dengan 20 fitur:10.83
R2 model Linear Regression data Test dengan 20 fitur:0.07
 
CPU times: user 3.57 s, sys: 44.1 ms, total: 3.61 s
Wall time: 3.61 s
