referensi https://machinelearningmastery.com/rfe-feature-selection-in-python/

The Recursive Feature Elimination (RFE) method works by recursively removing attributes and building a model on those attributes that remain. 

It uses accuracy metric to rank the feature according to their importance. 

The RFE method takes the model to be used and the number of required features as input. It then gives the ranking of all the variables, 1 being most important. 

It also gives its support, True being relevant feature and False being irrelevant feature.

In [1]:
#libraries buat seleksi fitur
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score, cross_validate, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor

In [2]:
#import dataset
df = pd.read_csv('/home/jupyter-17523142/Rama/Dataset/dataset_rama.csv')

#ambil x dan y
x = pd.DataFrame(df.drop(labels=['No', 'Mango Cultivars', 'Vit C (mg/100g)', 'TA (mg/100g)', 'SSC (oBrix)', 'label'], axis=1))
y = pd.DataFrame(df['Vit C (mg/100g)'])

# separate dataset into train and test
x_train, x_test, y_train, y_test = train_test_split(
    df.drop(labels=['No', 'Mango Cultivars', 'Vit C (mg/100g)', 'TA (mg/100g)', 'SSC (oBrix)', 'label'], axis=1),
    df['Vit C (mg/100g)'],
    test_size=0.3,
    random_state=0)

x_train.head()

Unnamed: 0,999.9,1000.3,1000.7,1001.1,1001.4,1001.8,1002.2,1002.6,1003,1003.4,...,2478.7,2481.1,2483.5,2485.8,2488.2,2490.6,2493,2495.4,2497.8,2500.2
16,0.471459,0.471074,0.470934,0.470379,0.47026,0.46988,0.469497,0.469435,0.469454,0.468998,...,1.413537,1.41574,1.417568,1.419698,1.421711,1.42307,1.424394,1.426121,1.427552,1.428625
51,0.433239,0.432622,0.432626,0.432379,0.43162,0.43071,0.430836,0.430847,0.430188,0.42947,...,1.601232,1.602877,1.604524,1.605982,1.606778,1.607837,1.608756,1.609967,1.6109,1.611099
183,0.545045,0.544204,0.543792,0.543596,0.543338,0.542534,0.541493,0.541139,0.541308,0.540831,...,1.524657,1.525973,1.527454,1.529518,1.530097,1.530315,1.530254,1.531191,1.532366,1.533183
145,0.545846,0.544815,0.544524,0.544631,0.544169,0.543143,0.542535,0.54208,0.541842,0.541258,...,1.421962,1.422955,1.423717,1.424639,1.42508,1.425797,1.426503,1.427164,1.427838,1.428271
40,0.381048,0.380483,0.380541,0.380151,0.379599,0.379189,0.379009,0.378722,0.378309,0.377719,...,1.571125,1.572674,1.574303,1.576075,1.577273,1.57798,1.578561,1.579334,1.580042,1.581424


# Seleksi Fitur RFE

In [3]:
%%time 

cols = list(x_train.columns)
model = LinearRegression()

#Initializing RFE model
rfe = RFE(estimator=model, n_features_to_select=60)

#Transforming data using RFE
X_rfe = rfe.fit_transform(x_train, y_train)

#Fitting the data to model
model.fit(X_rfe,y_train)              
temp = pd.Series(rfe.support_,index = cols)
selected_features_rfe = temp[temp==True].index
print(selected_features_rfe)

Index(['1393.2', '1393.9', '1400.7', '1431.7', '1435.6', '1436.4', '1447.6',
       '1449.3', '1504.8', '1509.2', '1535.1', '1539.6', '1562.8', '1565.7',
       '1567.6', '1572.3', '1580', '1581.9', '1586.7', '1590.6', '1592.6',
       '1597.5', '1742.4', '1775.8', '1786.9', '1791.8', '1798', '1810.6',
       '1811.8', '1834.9', '1838.8', '1841.4', '1848', '1857.3', '1863.9',
       '1868', '1881.5', '1895.3', '1905', '1907.8', '1936.3', '1995.9',
       '2070.9', '2089.2', '2092.6', '2116.5', '2121.7', '2123.5', '2148.1',
       '2155.2', '2157', '2173.3', '2195.4', '2199.1', '2302.6', '2304.7',
       '2325.3', '2333.7', '2352.8', '2354.9'],
      dtype='object')
CPU times: user 7min 37s, sys: 10.9 s, total: 7min 48s
Wall time: 19.6 s


# Prediksi Linear Regression

In [4]:
#cross validation 10-fold
cv = KFold(n_splits=10, random_state=1, shuffle=True)

In [5]:
%%time

#tentukan metode scoring yang digunakan
metrics = {'rmse': 'neg_root_mean_squared_error',
               'mse': 'neg_mean_squared_error',
               'mae': 'neg_mean_absolute_error',
               'r2': 'r2'} 

n_feat = [60]

for nfeat in n_feat:
        print("==================================================")
        
        #ambil n fitur input hasil seleksi fitur RFE
        x_train_selected = x_train[selected_features_rfe].iloc[:,0:nfeat]
        x_test_selected = x_test[selected_features_rfe].iloc[:,0:nfeat]

        #Create a Linear Regression
        lr_model = LinearRegression()
        
        #Train the model using the training sets
        lr_model.fit(x_train_selected, y_train)
        y_pred_lr = lr_model.predict(x_test_selected)

        #hitung score model dari data train
        scores = cross_validate(lr_model, x_train_selected, y_train, scoring=metrics, cv=cv, return_train_score=True)

        print("MSE model Linear Regression data Train dengan " + str(nfeat) + " fitur: " 
              + str(abs(round(scores['train_mse'].mean(), 2))))
        print("RMSE model Linear Regression data Train dengan " + str(nfeat) + " fitur:"
              + str(abs(round(scores['train_rmse'].mean(), 2))))
        print("MAE model Linear Regression data Train dengan " + str(nfeat) + " fitur:"
              + str(abs(round(scores['train_mae'].mean(), 2))))
        print("R2 model Linear Regression data Train dengan " + str(nfeat) + " fitur:"
              + str(abs((round(scores['train_r2'].mean(), 2)))))    
        print("----------------------------")
        print("MSE model Linear Regression data Test dengan " + str(nfeat) + " fitur:" 
          + str(abs(round(mean_squared_error(y_test, y_pred_lr), 2))))
        print("RMSE model Linear Regression data Test dengan " + str(nfeat) + " fitur:" 
          + str(abs(round(mean_squared_error(y_test, y_pred_lr, squared = False), 2))))
        print("MAE model Linear Regression data Test dengan " + str(nfeat) + " fitur:" 
          + str(abs(round(mean_absolute_error(y_test, y_pred_lr), 2))))
        print("R2 model Linear Regression data Test dengan " + str(nfeat) + " fitur:" 
          + str(abs(round(r2_score(y_test, y_pred_lr), 2))))
        print(" ")
        print("==================================================")

MSE model Linear Regression data Train dengan 60 fitur: 41.72
RMSE model Linear Regression data Train dengan 60 fitur:6.46
MAE model Linear Regression data Train dengan 60 fitur:5.04
R2 model Linear Regression data Train dengan 60 fitur:0.78
----------------------------
MSE model Linear Regression data Test dengan 60 fitur:4061.25
RMSE model Linear Regression data Test dengan 60 fitur:63.73
MAE model Linear Regression data Test dengan 60 fitur:26.18
R2 model Linear Regression data Test dengan 60 fitur:22.55
 
CPU times: user 6.31 s, sys: 200 ms, total: 6.51 s
Wall time: 286 ms


# Prediksi Random Forest Regressor

## 100 Trees

In [6]:
%%time

#tentukan metode scoring yang digunakan
metrics = {'rmse': 'neg_root_mean_squared_error',
               'mse': 'neg_mean_squared_error',
               'mae': 'neg_mean_absolute_error',
               'r2': 'r2'} 

n_feat = [60]
n_trees = [100]

for nfeat in n_feat:
    for ntrees in n_trees: 
        print("==================================================")
        
        #ambil n fitur input hasil seleksi fitur RFE
        x_train_selected = x_train[selected_features_rfe].iloc[:,0:nfeat]
        x_test_selected = x_test[selected_features_rfe].iloc[:,0:nfeat]

        #Create a Random Forest Regression
        rfg_model = RandomForestRegressor(n_estimators=100, random_state=100)
        
        #Train the model using the training sets
        rfg_model.fit(x_train_selected, y_train)
        y_pred_rfg = rfg_model.predict(x_test_selected)

        #hitung score model dari data train
        scores = cross_validate(rfg_model, x_train_selected, y_train, scoring=metrics, cv=cv, return_train_score=True)

        print("MSE model Random Forest Regression data Train dengan " + str(nfeat) + " fitur: " 
              + str(abs(round(scores['train_mse'].mean(), 2))))
        print("RMSE model Random Forest Regression data Train dengan " + str(nfeat) + " fitur:"
              + str(abs(round(scores['train_rmse'].mean(), 2))))
        print("MAE model Random Forest Regression data Train dengan " + str(nfeat) + " fitur:"
              + str(abs(round(scores['train_mae'].mean(), 2))))
        print("R2 model Random Forest Regression data Train dengan " + str(nfeat) + " fitur:"
              + str(abs((round(scores['train_r2'].mean(), 2)))))    
        print("----------------------------")
        print("MSE model Random Forest Regression data Test dengan " + str(nfeat) + " fitur:" 
          + str(abs(round(mean_squared_error(y_test, y_pred_rfg), 2))))
        print("RMSE model Random Forest Regression data Test dengan " + str(nfeat) + " fitur:" 
          + str(abs(round(mean_squared_error(y_test, y_pred_rfg, squared = False), 2))))
        print("MAE model Random Forest Regression data Test dengan " + str(nfeat) + " fitur:" 
          + str(abs(round(mean_absolute_error(y_test, y_pred_rfg), 2))))
        print("R2 model Random Forest Regression data Test dengan " + str(nfeat) + " fitur:" 
          + str(abs(round(r2_score(y_test, y_pred_rfg), 2))))
        print(" ")
        print("==================================================")

MSE model Linear Regression data Train dengan 60 fitur: 41.87
RMSE model Linear Regression data Train dengan 60 fitur:6.47
MAE model Linear Regression data Train dengan 60 fitur:4.94
R2 model Linear Regression data Train dengan 60 fitur:0.78
----------------------------
MSE model Linear Regression data Test dengan 60 fitur:195.49
RMSE model Linear Regression data Test dengan 60 fitur:13.98
MAE model Linear Regression data Test dengan 60 fitur:11.53
R2 model Linear Regression data Test dengan 60 fitur:0.13
 
CPU times: user 5.5 s, sys: 120 ms, total: 5.61 s
Wall time: 2.81 s
Parser   : 101 ms


## 150 Trees

In [7]:
%%time

#tentukan metode scoring yang digunakan
metrics = {'rmse': 'neg_root_mean_squared_error',
               'mse': 'neg_mean_squared_error',
               'mae': 'neg_mean_absolute_error',
               'r2': 'r2'} 

n_feat = [60]
n_trees = [150]

for nfeat in n_feat:
    for ntrees in n_trees: 
        print("==================================================")
        
        #ambil n fitur input hasil seleksi fitur RFE
        x_train_selected = x_train[selected_features_rfe].iloc[:,0:nfeat]
        x_test_selected = x_test[selected_features_rfe].iloc[:,0:nfeat]

        #Create a Random Forest Regression
        rfg_model = RandomForestRegressor(n_estimators=150, random_state=100)
        
        #Train the model using the training sets
        rfg_model.fit(x_train_selected, y_train)
        y_pred_rfg = rfg_model.predict(x_test_selected)

        #hitung score model dari data train
        scores = cross_validate(rfg_model, x_train_selected, y_train, scoring=metrics, cv=cv, return_train_score=True)

        print("MSE model Random Forest Regression data Train dengan " + str(nfeat) + " fitur: " 
              + str(abs(round(scores['train_mse'].mean(), 2))))
        print("RMSE model Random Forest Regression data Train dengan " + str(nfeat) + " fitur:"
              + str(abs(round(scores['train_rmse'].mean(), 2))))
        print("MAE model Random Forest Regression data Train dengan " + str(nfeat) + " fitur:"
              + str(abs(round(scores['train_mae'].mean(), 2))))
        print("R2 model Random Forest Regression data Train dengan " + str(nfeat) + " fitur:"
              + str(abs((round(scores['train_r2'].mean(), 2)))))    
        print("----------------------------")
        print("MSE model Random Forest Regression data Test dengan " + str(nfeat) + " fitur:" 
          + str(abs(round(mean_squared_error(y_test, y_pred_rfg), 2))))
        print("RMSE model Random Forest Regression data Test dengan " + str(nfeat) + " fitur:" 
          + str(abs(round(mean_squared_error(y_test, y_pred_rfg, squared = False), 2))))
        print("MAE model Random Forest Regression data Test dengan " + str(nfeat) + " fitur:" 
          + str(abs(round(mean_absolute_error(y_test, y_pred_rfg), 2))))
        print("R2 model Random Forest Regression data Test dengan " + str(nfeat) + " fitur:" 
          + str(abs(round(r2_score(y_test, y_pred_rfg), 2))))
        print(" ")
        print("==================================================")

MSE model Linear Regression data Train dengan 60 fitur: 41.58
RMSE model Linear Regression data Train dengan 60 fitur:6.44
MAE model Linear Regression data Train dengan 60 fitur:4.94
R2 model Linear Regression data Train dengan 60 fitur:0.78
----------------------------
MSE model Linear Regression data Test dengan 60 fitur:198.63
RMSE model Linear Regression data Test dengan 60 fitur:14.09
MAE model Linear Regression data Test dengan 60 fitur:11.61
R2 model Linear Regression data Test dengan 60 fitur:0.15
 
CPU times: user 3.81 s, sys: 30.4 ms, total: 3.84 s
Wall time: 3.83 s


## 200 Trees

In [8]:
%%time

#tentukan metode scoring yang digunakan
metrics = {'rmse': 'neg_root_mean_squared_error',
               'mse': 'neg_mean_squared_error',
               'mae': 'neg_mean_absolute_error',
               'r2': 'r2'} 

n_feat = [60]
n_trees = [200]

for nfeat in n_feat:
    for ntrees in n_trees: 
        print("==================================================")
        
        #ambil n fitur input hasil seleksi fitur RFE
        x_train_selected = x_train[selected_features_rfe].iloc[:,0:nfeat]
        x_test_selected = x_test[selected_features_rfe].iloc[:,0:nfeat]

        #Create a Random Forest Regression
        rfg_model = RandomForestRegressor(n_estimators=200, random_state=100)
        
        #Train the model using the training sets
        rfg_model.fit(x_train_selected, y_train)
        y_pred_rfg = rfg_model.predict(x_test_selected)

        #hitung score model dari data train
        scores = cross_validate(rfg_model, x_train_selected, y_train, scoring=metrics, cv=cv, return_train_score=True)

        print("MSE model Random Forest Regression data Train dengan " + str(nfeat) + " fitur: " 
              + str(abs(round(scores['train_mse'].mean(), 2))))
        print("RMSE model Random Forest Regression data Train dengan " + str(nfeat) + " fitur:"
              + str(abs(round(scores['train_rmse'].mean(), 2))))
        print("MAE model Random Forest Regression data Train dengan " + str(nfeat) + " fitur:"
              + str(abs(round(scores['train_mae'].mean(), 2))))
        print("R2 model Random Forest Regression data Train dengan " + str(nfeat) + " fitur:"
              + str(abs((round(scores['train_r2'].mean(), 2)))))    
        print("----------------------------")
        print("MSE model Random Forest Regression data Test dengan " + str(nfeat) + " fitur:" 
          + str(abs(round(mean_squared_error(y_test, y_pred_rfg), 2))))
        print("RMSE modelRandom Forest Regression data Test dengan " + str(nfeat) + " fitur:" 
          + str(abs(round(mean_squared_error(y_test, y_pred_rfg, squared = False), 2))))
        print("MAE model Random Forest Regression data Test dengan " + str(nfeat) + " fitur:" 
          + str(abs(round(mean_absolute_error(y_test, y_pred_rfg), 2))))
        print("R2 model Random Forest Regression data Test dengan " + str(nfeat) + " fitur:" 
          + str(abs(round(r2_score(y_test, y_pred_rfg), 2))))
        print(" ")
        print("==================================================")

MSE model Linear Regression data Train dengan 60 fitur: 41.4
RMSE model Linear Regression data Train dengan 60 fitur:6.43
MAE model Linear Regression data Train dengan 60 fitur:4.92
R2 model Linear Regression data Train dengan 60 fitur:0.78
----------------------------
MSE model Linear Regression data Test dengan 60 fitur:197.76
RMSE model Linear Regression data Test dengan 60 fitur:14.06
MAE model Linear Regression data Test dengan 60 fitur:11.57
R2 model Linear Regression data Test dengan 60 fitur:0.15
 
CPU times: user 5.11 s, sys: 26.2 ms, total: 5.13 s
Wall time: 5.13 s
