referensi https://towardsdatascience.com/feature-selection-with-pandas-e3690ad8504b

The Recursive Feature Elimination (RFE) method works by recursively removing attributes and building a model on those attributes that remain. 

It uses accuracy metric to rank the feature according to their importance. 

The RFE method takes the model to be used and the number of required features as input. It then gives the ranking of all the variables, 1 being most important. 

It also gives its support, True being relevant feature and False being irrelevant feature.

In [1]:
#libraries buat seleksi fitur
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

#libraries buat klasifikasi
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn import metrics
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier

In [2]:
#import dataset
df = pd.read_csv('nirsMangga.csv')

# separate dataset into train and test
x_train, x_test, y_train, y_test = train_test_split(
    df.drop(labels=['No', 'Mango Cultivars', 'Vit C (mg/100g)', 'TA (mg/100g)', 'SSC (oBrix)', 'label'], axis=1),
    df['label'],
    test_size=0.3,
    random_state=0)

x_train.head()

Unnamed: 0,999.9,1000.3,1000.7,1001.1,1001.4,1001.8,1002.2,1002.6,1003,1003.4,...,2478.7,2481.1,2483.5,2485.8,2488.2,2490.6,2493,2495.4,2497.8,2500.2
16,0.471459,0.471074,0.470934,0.470379,0.47026,0.46988,0.469497,0.469435,0.469454,0.468998,...,1.413537,1.41574,1.417568,1.419698,1.421711,1.42307,1.424394,1.426121,1.427552,1.428625
51,0.433239,0.432622,0.432626,0.432379,0.43162,0.43071,0.430836,0.430847,0.430188,0.42947,...,1.601232,1.602877,1.604524,1.605982,1.606778,1.607837,1.608756,1.609967,1.6109,1.611099
183,0.545045,0.544204,0.543792,0.543596,0.543338,0.542534,0.541493,0.541139,0.541308,0.540831,...,1.524657,1.525973,1.527454,1.529518,1.530097,1.530315,1.530254,1.531191,1.532366,1.533183
145,0.545846,0.544815,0.544524,0.544631,0.544169,0.543143,0.542535,0.54208,0.541842,0.541258,...,1.421962,1.422955,1.423717,1.424639,1.42508,1.425797,1.426503,1.427164,1.427838,1.428271
40,0.381048,0.380483,0.380541,0.380151,0.379599,0.379189,0.379009,0.378722,0.378309,0.377719,...,1.571125,1.572674,1.574303,1.576075,1.577273,1.57798,1.578561,1.579334,1.580042,1.581424


# Seleksi Fitur RFE

In [3]:
%%time 

cols = list(x_train.columns)
model = RandomForestClassifier()

#Initializing RFE model
rfe = RFE(estimator=model, n_features_to_select=80)

#Transforming data using RFE
X_rfe = rfe.fit_transform(x_train, y_train)

#Fitting the data to model
model.fit(X_rfe,y_train)              
temp = pd.Series(rfe.support_,index = cols)
selected_features_rfe = temp[temp==True].index
print(selected_features_rfe)

Index(['1090.3', '1091.7', '1092.1', '1092.6', '1095.4', '1096.8', '1098.2',
       '1100', '1106.6', '1109', '1118.5', '1126.3', '1126.8', '1127.8',
       '1130.7', '1131.2', '1132.2', '1132.7', '1133.2', '1133.7', '1135.7',
       '1136.7', '1137.2', '1138.7', '1139.2', '1139.7', '1142.7', '1144.2',
       '1151.8', '1153.9', '1208.7', '1235.8', '1248.9', '1258', '1265.4',
       '1285.4', '1289.3', '1295.1', '1298.3', '1318.8', '1334.4', '1387.2',
       '1659.9', '1670.6', '1695.7', '1712.5', '1872', '1876.1', '1878.8',
       '1880.2', '1882.9', '1888.4', '1900.8', '1905', '1919.1', '1924.8',
       '1936.3', '1974.7', '1980.7', '1991.3', '2033.5', '2035.1', '2043.1',
       '2044.7', '2048', '2054.5', '2075.8', '2101.1', '2104.5', '2123.5',
       '2142.8', '2144.5', '2158.8', '2166', '2176.9', '2193.5', '2202.8',
       '2210.3', '2231.3', '2246.7'],
      dtype='object')
CPU times: user 3min 50s, sys: 1.52 s, total: 3min 51s
Wall time: 3min 51s


In [4]:
print(rfe.ranking_)

[1478 1476 1474 ... 1403 1407 1408]


# Klasifikasi Random Forest Classifier

In [5]:
#buat n-fold cv
#cross validation 10-fold
cv = KFold(n_splits=10, random_state=1, shuffle=True)

## Testing Performa Model Random Forest Classifier

In [6]:
%%time 


#tentukan metode scoring yang digunakan
scoring_rfe = {'acc': 'accuracy',
               'prec_micro': 'precision_micro',
               'rec_micro': 'recall_micro'}

#tentukan total fitur dan trees yang digunakan dalam proses klasifikasi ini
# n_feat = [10, 20, 30]
n_feat = [80]
n_trees = [100]

for nfeat in n_feat:
    for ntrees in n_trees:        
        #ambil n fitur input hasil seleksi fitur RFE 
        x_train_selected = x_train[selected_features_rfe]
        x_test_selected = x_test[selected_features_rfe]

        #Create a Gaussian Classifier
        clf_rfe = RandomForestClassifier(n_estimators=ntrees)
        
        #Train the model using the training sets
        clf_rfe.fit(x_train_selected, y_train)
        y_pred_rfe=clf_rfe.predict(x_test_selected)
        
        #hitung score model dari data train
        scores_rfe = cross_validate(clf_rfe, x_train_selected, y_train, scoring=scoring_rfe, cv=cv, return_train_score=True)
        
        print("Akurasi model RFE data Train dengan " + str(nfeat) + " fitur dan " + str(ntrees) + " trees:" 
              + str(round(scores_rfe['train_acc'].mean(), 2)))
        print("Akurasi model RFE data Test dengan " + str(nfeat) + " fitur dan " + str(ntrees) + " trees:" 
              + str(round(metrics.accuracy_score(y_test, y_pred_rfe), 2)))
        print("Precision model RFE data Train dengan " + str(nfeat) + " fitur dan " + str(ntrees) + " trees:" 
              + str(round(scores_rfe['train_prec_micro'].mean(), 2)))
        print("Precision model RFE data Test dengan " + str(nfeat) + " fitur dan " + str(ntrees) + " trees:" 
              + str(round(metrics.precision_score(y_test, y_pred_rfe, average='micro'), 2)))
        print("Recall model RFE data Train dengan " + str(nfeat) + " fitur dan " + str(ntrees) + " trees:" 
              + str(round(scores_rfe['train_rec_micro'].mean(), 2)))
        print("Recall model RFE data Test dengan " + str(nfeat) + " fitur dan " + str(ntrees) + " trees:" 
              + str(round(metrics.recall_score(y_test, y_pred_rfe, average='micro'), 2)))
        print(" ")

Akurasi model RFE data Train dengan 80 fitur dan 100 trees:1.0
Akurasi model RFE data Test dengan 80 fitur dan 100 trees:0.93
Precision model RFE data Train dengan 80 fitur dan 100 trees:1.0
Precision model RFE data Test dengan 80 fitur dan 100 trees:0.93
Recall model RFE data Train dengan 80 fitur dan 100 trees:1.0
Recall model RFE data Test dengan 80 fitur dan 100 trees:0.93
 
CPU times: user 1.28 s, sys: 39.7 ms, total: 1.32 s
Wall time: 1.32 s


In [7]:
%%time 


#tentukan metode scoring yang digunakan
scoring_rfe = {'acc': 'accuracy',
               'prec_micro': 'precision_micro',
               'rec_micro': 'recall_micro'}

#tentukan total fitur dan trees yang digunakan dalam proses klasifikasi ini
# n_feat = [10, 20, 30]
n_feat = [80]
n_trees = [150]

for nfeat in n_feat:
    for ntrees in n_trees:        
        #ambil n fitur input hasil seleksi fitur RFE 
        x_train_selected = x_train[selected_features_rfe]
        x_test_selected = x_test[selected_features_rfe]

        #Create a Gaussian Classifier
        clf_rfe = RandomForestClassifier(n_estimators=ntrees)
        
        #Train the model using the training sets
        clf_rfe.fit(x_train_selected, y_train)
        y_pred_rfe=clf_rfe.predict(x_test_selected)
        
        #hitung score model dari data train
        scores_rfe = cross_validate(clf_rfe, x_train_selected, y_train, scoring=scoring_rfe, cv=cv, return_train_score=True)
        
        print("Akurasi model RFE data Train dengan " + str(nfeat) + " fitur dan " + str(ntrees) + " trees:" 
              + str(round(scores_rfe['train_acc'].mean(), 2)))
        print("Akurasi model RFE data Test dengan " + str(nfeat) + " fitur dan " + str(ntrees) + " trees:" 
              + str(round(metrics.accuracy_score(y_test, y_pred_rfe), 2)))
        print("Precision model RFE data Train dengan " + str(nfeat) + " fitur dan " + str(ntrees) + " trees:" 
              + str(round(scores_rfe['train_prec_micro'].mean(), 2)))
        print("Precision model RFE data Test dengan " + str(nfeat) + " fitur dan " + str(ntrees) + " trees:" 
              + str(round(metrics.precision_score(y_test, y_pred_rfe, average='micro'), 2)))
        print("Recall model RFE data Train dengan " + str(nfeat) + " fitur dan " + str(ntrees) + " trees:" 
              + str(round(scores_rfe['train_rec_micro'].mean(), 2)))
        print("Recall model RFE data Test dengan " + str(nfeat) + " fitur dan " + str(ntrees) + " trees:" 
              + str(round(metrics.recall_score(y_test, y_pred_rfe, average='micro'), 2)))
        print(" ")

Akurasi model RFE data Train dengan 80 fitur dan 150 trees:1.0
Akurasi model RFE data Test dengan 80 fitur dan 150 trees:0.93
Precision model RFE data Train dengan 80 fitur dan 150 trees:1.0
Precision model RFE data Test dengan 80 fitur dan 150 trees:0.93
Recall model RFE data Train dengan 80 fitur dan 150 trees:1.0
Recall model RFE data Test dengan 80 fitur dan 150 trees:0.93
 
CPU times: user 1.91 s, sys: 27.9 ms, total: 1.93 s
Wall time: 1.93 s


In [8]:
%%time 


#tentukan metode scoring yang digunakan
scoring_rfe = {'acc': 'accuracy',
               'prec_micro': 'precision_micro',
               'rec_micro': 'recall_micro'}

#tentukan total fitur dan trees yang digunakan dalam proses klasifikasi ini
# n_feat = [10, 20, 30]
n_feat = [80]
n_trees = [200]

for nfeat in n_feat:
    for ntrees in n_trees:        
        #ambil n fitur input hasil seleksi fitur RFE 
        x_train_selected = x_train[selected_features_rfe]
        x_test_selected = x_test[selected_features_rfe]

        #Create a Gaussian Classifier
        clf_rfe = RandomForestClassifier(n_estimators=ntrees)
        
        #Train the model using the training sets
        clf_rfe.fit(x_train_selected, y_train)
        y_pred_rfe=clf_rfe.predict(x_test_selected)
        
        #hitung score model dari data train
        scores_rfe = cross_validate(clf_rfe, x_train_selected, y_train, scoring=scoring_rfe, cv=cv, return_train_score=True)
        
        print("Akurasi model RFE data Train dengan " + str(nfeat) + " fitur dan " + str(ntrees) + " trees:" 
              + str(round(scores_rfe['train_acc'].mean(), 2)))
        print("Akurasi model RFE data Test dengan " + str(nfeat) + " fitur dan " + str(ntrees) + " trees:" 
              + str(round(metrics.accuracy_score(y_test, y_pred_rfe), 2)))
        print("Precision model RFE data Train dengan " + str(nfeat) + " fitur dan " + str(ntrees) + " trees:" 
              + str(round(scores_rfe['train_prec_micro'].mean(), 2)))
        print("Precision model RFE data Test dengan " + str(nfeat) + " fitur dan " + str(ntrees) + " trees:" 
              + str(round(metrics.precision_score(y_test, y_pred_rfe, average='micro'), 2)))
        print("Recall model RFE data Train dengan " + str(nfeat) + " fitur dan " + str(ntrees) + " trees:" 
              + str(round(scores_rfe['train_rec_micro'].mean(), 2)))
        print("Recall model RFE data Test dengan " + str(nfeat) + " fitur dan " + str(ntrees) + " trees:" 
              + str(round(metrics.recall_score(y_test, y_pred_rfe, average='micro'), 2)))
        print(" ")

Akurasi model RFE data Train dengan 80 fitur dan 200 trees:1.0
Akurasi model RFE data Test dengan 80 fitur dan 200 trees:0.93
Precision model RFE data Train dengan 80 fitur dan 200 trees:1.0
Precision model RFE data Test dengan 80 fitur dan 200 trees:0.93
Recall model RFE data Train dengan 80 fitur dan 200 trees:1.0
Recall model RFE data Test dengan 80 fitur dan 200 trees:0.93
 
CPU times: user 2.51 s, sys: 31.9 ms, total: 2.54 s
Wall time: 2.54 s


# Klasifikasi Support Vector Machine

Untuk train dan test set, serta cv-nya ambil dari yang Random Forest Classifier, sama aja 

## Testing Performa Model Support Vector Machine

In [9]:
%%time 


#tentukan metode scoring yang digunakan
scoring_svm = {'acc': 'accuracy',
               'prec_micro': 'precision_micro',
               'rec_micro': 'recall_micro'}

#tentukan total fitur yang digunakan dalam proses klasifikasi SVM ini
# n_feat = [10, 20, 30]
n_feat = [80]
for nfeat in n_feat:     
    #ambil n fitur input hasil seleksi fitur RFE 
    x_train_selected = x_train[selected_features_rfe]
    x_test_selected = x_test[selected_features_rfe]

    #Create a Support Vector Classifier
    clf_svm = svm.SVC()

    #Train the model using the training sets
    clf_svm.fit(x_train_selected, y_train)
    y_pred_svm = clf_svm.predict(x_test_selected)

    #hitung score model dari data train
    scores_svm = cross_validate(clf_svm, x_train_selected, y_train, scoring=scoring_svm, cv=cv, return_train_score=True)

    print("akurasi model SVM data Train dengan " + str(nfeat) + " fitur: " 
          + str(round(scores_svm['train_acc'].mean(), 2)))
    print("akurasi model SVM data Test dengan " + str(nfeat) + " fitur: " 
          + str(round(metrics.accuracy_score(y_test, y_pred_svm), 2)))
    print("Precision model SVM data Train dengan " + str(nfeat) + " fitur:"
          + str(round(scores_svm['train_prec_micro'].mean(), 2)))
    print("Precision model SVM data Test dengan " + str(nfeat) + " fitur:"
          + str(round(metrics.precision_score(y_test, y_pred_svm, average='micro'), 2)))
    print("Recall model SVM data Train dengan " + str(nfeat) + " fitur:"
          + str(round(scores_svm['train_rec_micro'].mean(), 2)))
    print("Recall model SVM data Test dengan " + str(nfeat) + " fitur:"
          + str(round(metrics.recall_score(y_test, y_pred_svm, average='micro'), 2)))
    print(" ")

akurasi model SVM data Train dengan 80 fitur: 0.55
akurasi model SVM data Test dengan 80 fitur: 0.46
Precision model SVM data Train dengan 80 fitur:0.55
Precision model SVM data Test dengan 80 fitur:0.46
Recall model SVM data Train dengan 80 fitur:0.55
Recall model SVM data Test dengan 80 fitur:0.46
 
CPU times: user 95.2 ms, sys: 11.8 ms, total: 107 ms
Wall time: 104 ms
