<a href="https://colab.research.google.com/github/gani88/ML-DataScience/blob/main/Dicoding_ML_StudyCase_FeatureEngineering_1_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Feature Engineering Study Case

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_wine

In [None]:
# Using Wine dataset
data = load_wine()
X, y = data.data, data.target

In [None]:
# Change to dataframe format
df = pd.DataFrame(X, columns=data.feature_names)
df['target'] = y
df

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127.0,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.20,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.50,16.8,113.0,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740.0,2
174,13.40,3.91,2.48,23.0,102.0,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750.0,2
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835.0,2
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840.0,2


In [None]:
# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Filter Methods

Filter methods are techniques used to evaluate the relevance of features independently of the machine learning model being used. These methods rely on statistical measures to select features without involving the model, making them faster and more efficient, especially for large datasets.

For example:

- Correlation measures the strength of the relationship between each feature and the target label.
- Chi-square test evaluates the independence between categorical features and the target label.

Another approach is mutual information, which scores the dependency between features and the target. Additionally, filter methods can assess variance thresholds, selecting features with variance above a specified threshold and eliminating those with lower information content.

In [None]:
# Implementation of filter method

# Using SelectKBest
filter_selector = SelectKBest(score_func=chi2, k=2)
X_train_filter = filter_selector.fit_transform(X_train, y_train)
X_test_filter = filter_selector.transform(X_test)

print("Using Filter Methods : ", filter_selector.get_support(indices=True))

Using Filter Methods :  [ 9 12]


In [None]:
# Using another method, Wrapper, RFE

model = LogisticRegression(solver='lbfgs', max_iter=5000)
rfe_selector = RFE(model, n_features_to_select=2)
X_train_rfe = rfe_selector.fit_transform(X_train, y_train)
X_test_rfe = rfe_selector.transform(X_test)

print("Feature from this method : ", rfe_selector.get_support(indices=True))

Feature from this method :  [0 6]


In [None]:
# Using another methdo embedded

# using random forest to get important feature
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Get important feature
importances = rf_model.feature_importances_
indices = np.argsort(importances)[::1]

# Choosing the threshold
threshold = 0.05
important_features_indices = [i for i in range(len(importances)) if importances[i] >= threshold]

# Moving important feature to new variable
X_important = X_train[:, important_features_indices]
X_test_important = X_test[:, important_features_indices]

# Print selected feature
print("The feature that important : ")
for i in important_features_indices:
    print(f"{data.feature_names[i]}: {importances[i]}")

print("\nTraining Data Dimension with important feature: ", X_important.shape)
print("\nTesting Data Dimension with important feature: ", X_test_important.shape)

The feature that important : 
alcohol: 0.11239773542143086
flavanoids: 0.20229341635663622
color_intensity: 0.1712021830864957
hue: 0.07089132259413944
od280/od315_of_diluted_wines: 0.1115643167260497
proline: 0.13904586955351153

Training Data Dimension with important feature:  (142, 6)

Testing Data Dimension with important feature:  (36, 6)


In [None]:
def evaluate_model(X_train, X_test, y_train, y_test, model):
    model.fit(X_train, y_train)
    accuracy = model.score(X_test, y_test)
    return accuracy

In [None]:
# Model Logistic Regression untuk Filter Methods
logistic_model_filter = LogisticRegression(max_iter=200)
accuracy_filter = evaluate_model(X_train_filter, X_test_filter, y_train, y_test, logistic_model_filter)

# Model Logistic Regression untuk Wrapper Methods
logistic_model_rfe = LogisticRegression(max_iter=200)
accuracy_rfe = evaluate_model(X_train_rfe, X_test_rfe, y_train, y_test, logistic_model_rfe)

# Model Random Forest untuk Embedded Methods
accuracy_rf = evaluate_model(X_important, X_test_important, y_train, y_test, rf_model)

In [None]:
print(f"\nAkurasi Model dengan Filter Methods: {accuracy_filter:.2f}")
print(f"Akurasi Model dengan Wrapper Methods: {accuracy_rfe:.2f}")
print(f"Akurasi Model dengan Embedded Methods: {accuracy_rf:.2f}")


Akurasi Model dengan Filter Methods: 0.89
Akurasi Model dengan Wrapper Methods: 0.94
Akurasi Model dengan Embedded Methods: 1.00
