In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
import joblib
import os

# Fungsi untuk evaluasi model
def evaluate_model(y_true, y_pred, model_name):
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    print(f"\n{model_name} - Metrics")
    print(f"MSE: {mse:.2f}")
    print(f"MAE: {mae:.2f}")
    print(f"RMSE: {rmse:.2f}")
    print(f"R² Score: {r2:.4f}")
    return mse, mae, rmse, r2

# Fungsi untuk menghitung confidence scores
def get_confidence_scores(model, X_test, model_type):
    if model_type == "DecisionTree":
        n_bootstraps = 100
        predictions = []
        for _ in range(n_bootstraps):
            idx = np.random.choice(X_test.shape[0], size=X_test.shape[0], replace=True)
            X_boot = X_test[idx]
            predictions.append(model.predict(X_boot))
        predictions = np.array(predictions)
        mean_pred = predictions.mean(axis=0)
        std_pred = predictions.std(axis=0)
        confidence_scores = 1 / (1 + std_pred / (mean_pred + 1e-10))
    elif model_type == "RandomForest":
        tree_predictions = np.array([tree.predict(X_test) for tree in model.estimators_])
        mean_pred = tree_predictions.mean(axis=0)
        std_pred = tree_predictions.std(axis=0)
        confidence_scores = 1 / (1 + std_pred / (mean_pred + 1e-10))
    elif model_type == "XGBoost":
        n_samples = 100
        predictions = []
        for _ in range(n_samples):
            noise = np.random.normal(0, 0.01, X_test.shape)
            X_noisy = X_test + noise
            predictions.append(model.predict(X_noisy))
        predictions = np.array(predictions)
        mean_pred = predictions.mean(axis=0)
        std_pred = predictions.std(axis=0)
        confidence_scores = 1 / (1 + std_pred / (mean_pred + 1e-10))
    return confidence_scores

# Buat folder models jika belum ada
if not os.path.exists('models'):
    os.makedirs('models')

# 1. Muat dataset
data = pd.read_csv('Flight Price Prediction Dataset.csv')

# 2. Preprocessing
if 'Unnamed: 0' in data.columns:
    data = data.drop(columns=['Unnamed: 0'])

if data.isnull().sum().sum() > 0:
    print("Data memiliki nilai null, mengisi dengan modus untuk kategorikal dan rata-rata untuk numerik...")
    for col in data.columns:
        if data[col].dtype == 'object':
            data[col] = data[col].fillna(data[col].mode()[0])
        else:
            data[col] = data[col].fillna(data[col].mean())

X = data.drop(columns=['price'])
y = data['price']

# Simpan nilai unik untuk dropdown dashboard
unique_values = {}
categorical_columns = ['airline', 'flight', 'source_city', 'departure_time', 'stops', 
                      'arrival_time', 'destination_city', 'class']
for col in categorical_columns:
    unique_values[col] = list(X[col].unique())
joblib.dump(unique_values, 'models/unique_values.joblib')

airline_flight_map = X.groupby('airline')['flight'].unique().to_dict()
joblib.dump(airline_flight_map, 'models/airline_flight_map.joblib')

numeric_columns = ['duration', 'days_left']

# Buat preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns),
        ('num', StandardScaler(), numeric_columns)
    ])

joblib.dump(preprocessor, 'models/preprocessor.joblib')

# Pisahkan data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
joblib.dump((X_train, X_test, y_train, y_test), 'models/processed_data.joblib')

# Simpan scaler untuk numerik
scaler = preprocessor.named_transformers_['num']
joblib.dump(scaler, 'models/scaler.joblib')

# 3. Inisialisasi hasil dan model
results = []
model_dict = {}
confidence_scores = {}

# 4. Pelatihan model tanpa feature selection
print("\n=== Non-Feature Selection ===")

# Decision Tree
dt_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', DecisionTreeRegressor(random_state=42))
])
dt_pipeline.fit(X_train, y_train)
y_pred_dt = dt_pipeline.predict(X_test)
mse_dt, mae_dt, rmse_dt, r2_dt = evaluate_model(y_test, y_pred_dt, "Decision Tree")
results.append({"Model": "Decision Tree", "Feature Selection": "None", "MSE": mse_dt, "MAE": mae_dt, "RMSE": rmse_dt, "R²": r2_dt})
model_dict["DecisionTree_None"] = dt_pipeline
confidence_scores["DecisionTree_None"] = get_confidence_scores(dt_pipeline.named_steps['model'], preprocessor.transform(X_test), "DecisionTree")
joblib.dump(dt_pipeline, 'models/DecisionTree_None.joblib')
pd.DataFrame({'confidence': confidence_scores["DecisionTree_None"]}).to_csv('models/DecisionTree_None_confidence.csv', index=False)

# Random Forest
rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(n_estimators=100, random_state=42))
])
rf_pipeline.fit(X_train, y_train)
y_pred_rf = rf_pipeline.predict(X_test)
mse_rf, mae_rf, rmse_rf, r2_rf = evaluate_model(y_test, y_pred_rf, "Random Forest")
results.append({"Model": "Random Forest", "Feature Selection": "None", "MSE": mse_rf, "MAE": mae_rf, "RMSE": rmse_rf, "R²": r2_rf})
model_dict["RandomForest_None"] = rf_pipeline
confidence_scores["RandomForest_None"] = get_confidence_scores(rf_pipeline.named_steps['model'], preprocessor.transform(X_test), "RandomForest")
joblib.dump(rf_pipeline, 'models/RandomForest_None.joblib')
pd.DataFrame({'confidence': confidence_scores["RandomForest_None"]}).to_csv('models/RandomForest_None_confidence.csv', index=False)

# XGBoost
xgb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', XGBRegressor(random_state=42, objective='reg:squarederror'))
])
xgb_pipeline.fit(X_train, y_train)
y_pred_xgb = xgb_pipeline.predict(X_test)
mse_xgb, mae_xgb, rmse_xgb, r2_xgb = evaluate_model(y_test, y_pred_xgb, "XGBoost")
results.append({"Model": "XGBoost", "Feature Selection": "None", "MSE": mse_xgb, "MAE": mae_xgb, "RMSE": rmse_xgb, "R²": r2_xgb})
model_dict["XGBoost_None"] = xgb_pipeline
confidence_scores["XGBoost_None"] = get_confidence_scores(xgb_pipeline.named_steps['model'], preprocessor.transform(X_test), "XGBoost")
joblib.dump(xgb_pipeline, 'models/XGBoost_None.joblib')
pd.DataFrame({'confidence': confidence_scores["XGBoost_None"]}).to_csv('models/XGBoost_None_confidence.csv', index=False)

# 5. Feature Selection: Mutual Information
print("\n=== Feature Selection: Mutual Information ===")
# Buat pipeline dengan feature selection
mi_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('selector', SelectKBest(score_func=mutual_info_regression, k=5))
])

# Fit pipeline untuk mendapatkan fitur terpilih
mi_pipeline.fit(X_train, y_train)
selected_features_idx = mi_pipeline.named_steps['selector'].get_support()
# Dapatkan nama fitur setelah preprocessing
feature_names = (mi_pipeline.named_steps['preprocessor']
                 .named_transformers_['cat']
                 .get_feature_names_out(categorical_columns)
                 .tolist() + numeric_columns)
selected_features_mi = [feature_names[i] for i, selected in enumerate(selected_features_idx) if selected]
print("Selected features (MI):", selected_features_mi)
joblib.dump(mi_pipeline.named_steps['selector'], 'models/selector_mi.joblib')
joblib.dump(selected_features_mi, 'models/selected_features_mi.joblib')

# Decision Tree (MI)
dt_mi_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('selector', SelectKBest(score_func=mutual_info_regression, k=5)),
    ('model', DecisionTreeRegressor(random_state=42))
])
dt_mi_pipeline.fit(X_train, y_train)
y_pred_dt_mi = dt_mi_pipeline.predict(X_test)
mse_dt_mi, mae_dt_mi, rmse_dt_mi, r2_dt_mi = evaluate_model(y_test, y_pred_dt_mi, "Decision Tree (MI)")
results.append({"Model": "Decision Tree", "Feature Selection": "MI", "MSE": mse_dt_mi, "MAE": mae_dt_mi, "RMSE": rmse_dt_mi, "R²": r2_dt_mi})
model_dict["DecisionTree_MI"] = dt_mi_pipeline
confidence_scores["DecisionTree_MI"] = get_confidence_scores(dt_mi_pipeline.named_steps['model'], 
                                                            dt_mi_pipeline.named_steps['selector'].transform(preprocessor.transform(X_test)), 
                                                            "DecisionTree")
joblib.dump(dt_mi_pipeline, 'models/DecisionTree_MI.joblib')
pd.DataFrame({'confidence': confidence_scores["DecisionTree_MI"]}).to_csv('models/DecisionTree_MI_confidence.csv', index=False)

# Random Forest (MI)
rf_mi_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('selector', SelectKBest(score_func=mutual_info_regression, k=5)),
    ('model', RandomForestRegressor(n_estimators=100, random_state=42))
])
rf_mi_pipeline.fit(X_train, y_train)
y_pred_rf_mi = rf_mi_pipeline.predict(X_test)
mse_rf_mi, mae_rf_mi, rmse_rf_mi, r2_rf_mi = evaluate_model(y_test, y_pred_rf_mi, "Random Forest (MI)")
results.append({"Model": "Random Forest", "Feature Selection": "MI", "MSE": mse_rf_mi, "MAE": mae_rf_mi, "RMSE": rmse_rf_mi, "R²": r2_rf_mi})
model_dict["RandomForest_MI"] = rf_mi_pipeline
confidence_scores["RandomForest_MI"] = get_confidence_scores(rf_mi_pipeline.named_steps['model'], 
                                                            rf_mi_pipeline.named_steps['selector'].transform(preprocessor.transform(X_test)), 
                                                            "RandomForest")
joblib.dump(rf_mi_pipeline, 'models/RandomForest_MI.joblib')
pd.DataFrame({'confidence': confidence_scores["RandomForest_MI"]}).to_csv('models/RandomForest_MI_confidence.csv', index=False)

# XGBoost (MI)
xgb_mi_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('selector', SelectKBest(score_func=mutual_info_regression, k=5)),
    ('model', XGBRegressor(random_state=42, objective='reg:squarederror'))
])
xgb_mi_pipeline.fit(X_train, y_train)
y_pred_xgb_mi = xgb_mi_pipeline.predict(X_test)
mse_xgb_mi, mae_xgb_mi, rmse_xgb_mi, r2_xgb_mi = evaluate_model(y_test, y_pred_xgb_mi, "XGBoost (MI)")
results.append({"Model": "XGBoost", "Feature Selection": "MI", "MSE": mse_xgb_mi, "MAE": mae_xgb_mi, "RMSE": rmse_xgb_mi, "R²": r2_xgb_mi})
model_dict["XGBoost_MI"] = xgb_mi_pipeline
confidence_scores["XGBoost_MI"] = get_confidence_scores(xgb_mi_pipeline.named_steps['model'], 
                                                        xgb_mi_pipeline.named_steps['selector'].transform(preprocessor.transform(X_test)), 
                                                        "XGBoost")
joblib.dump(xgb_mi_pipeline, 'models/XGBoost_MI.joblib')
pd.DataFrame({'confidence': confidence_scores["XGBoost_MI"]}).to_csv('models/XGBoost_MI_confidence.csv', index=False)

# 6. Feature Selection: Sequential Forward Selection (SFS)
print("\n=== Feature Selection: SFS ===")
# SFS tidak bisa langsung digunakan dalam pipeline karena membutuhkan data yang sudah di-preprocess
# Preprocess data terlebih dahulu
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)
feature_names = (preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_columns).tolist() + numeric_columns)

# Jalankan SFS
sfs = SFS(RandomForestRegressor(n_estimators=10, random_state=42), 
          k_features=5, forward=True, scoring='r2', cv=5, n_jobs=-1)
sfs.fit(X_train_preprocessed, y_train)
selected_features_idx = list(sfs.k_feature_idx_)
selected_features_sfs = [feature_names[i] for i in selected_features_idx]
print("Selected features (SFS):", selected_features_sfs)
joblib.dump(sfs, 'models/sfs.joblib')
joblib.dump(selected_features_sfs, 'models/selected_features_sfs.joblib')

# Buat pipeline dengan SFS (menggunakan selector manual)
class SFSSelector:
    def __init__(self, selected_indices):
        self.selected_indices = selected_indices
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[:, self.selected_indices]

# Decision Tree (SFS)
dt_sfs_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('selector', SFSSelector(selected_features_idx)),
    ('model', DecisionTreeRegressor(random_state=42))
])
dt_sfs_pipeline.fit(X_train, y_train)
y_pred_dt_sfs = dt_sfs_pipeline.predict(X_test)
mse_dt_sfs, mae_dt_sfs, rmse_dt_sfs, r2_dt_sfs = evaluate_model(y_test, y_pred_dt_sfs, "Decision Tree (SFS)")
results.append({"Model": "Decision Tree", "Feature Selection": "SFS", "MSE": mse_dt_sfs, "MAE": mae_dt_sfs, "RMSE": rmse_dt_sfs, "R²": r2_dt_sfs})
model_dict["DecisionTree_SFS"] = dt_sfs_pipeline
confidence_scores["DecisionTree_SFS"] = get_confidence_scores(dt_sfs_pipeline.named_steps['model'], 
                                                             dt_sfs_pipeline.named_steps['selector'].transform(preprocessor.transform(X_test)), 
                                                             "DecisionTree")
joblib.dump(dt_sfs_pipeline, 'models/DecisionTree_SFS.joblib')
pd.DataFrame({'confidence': confidence_scores["DecisionTree_SFS"]}).to_csv('models/DecisionTree_SFS_confidence.csv', index=False)

# Random Forest (SFS)
rf_sfs_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('selector', SFSSelector(selected_features_idx)),
    ('model', RandomForestRegressor(n_estimators=100, random_state=42))
])
rf_sfs_pipeline.fit(X_train, y_train)
y_pred_rf_sfs = rf_sfs_pipeline.predict(X_test)
mse_rf_sfs, mae_rf_sfs, rmse_rf_sfs, r2_rf_sfs = evaluate_model(y_test, y_pred_rf_sfs, "Random Forest (SFS)")
results.append({"Model": "Random Forest", "Feature Selection": "SFS", "MSE": mse_rf_sfs, "MAE": mae_rf_sfs, "RMSE": rmse_rf_sfs, "R²": r2_rf_sfs})
model_dict["RandomForest_SFS"] = rf_sfs_pipeline
confidence_scores["RandomForest_SFS"] = get_confidence_scores(rf_sfs_pipeline.named_steps['model'], 
                                                             rf_sfs_pipeline.named_steps['selector'].transform(preprocessor.transform(X_test)), 
                                                             "RandomForest")
joblib.dump(rf_sfs_pipeline, 'models/RandomForest_SFS.joblib')
pd.DataFrame({'confidence': confidence_scores["RandomForest_SFS"]}).to_csv('models/RandomForest_SFS_confidence.csv', index=False)

# XGBoost (SFS)
xgb_sfs_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('selector', SFSSelector(selected_features_idx)),
    ('model', XGBRegressor(random_state=42, objective='reg:squarederror'))
])
xgb_sfs_pipeline.fit(X_train, y_train)
y_pred_xgb_sfs = xgb_sfs_pipeline.predict(X_test)
mse_xgb_sfs, mae_xgb_sfs, rmse_xgb_sfs, r2_xgb_sfs = evaluate_model(y_test, y_pred_xgb_sfs, "XGBoost (SFS)")
results.append({"Model": "XGBoost", "Feature Selection": "SFS", "MSE": mse_xgb_sfs, "MAE": mae_xgb_sfs, "RMSE": rmse_xgb_sfs, "R²": r2_xgb_sfs})
model_dict["XGBoost_SFS"] = xgb_sfs_pipeline
confidence_scores["XGBoost_SFS"] = get_confidence_scores(xgb_sfs_pipeline.named_steps['model'], 
                                                        xgb_sfs_pipeline.named_steps['selector'].transform(preprocessor.transform(X_test)), 
                                                        "XGBoost")
joblib.dump(xgb_sfs_pipeline, 'models/XGBoost_SFS.joblib')
pd.DataFrame({'confidence': confidence_scores["XGBoost_SFS"]}).to_csv('models/XGBoost_SFS_confidence.csv', index=False)

# 7. Simpan hasil evaluasi
results_df = pd.DataFrame(results)
results_df.to_csv('models/model_results.csv', index=False)
print("\nHasil evaluasi disimpan ke 'models/model_results.csv'")

# Cetak perbandingan
print("\n=== Comparison Table ===")
print("\nNon-Feature Selection:")
print(results_df[results_df['Feature Selection'] == 'None'][['Model', 'MSE', 'MAE', 'RMSE', 'R²']])
print("\nMutual Information:")
print(results_df[results_df['Feature Selection'] == 'MI'][['Model', 'MSE', 'MAE', 'RMSE', 'R²']])
print("\nSFS:")
print(results_df[results_df['Feature Selection'] == 'SFS'][['Model', 'MSE', 'MAE', 'RMSE', 'R²']])

# Simpan confidence scores
joblib.dump(confidence_scores, 'models/confidence_scores.joblib')
print("Confidence scores disimpan ke 'models/confidence_scores.joblib'")

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
import joblib
import os

# Fungsi untuk evaluasi model
def evaluate_model(y_true, y_pred, model_name):
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    print(f"\n{model_name} - Metrics")
    print(f"MSE: {mse:.2f}")
    print(f"MAE: {mae:.2f}")
    print(f"RMSE: {rmse:.2f}")
    print(f"R² Score: {r2:.4f}")
    return mse, mae, rmse, r2

# Fungsi untuk menghitung confidence scores
def get_confidence_scores(model, X_test, model_type):
    if model_type == "DecisionTree":
        n_bootstraps = 100
        predictions = []
        for _ in range(n_bootstraps):
            idx = np.random.choice(X_test.shape[0], size=X_test.shape[0], replace=True)
            X_boot = X_test[idx]
            predictions.append(model.predict(X_boot))
        predictions = np.array(predictions)
        mean_pred = predictions.mean(axis=0)
        std_pred = predictions.std(axis=0)
        confidence_scores = 1 / (1 + std_pred / (mean_pred + 1e-10))
    elif model_type == "RandomForest":
        tree_predictions = np.array([tree.predict(X_test) for tree in model.estimators_])
        mean_pred = tree_predictions.mean(axis=0)
        std_pred = tree_predictions.std(axis=0)
        confidence_scores = 1 / (1 + std_pred / (mean_pred + 1e-10))
    elif model_type == "XGBoost":
        n_samples = 100
        predictions = []
        for _ in range(n_samples):
            noise = np.random.normal(0, 0.01, X_test.shape)
            X_noisy = X_test + noise
            predictions.append(model.predict(X_noisy))
        predictions = np.array(predictions)
        mean_pred = predictions.mean(axis=0)
        std_pred = predictions.std(axis=0)
        confidence_scores = 1 / (1 + std_pred / (mean_pred + 1e-10))
    return confidence_scores

# Buat folder models jika belum ada
if not os.path.exists('models'):
    os.makedirs('models')

# 1. Muat dataset
data = pd.read_csv('Flight Price Prediction Dataset.csv')

# 2. Preprocessing
if 'Unnamed: 0' in data.columns:
    data = data.drop(columns=['Unnamed: 0'])

if data.isnull().sum().sum() > 0:
    print("Data memiliki nilai null, mengisi dengan modus untuk kategorikal dan rata-rata untuk numerik...")
    for col in data.columns:
        if data[col].dtype == 'object':
            data[col] = data[col].fillna(data[col].mode()[0])
        else:
            data[col] = data[col].fillna(data[col].mean())

X = data.drop(columns=['price'])
y = data['price']

# Simpan nilai unik untuk dropdown dashboard
unique_values = {}
categorical_columns = ['airline', 'flight', 'source_city', 'departure_time', 'stops', 
                      'arrival_time', 'destination_city', 'class']
for col in categorical_columns:
    unique_values[col] = list(X[col].unique())
joblib.dump(unique_values, 'models/unique_values.joblib')

airline_flight_map = X.groupby('airline')['flight'].unique().to_dict()
joblib.dump(airline_flight_map, 'models/airline_flight_map.joblib')

numeric_columns = ['duration', 'days_left']

# Buat preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns),
        ('num', StandardScaler(), numeric_columns)
    ])

joblib.dump(preprocessor, 'models/preprocessor.joblib')

# Pisahkan data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
joblib.dump((X_train, X_test, y_train, y_test), 'models/processed_data.joblib')

# Simpan scaler untuk numerik
# Simpan scaler untuk numerik
preprocessor.fit(X_train)
scaler = preprocessor.named_transformers_['num']  # Corrected line
joblib.dump(scaler, 'models/scaler.joblib')


# 3. Inisialisasi hasil dan model
results = []
model_dict = {}
confidence_scores = {}

# 4. Pelatihan model tanpa feature selection
print("\n=== Non-Feature Selection ===")

# Decision Tree
dt_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', DecisionTreeRegressor(random_state=42))
])
dt_pipeline.fit(X_train, y_train)
y_pred_dt = dt_pipeline.predict(X_test)
mse_dt, mae_dt, rmse_dt, r2_dt = evaluate_model(y_test, y_pred_dt, "Decision Tree")
results.append({"Model": "Decision Tree", "Feature Selection": "None", "MSE": mse_dt, "MAE": mae_dt, "RMSE": rmse_dt, "R²": r2_dt})
model_dict["DecisionTree_None"] = dt_pipeline
confidence_scores["DecisionTree_None"] = get_confidence_scores(dt_pipeline.named_steps['model'], preprocessor.transform(X_test), "DecisionTree")
joblib.dump(dt_pipeline, 'models/DecisionTree_None.joblib')
pd.DataFrame({'confidence': confidence_scores["DecisionTree_None"]}).to_csv('models/DecisionTree_None_confidence.csv', index=False)

# Random Forest
rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(n_estimators=100, random_state=42))
])
rf_pipeline.fit(X_train, y_train)
y_pred_rf = rf_pipeline.predict(X_test)
mse_rf, mae_rf, rmse_rf, r2_rf = evaluate_model(y_test, y_pred_rf, "Random Forest")
results.append({"Model": "Random Forest", "Feature Selection": "None", "MSE": mse_rf, "MAE": mae_rf, "RMSE": rmse_rf, "R²": r2_rf})
model_dict["RandomForest_None"] = rf_pipeline
confidence_scores["RandomForest_None"] = get_confidence_scores(rf_pipeline.named_steps['model'], preprocessor.transform(X_test), "RandomForest")
joblib.dump(rf_pipeline, 'models/RandomForest_None.joblib')
pd.DataFrame({'confidence': confidence_scores["RandomForest_None"]}).to_csv('models/RandomForest_None_confidence.csv', index=False)

# XGBoost
xgb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', XGBRegressor(random_state=42, objective='reg:squarederror'))
])
xgb_pipeline.fit(X_train, y_train)
y_pred_xgb = xgb_pipeline.predict(X_test)
mse_xgb, mae_xgb, rmse_xgb, r2_xgb = evaluate_model(y_test, y_pred_xgb, "XGBoost")
results.append({"Model": "XGBoost", "Feature Selection": "None", "MSE": mse_xgb, "MAE": mae_xgb, "RMSE": rmse_xgb, "R²": r2_xgb})
model_dict["XGBoost_None"] = xgb_pipeline
confidence_scores["XGBoost_None"] = get_confidence_scores(xgb_pipeline.named_steps['model'], preprocessor.transform(X_test), "XGBoost")
joblib.dump(xgb_pipeline, 'models/XGBoost_None.joblib')
pd.DataFrame({'confidence': confidence_scores["XGBoost_None"]}).to_csv('models/XGBoost_None_confidence.csv', index=False)

# 5. Feature Selection: Mutual Information
print("\n=== Feature Selection: Mutual Information ===")
# Buat pipeline dengan feature selection
mi_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('selector', SelectKBest(score_func=mutual_info_regression, k=5))
])

# Fit pipeline untuk mendapatkan fitur terpilih
mi_pipeline.fit(X_train, y_train)
selected_features_idx = mi_pipeline.named_steps['selector'].get_support()
# Dapatkan nama fitur setelah preprocessing
feature_names = (mi_pipeline.named_steps['preprocessor']
                 .named_transformers_['cat']
                 .get_feature_names_out(categorical_columns)
                 .tolist() + numeric_columns)
selected_features_mi = [feature_names[i] for i, selected in enumerate(selected_features_idx) if selected]
print("Selected features (MI):", selected_features_mi)
joblib.dump(mi_pipeline.named_steps['selector'], 'models/selector_mi.joblib')
joblib.dump(selected_features_mi, 'models/selected_features_mi.joblib')

# Decision Tree (MI)
dt_mi_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('selector', SelectKBest(score_func=mutual_info_regression, k=5)),
    ('model', DecisionTreeRegressor(random_state=42))
])
dt_mi_pipeline.fit(X_train, y_train)
y_pred_dt_mi = dt_mi_pipeline.predict(X_test)
mse_dt_mi, mae_dt_mi, rmse_dt_mi, r2_dt_mi = evaluate_model(y_test, y_pred_dt_mi, "Decision Tree (MI)")
results.append({"Model": "Decision Tree", "Feature Selection": "MI", "MSE": mse_dt_mi, "MAE": mae_dt_mi, "RMSE": rmse_dt_mi, "R²": r2_dt_mi})
model_dict["DecisionTree_MI"] = dt_mi_pipeline
confidence_scores["DecisionTree_MI"] = get_confidence_scores(dt_mi_pipeline.named_steps['model'], 
                                                            dt_mi_pipeline.named_steps['selector'].transform(preprocessor.transform(X_test)), 
                                                            "DecisionTree")
joblib.dump(dt_mi_pipeline, 'models/DecisionTree_MI.joblib')
pd.DataFrame({'confidence': confidence_scores["DecisionTree_MI"]}).to_csv('models/DecisionTree_MI_confidence.csv', index=False)

# Random Forest (MI)
rf_mi_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('selector', SelectKBest(score_func=mutual_info_regression, k=5)),
    ('model', RandomForestRegressor(n_estimators=100, random_state=42))
])
rf_mi_pipeline.fit(X_train, y_train)
y_pred_rf_mi = rf_mi_pipeline.predict(X_test)
mse_rf_mi, mae_rf_mi, rmse_rf_mi, r2_rf_mi = evaluate_model(y_test, y_pred_rf_mi, "Random Forest (MI)")
results.append({"Model": "Random Forest", "Feature Selection": "MI", "MSE": mse_rf_mi, "MAE": mae_rf_mi, "RMSE": rmse_rf_mi, "R²": r2_rf_mi})
model_dict["RandomForest_MI"] = rf_mi_pipeline
confidence_scores["RandomForest_MI"] = get_confidence_scores(rf_mi_pipeline.named_steps['model'], 
                                                            rf_mi_pipeline.named_steps['selector'].transform(preprocessor.transform(X_test)), 
                                                            "RandomForest")
joblib.dump(rf_mi_pipeline, 'models/RandomForest_MI.joblib')
pd.DataFrame({'confidence': confidence_scores["RandomForest_MI"]}).to_csv('models/RandomForest_MI_confidence.csv', index=False)

# XGBoost (MI)
xgb_mi_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('selector', SelectKBest(score_func=mutual_info_regression, k=5)),
    ('model', XGBRegressor(random_state=42, objective='reg:squarederror'))
])
xgb_mi_pipeline.fit(X_train, y_train)
y_pred_xgb_mi = xgb_mi_pipeline.predict(X_test)
mse_xgb_mi, mae_xgb_mi, rmse_xgb_mi, r2_xgb_mi = evaluate_model(y_test, y_pred_xgb_mi, "XGBoost (MI)")
results.append({"Model": "XGBoost", "Feature Selection": "MI", "MSE": mse_xgb_mi, "MAE": mae_xgb_mi, "RMSE": rmse_xgb_mi, "R²": r2_xgb_mi})
model_dict["XGBoost_MI"] = xgb_mi_pipeline
confidence_scores["XGBoost_MI"] = get_confidence_scores(xgb_mi_pipeline.named_steps['model'], 
                                                        xgb_mi_pipeline.named_steps['selector'].transform(preprocessor.transform(X_test)), 
                                                        "XGBoost")
joblib.dump(xgb_mi_pipeline, 'models/XGBoost_MI.joblib')
pd.DataFrame({'confidence': confidence_scores["XGBoost_MI"]}).to_csv('models/XGBoost_MI_confidence.csv', index=False)




=== Non-Feature Selection ===

Decision Tree - Metrics
MSE: 8865106.64
MAE: 878.96
RMSE: 2977.43
R² Score: 0.9828


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
import joblib
import os

# Fungsi untuk evaluasi model
def evaluate_model(y_true, y_pred, model_name):
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    print(f"\n{model_name} - Metrics")
    print(f"MSE: {mse:.2f}")
    print(f"MAE: {mae:.2f}")
    print(f"RMSE: {rmse:.2f}")
    print(f"R² Score: {r2:.4f}")
    return mse, mae, rmse, r2

# Fungsi untuk menghitung confidence scores
def get_confidence_scores(model, X_test, model_type):
    if model_type == "DecisionTree":
        n_bootstraps = 100
        predictions = []
        for _ in range(n_bootstraps):
            idx = np.random.choice(X_test.shape[0], size=X_test.shape[0], replace=True)
            X_boot = X_test[idx]
            predictions.append(model.predict(X_boot))
        predictions = np.array(predictions)
        mean_pred = predictions.mean(axis=0)
        std_pred = predictions.std(axis=0)
        confidence_scores = 1 / (1 + std_pred / (mean_pred + 1e-10))
    elif model_type == "RandomForest":
        tree_predictions = np.array([tree.predict(X_test) for tree in model.estimators_])
        mean_pred = tree_predictions.mean(axis=0)
        std_pred = tree_predictions.std(axis=0)
        confidence_scores = 1 / (1 + std_pred / (mean_pred + 1e-10))
    elif model_type == "XGBoost":
        n_samples = 100
        predictions = []
        for _ in range(n_samples):
            noise = np.random.normal(0, 0.01, X_test.shape)
            X_noisy = X_test + noise
            predictions.append(model.predict(X_noisy))
        predictions = np.array(predictions)
        mean_pred = predictions.mean(axis=0)
        std_pred = predictions.std(axis=0)
        confidence_scores = 1 / (1 + std_pred / (mean_pred + 1e-10))
    return confidence_scores

# Buat folder models jika belum ada
if not os.path.exists('models'):
    os.makedirs('models')

# 1. Muat dataset
data = pd.read_csv('Flight Price Prediction Dataset.csv')

# 2. Preprocessing
if 'Unnamed: 0' in data.columns:
    data = data.drop(columns=['Unnamed: 0'])

if data.isnull().sum().sum() > 0:
    print("Data memiliki nilai null, mengisi dengan modus untuk kategorikal dan rata-rata untuk numerik...")
    for col in data.columns:
        if data[col].dtype == 'object':
            data[col] = data[col].fillna(data[col].mode()[0])
        else:
            data[col] = data[col].fillna(data[col].mean())

X = data.drop(columns=['price'])
y = data['price']

# Simpan nilai unik untuk dropdown dashboard
unique_values = {}
categorical_columns = ['airline', 'flight', 'source_city', 'departure_time', 'stops', 
                      'arrival_time', 'destination_city', 'class']
for col in categorical_columns:
    unique_values[col] = list(X[col].unique())
joblib.dump(unique_values, 'models/unique_values.joblib')

airline_flight_map = X.groupby('airline')['flight'].unique().to_dict()
joblib.dump(airline_flight_map, 'models/airline_flight_map.joblib')

numeric_columns = ['duration', 'days_left']

# Buat preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns),
        ('num', StandardScaler(), numeric_columns)
    ])

joblib.dump(preprocessor, 'models/preprocessor.joblib')

# Pisahkan data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
joblib.dump((X_train, X_test, y_train, y_test), 'models/processed_data.joblib')

# Simpan scaler untuk numerik
# Simpan scaler untuk numerik
preprocessor.fit(X_train)
scaler = preprocessor.named_transformers_['num']  # Corrected line
joblib.dump(scaler, 'models/scaler.joblib')


# 3. Inisialisasi hasil dan model
results = []
model_dict = {}
confidence_scores = {}



In [2]:
# 4. Pelatihan model tanpa feature selection
print("\n=== Non-Feature Selection ===")

# Decision Tree
dt_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', DecisionTreeRegressor(random_state=42))
])
dt_pipeline.fit(X_train, y_train)
y_pred_dt = dt_pipeline.predict(X_test)
mse_dt, mae_dt, rmse_dt, r2_dt = evaluate_model(y_test, y_pred_dt, "Decision Tree")
results.append({"Model": "Decision Tree", "Feature Selection": "None", "MSE": mse_dt, "MAE": mae_dt, "RMSE": rmse_dt, "R²": r2_dt})
model_dict["DecisionTree_None"] = dt_pipeline
confidence_scores["DecisionTree_None"] = get_confidence_scores(dt_pipeline.named_steps['model'], preprocessor.transform(X_test), "DecisionTree")
joblib.dump(dt_pipeline, 'models/DecisionTree_None.joblib')
pd.DataFrame({'confidence': confidence_scores["DecisionTree_None"]}).to_csv('models/DecisionTree_None_confidence.csv', index=False)




=== Non-Feature Selection ===

Decision Tree - Metrics
MSE: 8865106.64
MAE: 878.96
RMSE: 2977.43
R² Score: 0.9828


In [4]:
# Random Forest
rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(n_estimators=100, random_state=42))
])
rf_pipeline.fit(X_train, y_train)
y_pred_rf = rf_pipeline.predict(X_test)
mse_rf, mae_rf, rmse_rf, r2_rf = evaluate_model(y_test, y_pred_rf, "Random Forest")
results.append({"Model": "Random Forest", "Feature Selection": "None", "MSE": mse_rf, "MAE": mae_rf, "RMSE": rmse_rf, "R²": r2_rf})
model_dict["RandomForest_None"] = rf_pipeline
confidence_scores["RandomForest_None"] = get_confidence_scores(rf_pipeline.named_steps['model'], preprocessor.transform(X_test), "RandomForest")
joblib.dump(rf_pipeline, 'models/RandomForest_None.joblib')
pd.DataFrame({'confidence': confidence_scores["RandomForest_None"]}).to_csv('models/RandomForest_None_confidence.csv', index=False)




Random Forest - Metrics
MSE: 5646696.32
MAE: 857.93
RMSE: 2376.28
R² Score: 0.9890


In [5]:
# XGBoost
xgb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', XGBRegressor(random_state=42, objective='reg:squarederror'))
])
xgb_pipeline.fit(X_train, y_train)
y_pred_xgb = xgb_pipeline.predict(X_test)
mse_xgb, mae_xgb, rmse_xgb, r2_xgb = evaluate_model(y_test, y_pred_xgb, "XGBoost")
results.append({"Model": "XGBoost", "Feature Selection": "None", "MSE": mse_xgb, "MAE": mae_xgb, "RMSE": rmse_xgb, "R²": r2_xgb})
model_dict["XGBoost_None"] = xgb_pipeline
confidence_scores["XGBoost_None"] = get_confidence_scores(xgb_pipeline.named_steps['model'], preprocessor.transform(X_test), "XGBoost")
joblib.dump(xgb_pipeline, 'models/XGBoost_None.joblib')
pd.DataFrame({'confidence': confidence_scores["XGBoost_None"]}).to_csv('models/XGBoost_None_confidence.csv', index=False)




XGBoost - Metrics
MSE: 11730853.49
MAE: 1962.96
RMSE: 3425.03
R² Score: 0.9772


In [6]:
# 5. Feature Selection: Mutual Information
print("\n=== Feature Selection: Mutual Information ===")
# Buat pipeline dengan feature selection
mi_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('selector', SelectKBest(score_func=mutual_info_regression, k=5))
])

# Fit pipeline untuk mendapatkan fitur terpilih
mi_pipeline.fit(X_train, y_train)
selected_features_idx = mi_pipeline.named_steps['selector'].get_support()
# Dapatkan nama fitur setelah preprocessing
feature_names = (mi_pipeline.named_steps['preprocessor']
                 .named_transformers_['cat']
                 .get_feature_names_out(categorical_columns)
                 .tolist() + numeric_columns)
selected_features_mi = [feature_names[i] for i, selected in enumerate(selected_features_idx) if selected]
print("Selected features (MI):", selected_features_mi)
joblib.dump(mi_pipeline.named_steps['selector'], 'models/selector_mi.joblib')
joblib.dump(selected_features_mi, 'models/selected_features_mi.joblib')

# Decision Tree (MI)
dt_mi_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('selector', SelectKBest(score_func=mutual_info_regression, k=5)),
    ('model', DecisionTreeRegressor(random_state=42))
])
dt_mi_pipeline.fit(X_train, y_train)
y_pred_dt_mi = dt_mi_pipeline.predict(X_test)
mse_dt_mi, mae_dt_mi, rmse_dt_mi, r2_dt_mi = evaluate_model(y_test, y_pred_dt_mi, "Decision Tree (MI)")
results.append({"Model": "Decision Tree", "Feature Selection": "MI", "MSE": mse_dt_mi, "MAE": mae_dt_mi, "RMSE": rmse_dt_mi, "R²": r2_dt_mi})
model_dict["DecisionTree_MI"] = dt_mi_pipeline
confidence_scores["DecisionTree_MI"] = get_confidence_scores(dt_mi_pipeline.named_steps['model'], 
                                                            dt_mi_pipeline.named_steps['selector'].transform(preprocessor.transform(X_test)), 
                                                            "DecisionTree")
joblib.dump(dt_mi_pipeline, 'models/DecisionTree_MI.joblib')
pd.DataFrame({'confidence': confidence_scores["DecisionTree_MI"]}).to_csv('models/DecisionTree_MI_confidence.csv', index=False)





=== Feature Selection: Mutual Information ===
Selected features (MI): ['airline_Vistara', 'class_Business', 'class_Economy', 'duration', 'days_left']

Decision Tree (MI) - Metrics
MSE: 38372030.86
MAE: 3591.68
RMSE: 6194.52
R² Score: 0.9256


In [7]:
# Random Forest (MI)
rf_mi_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('selector', SelectKBest(score_func=mutual_info_regression, k=5)),
    ('model', RandomForestRegressor(n_estimators=100, random_state=42))
])
rf_mi_pipeline.fit(X_train, y_train)
y_pred_rf_mi = rf_mi_pipeline.predict(X_test)
mse_rf_mi, mae_rf_mi, rmse_rf_mi, r2_rf_mi = evaluate_model(y_test, y_pred_rf_mi, "Random Forest (MI)")
results.append({"Model": "Random Forest", "Feature Selection": "MI", "MSE": mse_rf_mi, "MAE": mae_rf_mi, "RMSE": rmse_rf_mi, "R²": r2_rf_mi})
model_dict["RandomForest_MI"] = rf_mi_pipeline
confidence_scores["RandomForest_MI"] = get_confidence_scores(rf_mi_pipeline.named_steps['model'], 
                                                            rf_mi_pipeline.named_steps['selector'].transform(preprocessor.transform(X_test)), 
                                                            "RandomForest")
joblib.dump(rf_mi_pipeline, 'models/RandomForest_MI.joblib')
pd.DataFrame({'confidence': confidence_scores["RandomForest_MI"]}).to_csv('models/RandomForest_MI_confidence.csv', index=False)




Random Forest (MI) - Metrics
MSE: 34637319.52
MAE: 3455.38
RMSE: 5885.35
R² Score: 0.9328


In [8]:
# XGBoost (MI)
xgb_mi_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('selector', SelectKBest(score_func=mutual_info_regression, k=5)),
    ('model', XGBRegressor(random_state=42, objective='reg:squarederror'))
])
xgb_mi_pipeline.fit(X_train, y_train)
y_pred_xgb_mi = xgb_mi_pipeline.predict(X_test)
mse_xgb_mi, mae_xgb_mi, rmse_xgb_mi, r2_xgb_mi = evaluate_model(y_test, y_pred_xgb_mi, "XGBoost (MI)")
results.append({"Model": "XGBoost", "Feature Selection": "MI", "MSE": mse_xgb_mi, "MAE": mae_xgb_mi, "RMSE": rmse_xgb_mi, "R²": r2_xgb_mi})
model_dict["XGBoost_MI"] = xgb_mi_pipeline
confidence_scores["XGBoost_MI"] = get_confidence_scores(xgb_mi_pipeline.named_steps['model'], 
                                                        xgb_mi_pipeline.named_steps['selector'].transform(preprocessor.transform(X_test)), 
                                                        "XGBoost")
joblib.dump(xgb_mi_pipeline, 'models/XGBoost_MI.joblib')
pd.DataFrame({'confidence': confidence_scores["XGBoost_MI"]}).to_csv('models/XGBoost_MI_confidence.csv', index=False)



XGBoost (MI) - Metrics
MSE: 26524799.25
MAE: 3051.95
RMSE: 5150.22
R² Score: 0.9485


In [None]:
# 6. Feature Selection: Sequential Forward Selection (SFS)
print("\n=== Feature Selection: SFS ===")
# SFS tidak bisa langsung digunakan dalam pipeline karena membutuhkan data yang sudah di-preprocess
# Preprocess data terlebih dahulu
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)
feature_names = (preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_columns).tolist() + numeric_columns)

# Jalankan SFS
sfs = SFS(RandomForestRegressor(n_estimators=10, random_state=42), 
          k_features=5, forward=True, scoring='r2', cv=5, n_jobs=-1)
sfs.fit(X_train_preprocessed, y_train)
selected_features_idx = list(sfs.k_feature_idx_)
selected_features_sfs = [feature_names[i] for i in selected_features_idx]
print("Selected features (SFS):", selected_features_sfs)




=== Feature Selection: SFS ===
Selected features (SFS): ['airline_Vistara', 'source_city_Mumbai', 'arrival_time_Night', 'class_Business', 'duration']


OSError: [Errno 22] Invalid argument: 'models/sfs.joblib'

In [24]:
import joblib

# Simpan SFS ke path penuh
joblib.dump(sfs, 'D:/Data Mining2/models/sfs.joblib')

# Simpan nama fitur terpilih ke path penuh
joblib.dump(selected_features_sfs, 'D:/Data Mining2/models/selected_features_sfs.joblib')


['D:/Data Mining2/models/selected_features_sfs.joblib']

In [28]:
# Buat pipeline dengan SFS (menggunakan selector manual)
class SFSSelector:
    def __init__(self, selected_indices):
        self.selected_indices = selected_indices
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[:, self.selected_indices]

# Decision Tree (SFS)
dt_sfs_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('selector', SFSSelector(selected_features_idx)),
    ('model', DecisionTreeRegressor(random_state=42))
])
dt_sfs_pipeline.fit(X_train, y_train)
y_pred_dt_sfs = dt_sfs_pipeline.predict(X_test)
mse_dt_sfs, mae_dt_sfs, rmse_dt_sfs, r2_dt_sfs = evaluate_model(y_test, y_pred_dt_sfs, "Decision Tree (SFS)")
results.append({"Model": "Decision Tree", "Feature Selection": "SFS", "MSE": mse_dt_sfs, "MAE": mae_dt_sfs, "RMSE": rmse_dt_sfs, "R²": r2_dt_sfs})
model_dict["DecisionTree_SFS"] = dt_sfs_pipeline
confidence_scores["DecisionTree_SFS"] = get_confidence_scores(dt_sfs_pipeline.named_steps['model'], 
                                                             dt_sfs_pipeline.named_steps['selector'].transform(preprocessor.transform(X_test)), 
                                                             "DecisionTree")
joblib.dump(dt_sfs_pipeline, 'D:/Data Mining2/models/DecisionTree_SFS.joblib')
pd.DataFrame({'confidence': confidence_scores["DecisionTree_SFS"]}).to_csv('D:/Data Mining2/models/DecisionTree_SFS_confidence.csv', index=False)



Decision Tree (SFS) - Metrics
MSE: 25191417.40
MAE: 3257.76
RMSE: 5019.11
R² Score: 0.9511


In [29]:
# Random Forest (SFS)
rf_sfs_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('selector', SFSSelector(selected_features_idx)),
    ('model', RandomForestRegressor(n_estimators=100, random_state=42))
])
rf_sfs_pipeline.fit(X_train, y_train)
y_pred_rf_sfs = rf_sfs_pipeline.predict(X_test)
mse_rf_sfs, mae_rf_sfs, rmse_rf_sfs, r2_rf_sfs = evaluate_model(y_test, y_pred_rf_sfs, "Random Forest (SFS)")
results.append({"Model": "Random Forest", "Feature Selection": "SFS", "MSE": mse_rf_sfs, "MAE": mae_rf_sfs, "RMSE": rmse_rf_sfs, "R²": r2_rf_sfs})
model_dict["RandomForest_SFS"] = rf_sfs_pipeline
confidence_scores["RandomForest_SFS"] = get_confidence_scores(rf_sfs_pipeline.named_steps['model'], 
                                                             rf_sfs_pipeline.named_steps['selector'].transform(preprocessor.transform(X_test)), 
                                                             "RandomForest")
joblib.dump(rf_sfs_pipeline, 'D:/Data Mining2/models/RandomForest_SFS.joblib')
pd.DataFrame({'confidence': confidence_scores["RandomForest_SFS"]}).to_csv('D:/Data Mining2/models/RandomForest_SFS_confidence.csv', index=False)



Random Forest (SFS) - Metrics
MSE: 25182625.91
MAE: 3259.33
RMSE: 5018.23
R² Score: 0.9511


In [30]:
# XGBoost (SFS)
xgb_sfs_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('selector', SFSSelector(selected_features_idx)),
    ('model', XGBRegressor(random_state=42, objective='reg:squarederror'))
])
xgb_sfs_pipeline.fit(X_train, y_train)
y_pred_xgb_sfs = xgb_sfs_pipeline.predict(X_test)
mse_xgb_sfs, mae_xgb_sfs, rmse_xgb_sfs, r2_xgb_sfs = evaluate_model(y_test, y_pred_xgb_sfs, "XGBoost (SFS)")
results.append({"Model": "XGBoost", "Feature Selection": "SFS", "MSE": mse_xgb_sfs, "MAE": mae_xgb_sfs, "RMSE": rmse_xgb_sfs, "R²": r2_xgb_sfs})
model_dict["XGBoost_SFS"] = xgb_sfs_pipeline
confidence_scores["XGBoost_SFS"] = get_confidence_scores(xgb_sfs_pipeline.named_steps['model'], 
                                                        xgb_sfs_pipeline.named_steps['selector'].transform(preprocessor.transform(X_test)), 
                                                        "XGBoost")
joblib.dump(xgb_sfs_pipeline, 'D:/Data Mining2/models/XGBoost_SFS.joblib')
pd.DataFrame({'confidence': confidence_scores["XGBoost_SFS"]}).to_csv('D:/Data Mining2/models/XGBoost_SFS_confidence.csv', index=False)




XGBoost (SFS) - Metrics
MSE: 28421890.78
MAE: 3504.75
RMSE: 5331.22
R² Score: 0.9449


In [31]:
# 7. Simpan hasil evaluasi
results_df = pd.DataFrame(results)
results_df.to_csv('D:/Data Mining2/models/model_results.csv', index=False)
print("\nHasil evaluasi disimpan ke 'models/model_results.csv'")

# Cetak perbandingan
print("\n=== Comparison Table ===")
print("\nNon-Feature Selection:")
print(results_df[results_df['Feature Selection'] == 'None'][['Model', 'MSE', 'MAE', 'RMSE', 'R²']])
print("\nMutual Information:")
print(results_df[results_df['Feature Selection'] == 'MI'][['Model', 'MSE', 'MAE', 'RMSE', 'R²']])
print("\nSFS:")
print(results_df[results_df['Feature Selection'] == 'SFS'][['Model', 'MSE', 'MAE', 'RMSE', 'R²']])

# Simpan confidence scores
joblib.dump(confidence_scores, 'D:/Data Mining2/models/confidence_scores.joblib')
print("Confidence scores disimpan ke 'models/confidence_scores.joblib'")


Hasil evaluasi disimpan ke 'models/model_results.csv'

=== Comparison Table ===

Non-Feature Selection:
           Model           MSE          MAE         RMSE        R²
0  Decision Tree  8.865107e+06   878.958971  2977.432894  0.982802
1  Random Forest  5.646696e+06   857.926063  2376.277828  0.989046
2        XGBoost  1.173085e+07  1962.960779  3425.033357  0.977243

Mutual Information:
           Model           MSE          MAE         RMSE        R²
3  Decision Tree  3.837203e+07  3591.682250  6194.516192  0.925561
4  Random Forest  3.463732e+07  3455.383906  5885.347867  0.932806
5        XGBoost  2.652480e+07  3051.949122  5150.223223  0.948544

SFS:
            Model           MSE          MAE         RMSE        R²
6   Decision Tree  2.519142e+07  3257.760353  5019.105239  0.951130
7   Decision Tree  2.519142e+07  3257.760353  5019.105239  0.951130
8   Decision Tree  2.519142e+07  3257.760353  5019.105239  0.951130
9   Decision Tree  2.519142e+07  3257.760353  5019.105239  0

In [33]:
joblib.dump(SFSSelector(selected_features_idx), 'D:/Data Mining2/models/selector_sfs.joblib')


['D:/Data Mining2/models/selector_sfs.joblib']