In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import StratifiedKFold, train_test_split
from imblearn.over_sampling import SMOTE
import tensorflow as tf
from tensorflow.keras import Input, Model
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from sklearn.metrics import classification_report, accuracy_score
import joblib
import warnings
import shap
import matplotlib.pyplot as plt

# Mengabaikan warning yang tidak perlu
warnings.filterwarnings('ignore')

In [None]:


# ---------------------------------------
# LOAD DAN PERSIAPAN DATA
# ---------------------------------------
# Ganti path sesuai lokasi dataset Anda
df = pd.read_csv("/kaggle/input/dataset-pkm/dataset_Media_Bioflok.csv")

# Menampilkan beberapa baris awal untuk inspeksi
print("Contoh Data Awal:")
print(df.head())

# Menghapus kolom 'Timestamp' jika ada
if 'Timestamp' in df.columns:
    df = df.drop(['Timestamp'], axis=1)
    print("Kolom 'Timestamp' telah dihapus.")

# ---------------------------------------
# FEATURE ENGINEERING
# ---------------------------------------
def combine_dimensions(row):
    if row['Bentuk_Kolam'] == 'Bulat':
        return row['Diameter (m)']
    elif row['Bentuk_Kolam'] == 'Kotak':
        return (row['Panjang (m)'] + row['Lebar (m)']) / 2
    else:
        return 0

df['Dimensi (m)'] = df.apply(combine_dimensions, axis=1)

# Drop kolom yang tidak diperlukan lagi tanpa menggunakan inplace=True
df = df.drop(['Diameter (m)', 'Panjang (m)', 'Lebar (m)'], axis=1)

# Isi nilai kosong jika ada tanpa menggunakan inplace=True
df['Tinggi (m)'] = df['Tinggi (m)'].fillna(df['Tinggi (m)'].mean())

# Encoding kategorikal tanpa menggunakan inplace=True
categorical_features = ['Bentuk_Kolam', 'Material_Kolam']
encoder = OneHotEncoder(sparse=False, drop='first')  # drop='first' untuk menghindari dummy variable trap
encoded_categorical = encoder.fit_transform(df[categorical_features])
encoded_feature_names = encoder.get_feature_names_out(categorical_features)

df_encoded = pd.DataFrame(encoded_categorical, columns=encoded_feature_names)

# Reset index untuk memastikan konsistensi saat concat
df = df.reset_index(drop=True)
df_encoded = df_encoded.reset_index(drop=True)

# Menggabungkan data ter-encode ke dataframe asli tanpa inplace=True
df = pd.concat([df.drop(categorical_features, axis=1), df_encoded], axis=1)

# Menentukan fitur dan target
features = ['Tinggi (m)', 'Volume_Air (L)', 'Garam_Krosok (kg)', 
            'Molase (ml)', 'Probiotik (ml)', 'Kapur_Dolomit (ml)', 
            'Dimensi (m)'] + list(encoded_feature_names)

target = 'Label'

X = df[features].values
y = df[target].map({'Benar': 1, 'Salah': 0}).values

# Normalisasi fitur
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
joblib.dump(scaler, 'scaler_bioflok.pkl')
print("Scaler disimpan sebagai 'scaler_bioflok.pkl'.")

# ---------------------------------------
# FUNGSI PEMBANGUNAN MODEL
# ---------------------------------------
def build_model(input_dim, learning_rate=1e-3, l2_reg=1e-5, dropout_rate=0.2):
    input_layer = Input(shape=(input_dim,))
    
    x = Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(l2_reg))(input_layer)
    x = BatchNormalization()(x)
    x = Dropout(dropout_rate)(x)

    x = Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(l2_reg))(x)
    x = BatchNormalization()(x)
    x = Dropout(dropout_rate)(x)

    x = Dense(32, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(l2_reg))(x)
    x = BatchNormalization()(x)
    x = Dropout(dropout_rate)(x)

    output = Dense(1, activation='sigmoid')(x)

    model = Model(inputs=input_layer, outputs=output)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    return model

# ---------------------------------------
# CROSS-VALIDATION DENGAN STRATIFIEDKFold
# ---------------------------------------
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold = 1
cv_accuracies = []
cv_reports = []

epochs = 100
batch_size = 32  # Meningkatkan batch_size untuk efisiensi

for train_index, test_index in kf.split(X_scaled, y):
    print(f"\n----- Fold {fold} -----")
    X_train_fold, X_test_fold = X_scaled[train_index], X_scaled[test_index]
    y_train_fold, y_test_fold = y[train_index], y[test_index]

    # Handling imbalanced data dengan SMOTE pada data training
    sm = SMOTE(random_state=42)
    X_train_fold_res, y_train_fold_res = sm.fit_resample(X_train_fold, y_train_fold)
    print(f"Fold {fold}: Sebelum SMOTE: {np.bincount(y_train_fold)}, Setelah SMOTE: {np.bincount(y_train_fold_res)}")

    # Membangun model baru
    model = build_model(input_dim=X_train_fold_res.shape[1], learning_rate=1e-3, l2_reg=1e-5, dropout_rate=0.2)

    # Definisi Callbacks dengan ekstensi .keras
    early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True, verbose=1)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, verbose=1)
    checkpoint = ModelCheckpoint(
        filepath=f'best_model_fold_{fold}.keras',  # Menggunakan ekstensi .keras
        monitor='val_loss',
        verbose=1,
        save_best_only=True,
        save_weights_only=False,
        mode='min'
    )

    callbacks = [early_stop, reduce_lr, checkpoint]

    # Melatih model
    history = model.fit(
        X_train_fold_res, y_train_fold_res,
        validation_split=0.2,
        epochs=epochs,
        batch_size=batch_size,
        callbacks=callbacks,
        verbose=1
    )

    # Load model terbaik dari fold ini
    best_model_fold = tf.keras.models.load_model(f'best_model_fold_{fold}.keras')

    # Evaluasi pada data test fold ini
    eval_res = best_model_fold.evaluate(X_test_fold, y_test_fold, verbose=0)
    y_pred_prob = best_model_fold.predict(X_test_fold)
    y_pred_fold = (y_pred_prob.flatten() > 0.5).astype(int)

    acc = accuracy_score(y_test_fold, y_pred_fold)
    cv_accuracies.append(acc)

    rep = classification_report(y_test_fold, y_pred_fold, output_dict=True)
    cv_reports.append(rep)

    print(f"Fold {fold} - Accuracy: {acc:.4f}")
    fold += 1

# Rata-rata performa cross-validation
mean_acc = np.mean(cv_accuracies)
std_acc = np.std(cv_accuracies)
print(f"\nCross-Validation Accuracy: {mean_acc:.4f} ± {std_acc:.4f}")

# ---------------------------------------
# MELATIH MODEL FINAL DAN ANALISIS FITUR DENGAN SHAP (OPSIONAL)
# ---------------------------------------
# Membagi data menjadi train dan test final
X_train_full, X_test_final, y_train_full, y_test_final = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y)

# Handling imbalanced data dengan SMOTE pada data training full
sm = SMOTE(random_state=42)
X_train_full_res, y_train_full_res = sm.fit_resample(X_train_full, y_train_full)

# Membangun dan melatih model final
final_model = build_model(input_dim=X_train_full_res.shape[1], learning_rate=1e-3, l2_reg=1e-5, dropout_rate=0.2)
early_stop_final = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True, verbose=1)
reduce_lr_final = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, verbose=1)
checkpoint_final = ModelCheckpoint(
    filepath='best_final_model.keras',  # Menggunakan ekstensi .keras
    monitor='val_loss',
    verbose=1,
    save_best_only=True,
    save_weights_only=False,
    mode='min'
)

callbacks_final = [early_stop_final, reduce_lr_final, checkpoint_final]

final_history = final_model.fit(
    X_train_full_res, y_train_full_res,
    validation_split=0.2,
    epochs=100,
    batch_size=32,
    callbacks=callbacks_final,
    verbose=1
)

# Load model terbaik
best_final_model = tf.keras.models.load_model('best_final_model.keras')

# Evaluasi pada data test final
y_pred_final = best_final_model.predict(X_test_final)
y_pred_final = (y_pred_final.flatten() > 0.5).astype(int)

print("\nFinal Model Classification Report:")
print(classification_report(y_test_final, y_pred_final))

# SHAP Analysis
# Pastikan shap terinstall: pip install shap
import shap

# Membuat explainer dengan subset data training
explainer = shap.DeepExplainer(best_final_model, X_train_full_res[:100])
shap_values = explainer.shap_values(X_test_final[:50])

# Plot summary
shap.summary_plot(shap_values, X_test_final[:50], feature_names=features)
plt.show()

# ---------------------------------------
# SIMPAN MODEL DAN SCALER
# ---------------------------------------
final_model.save('final_model_bioflok_adv.keras')  # Menyimpan dengan ekstensi .keras
print("Model akhir disimpan sebagai 'final_model_bioflok_adv.keras'.")
