In [None]:
!pip install ucimlrepo pandas scikit-learn imbalanced-learn matplotlib seaborn

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_curve, auc
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek, SMOTEENN
import xgboost as xgb
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import warning
warnings.filterwarnings('ignore')

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [None]:
print("Mengambil Dataset AI4I 2020 Predictive Maintenance.")
from ucimlrepo import fetch_ucirepo

ai4i_2020_predictive_maintenance_dataset = fetch_ucirepo(id=601)

X = ai4i_2020_predictive_maintenance_dataset.data.features
y_raw = ai4i_2020_predictive_maintenance_dataset.data.targets

try:
    y = y_raw['Target']
except KeyError:
    print("\nPeringatan: Kolom 'Target' tidak ditemukan. Mencoba menggunakan kolom pertama dari targets sebagai default.")
    y = y_raw.iloc[:, 0]
    print(f"Menggunakan kolom '{y.name}' sebagai target.")

In [None]:
print("Metadata Dataset:")
print(ai4i_2020_predictive_maintenance_dataset.metadata)

print("\nInformasi Variabel Dataset (ai4i_2020_predictive_maintenance_dataset.variables):")
print(ai4i_2020_predictive_maintenance_dataset.variables)

if 'Product ID' in X.columns:
    X = X.drop('Product ID', axis=1)

numerical_cols = ai4i_2020_predictive_maintenance_dataset.variables[
    (ai4i_2020_predictive_maintenance_dataset.variables['type'] == 'Continuous') &
    (ai4i_2020_predictive_maintenance_dataset.variables['name'] != 'Product ID')
]['name'].tolist()

categorical_cols = ai4i_2020_predictive_maintenance_dataset.variables[
    (ai4i_2020_predictive_maintenance_dataset.variables['type'] == 'Categorical')
]['name'].tolist()

numerical_cols = [col for col in numerical_cols if col in X.columns]
categorical_cols = [col for col in categorical_cols if col in X.columns]

print("Kolom Numerik yang Diidentifikasi Secara Otomatis:")
print(numerical_cols)
print("Kolom Kategorikal yang Diidentifikasi Secara Otomatis:")
print(categorical_cols)

In [None]:
print("Eksplorasi Data Awal:")
print("Bentuk X:", X.shape)
print("Bentuk y:", y.shape)
print("\nInfo X:")
X.info()

print("\nDistribusi kelas di variabel target 'Target':")
target_counts = Counter(y)
total_samples = sum(target_counts.values())
for label, count in target_counts.items():
    percentage = (count / total_samples) * 100
    print(f"Kelas {label}: {count} sampel ({percentage:.2f}%)")

if 1 in target_counts and target_counts[1] < target_counts[0] / 5:
    print("\nDataset kemungkinan mengalami **ketidakseimbangan (imbalance)** yang signifikan. Penanganan imbalance (seperti SMOTE) kemungkinan besar diperlukan.")
else:
    print("\nDataset terlihat cukup seimbang atau ketidakseimbangan tidak terlalu parah.")

plt.figure(figsize=(6, 4))
sns.countplot(x=y)
plt.title('Distribusi Kelas Target (Kegagalan Mesin)')
plt.xlabel('Kegagalan (0: Tidak, 1: Ya)')
plt.ylabel('Jumlah Sampel')
plt.show()

# Data Processing

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)
    ],
    remainder='passthrough'
)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"\nJumlah sampel training sebelum resampling: {Counter(y_train)}")

In [None]:
print("\nMelakukan Preprocessing Data.")

X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

processed_feature_names = []
if 'num' in preprocessor.named_transformers_:
    processed_feature_names.extend([f'num__{col}' for col in numerical_cols])
if 'cat' in preprocessor.named_transformers_ and hasattr(preprocessor.named_transformers_['cat'], 'get_feature_names_out'):
    processed_feature_names.extend(preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_cols))

remainder_features = [col for col in X.columns if col not in numerical_cols + categorical_cols + ['Product ID']]
if 'remainder' in preprocessor.named_transformers_:
    processed_feature_names.extend([f'remainder__{col}' for col in remainder_features])

if len(processed_feature_names) != X_train_processed.shape[1]:
    print("\nPeringatan: Gagal mendapatkan semua nama fitur yang diproses secara otomatis. Menggunakan nama generik.")
    processed_feature_names = [f"feature_{i}" for i in range(X_train_processed.shape[1])]


X_train_processed_df = pd.DataFrame(X_train_processed, columns=processed_feature_names)
X_test_processed_df = pd.DataFrame(X_test_processed, columns=processed_feature_names)

# Evaluate_model

In [None]:
def evaluate_model(model, X_test, y_test, model_name, feature_names, feature_descriptions_map):
    """
    Melakukan prediksi dan evaluasi model, serta menampilkan feature importance.
    """
    print(f"\nEvaluasi Model {model_name}:")
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    print("\nLaporan Klasifikasi:")
    print(classification_report(y_test, y_pred))

    print("\nConfusion Matrix:")
    cm = confusion_matrix(y_test, y_pred)
    print(cm)

    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Prediksi Non-Gagal', 'Prediksi Gagal'],
                yticklabels=['Aktual Non-Gagal', 'Aktual Gagal'])
    plt.title(f'Confusion Matrix ({model_name})')
    plt.ylabel('Aktual')
    plt.xlabel('Prediksi')
    plt.show()

    roc_auc = roc_auc_score(y_test, y_prob)
    print(f"\nROC AUC Score: {roc_auc:.4f}")

    precision, recall, _ = precision_recall_curve(y_test, y_prob)
    pr_auc = auc(recall, precision)
    print(f"Precision-Recall AUC: {pr_auc:.4f}")

    print(f"\nAnalisis Feature Importance ({model_name}):")
    if hasattr(model, 'feature_importances_'):
        feature_importances = pd.Series(model.feature_importances_, index=feature_names).sort_values(ascending=False)
    elif hasattr(model, 'coef_'):
        if model.coef_.ndim > 1:
            feature_importances = pd.Series(np.abs(model.coef_[0]), index=feature_names).sort_values(ascending=False)
        else:
            feature_importances = pd.Series(np.abs(model.coef_), index=feature_names).sort_values(ascending=False)
    else:
        print("Model tidak memiliki atribut feature_importances_ atau coef_.")
        return

    readable_feature_importances = []
    for feature, importance in feature_importances.items():
        original_name = feature.split('__')[-1]
        display_name = feature_descriptions_map.get(original_name, original_name)
        readable_feature_importances.append(f"{feature} ({display_name}): {importance:.6f}")

    for line in readable_feature_importances:
        print(line)

    plt.figure(figsize=(10, max(6, len(feature_importances) // 2)))
    sns.barplot(x=feature_importances.values, y=feature_importances.index, palette='viridis')
    plt.title(f'Feature Importance dari {model_name}')
    plt.xlabel('Kepentingan')
    plt.ylabel('Fitur')
    plt.show()

feature_descriptions = {
    'Air temperature': 'Suhu Udara',
    'Process temperature': 'Suhu Proses',
    'Rotational speed': 'Kecepatan Rotasi',
    'Torque': 'Torsi',
    'Tool wear': 'Keausan Alat',
    'Type_L': 'Tipe Produk (Rendah)',
    'Type_M': 'Tipe Produk (Sedang)',
    'Type_H': 'Tipe Produk (Tinggi)',
}


In [None]:
resampling_methods = {
    "SMOTE": SMOTE(random_state=42),
    "SMOTETomek": SMOTETomek(random_state=42),
    "SMOTEENN": SMOTEENN(random_state=42)
}

for method_name, resampler_instance in resampling_methods.items():
    print(f"\n\n{'='*60}")
    print(f"      Melatih Model dengan Resampling: {method_name}")
    print(f"{'='*60}")

    print(f"\nMelakukan Resampling dengan {method_name}:")
    X_train_resampled, y_train_resampled = resampler_instance.fit_resample(X_train_processed_df, y_train)
    print(f"Jumlah sampel training setelah {method_name}: {Counter(y_train_resampled)}")

    print(f"\nMelatih Model Random Forest untuk {method_name}:")
    rf_model = RandomForestClassifier(random_state=42)
    rf_model.fit(X_train_resampled, y_train_resampled)
    evaluate_model(rf_model, X_test_processed_df, y_test, f"Random Forest ({method_name})", processed_feature_names, feature_descriptions)

    print(f"\nMelatih Model XGBoost untuk {method_name}:")
    xgb_model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False, random_state=42)
    xgb_model.fit(X_train_resampled, y_train_resampled)
    evaluate_model(xgb_model, X_test_processed_df, y_test, f"XGBoost ({method_name})", processed_feature_names, feature_descriptions)

print("\n\nProses pelatihan dan evaluasi untuk semua metode resampling selesai!")