<a href="https://colab.research.google.com/github/fathanzys/Data/blob/main/0102522024_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**1. Import Library**

In [191]:
# 1. Import Library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.mixture import GaussianMixture

import warnings
warnings.filterwarnings("ignore")

#**2. Preprocessing**

In [205]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [193]:
# 2.1 Load Dataset
# File CSV dari NHANES 2013–2014 (pastikan sudah digabungkan sebelumnya secara manual atau via kode)
df_demo = pd.read_csv("demographic.csv")
df_diet = pd.read_csv("diet.csv")
df_exam = pd.read_csv("examination.csv")
df_quest = pd.read_csv("questionnaire.csv")

In [194]:
# 2.2 Seleksi Kolom Fitur

# Kolom demografi
df_demo_sel = df_demo[['SEQN', 'RIAGENDR', 'RIDAGEYR', 'INDFMPIR']]  # Gender, Usia, Income Ratio

# Kolom diet (nutrisi harian)
df_diet_sel = df_diet[['SEQN', 'DR1TKCAL', 'DR1TPROT', 'DR1TCARB', 'DR1TTFAT']]  # Kalori, Protein, Karbo, Lemak

# Kolom pemeriksaan fisik
df_exam_sel = df_exam[['SEQN', 'BMXBMI']]  # Body Mass Index

# Kolom kuisioner (aktivitas fisik)
df_quest_sel = df_quest[['SEQN', 'PAQ605', 'PAQ620']]  # Frekuensi aktivitas sedang & berat per minggu

In [195]:
# 2.3 Gabung dan Cek Missing Values

# Ganti nama kolom dengan versi panjang agar lebih deskriptif
df_demo_sel = df_demo_sel.rename(columns={
    'SEQN': 'Respondent_ID',
    'RIAGENDR': 'Gender',
    'RIDAGEYR': 'Age',
    'INDFMPIR': 'Income_to_Poverty_Ratio'
})

df_diet_sel = df_diet_sel.rename(columns={
    'SEQN': 'Respondent_ID',
    'DR1TKCAL': 'Daily_Energy_Intake_kcal',
    'DR1TPROT': 'Daily_Protein_grams',
    'DR1TCARB': 'Daily_Carbohydrate_grams',
    'DR1TTFAT': 'Daily_Total_Fat_grams'
})

df_exam_sel = df_exam_sel.rename(columns={
    'SEQN': 'Respondent_ID',
    'BMXBMI': 'Body_Mass_Index'
})

df_quest_sel = df_quest_sel.rename(columns={
    'SEQN': 'Respondent_ID',
    'PAQ605': 'Moderate_Activity_Frequency',
    'PAQ620': 'Vigorous_Activity_Frequency'
})

# Gabungkan semua data berdasarkan Respondent_ID
df_merged = df_demo_sel.merge(df_diet_sel, on='Respondent_ID') \
                       .merge(df_exam_sel, on='Respondent_ID') \
                       .merge(df_quest_sel, on='Respondent_ID')

# Cek jumlah missing values per kolom
missing_values = df_merged.isnull().sum()
print("Jumlah missing values per kolom:")
print(missing_values)

# Drop baris yang memiliki nilai null
df_cleaned = df_merged.dropna()
print(f"Jumlah data setelah drop NA: {len(df_cleaned)} observasi")

Jumlah missing values per kolom:
Respondent_ID                     0
Gender                            0
Age                               0
Income_to_Poverty_Ratio         730
Daily_Energy_Intake_kcal       1282
Daily_Protein_grams            1282
Daily_Carbohydrate_grams       1282
Daily_Total_Fat_grams          1282
Body_Mass_Index                 758
Moderate_Activity_Frequency    2863
Vigorous_Activity_Frequency    2863
dtype: int64
Jumlah data setelah drop NA: 5820 observasi


In [207]:
# Membuat fitur tambahan gaya hidup
df['Rasio_Karbo_Protein'] = df['Karbohidrat_Harian'] / (df['Protein_Harian'] + 1e-5)
df['Rasio_Lemak_Protein'] = df['Lemak_Harian'] / (df['Protein_Harian'] + 1e-5)
df['Kalori_per_Berat'] = df['Kalori_Harian'] / (df['Berat_Badan_kg'] + 1e-5)

# Interaksi tekanan darah dan BMI
df['Sistolik_per_BMI'] = df['Tekanan_Sistolik'] / (df['Indeks_Massa_Tubuh'] + 1e-5)
df['Diastolik_per_BMI'] = df['Tekanan_Diastolik'] / (df['Indeks_Massa_Tubuh'] + 1e-5)

KeyError: 'Karbohidrat_Harian'

#**3. Transformation**

In [206]:
df['Kalori_BMI'] = df['Kalori_Harian'] / df['BMI']
df['Protein_Glucose'] = df['Asupan_Protein'] * df['Glukosa_Puasa']

KeyError: 'Kalori_Harian'

In [196]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Salin data yang sudah dibersihkan
df_transformed = df_cleaned.copy()

# =========================
# 1. Encoding Variabel Kategorikal
# =========================

# Gender: 1 = Male, 2 = Female → ubah ke string lalu encode
df_transformed['Gender'] = df_transformed['Gender'].map({1: 'Male', 2: 'Female'})

# Label encoding untuk kolom Gender
label_encoder = LabelEncoder()
df_transformed['Gender_Encoded'] = label_encoder.fit_transform(df_transformed['Gender'])

# =========================
# 2. Standarisasi Fitur Numerik
# =========================

# Daftar fitur numerik untuk distandarkan
numeric_features = [
    'Age',
    'Income_to_Poverty_Ratio',
    'Daily_Energy_Intake_kcal',
    'Daily_Protein_grams',
    'Daily_Carbohydrate_grams',
    'Daily_Total_Fat_grams',
    'Moderate_Activity_Frequency',
    'Vigorous_Activity_Frequency'
]

# Inisialisasi StandardScaler
scaler = StandardScaler()
df_transformed[numeric_features] = scaler.fit_transform(df_transformed[numeric_features])

In [197]:
# Buat kolom baru bernama BMI_Category berdasarkan nilai BMI
def classify_bmi(bmi_value):
    if bmi_value < 18.5:
        return 'Underweight'
    elif 18.5 <= bmi_value < 25:
        return 'Normal'
    elif 25 <= bmi_value < 30:
        return 'Overweight'
    else:
        return 'Obese'

# Terapkan fungsi ke kolom Indeks Massa Tubuh
df_transformed['BMI_Category'] = df_transformed['Body_Mass_Index'].apply(classify_bmi)

# Cek distribusi label
print(df_transformed['BMI_Category'].value_counts())

BMI_Category
Obese          1967
Normal         1925
Overweight     1696
Underweight     232
Name: count, dtype: int64


#**4. Data Mining**

In [198]:
# Pisahkan fitur dan target
X = df_transformed.drop(columns=['Body_Mass_Index', 'BMI_Category'])
y = df_transformed['BMI_Category']

In [200]:
from sklearn.preprocessing import OneHotEncoder

# Pisahkan kolom numerik dan kategorikal
categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns

# Terapkan One-Hot Encoding ke kolom kategorikal
# Removed sparse=False as it's no longer supported
encoder = OneHotEncoder(handle_unknown='ignore')
X_encoded = encoder.fit_transform(X[categorical_cols])

# Gabungkan kembali dengan kolom numerik
# Convert sparse matrix to dense array for concatenation if needed, or handle sparse matrices downstream
X_final = np.concatenate([X[numerical_cols].values, X_encoded.toarray()], axis=1)

In [201]:
from sklearn.model_selection import train_test_split

# Bagi data menjadi data latih dan data uji
X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.2, random_state=42, stratify=y)

In [202]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

# Decision Tree
dt_model = DecisionTreeClassifier(max_depth=5, random_state=42)
dt_model.fit(X_train, y_train)

# K-Nearest Neighbors
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)

# Support Vector Machine
svm_model = SVC(kernel='rbf', class_weight='balanced', probability=True, random_state=42)
svm_model.fit(X_train, y_train)

In [None]:
# Menambahkan class_weight='balanced'
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

dt_model = DecisionTreeClassifier(class_weight='balanced', random_state=42)
svm_model = SVC(class_weight='balanced', probability=True, random_state=42)

In [208]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

param_grid_knn = {'n_neighbors': list(range(3, 11))}
grid_knn = GridSearchCV(KNeighborsClassifier(), param_grid_knn, cv=5)
grid_knn.fit(X_resampled, y_resampled)
print("Best k:", grid_knn.best_params_)

Best k: {'n_neighbors': 3}


#**5. Interpretation & Evaluation**

In [203]:
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, confusion_matrix
from sklearn.preprocessing import label_binarize

def evaluate_model(model, X_test, y_test, model_name):
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"🔍 Akurasi Model {model_name}: {acc:.4f}")
    print(f"\n📄 Classification Report - {model_name}:\n{classification_report(y_test, y_pred)}")

    # AUC hanya dihitung jika lebih dari dua kelas
    if len(np.unique(y_test)) > 2:
        y_test_bin = label_binarize(y_test, classes=np.unique(y_test))
        if hasattr(model, "predict_proba"):
            y_proba = model.predict_proba(X_test)
            auc = roc_auc_score(y_test_bin, y_proba, average="macro", multi_class="ovr")
            print(f"📈 AUC Score ({model_name}): {auc:.4f}")
        else:
            print(f"⚠️ Model {model_name} tidak mendukung probabilitas, AUC tidak dihitung.")
    print("-" * 60)

In [204]:
# Evaluasi ketiga model
evaluate_model(dt_model, X_test, y_test, "Decision Tree")
evaluate_model(knn_model, X_test, y_test, "K-Nearest Neighbors")
evaluate_model(svm_model, X_test, y_test, "Support Vector Machine")

🔍 Akurasi Model Decision Tree: 0.4321

📄 Classification Report - Decision Tree:
              precision    recall  f1-score   support

      Normal       0.47      0.55      0.51       385
       Obese       0.42      0.55      0.48       393
  Overweight       0.40      0.20      0.26       339
 Underweight       0.23      0.15      0.18        47

    accuracy                           0.43      1164
   macro avg       0.38      0.36      0.36      1164
weighted avg       0.42      0.43      0.41      1164

📈 AUC Score (Decision Tree): 0.6592
------------------------------------------------------------
🔍 Akurasi Model K-Nearest Neighbors: 0.3308

📄 Classification Report - K-Nearest Neighbors:
              precision    recall  f1-score   support

      Normal       0.32      0.45      0.38       385
       Obese       0.34      0.37      0.36       393
  Overweight       0.34      0.19      0.25       339
 Underweight       0.00      0.00      0.00        47

    accuracy            

#**6. Visualisasi**

In [188]:
# ==============================================================================
# 6. Visualisasi
# ==============================================================================
# Menggunakan PCA untuk mereduksi dimensi data menjadi 2 komponen
pca = PCA(n_components=2)
df_pca = pca.fit_transform(df_scaled)
df_pca = pd.DataFrame(df_pca, columns=['PC1', 'PC2'])
df_pca['Cluster'] = clusters

# Membuat scatter plot dari hasil PCA
plt.figure(figsize=(12, 8))
sns.scatterplot(x='PC1', y='PC2', hue='Cluster', data=df_pca, palette='viridis', s=70, alpha=0.8, legend='full')

plt.title('Visualisasi Cluster Status Gizi dengan PCA')
plt.xlabel('Principal Component 1 (Komponen Utama 1)')
plt.ylabel('Principal Component 2 (Komponen Utama 2)')
plt.legend(title='Cluster')
plt.savefig('cluster_visualization.png')
plt.close()

print("Plot visualisasi cluster disimpan sebagai 'cluster_visualization.png'")
print("✅ Visualisasi selesai.")

Plot visualisasi cluster disimpan sebagai 'cluster_visualization.png'
✅ Visualisasi selesai.
