In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# ==========================================
# 1. LOAD DAN EKSPLORASI DATA (EDA)
# ==========================================
# Menggunakan dataset Titanic dari seaborn
df = sns.load_dataset('titanic')

print("--- 5 Data Teratas ---")
print(df.head())

print("\n--- Info Dataset ---")
print(df.info())

# Visualisasi distribusi target (Survived vs Not Survived)
plt.figure(figsize=(6, 4))
sns.countplot(x='survived', data=df, palette='viridis')
plt.title('Distribusi Penumpang Selamat (1) dan Tidak Selamat (0)')
plt.show()

# ==========================================
# 2. PREPROCESSING DATA
# ==========================================
# Memilih fitur yang relevan untuk prediksi
features = ['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']
X = df[features].copy()
y = df['survived']

# Handling Missing Values
X['age'] = X['age'].fillna(X['age'].median())
X['embarked'] = X['embarked'].fillna(X['embarked'].mode()[0])

# Encoding Data Kategorikal (Ubah teks menjadi angka)
X = pd.get_dummies(X, columns=['sex', 'embarked'], drop_first=True)

# ==========================================
# 3. SPLIT DATA (TRAINING & TESTING)
# ==========================================
# Membagi data: 80% Train, 20% Test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ==========================================
# 4. MEMBANGUN MODEL DECISION TREE
# ==========================================
# Menggunakan max_depth=3 agar pohon tidak terlalu kompleks (mencegah overfitting)
model = DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=42)
model.fit(X_train, y_train)

# ==========================================
# 5. EVALUASI MODEL
# ==========================================
y_pred = model.predict(X_test)

print("\n--- Laporan Klasifikasi ---")
print(classification_report(y_test, y_pred))

# Confusion Matrix
plt.figure(figsize=(6, 5))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues')
plt.xlabel('Prediksi')
plt.ylabel('Aktual')
plt.title('Confusion Matrix')
plt.show()

# ==========================================
# 6. VISUALISASI DECISION TREE
# ==========================================
plt.figure(figsize=(20, 10))
plot_tree(model, 
          feature_names=X.columns, 
          class_names=['Die', 'Live'], 
          filled=True, 
          rounded=True, 
          fontsize=12)
plt.title("Visualisasi Struktur Pohon Keputusan Titanic")
plt.show()

# Menampilkan tingkat kepentingan fitur (Feature Importance)
importance = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False)
print("\n--- Pentingnya Fitur (Feature Importance) ---")
print(importance)

ModuleNotFoundError: No module named 'pandas'