In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math, kagglehub, shutil, os
import hdbscan
import plotly.express as px
from scipy.stats import pearsonr
from sklearn.preprocessing import StandardScaler # Wajib karena pakai K-Means
from sklearn.preprocessing import MinMaxScaler # Wajib karena pakai K-Means
from sklearn.cluster import KMeans # K-Means
from sklearn.metrics import silhouette_score # Silhoutte score
from sklearn.decomposition import PCA # Untuk reduksi dimensis
from mpl_toolkits.mplot3d import Axes3D
from IPython.display import HTML, display
import matplotlib.pyplot as plt

# Unduh dan Baca Dataset

In [None]:
# Unduh versi terbaru dataset
path = kagglehub.dataset_download("zubairdhuddi/global-daset")

print("Path to dataset files:", path)

In [None]:
# Folder tujuan di Colab
dst_path = "/content/dataset"
os.makedirs(dst_path, exist_ok=True)

# Salin file dari path yang benar (hasil unduhan) ke folder kerja
shutil.copytree(path, dst_path, dirs_exist_ok=True)

print("Dataset telah dipindahkan ke:", dst_path)

In [None]:
df = pd.read_csv('dataset/global_disaster_response_2018_2024 (1).csv')

# Cek Informasi dan Statistik Deskriptif

In [None]:
# Hitung jumlah baris dan cek variabel kolom
print(f"Jumlah Baris: {len(df)}")
print(f"Fitur: {df.columns.tolist()}")

In [None]:
# Menampilkan 5 baris pertama
df.head()

In [None]:
# Informasi dasar dataset
df.info()

In [None]:
# Statistik deskriptif
df.describe()

# Cek Missing Value, Duplikasi dan Outlier

In [None]:
missing = (df.isna().sum().to_frame(name='jumlah').assign(persen=lambda x: x['jumlah'] / len(df)).sort_values(by='persen', ascending=False))
missing.head(20)

In [None]:
# Cek duplikasi data
duplicate_rows = df.duplicated().sum()
print(f"Jumlah data duplikat: {duplicate_rows}")

if duplicate_rows > 0:
    print("Ada data duplikat dalam dataset.")
else:
    print("Tidak ada data duplikat dalam dataset.")

In [None]:
# Cek Outlier pada Fitur Numerik
numeric_cols = ['severity_index','casualties','economic_loss_usd',
                'response_time_hours','aid_amount_usd',
                'response_efficiency_score','recovery_days',
                'latitude', 'longitude']
# Fungsi deteksi outlier IQR
def detect_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower) | (df[column] > upper)]
    return outliers, lower, upper

# Menampilkan jumlah outlier tiap fitur
for col in numeric_cols:
    outliers, lower, upper = detect_outliers(df, col)
    print(f"{col}: \n jumlah outlier = {len(outliers)}")

# Konversi dan Validasi Data Tanggal

In [None]:
df['date'] = pd.to_datetime(df['date'], errors='coerce', dayfirst=False)
print("Missing dates:", df['date'].isna().sum())

# Ekstraksi Fitur Waktu dan Perhitungan Vulnerability Index

In [None]:
# Feature Engineering: Waktu
df = df.assign(
    year       = df['date'].dt.year,
    month      = df['date'].dt.month,
    quarter    = df['date'].dt.quarter,
    dayofweek  = df['date'].dt.day_name()
)

# Persiapan Data Numerik
cols_to_numeric = ['response_efficiency_score', 'severity_index']
df[cols_to_numeric] = df[cols_to_numeric].apply(lambda x:
                                                pd.to_numeric(x, errors='coerce'))

# Perhitungan Vulnerability Index
sev = df['severity_index'].fillna(0)
cas = df['casualties'].fillna(0)
resp = df['response_efficiency_score']
resp_safe = resp.mask(resp == 0, resp.median())
df['vulnerability_index'] = (sev * cas) / resp_safe
df['vulnerability_index'] = df['vulnerability_index'].replace([np.inf, -np.inf],
                                                              np.nan)

# Tampilkan Hasil
df[['date','year','month','disaster_type','severity_index',
    'casualties','response_efficiency_score','vulnerability_index']].head()


# Pembersihan dan Penanganan Outlier

In [None]:
# Standarisasi Kolom Numerik
numeric_features = {'severity_index','casualties','economic_loss_usd','response_time_hours','aid_amount_usd','response_efficiency_score','recovery_days','latitude','longitude'}

df[list(numeric_features)] = df[list(numeric_features)].apply(lambda x: pd.to_numeric(x, errors='coerce'))

# Validasi Nilai Tidak Logis (Negatif)
invalid_negative = ['casualties','economic_loss_usd','aid_amount_usd','recovery_days']
df[invalid_negative] = df[invalid_negative].mask(df[invalid_negative] < 0)

# Penanganan Outlier Ekstrem (99.5%)
outlier_cols = ['economic_loss_usd','aid_amount_usd','casualties']

for col in set(outlier_cols).intersection(df.columns):
    batas_atas = df[col].quantile(0.995)
    df[col] = df[col].clip(upper=batas_atas)

# Penghapusan Data Kritis yang Hilang
df = df.loc[df['date'].notna() & df['country'].notna()].reset_index(drop=True)
print(f"After cleaning shape: {df.shape}")


# EDA - Time Series

In [None]:
# Jumlah Bencana per Tahun
jml = df['year'].value_counts().sort_index()
jml = jml.reset_index()
jml.columns = ['year', 'total']
fig1 = px.bar(jml, x='year', y='total', title='Jumlah Bencana per Tahun')
fig1.show()

# Rata-rata Saverity per Tahun
hasil = []
thn = df['year'].unique()
thn.sort()
for t in thn:
    d = df[df['year'] == t]
    r = d['severity_index'].mean()
    hasil.append([t, r])

avg = pd.DataFrame(hasil, columns=['year', 'avg_sev'])
fig2 = px.line(avg, x='year', y='avg_sev', title='Rata-rata Severity per Tahun', markers=True)
fig2.show()

In [None]:
# Hitung jumlah bencana tiap negara
negara = df['country'].value_counts()
negara_15 = negara.head(15)
df_negara = negara_15.reset_index()
df_negara.columns = ['country', 'count']
fig = px.bar(df_negara, x='count', y='country', orientation='h', title='Top 15 Negara dengan Bencana Terbanyak')
fig.update_layout(yaxis={'categoryorder':'total ascending'})
fig.show()

# Hitung jumlah tiap jenis bencana
jenis = df['disaster_type'].value_counts()
df_jenis = jenis.reset_index()
df_jenis.columns = ['disaster_type', 'count']
fig = px.pie(df_jenis, values='count', names='disaster_type', title='Distribusi Jenis Bencana')
fig.show()

# Visualisasi Geografik

In [None]:
if len(df) > 1000:
    d = df.sample(1000, random_state=42)
else:
    d = df.copy()

peta = px.scatter_geo(d, lat='latitude', lon='longitude', color='disaster_type',
                      size='severity_index', hover_name='country',
                      title='Sebaran Lokasi Bencana Dunia')
peta.update_geos(projection_type='natural earth')
peta.show()


# Visualisasi Distribusi Fitur Numerik


In [None]:
numeric_features = ['severity_index','casualties','economic_loss_usd','response_time_hours',
                    'aid_amount_usd','response_efficiency_score','recovery_days','latitude',
                    'longitude']
cols = 3
rows = (len(numeric_features) + cols - 1) // cols  # pembulatan ke atas
plt.figure(figsize=(18, 5 * rows))

for i, col in enumerate(numeric_features, 1):
    plt.subplot(rows, cols, i)
    sns.histplot(df[col], bins=20, kde=True, color='skyblue', edgecolor='black')
    plt.title(f'Distribusi {col}', fontsize=12, fontweight='bold')
    plt.xlabel(col)
    plt.ylabel('Frekuensi')

plt.tight_layout()
plt.show()

# Matriks Korelasi Antar Variabel

In [None]:
corr_cols = ['severity_index','casualties','economic_loss_usd',
             'aid_amount_usd','response_time_hours','response_efficiency_score',
             'recovery_days']
corr_matrix = df[corr_cols].corr()
plt.figure(figsize=(12,8))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0)
plt.title('Correlation Matrix')
plt.show()

# K Means

In [None]:
RANDOM_STATE = 42
fkmeans = ['severity_index','casualties','economic_loss_usd','response_time_hours','recovery_days','response_efficiency_score']
df_clean = df[fkmeans].dropna().copy()
cl_df = df[fkmeans].dropna().sample(n=min(15000, df.shape[0]), random_state=RANDOM_STATE)
scaler = StandardScaler()
Xc = scaler.fit_transform(cl_df)
Xc_df = pd.DataFrame(Xc, columns=fkmeans)
for col in fkmeans:
    print(f"{col}: Mean={Xc_df[col].mean():.4f}, Std={Xc_df[col].std():.4f}")

In [None]:
# Find elbow
inertias = []
K = range(2,10)
for k in K:
    km = KMeans(n_clusters=k, random_state=RANDOM_STATE, n_init=20)
    km.fit(Xc)
    inertias.append(km.inertia_)
    print(f"K={k}, Inertia={inertias[-1]:.2f}")

In [None]:
# Visualisasi Elbow Method
plt.figure(figsize=(10, 6))
plt.plot(K, inertias, marker='o', linewidth=2, markersize=10, color='darkblue')
plt.title('Elbow Method: Menentukan K Optimal', fontsize=14, fontweight='bold')
plt.xlabel('Jumlah Cluster (K)', fontsize=12)
plt.ylabel('Inertia', fontsize=12)
plt.xticks(K)
plt.grid(alpha=0.3)

plt.axvline(x=4, color='red', linestyle='--', linewidth=2, label='Elbow (K=4)')
plt.legend()

plt.savefig('03_elbow_method.png', dpi=100, bbox_inches='tight')
plt.show()

In [None]:
# Fit K-Means dengan K=4
kmeans = KMeans(n_clusters=4, random_state=42, n_init=20)
labels = kmeans.fit_predict(Xc)
cl_df['Cluster'] = labels
df['Cluster'] = np.nan
df.loc[cl_df.index, 'Cluster'] = cl_df['Cluster']
profile = cl_df.groupby('Cluster').mean().T
display(profile)

In [None]:
pca = PCA(n_components=2, random_state=RANDOM_STATE)
pc = pca.fit_transform(Xc)
plt.figure(figsize=(10,7))
sns.scatterplot(x=pc[:,0], y=pc[:,1], hue=labels, palette='tab10', alpha=0.6, s=40)
plt.title('PCA projection of clusters')
plt.show()

In [None]:
# Hitung mean per cluster
cluster_summary = df.groupby('Cluster')[fkmeans].mean().round(2)
print("Karakteristik Setiap Cluster (Mean):")
print(cluster_summary)

# Visualisasi radar chart / heatmap
fig, ax = plt.subplots(figsize=(12, 6))
cluster_summary_T = cluster_summary.T
sns.heatmap(cluster_summary_T, annot=True, fmt='.2f', cmap='YlOrRd',
            cbar_kws={"shrink": 0.8}, linewidths=1)
ax.set_title('Heatmap: Karakteristik Cluster (Mean Values)', fontsize=14, fontweight='bold')
ax.set_xlabel('Cluster')
ax.set_ylabel('Fitur')
plt.tight_layout()
plt.savefig('05_cluster_heatmap.png', dpi=100, bbox_inches='tight')
plt.show()