In [None]:
# Import library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.datasets import fetch_openml
import warnings
warnings.filterwarnings('ignore')

# Load dataset Boston Housing dari OpenML
boston = fetch_openml(name='Boston', version=1, as_frame=True)
df = boston.frame.copy()
df.rename(columns={'MEDV':'MEDV'}, inplace=True)  # Pastikan nama target MEDV
df.head()

# EDA: info dasar dan statistik deskriptif
print('Jumlah baris, kolom:', df.shape)
print('\nTipe kolom:\n', df.dtypes)
display(df.describe())

# Korelasi antar fitur
corr = df.corr()
corr_target = corr['MEDV'].abs().sort_values(ascending=False)
print('\nFitur paling berkorelasi dengan MEDV:\n', corr_target.head(10))

# Heatmap korelasi (menggunakan matplotlib)
plt.figure(figsize=(10,8))
plt.title('Heatmap Korelasi Fitur (matplotlib)')
plt.imshow(corr, interpolation='nearest')
plt.colorbar()
ticks = np.arange(0, len(corr.columns), 1)
plt.xticks(ticks, corr.columns, rotation=90)
plt.yticks(ticks, corr.columns)
plt.tight_layout()
plt.show()

# Scatter plot antara MEDV dan 3 fitur dengan korelasi tertinggi (kecuali MEDV sendiri)
top_feats = corr_target.index[1:4]  # ambil 3 fitur teratas selain MEDV
for feat in top_feats:
    plt.figure(figsize=(6,4))
    plt.scatter(df[feat], df['MEDV'], alpha=0.6)
    plt.xlabel(feat)
    plt.ylabel('MEDV')
    plt.title(f'MEDV vs {feat}')
    plt.tight_layout()
    plt.show()

    # Split fitur dan target
X = df.drop(columns=['MEDV'])
y = df['MEDV']

# Train-test split 80/20
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardisasi fitur (fit hanya di train)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Inisialisasi model
models = {
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(alpha=1.0, random_state=42),
    'Lasso': Lasso(alpha=0.1, random_state=42, max_iter=10000),
    'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42)
}

# Melatih model dan menyimpan hasil prediksi pada test set
results = {}
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results[name] = {'model': model, 'y_pred': y_pred, 'MSE': mse, 'RMSE': rmse, 'MAE': mae, 'R2': r2}
    
# Tampilkan ringkasan metrik
summary = pd.DataFrame({k: {m: results[k][m] for m in ['MSE','RMSE','MAE','R2']} for k in results}).T
display(summary.sort_values('RMSE'))

# Plot aktual vs prediksi untuk setiap model
for name in results:
    y_pred = results[name]['y_pred']
    plt.figure(figsize=(6,6))
    plt.scatter(y_test, y_pred, alpha=0.6)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()])
    plt.xlabel('Actual MEDV')
    plt.ylabel('Predicted MEDV')
    plt.title(f'Actual vs Predicted - {name}')
    plt.tight_layout()
    plt.show()

# Residual plots
for name in results:
    y_pred = results[name]['y_pred']
    residuals = y_test - y_pred
    plt.figure(figsize=(6,4))
    plt.scatter(y_pred, residuals, alpha=0.6)
    plt.axhline(0, linestyle='--')
    plt.xlabel('Predicted MEDV')
    plt.ylabel('Residuals')
    plt.title(f'Residual Plot - {name}')
    plt.tight_layout()
    plt.show()

    # Learning curve untuk model terpilih (misal RandomForest) untuk melihat perilaku belajar
from sklearn.model_selection import learning_curve
model = RandomForestRegressor(n_estimators=100, random_state=42)
train_sizes, train_scores, valid_scores = learning_curve(model, X_train_scaled, y_train, cv=5, scoring='neg_mean_squared_error', train_sizes=np.linspace(0.1,1.0,5), random_state=42)

# Konversi skor (neg MSE -> RMSE)
train_rmse = np.sqrt(-train_scores.mean(axis=1))
valid_rmse = np.sqrt(-valid_scores.mean(axis=1))

plt.figure(figsize=(8,5))
plt.plot(train_sizes, train_rmse, marker='o', label='Training RMSE')
plt.plot(train_sizes, valid_rmse, marker='o', label='Validation RMSE')
plt.xlabel('Jumlah Data Pelatihan')
plt.ylabel('RMSE')
plt.title('Learning Curve - Random Forest')
plt.legend()
plt.tight_layout()
plt.show()

# Fitur penting untuk Random Forest (jika tersedia)
rf = results['RandomForest']['model']
importances = rf.feature_importances_
feat_imp = pd.Series(importances, index=X.columns).sort_values(ascending=False)
display(feat_imp.head(10))

plt.figure(figsize=(8,4))
plt.bar(range(len(feat_imp.head(10))), feat_imp.head(10).values)
plt.xticks(range(len(feat_imp.head(10))), feat_imp.head(10).index, rotation=45)
plt.title('Top 10 Feature Importances - Random Forest')
plt.tight_layout()
plt.show()

# Kesimpulan singkat
print('Ringkasan metrik:')
display(summary.sort_values('RMSE'))

print('\nInterpretasi singkat:')
print('- Perbandingan metrik membantu memilih model terbaik berdasarkan RMSE atau R2.')
print('- Random Forest biasanya memberikan performa baik pada dataset ini, namun Linear/Ridge/Lasso berguna untuk interpretabilitas.')
print('- Untuk deployment, pertimbangkan regularisasi dan validasi silang lebih lanjut serta tuning hyperparameter.')