In [2]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

print("Loading data...")
df = pd.read_csv('cleanedtrain.csv')
dft = pd.read_csv('cleanedtest.csv')

# Pisahkan features dan target
X = df.drop(['SalePrice', 'Id'], axis=1)
y = df['SalePrice']
X_test = dft.drop('Id', axis=1)
test_ids = dft['Id']

print(f"Training data shape: {X.shape}")
print(f"Test data shape: {X_test.shape}")

# Setup cross-validation
kfolds = KFold(n_splits=5, shuffle=True, random_state=42)

def rmse_cv(model, X, y):
    """Calculate RMSE using cross-validation"""
    rmse = np.sqrt(-cross_val_score(model, X, y, 
                                     scoring="neg_mean_squared_error", 
                                     cv=kfolds))
    return rmse

# Dictionary untuk menyimpan model dan skor
models = {}
scores = {}

print("\n" + "="*60)
print("Training dan evaluasi model...")
print("="*60)

# 1. Ridge Regression
print("\n1. Ridge Regression...")
ridge = Ridge(alpha=10.0, random_state=42)
ridge.fit(X, y)
ridge_score = rmse_cv(ridge, X, y)
models['Ridge'] = ridge
scores['Ridge'] = ridge_score
print(f"   RMSE: {ridge_score.mean():.6f} (+/- {ridge_score.std():.6f})")

# 2. Lasso Regression
print("\n2. Lasso Regression...")
lasso = Lasso(alpha=0.0005, random_state=42, max_iter=10000)
lasso.fit(X, y)
lasso_score = rmse_cv(lasso, X, y)
models['Lasso'] = lasso
scores['Lasso'] = lasso_score
print(f"   RMSE: {lasso_score.mean():.6f} (+/- {lasso_score.std():.6f})")

# 3. ElasticNet
print("\n3. ElasticNet...")
elastic = ElasticNet(alpha=0.0005, l1_ratio=0.9, random_state=42, max_iter=10000)
elastic.fit(X, y)
elastic_score = rmse_cv(elastic, X, y)
models['ElasticNet'] = elastic
scores['ElasticNet'] = elastic_score
print(f"   RMSE: {elastic_score.mean():.6f} (+/- {elastic_score.std():.6f})")

# 4. Gradient Boosting
print("\n4. Gradient Boosting...")
gbr = GradientBoostingRegressor(n_estimators=500, learning_rate=0.05,
                                max_depth=4, max_features='sqrt',
                                min_samples_leaf=15, min_samples_split=10,
                                loss='huber', random_state=42)
gbr.fit(X, y)
gbr_score = rmse_cv(gbr, X, y)
models['GradientBoosting'] = gbr
scores['GradientBoosting'] = gbr_score
print(f"   RMSE: {gbr_score.mean():.6f} (+/- {gbr_score.std():.6f})")

# 5. XGBoost
print("\n5. XGBoost...")
xgb = XGBRegressor(n_estimators=500, learning_rate=0.05, max_depth=3,
                   min_child_weight=1, subsample=0.8, colsample_bytree=0.8,
                   reg_alpha=0.1, reg_lambda=1, random_state=42)
xgb.fit(X, y)
xgb_score = rmse_cv(xgb, X, y)
models['XGBoost'] = xgb
scores['XGBoost'] = xgb_score
print(f"   RMSE: {xgb_score.mean():.6f} (+/- {xgb_score.std():.6f})")

# 6. LightGBM
print("\n6. LightGBM...")
lgbm = LGBMRegressor(n_estimators=500, learning_rate=0.05, max_depth=3,
                     num_leaves=31, min_child_samples=20, subsample=0.8,
                     colsample_bytree=0.8, reg_alpha=0.1, reg_lambda=1,
                     random_state=42, verbose=-1)
lgbm.fit(X, y)
lgbm_score = rmse_cv(lgbm, X, y)
models['LightGBM'] = lgbm
scores['LightGBM'] = lgbm_score
print(f"   RMSE: {lgbm_score.mean():.6f} (+/- {lgbm_score.std():.6f})")

# 7. Random Forest
print("\n7. Random Forest...")
rf = RandomForestRegressor(n_estimators=300, max_depth=15, min_samples_split=10,
                           min_samples_leaf=4, max_features='sqrt', 
                           random_state=42, n_jobs=-1)
rf.fit(X, y)
rf_score = rmse_cv(rf, X, y)
models['RandomForest'] = rf
scores['RandomForest'] = rf_score
print(f"   RMSE: {rf_score.mean():.6f} (+/- {rf_score.std():.6f})")

# Rangkuman skor
print("\n" + "="*60)
print("RANGKUMAN SKOR MODEL")
print("="*60)
for name, score in sorted(scores.items(), key=lambda x: x[1].mean()):
    print(f"{name:20s}: {score.mean():.6f} (+/- {score.std():.6f})")

# Cari model terbaik
best_model_name = min(scores.items(), key=lambda x: x[1].mean())[0]
best_model = models[best_model_name]
best_score = scores[best_model_name].mean()
print(f"\n MODEL TERBAIK: {best_model_name} dengan RMSE: {best_score:.6f}")

# Ensemble: Weighted average dari top models
print("\n" + "="*60)
print("MEMBUAT ENSEMBLE MODEL")
print("="*60)

# Pilih top 5 model
top_models = sorted(scores.items(), key=lambda x: x[1].mean())[:5]
print("\nTop 5 Model untuk Ensemble:")
for i, (name, score) in enumerate(top_models, 1):
    print(f"{i}. {name}: {score.mean():.6f}")

# Prediksi menggunakan ensemble
print("\nMembuat prediksi ensemble...")
ensemble_predictions = np.zeros(len(X_test))
weights = []

for name, score in top_models:
    # Weight berbanding terbalik dengan RMSE (semakin kecil RMSE, semakin besar weight)
    weight = 1.0 / score.mean()
    weights.append(weight)

# Normalisasi weights
weights = np.array(weights)
weights = weights / weights.sum()

print("\nWeight untuk setiap model:")
for i, (name, score) in enumerate(top_models):
    print(f"  {name}: {weights[i]:.4f}")
    pred = models[name].predict(X_test)
    ensemble_predictions += weights[i] * pred

# Buat submission menggunakan ensemble
print("\n" + "="*60)
print("MEMBUAT FILE SUBMISSION")
print("="*60)

# Convert kembali dari log transform
final_predictions = np.expm1(ensemble_predictions)

submission = pd.DataFrame({
    'Id': test_ids,
    'SalePrice': final_predictions
})

submission.to_csv('submission.csv', index=False)
print("\n File 'submission.csv' berhasil dibuat!")
print(f"   Total prediksi: {len(submission)}")
print(f"   Range harga: ${final_predictions.min():,.2f} - ${final_predictions.max():,.2f}")
print(f"   Median harga: ${np.median(final_predictions):,.2f}")

print("\n" + "="*60)
print("SELESAI!")
print("="*60)
print(f"Model terbaik: {best_model_name} (RMSE: {best_score:.6f})")
print("File submission siap diupload ke Kaggle!")
print("="*60)

Loading data...
Training data shape: (1460, 212)
Test data shape: (1459, 212)

Training dan evaluasi model...

1. Ridge Regression...
   RMSE: 0.131096 (+/- 0.019637)

2. Lasso Regression...
   RMSE: 0.130628 (+/- 0.018590)

3. ElasticNet...
   RMSE: 0.130715 (+/- 0.018339)

4. Gradient Boosting...
   RMSE: 0.124430 (+/- 0.019022)

5. XGBoost...
   RMSE: 0.129254 (+/- 0.019351)

6. LightGBM...
   RMSE: 0.130002 (+/- 0.017541)

7. Random Forest...
   RMSE: 0.152528 (+/- 0.015796)

RANGKUMAN SKOR MODEL
GradientBoosting    : 0.124430 (+/- 0.019022)
XGBoost             : 0.129254 (+/- 0.019351)
LightGBM            : 0.130002 (+/- 0.017541)
Lasso               : 0.130628 (+/- 0.018590)
ElasticNet          : 0.130715 (+/- 0.018339)
Ridge               : 0.131096 (+/- 0.019637)
RandomForest        : 0.152528 (+/- 0.015796)

 MODEL TERBAIK: GradientBoosting dengan RMSE: 0.124430

MEMBUAT ENSEMBLE MODEL

Top 5 Model untuk Ensemble:
1. GradientBoosting: 0.124430
2. XGBoost: 0.129254
3. LightGBM: