In [78]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/aiffel-ds-3-house-prices/sample_submission.csv
/kaggle/input/aiffel-ds-3-house-prices/train.csv
/kaggle/input/aiffel-ds-3-house-prices/test.csv


In [79]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.ensemble import StackingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures

train = pd.read_csv('/kaggle/input/aiffel-ds-3-house-prices/train.csv')
test = pd.read_csv("/kaggle/input/aiffel-ds-3-house-prices/test.csv")


# Step 1: Handle outliers in train dataset
def remove_outliers(df, columns):
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df[col] = np.clip(df[col], lower_bound, upper_bound)  # Winsorization
    return df

# Step 2: Data Preprocessing
def preprocess_data(train, test):
    # Log transformation of target
    train['SalePrice'] = np.log1p(train['SalePrice'])

    # Log transform skewed numeric features
    skewed_features = ['GrLivArea', '1stFlrSF', 'TotalBsmtSF', 'LotArea']
    for feature in skewed_features:
        train[feature] = np.log1p(train[feature])
        test[feature] = np.log1p(test[feature])
    
    # Add DatasetType for train/test separation
    train['DatasetType'] = 'train'
    test['DatasetType'] = 'test'
    combined = pd.concat([train, test], ignore_index=True)
    
    # Handle missing values
    numeric_cols = combined.select_dtypes(include=['float64', 'int64']).columns
    imputer = SimpleImputer(strategy='median')
    combined[numeric_cols] = imputer.fit_transform(combined[numeric_cols])
    
    # One-hot encoding for categorical columns
    categorical_cols = combined.select_dtypes(include=['object']).columns
    combined = pd.get_dummies(combined, columns=categorical_cols, drop_first=True)
    
    return combined

# Step 3: Feature Engineering
def feature_engineering(combined):
    # Fill missing values for required columns
    combined['YearBuilt'] = combined['YearBuilt'].fillna(0)
    combined['YearRemodAdd'] = combined['YearRemodAdd'].fillna(0)
    combined['GarageArea'] = combined['GarageArea'].fillna(0)
    combined['GrLivArea'] = combined['GrLivArea'].fillna(0)

    # Create key features
    combined['HouseAge'] = 2023 - combined['YearBuilt']  # Age of the house
    combined['RemodAddAge'] = 2023 - combined['YearRemodAdd']  # Age since last remodel
    combined['GrLivGarageRatio'] = combined['GrLivArea'] / (combined['GarageArea'] + 1)  # Avoid division by zero

    # Return the modified DataFrame
    return combined


# Step 4: Split Combined Data into Train and Test
def split_data(combined, train):
    train = combined[combined['DatasetType_train'] == 1].drop(columns=['DatasetType_train'])
    test = combined[combined['DatasetType_train'] == 0].drop(columns=['DatasetType_train', 'SalePrice'])
    X = train.drop(columns=['SalePrice'])
    y = train['SalePrice']
    return train_test_split(X, y, test_size=0.2, random_state=42), test

# Step 5: Train and Evaluate Models
def train_and_evaluate(X_train, X_val, y_train, y_val):
    # Ensure data is numpy array
    X_train = X_train.values if isinstance(X_train, pd.DataFrame) else X_train
    X_val = X_val.values if isinstance(X_val, pd.DataFrame) else X_val
    y_train = y_train.values if isinstance(y_train, pd.Series) else y_train
    y_val = y_val.values if isinstance(y_val, pd.Series) else y_val

    # Base models
    base_models = {
        'Ridge': Ridge(),
        'XGBoost': XGBRegressor(random_state=42, n_estimators=200, max_depth=5, learning_rate=0.1),
        'LightGBM': LGBMRegressor(random_state=42, n_estimators=200, max_depth=5, learning_rate=0.1),
        'CatBoost': CatBoostRegressor(random_state=42, iterations=200, depth=5, learning_rate=0.1, verbose=0)
    }

    # Train and evaluate base models
    best_base_model = None
    best_rmse = float('inf')
    for name, model in base_models.items():
        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        rmse = mean_squared_error(y_val, preds, squared=False)
        print(f"{name} RMSE: {rmse}")
        if rmse < best_rmse:
            best_rmse = rmse
            best_base_model = model

    # Stacking model
    stacking_model = StackingRegressor(
        estimators=[
            ('Ridge', Ridge()),
            ('XGBoost', XGBRegressor(random_state=42, n_estimators=200, max_depth=5, learning_rate=0.1)),
            ('LightGBM', LGBMRegressor(random_state=42, n_estimators=200, max_depth=5, learning_rate=0.1)),
            ('CatBoost', CatBoostRegressor(random_state=42, iterations=200, depth=5, learning_rate=0.1, verbose=0))
        ],
        final_estimator=Ridge()
    )
    stacking_model.fit(X_train, y_train)
    stacking_preds = stacking_model.predict(X_val)
    stacking_rmse = mean_squared_error(y_val, stacking_preds, squared=False)
    print(f"Stacking Model RMSE: {stacking_rmse}")

    # Select best model (stacking or best base model)
    final_model = stacking_model if stacking_rmse < best_rmse else best_base_model
    print(f"Selected Model: {'Stacking' if final_model == stacking_model else 'Base Model'}")
    
    return final_model

# Step 6: Generate Submission
def generate_submission(model, test, filename='submission.csv'):
    test_preds = np.expm1(model.predict(test))  # Reverse log transformation
    submission = pd.DataFrame({'Id': test['Id'].astype(int), 'SalePrice': test_preds})
    submission.to_csv('submission9.csv', index=False)
 

# Main Execution
# train, test should be preloaded DataFrames
combined = preprocess_data(train, test)
combined = feature_engineering(combined)
(X_train, X_val, y_train, y_val), test = split_data(combined, train)
final_model = train_and_evaluate(X_train, X_val, y_train, y_val)
generate_submission(final_model, test)


Ridge RMSE: 0.13323861460209757
XGBoost RMSE: 0.13070491266597342
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000814 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3566
[LightGBM] [Info] Number of data points in the train set: 934, number of used features: 152
[LightGBM] [Info] Start training from score 12.007004
LightGBM RMSE: 0.12401603807610416
CatBoost RMSE: 0.11954355274544899
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000781 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3566
[LightGBM] [Info] Number of data points in the train set: 934, number of used features: 152
[LightGBM] [Info] Start training from score 12.007004
[LightGBM] [Info] Auto-choosing row-wise multi-threa