In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Load test data
test_df = pd.read_csv('data/test.csv')

# Function to create new features (same as used for training)
def create_new_features(X):
    """Create the same engineered features as used in training"""
    X_new = X.copy()
    
    # Total bathrooms
    X_new['TotalBath'] = X_new['FullBath'] + X_new['HalfBath'] * 0.5 + X_new['BsmtFullBath'] + X_new['BsmtHalfBath'] * 0.5
    
    # Total porch area
    X_new['TotalPorchSF'] = X_new['OpenPorchSF'] + X_new['EnclosedPorch'] + X_new['3SsnPorch'] + X_new['ScreenPorch']
    
    # Living area per room
    X_new['LivAreaPerRoom'] = X_new['GrLivArea'] / X_new['TotRmsAbvGrd']
    
    # House age at time of sale
    X_new['HouseAge'] = X_new['YrSold'] - X_new['YearBuilt']
    
    # Years since remodel
    X_new['YearsSinceRemodel'] = X_new['YrSold'] - X_new['YearRemodAdd']
    
    # Garage age
    X_new['GarageAge'] = X_new['YrSold'] - X_new['GarageYrBlt']
    
    # Was house remodeled?
    X_new['IsRemodeled'] = (X_new['YearRemodAdd'] != X_new['YearBuilt']).astype(int)
    
    # Overall quality * condition interaction
    X_new['QualCondProduct'] = X_new['OverallQual'] * X_new['OverallCond']
    
    # Quality scores (convert categorical to numerical)
    quality_map = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1}
    X_new['ExterQual_num'] = X_new['ExterQual'].map(quality_map)
    X_new['KitchenQual_num'] = X_new['KitchenQual'].map(quality_map)
    
    # Average quality score
    X_new['AvgQuality'] = (X_new['OverallQual'] + X_new['ExterQual_num'] + X_new['KitchenQual_num']) / 3
    
    # Has specific features
    X_new['HasPool'] = (X_new['PoolArea'] > 0).astype(int)
    X_new['HasGarage'] = (X_new['GarageArea'] > 0).astype(int)
    X_new['HasBasement'] = (X_new['TotalBsmtSF'] > 0).astype(int)
    X_new['HasFireplace'] = (X_new['Fireplaces'] > 0).astype(int)
    X_new['HasWoodDeck'] = (X_new['WoodDeckSF'] > 0).astype(int)
    X_new['HasFence'] = (X_new['Fence'] != 'NA').astype(int)
    
    # Ratios
    X_new['GarageRatio'] = X_new['GarageArea'] / X_new['LotArea']
    X_new['LivAreaRatio'] = X_new['GrLivArea'] / X_new['LotArea']
    
    # Is corner lot
    X_new['IsCornerLot'] = (X_new['LotConfig'] == 'Corner').astype(int)
    
    # Sale season
    X_new['SaleSeason'] = X_new['MoSold'].apply(lambda x: 'Spring' if x in [3,4,5] 
                                          else 'Summer' if x in [6,7,8]
                                          else 'Fall' if x in [9,10,11] 
                                          else 'Winter')
    
    # House size categories
    X_new['HouseSizeCategory'] = pd.cut(X_new['GrLivArea'], 
                                    bins=[0, 1200, 1800, 2500, float('inf')], 
                                    labels=['Small', 'Medium', 'Large', 'XLarge'])
    
    # Missing value indicators
    missing_features = ['LotFrontage', 'MasVnrArea', 'GarageYrBlt']
    for feature in missing_features:
        X_new[f'{feature}_Missing'] = X_new[feature].isnull().astype(int)
    
    return X_new

# Function to encode categorical features (same as used in training)
def encode_categorical_features(X):
    """
    Encodes categorical features in the DataFrame X using OneHotEncoder.
    Returns a DataFrame with encoded features.
    """
    # Get categorical columns from features only
    categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

    # Handle missing values first
    X_clean = X.copy()
    
    # Fill numeric missing values
    numeric_cols = X_clean.select_dtypes(include=[np.number]).columns.tolist()
    for col in numeric_cols:
        X_clean[col] = X_clean[col].fillna(X_clean[col].median())

    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)
        ],
        remainder='passthrough'  # keep other (non-categorical) columns
    )

    transformed = preprocessor.fit_transform(X_clean)
    feature_names = preprocessor.get_feature_names_out()
    return pd.DataFrame(transformed, columns=feature_names), preprocessor

# Prepare training data (using your existing code)
df = pd.read_csv('data/train.csv')
X = df.drop('SalePrice', axis=1)
y = df['SalePrice']

# Apply feature engineering to training data
X_new_features = create_new_features(X)

# Encode training data and fit preprocessor
final_df, preprocessor = encode_categorical_features(X_new_features)

# Train the model with best parameters
best_params = {
    'n_estimators': 221, 
    'learning_rate': 0.13275517953802823, 
    'max_depth': 3, 
    'min_samples_split': 12, 
    'min_samples_leaf': 3, 
    'subsample': 0.7652505358562287, 
    'max_features': None,
    'random_state': 42
}

print("Training the GradientBoostingRegressor with best parameters...")
model = GradientBoostingRegressor(**best_params)
model.fit(final_df, y)

# Prepare test data
print("Preparing test data...")
# Apply same feature engineering to test data
X_test_features = create_new_features(test_df)

# Handle missing values in test data (same way as training)
X_test_clean = X_test_features.copy()
numeric_cols = X_test_clean.select_dtypes(include=[np.number]).columns.tolist()
for col in numeric_cols:
    X_test_clean[col] = X_test_clean[col].fillna(X_test_clean[col].median())

# Apply the same preprocessing as training data
test_transformed = preprocessor.transform(X_test_clean)
test_feature_names = preprocessor.get_feature_names_out()
test_final_df = pd.DataFrame(test_transformed, columns=test_feature_names)

# Make predictions
print("Making predictions...")
test_predictions = model.predict(test_final_df)

# Create submission file
submission = pd.DataFrame({
    'Id': test_df['Id'],
    'SalePrice': test_predictions
})

# Save submission
submission.to_csv('submission.csv', index=False)
print(f"Submission saved to 'submission.csv'")
print(f"Number of predictions: {len(test_predictions)}")
print(f"Prediction range: ${test_predictions.min():,.2f} - ${test_predictions.max():,.2f}")
print(f"Mean prediction: ${test_predictions.mean():,.2f}")

# Display first few predictions
print("\nFirst 10 predictions:")
print(submission.head(10))