In [4]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, OneHotEncoder, PowerTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
import xgboost as xgb
import pickle

# Data preprocessing function remains the same
def preprocess_data(df):
    """Handle missing values and add new features."""
    df = df.copy()
    
    # Drop columns with high NaN ratio
    df.drop(columns=["PoolQC", "MiscFeature", "Alley", "Fence", "FireplaceQu"], inplace=True, errors="ignore")
    
    # Fill known missing values
    df["LotFrontage"] = df["LotFrontage"].fillna(df["LotFrontage"].median())
    df["GarageYrBlt"] = df["GarageYrBlt"].fillna(df["YearBuilt"])
    
    # Categorical columns
    cat_cols = df.select_dtypes(include="object").columns
    for col in cat_cols:
        df[col] = df[col].fillna("None")
    
    # Feature engineering
    df["TotalSF"] = df["TotalBsmtSF"] + df["1stFlrSF"] + df["2ndFlrSF"]
    df["TotalBath"] = df["FullBath"] + 0.5 * df["HalfBath"] + df["BsmtFullBath"] + 0.5 * df["BsmtHalfBath"]
    df["Age"] = df["YrSold"] - df["YearBuilt"]
    df["Remodeled"] = (df["YearBuilt"] != df["YearRemodAdd"]).astype(int)
    df["Qual*Area"] = df["GrLivArea"] * df["OverallQual"]
    
    # Log-transform skewed numeric features
    numeric_feats = df.select_dtypes(include=np.number).columns
    skewed_feats = df[numeric_feats].apply(lambda x: x.skew()).sort_values(ascending=False)
    high_skew = skewed_feats[skewed_feats > 0.75].index
    df[high_skew] = np.log1p(df[high_skew])
    
    return df

# Load and preprocess data
train_df = pd.read_csv("/Users/dubeaditya/Desktop/BF House Price Predicition/Data/train.csv")
test_df = pd.read_csv("/Users/dubeaditya/Desktop/BF House Price Predicition/Data/test.csv")

train_processed = preprocess_data(train_df)
test_processed = preprocess_data(test_df)

# Prepare feature lists
numeric_features = train_processed.select_dtypes(include=np.number).columns.tolist()
categorical_features = train_processed.select_dtypes(include="object").columns.tolist()

if "SalePrice" in numeric_features:
    numeric_features.remove("SalePrice")

# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', RobustScaler()),
            ('transformer', PowerTransformer(method='yeo-johnson'))
        ]), numeric_features),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
        ]), categorical_features)
    ])

# Prepare initial data
X = train_processed.drop(columns=["SalePrice"])
y = np.log1p(train_df["SalePrice"])
X_test = test_processed

# Fit preprocessor separately
print("Fitting preprocessor...")
preprocessor.fit(X)
X_transformed = preprocessor.transform(X)
X_test_transformed = preprocessor.transform(X_test)

# Get feature names after preprocessing
preprocessed_feature_names = (
    numeric_features + 
    preprocessor.named_transformers_['cat']
        .named_steps['encoder']
        .get_feature_names_out(categorical_features).tolist()
)

# Fit Random Forest for feature selection
print("Fitting Random Forest for feature selection...")
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_transformed, y)

# Select top 30 features
feature_selector = SelectFromModel(rf_model, max_features=30, threshold=-np.inf, prefit=True)
X_selected = feature_selector.transform(X_transformed)
X_test_selected = feature_selector.transform(X_test_transformed)

# Get selected feature names
selected_mask = feature_selector.get_support()
selected_features = [preprocessed_feature_names[i] for i in range(len(preprocessed_feature_names)) if selected_mask[i]]

print("Top 30 features selected:")
feature_importance_df = pd.DataFrame({
    'feature': preprocessed_feature_names,
    'importance': rf_model.feature_importances_
})
print(feature_importance_df.nlargest(30, 'importance'))

# Create XGBoost pipeline (no preprocessor since data is already transformed)
xgb_model = Pipeline(steps=[
    ('regressor', xgb.XGBRegressor(
        n_estimators=1000,
        learning_rate=0.05,
        max_depth=4,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    ))
])

# Cross-validation with selected features
kf = KFold(n_splits=5, shuffle=True, random_state=42)
xgb_scores = -cross_val_score(xgb_model, X_selected, y, cv=kf, scoring="neg_mean_squared_error")

print(f"XGBoost CV Score with top 30 features: {np.mean(xgb_scores):.5f}")

# Train model on full training data with selected features
xgb_model.fit(X_selected, y)

# Make predictions
xgb_pred = np.expm1(xgb_model.predict(X_test_selected))

# Export results
output = pd.DataFrame({"Id": test_df["Id"], "SalePrice": xgb_pred})
output.to_csv("xgboost_predictions_rf_selection.csv", index=False)

# Save the trained model, preprocessor, and selected features
def save_model_and_features(model, preprocessor, selected_features):
    """Save trained XGBoost model, preprocessor, and selected features to pickle files"""
    with open('xgboost_model_rf_selection.pkl', 'wb') as f:
        pickle.dump(model, f)
    with open('preprocessor.pkl', 'wb') as f:
        pickle.dump(preprocessor, f)
    with open('selected_features.pkl', 'wb') as f:
        pickle.dump(selected_features, f)

# Save the model and features
save_model_and_features(xgb_model, preprocessor, selected_features)

Fitting preprocessor...
Fitting Random Forest for feature selection...
Top 30 features selected:
              feature  importance
41          Qual*Area    0.581971
37            TotalSF    0.118526
4         OverallQual    0.049219
12        TotalBsmtSF    0.018893
26         GarageCars    0.017128
9          BsmtFinSF1    0.016276
39                Age    0.012849
7        YearRemodAdd    0.010143
3             LotArea    0.009905
6           YearBuilt    0.008749
27         GarageArea    0.008651
38          TotalBath    0.008248
5         OverallCond    0.008085
46        MSZoning_RM    0.007540
226      CentralAir_N    0.006521
13           1stFlrSF    0.005922
16          GrLivArea    0.005103
11          BsmtUnfSF    0.005055
2         LotFrontage    0.004873
25        GarageYrBlt    0.004620
227      CentralAir_Y    0.004166
0                  Id    0.003744
14           2ndFlrSF    0.003674
237    KitchenQual_TA    0.002963
35             MoSold    0.002762
29        OpenPorch