In [None]:
# House Price Prediction Notebook

# =============================================================================
# Step 1: Import necessary libraries
# =============================================================================
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from src.data_preprocessing import load_data, preprocess_data
from src.model import train_model, evaluate_model
from src.utils import plot_feature_importance
from src.hyperparameter_tuning import tune_hyperparameters

# =============================================================================
# Step 2: Load the Data
# =============================================================================
# Load the training data from CSV.
train_df = load_data('data/train.csv')
# Print column names for debugging
print("DataFrame columns:", train_df.columns.tolist())

# =============================================================================
# Step 3: Data Preprocessing
# =============================================================================
# Preprocess the dataset: handle missing values, encode categorical variables,
# create derived features, and scale numerical features.
X, y = preprocess_data(train_df)
# Convert X to a DataFrame using original feature names (excluding target 'SalePrice')
X = pd.DataFrame(X, columns=train_df.drop(columns='SalePrice').columns)

# =============================================================================
# Step 3.1: Exploratory Data Analysis (EDA)
# =============================================================================
# Generate a correlation heatmap to understand relationships between features and the target.
plt.figure(figsize=(12, 10))
corr = train_df.corr()  # Compute the correlation matrix on raw data
sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Correlation Heatmap of the Dataset")
plt.show()

# Optionally, display distributions for key features
important_features = ["SalePrice", "GrLivArea", "TotRmsAbvGrd", "YearBuilt"]
train_df[important_features].hist(bins=30, figsize=(12, 8))
plt.suptitle("Distributions of Important Features")
plt.show()

# =============================================================================
# Step 4: Data Partitioning
# =============================================================================
# Split the data into training and testing sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# =============================================================================
# Step 5: Hyperparameter Tuning
# =============================================================================
# Call the hyperparameter tuning function from our dedicated module.
best_model = tune_hyperparameters(X_train, y_train)

# =============================================================================
# Step 6: Model Evaluation
# =============================================================================
# Evaluate the best model using multiple metrics: MSE, MAE, RMSE, and R² Score.
metrics = evaluate_model(best_model, X_test, y_test)
print("Evaluation Metrics:")
for metric, value in metrics.items():
    print(f"{metric}: {value}")

# =============================================================================
# Step 7: Visualize Feature Importance
# =============================================================================
# Visualize the feature importance based on the trained model.
# We pass the feature names from the preprocessed DataFrame.
plot_feature_importance(best_model, X.columns)