In [None]:
# House Price Prediction Notebook

# =============================================================================
# Step 1: Import necessary libraries
# =============================================================================
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from src.data_preprocessing import load_data, preprocess_data
from src.model import train_model, evaluate_model

# =============================================================================
# Step 2: Load the data
# =============================================================================
train_df = load_data('data/train.csv')

# =============================================================================
# Step 3: Preprocess the data
# =============================================================================
# Preprocess the dataset: handle missing values, create derived features, etc.
X, y = preprocess_data(train_df)

# =============================================================================
# Step 3.1: Exploratory Data Analysis (EDA)
# =============================================================================
# Generate a correlation heatmap to understand relationships between features and the target.
plt.figure(figsize=(12, 10))
corr = train_df.corr()  # Compute the correlation matrix on the raw data
sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Correlation Heatmap of the Dataset")
plt.show()

# Optional: Display distributions for key features
important_features = ["SalePrice", "GrLivArea", "TotRmsAbvGrd", "YearBuilt"]
train_df[important_features].hist(bins=30, figsize=(12, 8))
plt.suptitle("Distributions of Important Features")
plt.show()

# =============================================================================
# Step 4: Split the data into training and testing sets
# =============================================================================
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# =============================================================================
# Step 5: Train the model
# =============================================================================
model = train_model(X_train, y_train)

# =============================================================================
# Step 6: Evaluate the model with multiple metrics
# =============================================================================
# Use evaluate_model function which now returns a dictionary with MSE, MAE, RMSE, and R² Score.
metrics = evaluate_model(model, X_test, y_test)
print("Evaluation Metrics:")
for metric, value in metrics.items():
    print(f"{metric}: {value}")

# =============================================================================
# Step 7: Visualize Feature Importance
# =============================================================================
# Visualize the feature importance using the columns from the original dataset, 
# excluding the target column 'SalePrice'
from src.utils import plot_feature_importance
plot_feature_importance(model, train_df.drop(columns=["SalePrice"]).columns)