In [65]:
"""
Housing Price Prediction - Model Training

This script trains a Linear Regression model on the preprocessed housing dataset 
to predict sale prices.
"""

'\nHousing Price Prediction - Model Training\n\nThis script trains a Linear Regression model on the preprocessed housing dataset \nto predict sale prices.\n'

In [66]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import joblib
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import cross_val_score


In [67]:
# Set plot style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("deep")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

In [68]:
def load_preprocessed_data(data_dir='../processed_data', version=1):
    """
    Load preprocessed training and testing data.
    
    Args:
        data_dir: Directory containing the preprocessed data files
        version: Version number of the preprocessed data
        
    Returns:
        X_train, X_test, y_train, y_test: Loaded data splits
    """
    try:
        # Try to load as CSV first
        X_train = pd.read_csv(os.path.join(data_dir, f'X_train_{version}.csv'))
        X_test = pd.read_csv(os.path.join(data_dir, f'X_test_{version}.csv'))
        y_train = pd.read_csv(os.path.join(data_dir, f'y_train_{version}.csv')).iloc[:, 0]
        y_test = pd.read_csv(os.path.join(data_dir, f'y_test_{version}.csv')).iloc[:, 0]
    except:
        try:
            # If CSV fails, try to load as numpy arrays
            X_train = np.load(os.path.join(data_dir, f'X_train_{version}.npy'))
            X_test = np.load(os.path.join(data_dir, f'X_test_{version}.npy'))
            y_train = pd.read_csv(os.path.join(data_dir, f'y_train_{version}.csv')).iloc[:, 0]
            y_test = pd.read_csv(os.path.join(data_dir, f'y_test_{version}.csv')).iloc[:, 0]
        except Exception as e:
            print(f"Error loading data: {e}")
            print("Check file paths and formats.")
            return None, None, None, None
        
    print(f"X_train shape: {X_train.shape}")
    print(f"X_test shape: {X_test.shape}")
    print(f"y_train shape: {y_train.shape}")
    print(f"y_test shape: {y_test.shape}")
    
    return X_train, X_test, y_train, y_test

In [69]:
print("=" * 50)
print("HOUSING PRICE PREDICTION - MODEL TRAINING")
print("=" * 50)

# Load data
print("\n1. LOADING PREPROCESSED DATA")
print("-" * 30)

X_train, X_test, y_train, y_test = load_preprocessed_data(data_dir='../processed_data', version=4)

HOUSING PRICE PREDICTION - MODEL TRAINING

1. LOADING PREPROCESSED DATA
------------------------------
X_train shape: (1168, 24)
X_test shape: (292, 24)
y_train shape: (1168,)
y_test shape: (292,)


In [70]:
def explore_features(X_train, y_train, save_plots=False, output_dir='../plots' , version=1):
    """
    Explore preprocessed features and their correlation with the target.
    
    Args:
        X_train: Training features
        y_train: Training target values
        save_plots: Whether to save plots instead of displaying them
        output_dir: Directory to save plots (if save_plots is True)
    """
    if not isinstance(X_train, pd.DataFrame):
        print("X_train is a numpy array, feature names not available.")
        return
    
    # Display feature names
    print("Features after preprocessing:")
    print(X_train.columns.tolist())
    
    # Feature correlation with target
    correlation_data = pd.concat([X_train, y_train.reset_index(drop=True)], axis=1)
    correlations = correlation_data.corr()['SalePrice'].sort_values(ascending=False)
    
    # Display correlation with target
    print("\nFeature Correlations with SalePrice:")
    print(correlations.drop('SalePrice'))
    
    # Create output directory if saving plots
    if save_plots:
        os.makedirs(output_dir, exist_ok=True)
    
    # Plot feature correlations
    plt.figure(figsize=(10, 6))
    correlations.drop('SalePrice').plot(kind='bar')
    plt.title('Feature Correlation with Sale Price', fontsize=14)
    plt.xlabel('Features', fontsize=12)
    plt.ylabel('Correlation Coefficient', fontsize=12)
    plt.tight_layout()

    if save_plots:
        plt.savefig(os.path.join(output_dir, f'feature_correlations_{version}.png'))
        plt.close()
    else:
        plt.show()

In [71]:
# Explore features
print("\n2. EXPLORING PREPROCESSED FEATURES")
print("-" * 30)
explore_features(X_train, y_train, save_plots=True, output_dir='../plots' , version=4)


2. EXPLORING PREPROCESSED FEATURES
------------------------------
Features after preprocessing:
['LotArea', 'GrLivArea', 'OverallQuality', 'OverallCondition', 'TotalBsmtSF', 'GarageCars', 'HouseAge', 'TotalBath', 'HouseStyle_1.5Unf', 'HouseStyle_1Fam', 'HouseStyle_1Story', 'HouseStyle_2.5Fin', 'HouseStyle_2.5Unf', 'HouseStyle_2Story', 'HouseStyle_SFoyer', 'HouseStyle_SLvl', 'GarageType_BuiltIn', 'GarageType_Detchd', 'GarageType_Missing', 'GarageType_Other', 'SaleType_New', 'SaleType_Other', 'SaleType_WD', 'SaleCondition_Other']

Feature Correlations with SalePrice:
OverallQuality         0.796219
TotalBsmtSF            0.645338
GarageCars             0.635389
TotalBath              0.596834
GrLivArea              0.372400
SaleType_New           0.354580
LotArea                0.277866
GarageType_BuiltIn     0.241354
HouseStyle_2Story      0.224236
SaleCondition_Other    0.160139
HouseStyle_2.5Fin      0.055057
SaleType_Other        -0.007496
HouseStyle_1Fam       -0.007618
HouseStyle_

In [72]:
def train_linear_regression(X_train, y_train, save_plots=False, output_dir='../plots' ,version=1):
    """
    Train a linear regression model with cross-validation.
    
    Args:
        X_train: Training features
        y_train: Training target values
        save_plots: Whether to save plots instead of displaying them
        output_dir: Directory to save plots (if save_plots is True)
        
    Returns:
        Trained linear regression model
    """
    # Initialize the model
    lr_model = LinearRegression()
    
    # Perform cross-validation to estimate model performance
    cv_scores = cross_val_score(lr_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
    rmse_scores = np.sqrt(-cv_scores)
    
    # Display cross-validation results
    print("Cross-Validation Results:")
    print(f"RMSE scores: {rmse_scores}")
    print(f"Mean RMSE: {rmse_scores.mean():.2f}")
    print(f"Standard deviation: {rmse_scores.std():.2f}")

    # Train the model on the full training set
    lr_model.fit(X_train, y_train)
    
    # Create output directory if saving plots
    if save_plots:
        os.makedirs(output_dir, exist_ok=True)
    
    # Display model coefficients
    if isinstance(X_train, pd.DataFrame):
        coefficients = pd.DataFrame({'Feature': X_train.columns, 'Coefficient': lr_model.coef_})
        coefficients = coefficients.sort_values('Coefficient', ascending=False)
        print("\nModel Coefficients:")
        print(coefficients)
        
        # Plot coefficients
        plt.figure(figsize=(10, 6))
        coefficients.plot(x='Feature', y='Coefficient', kind='bar')
        plt.title('Linear Regression Coefficients', fontsize=14)
        plt.xlabel('Features', fontsize=12)
        plt.ylabel('Coefficient Value', fontsize=12)
        plt.tight_layout()

        if save_plots:
            plt.savefig(os.path.join(output_dir, f'model_coefficients_{version}.png'))
            plt.close()
        else:
            plt.show()
    else:
        print("\nModel Coefficients:")
        print(lr_model.coef_)
    
    return lr_model

In [73]:
# Train model
print("\n3. TRAINING LINEAR REGRESSION MODEL")
print("-" * 30)
lr_model = train_linear_regression(X_train, y_train, save_plots=True, output_dir='../plots' ,version=4)


3. TRAINING LINEAR REGRESSION MODEL
------------------------------
Cross-Validation Results:
RMSE scores: [35177.97559271 46053.89106003 40082.8238683  31772.28861744
 39786.14646867]
Mean RMSE: 38574.63
Standard deviation: 4847.49

Model Coefficients:
                Feature    Coefficient
0               LotArea  202820.440741
2        OverallQuality  195005.528815
4           TotalBsmtSF  170962.792745
1             GrLivArea  125615.426160
5            GarageCars   88771.631193
3      OverallCondition   47555.649615
7             TotalBath   39581.129041
20         SaleType_New   36313.390805
11    HouseStyle_2.5Fin   33410.462011
21       SaleType_Other   26355.745968
18   GarageType_Missing   26226.403156
16   GarageType_BuiltIn   21902.016305
22          SaleType_WD   12632.695810
13    HouseStyle_2Story    3690.163465
19     GarageType_Other   -4211.288146
17    GarageType_Detchd   -4914.359538
15      HouseStyle_SLvl   -5241.964395
23  SaleCondition_Other   -6120.443331
14   

<Figure size 1000x600 with 0 Axes>

In [74]:
def evaluate_model(model, X_test, y_test, save_plots=False, output_dir='../plots' , version=1):
    """
    Evaluate model performance on test data.
    
    Args:
        model: Trained model
        X_test: Test features
        y_test: Test target values
        save_plots: Whether to save plots instead of displaying them
        output_dir: Directory to save plots (if save_plots is True)
        
    Returns:
        Dictionary containing evaluation metrics
    """
    # Make predictions on test set
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Display metrics
    print("Model Performance on Test Set:")
    print(f"Mean Squared Error (MSE): {mse:.2f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
    print(f"Mean Absolute Error (MAE): {mae:.2f}")
    print(f"R² Score: {r2:.4f}")
    
    # Create output directory if saving plots
    if save_plots:
        os.makedirs(output_dir, exist_ok=True)
    
    # Visualize predictions vs actual values
    plt.figure(figsize=(10, 6))
    plt.scatter(y_test, y_pred, alpha=0.6)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
    plt.xlabel('Actual Sale Price', fontsize=12)
    plt.ylabel('Predicted Sale Price', fontsize=12)
    plt.title('Predicted vs Actual Sale Prices', fontsize=14)
    plt.tight_layout()

    if save_plots:
        plt.savefig(os.path.join(output_dir, f'predicted_vs_actual_{version}.png'))
        plt.close()
    else:
        plt.show()

    # Plot residuals
    residuals = y_test - y_pred
    plt.figure(figsize=(10, 6))
    plt.scatter(y_pred, residuals, alpha=0.6)
    plt.axhline(y=0, color='r', linestyle='--')
    plt.xlabel('Predicted Sale Price', fontsize=12)
    plt.ylabel('Residuals', fontsize=12)
    plt.title('Residual Plot', fontsize=14)
    plt.tight_layout()
    
    if save_plots:
        plt.savefig(os.path.join(output_dir, f'residual_plot_{version}.png'))
        plt.close()
    else:
        plt.show()

    # Distribution of residuals
    plt.figure(figsize=(10, 6))
    sns.histplot(residuals, kde=True)
    plt.xlabel('Residuals', fontsize=12)
    plt.ylabel('Frequency', fontsize=12)
    plt.title('Distribution of Residuals', fontsize=14)
    plt.tight_layout()
    
    if save_plots:
        plt.savefig(os.path.join(output_dir, f'residual_distribution_{version}.png'))
        plt.close()
    else:
        plt.show()
    
    return {
        'mse': mse,
        'rmse': rmse,
        'mae': mae,
        'r2': r2
    }

In [75]:
# Evaluate model
print("\n4. EVALUATING MODEL ON TEST SET")
print("-" * 30)
evaluation_metrics = evaluate_model(lr_model, X_test, y_test, save_plots=True, output_dir='../plots' , version=4)


4. EVALUATING MODEL ON TEST SET
------------------------------
Model Performance on Test Set:
Mean Squared Error (MSE): 2178626095.30
Root Mean Squared Error (RMSE): 46675.75
Mean Absolute Error (MAE): 25431.97
R² Score: 0.6505


In [76]:
def analyze_feature_importance(model, X_train, save_plots=False, output_dir='../plots' ,version=1):
    """
    Analyze feature importance based on model coefficients.
    
    Args:
        model: Trained model
        X_train: Training features
        save_plots: Whether to save plots instead of displaying them
        output_dir: Directory to save plots (if save_plots is True)
    """
    if not isinstance(X_train, pd.DataFrame):
        print("X_train is a numpy array, feature importance analysis requires feature names.")
        return
    
    # Calculate absolute contribution of each feature
    importance = np.abs(model.coef_)
    
    # Normalize to get relative importance
    importance = 100.0 * (importance / importance.sum())
    
    # Create DataFrame for visualization
    importance_df = pd.DataFrame({
        'Feature': X_train.columns,
        'Importance': importance
    })
    importance_df = importance_df.sort_values('Importance', ascending=False)
    
    # Display feature importance
    print("Feature Importance:")
    print(importance_df)
    
    # Create output directory if saving plots
    if save_plots:
        os.makedirs(output_dir, exist_ok=True)
    
    # Plot feature importance
    plt.figure(figsize=(10, 6))
    sns.barplot(x='Importance', y='Feature', data=importance_df)
    plt.title('Feature Importance (Absolute Coefficient Value)', fontsize=14)
    plt.xlabel('Importance (%)', fontsize=12)
    plt.tight_layout()
    
    if save_plots:
        plt.savefig(os.path.join(output_dir, f'feature_importance_{version}.png'))
        plt.close()
    else:
        plt.show()


In [77]:
# Analyze feature importance
print("\n5. ANALYZING FEATURE IMPORTANCE")
print("-" * 30)
analyze_feature_importance(lr_model, X_train, save_plots=True, output_dir='../plots' ,version=4)


5. ANALYZING FEATURE IMPORTANCE
------------------------------
Feature Importance:
                Feature  Importance
0               LotArea   17.973551
2        OverallQuality   17.281009
4           TotalBsmtSF   15.150389
1             GrLivArea   11.131794
5            GarageCars    7.866768
3      OverallCondition    4.214289
7             TotalBath    3.507602
20         SaleType_New    3.218022
11    HouseStyle_2.5Fin    2.960770
21       SaleType_Other    2.335595
18   GarageType_Missing    2.324133
8     HouseStyle_1.5Unf    2.039530
16   GarageType_BuiltIn    1.940914
12    HouseStyle_2.5Unf    1.532447
9       HouseStyle_1Fam    1.349524
22          SaleType_WD    1.119485
6              HouseAge    0.694162
10    HouseStyle_1Story    0.654971
14    HouseStyle_SFoyer    0.562422
23  SaleCondition_Other    0.542382
15      HouseStyle_SLvl    0.464533
17    GarageType_Detchd    0.435501
19     GarageType_Other    0.373196
13    HouseStyle_2Story    0.327015


In [78]:
def save_model(model, version=1, models_dir='../models'):
    """
    Save the trained model.
    
    Args:
        model: Model to save
        version: Version number to use in the saved model filename
        models_dir: Directory to save the model
    """
    # Create directory for models if it doesn't exist
    os.makedirs(models_dir, exist_ok=True)
    
    # Save the model
    model_path = os.path.join(models_dir, f'linear_regression_model_v{version}.pkl')
    joblib.dump(model, model_path)
    print(f"Model saved to {model_path}")

In [79]:
# Save model
print("\n6. SAVING MODEL")
print("-" * 30)
save_model(lr_model, version=4, models_dir='../models')


6. SAVING MODEL
------------------------------
Model saved to ../models\linear_regression_model_v4.pkl


In [80]:
# Final summary
print("\n7. SUMMARY")
print("-" * 30)
print(f"Model training complete with R² score of {evaluation_metrics['r2']:.4f} on test set.")
print("The model has been saved and can be used for predictions.")


7. SUMMARY
------------------------------
Model training complete with R² score of 0.6505 on test set.
The model has been saved and can be used for predictions.
