In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
# Load the preprocessed data
df_processed = pd.read_csv('C:\\Users\\User\\PycharmProjects\\house-price-prediction-fdm\\house_prices_processed.csv')
print("Processed data shape:", df_processed.shape)

# Load the preprocessor pipeline
preprocessor = joblib.load('C:\\Users\\User\\PycharmProjects\\house-price-prediction-fdm\\models\\preprocessor.pkl')
print("Preprocessor loaded successfully")

Processed data shape: (177086, 14)
Preprocessor loaded successfully


In [5]:
# Separate features and target
X = df_processed.drop("Price (in rupees)", axis=1)
y = df_processed["Price (in rupees)"]

print("Features shape:", X.shape)
print("Target shape:", y.shape)

Features shape: (177086, 13)
Target shape: (177086,)


In [6]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)

print("Training set - Features:", X_train.shape, "Target:", y_train.shape)
print("Testing set - Features:", X_test.shape, "Target:", y_test.shape)

Training set - Features: (141668, 13) Target: (141668,)
Testing set - Features: (35418, 13) Target: (35418,)


In [7]:
# Transform the features using the preprocessor
print("Transforming training features...")
X_train_transformed = preprocessor.fit_transform(X_train)

print("Transforming testing features...")
X_test_transformed = preprocessor.transform(X_test)

print("Transformed training features shape:", X_train_transformed.shape)
print("Transformed testing features shape:", X_test_transformed.shape)

Transforming training features...
Transforming testing features...
Transformed training features shape: (141668, 125)
Transformed testing features shape: (35418, 125)


In [8]:
# Initialize Linear Regression model
linear_model = LinearRegression()

print("Training Linear Regression model...")
# Train the model
linear_model.fit(X_train_transformed, y_train)

print("Linear Regression model training completed!")

Training Linear Regression model...
Linear Regression model training completed!


In [9]:
# Make predictions on training and testing sets
y_train_pred = linear_model.predict(X_train_transformed)
y_test_pred = linear_model.predict(X_test_transformed)

print("Predictions completed!")

Predictions completed!


In [10]:
def evaluate_model(y_true, y_pred, dataset_name):
    """Evaluate model performance metrics"""
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)

    print(f"\n{dataset_name} Set Performance:")
    print(f"Mean Absolute Error (MAE): {mae:.2f}")
    print(f"Mean Squared Error (MSE): {mse:.2f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
    print(f"R² Score: {r2:.4f}")

    return mae, mse, rmse, r2

# Evaluate on training set
train_mae, train_mse, train_rmse, train_r2 = evaluate_model(y_train, y_train_pred, "Training")

# Evaluate on testing set
test_mae, test_mse, test_rmse, test_r2 = evaluate_model(y_test, y_test_pred, "Testing")


Training Set Performance:
Mean Absolute Error (MAE): 1574.86
Mean Squared Error (MSE): 4353764.80
Root Mean Squared Error (RMSE): 2086.57
R² Score: 0.4385

Testing Set Performance:
Mean Absolute Error (MAE): 1575.99
Mean Squared Error (MSE): 4337338.40
Root Mean Squared Error (RMSE): 2082.63
R² Score: 0.4415
