In [1]:
#import libraries
import pandas as pd
import numpy as np
import joblib
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

In [2]:
#load preprocessed data
df_preprocessed = pd.read_csv("../house_prices_processed.csv")

X = df_preprocessed.drop("Price (in rupees)", axis=1)
y = df_preprocessed["Price (in rupees)"]

In [3]:
#load preprocessor
preprocessor = joblib.load('../models/preprocessor.pkl')


In [4]:
#Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print(f"Training set: {X_train.shape}")
print(f"Testing set: {X_test.shape}")

Training set: (141668, 13)
Testing set: (35418, 13)


In [5]:
#Transform the features using the preprocessor
#X_train_transformed = preprocessor.transform(X_train)
#X_test_transformed = preprocessor.transform(X_test)

#print(f"Transformed training features: {X_train_transformed.shape}")
#print(f"Transformed testing features: {X_test_transformed.shape}")

In [6]:
# Initialize and train Decision Tree
dt_model = DecisionTreeRegressor(
    random_state=42,
    max_depth=10,  # Limit depth to prevent overfitting
    min_samples_split=20,
    min_samples_leaf=10
)

In [7]:
dt_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", dt_model)
])

In [8]:
# Train pipeline
dt_pipeline.fit(X_train, y_train)
print("Decision Tree model training completed!")

Decision Tree model training completed!


In [9]:
# Make predictions
y_pred_dt_train = dt_pipeline.predict(X_train)
y_pred_dt = dt_pipeline.predict(X_test)

In [10]:
# Evaluate Decision Tree
dt_mae_train = mean_absolute_error(y_train, y_pred_dt_train)
dt_rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_dt_train))
dt_r2_train = r2_score(y_train, y_pred_dt_train)

print("Decision Tree Train Results:")
print(f"Train MAE: {dt_mae_train:.2f}")
print(f"Train RMSE: {dt_rmse_train:.2f}")
print(f"Train R² Score: {dt_r2_train:.4f}")

dt_mae = mean_absolute_error(y_test, y_pred_dt)
dt_rmse = np.sqrt(mean_squared_error(y_test, y_pred_dt))
dt_r2 = r2_score(y_test, y_pred_dt)

print("\nDecision Tree Test Results:")
print(f"Test MAE: {dt_mae:.2f}")
print(f"Test RMSE: {dt_rmse:.2f}")
print(f"Test R² Score: {dt_r2:.4f}")


Decision Tree Train Results:
Train MAE: 953.01
Train RMSE: 1511.02
Train R² Score: 0.7056

Decision Tree Test Results:
Test MAE: 967.59
Test RMSE: 1545.06
Test R² Score: 0.6926


In [11]:
# Feature Importance Analysis for Decision Tree
# Get feature names after preprocessing
#feature_names = []
#for name, transformer, features in preprocessor.transformers_:
    #if name == 'num':
        #feature_names.extend(features)
    #elif name == 'cat':
        # Get one-hot encoded feature names
        #cat_features = preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(features)
        #feature_names.extend(cat_features)

# Decision Tree Feature Importance
#dt_importance_df = pd.DataFrame({
    #'feature': feature_names,
    #'importance': dt_model.feature_importances_
#}).sort_values('importance', ascending=False)

#print("Top 15 Most Important Features (Decision Tree):")
#print(dt_importance_df.head(15))

In [12]:
# Plot Decision Tree Feature Importance
#plt.figure(figsize=(12, 8))
#sns.barplot(data=dt_importance_df.head(15), y='feature', x='importance', palette='viridis')
#plt.title('Top 15 Feature Importances - Decision Tree', fontsize=16, fontweight='bold')
#plt.xlabel('Importance Score', fontsize=12)
#plt.ylabel('Features', fontsize=12)
#plt.tight_layout()
#plt.show()

In [13]:
# Save pipeline
joblib.dump(dt_pipeline, '../models/decision_tree_pipeline.pkl')
print("Decision Tree pipeline saved as 'decision_tree_pipeline.pkl'")

Decision Tree pipeline saved as 'decision_tree_pipeline.pkl'


In [14]:
# Initialize and train Random Forest model
rf_model = RandomForestRegressor(
    n_estimators=100,
    random_state=42,
    max_depth=15,
    min_samples_split=15,
    min_samples_leaf=5,
    n_jobs=-1  # Use all available cores
)


In [15]:
# Build pipeline
rf_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", rf_model)
])

In [16]:
# Train pipeline
rf_pipeline.fit(X_train, y_train)
print("Random Forest model training completed!")

Random Forest model training completed!


In [17]:
# Make predictions
y_pred_rf_train = rf_pipeline.predict(X_train)
y_pred_rf = rf_pipeline.predict(X_test)

In [18]:
# Evaluate Random Forest
rf_mae_train = mean_absolute_error(y_train, y_pred_rf_train)
rf_rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_rf_train))
rf_r2_train = r2_score(y_train, y_pred_rf_train)

print("Random Forest Train Results:")
print(f"Train MAE: {rf_mae_train:.2f}")
print(f"Train RMSE: {rf_rmse_train:.2f}")
print(f"Train R² Score: {rf_r2_train:.4f}")

rf_mae = mean_absolute_error(y_test, y_pred_rf)
rf_rmse = np.sqrt(mean_squared_error(y_test, y_pred_rf))
rf_r2 = r2_score(y_test, y_pred_rf)

print("\nRandom Forest Test Results:")
print(f"Test MAE: {rf_mae:.2f}")
print(f"Test RMSE: {rf_rmse:.2f}")
print(f"Test R² Score: {rf_r2:.4f}")


Random Forest Train Results:
Train MAE: 749.56
Train RMSE: 1266.57
Train R² Score: 0.7931

Random Forest Test Results:
Test MAE: 773.49
Test RMSE: 1316.56
Test R² Score: 0.7768


In [19]:
# Feature Importance Analysis for Random Forest
# Get feature names after preprocessing (if not already defined)
#feature_names = []
#for name, transformer, features in preprocessor.transformers_:
    #if name == 'num':
        #feature_names.extend(features)
    #elif name == 'cat':
        # Get one-hot encoded feature names
        #cat_features = preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(features)
        #feature_names.extend(cat_features)

# Random Forest Feature Importance
#rf_importance_df = pd.DataFrame({
    #'feature': feature_names,
    #'importance': rf_model.feature_importances_
#}).sort_values('importance', ascending=False)

#print("Top 15 Most Important Features (Random Forest):")
#print(rf_importance_df.head(15))

In [20]:
# Plot Random Forest Feature Importance
#plt.figure(figsize=(12, 8))
#sns.barplot(data=rf_importance_df.head(15), y='feature', x='importance', palette='plasma')
#plt.title('Top 15 Feature Importances - Random Forest', fontsize=16, fontweight='bold')
#plt.xlabel('Importance Score', fontsize=12)
#plt.ylabel('Features', fontsize=12)
#plt.tight_layout()
#plt.show()

In [21]:
# Save pipeline
joblib.dump(rf_pipeline, '../models/random_forest_pipeline.pkl')
print("Random Forest pipeline saved as 'random_forest_pipeline.pkl'")

Random Forest pipeline saved as 'random_forest_pipeline.pkl'
