###Problem Statement:
For a utility company looking to optimize energy consumption and reduce costs, predicting energy usage accurately is crucial. The client wishes to develop a machine learning model that can forecast energy usage based on various parameters such as temperature, humidity, wind speed, and time of day. By accurately predicting energy demand, the client can optimize resource allocation, schedule maintenance, and improve overall operational efficiency.

###1. LINEAR REGRESSION

In [18]:
# Import necessary libraries
import numpy as np  # For numerical operations
import pandas as pd  # For data manipulation
from sklearn.model_selection import train_test_split  # To split the dataset
from sklearn.linear_model import LinearRegression  # Initiating Linear regression model
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error  # For model evaluation



# Load the energy dataset
from yellowbrick.datasets import load_energy  # Import the dataset loader
X, y = load_energy()  # Load features (X) and target variable (y)

# Split the data into training and testing sets
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=0)  # Split data

# Initialize and train the Linear Regression model
lr = LinearRegression().fit(X_train, y_train)  # Train model on training data

# Print the R² score for training and validation sets
print("Training score : {:.8f}".format(lr.score(X_train, y_train)))  # Training R²
print("Validation score : {:.8f}".format(lr.score(X_val, y_val)))  # Validation R²

# Make predictions on the validation set
y_pred = lr.predict(X_val)  # Predictions for validation set

# Evaluate the model using mean squared error and Mean Absolute error
mse = mean_squared_error(y_val, y_pred)  # Calculate MSE
lr_mae = mean_absolute_error(y_val, lr.predict(X_val))

# Print the evaluation results
print(f"Mean Squared Error (MSE): {mse}")  # Output MSE
print(f" MAE: {lr_mae:.2f}")#Output MAE


Training score : 0.91728565
Validation score : 0.91124809
Mean Squared Error (MSE): 9.586373592719587
Linear Regression MAE: 2.18


###2. Random Forrest Regression




In [49]:
from sklearn.ensemble import RandomForestRegressor  # Random forest model for regression


# Split the dataset into training (80%) and testing (20%) sets
X_train, X_val, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Initialize the Random Forest Regressor with 100 trees and a fixed random state for reproducibility
random_forest_model = RandomForestRegressor( random_state=0)
random_forest_model.fit(X_train, y_train)  # Train the model on the training data

# Use the trained model to make predictions on the test dataset
y_pred = random_forest_model.predict(X_val)

# Calculate Mean Squared Error between true and predicted values for test set
mse = mean_squared_error(y_test, y_pred)  # Mean squared error
rf_mae = mean_absolute_error(y_test, random_forest_model.predict(X_val)) #Mean Absolute error

# Calculate the model's performance scores on training and test data for comparison
training_score = random_forest_model.score(X_train, y_train)  # R² score on the training set
test_score = random_forest_model.score(X_val, y_test)  # R² score on the test set


# Output the results
# Assuming it should be 'random_forest_model', the corrected lines are:
print("Training score: {:.8f}".format(random_forest_model.score(X_train, y_train)))  # Print training R² score
print("Test score: {:.8f}".format(random_forest_model.score(X_val, y_test)))  # Print test R² score
print(f"Mean Squared Error (MSE): {mse}")  # Print the mean squared error
print(f" MAE: {rf_mae:.2f}")

rf_feature_importances = random_forest_model.feature_importances_
print(f"\nRandom Forest Feature Importances:")
for name, importance in zip(X.columns, rf_feature_importances):
    print(f"{name}: {importance:.4f}")

Training score: 0.99969889
Test score: 0.99698276
Mean Squared Error (MSE): 0.3331825161688284
 MAE: 0.38

Random Forest Feature Importances:
relative compactness: 0.4027
surface area: 0.1753
wall area: 0.0505
roof area: 0.1613
overall height: 0.1187
orientation: 0.0007
glazing area: 0.0794
glazing area distribution: 0.0114


###3. Decision Trees

In [45]:
# Import the Decision Tree Regressor model from scikit-learn
from sklearn.tree import DecisionTreeRegressor
# Initialize the Decision Tree Regressor with a maximum depth of 5
decision_tree_regressor = DecisionTreeRegressor(max_depth=5)

# Import utilities for splitting datasets and conducting cross-validation
from sklearn.model_selection import train_test_split, cross_validate

# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=0)

# Initialize another Decision Tree Regressor with specified max depth and random state for reproducibility
tree = DecisionTreeRegressor(max_depth=5, random_state=0)
# Fit the decision tree model on the training data
tree.fit(X_train, y_train)

# Print model accuracy (R^2 score) on both training and validation sets
print("Accuracy on training set: {:.8f}".format(tree.score(X_train, y_train)))
print("Accuracy on validation set: {:.8f}".format(tree.score(X_val, y_val)))

# Perform cross-validation to assess model performance, using mean squared error as the scoring metric
scores = cross_validate(tree, X_train, y_train, cv=5, scoring='neg_mean_squared_error', return_train_score=True)

# Print the mean cross-validation scores for both training and validation datasets
# Note: Scores are negated to convert from negative mean squared error to mean squared error
for label_pair in [('train_score', 'train_score'), ('test_score', 'validation_score')]:
    print('{} = {:.3f}'.format(label_pair[1], -scores[label_pair[0]].mean()))
dt_mae = mean_absolute_error(y_val, tree.predict(X_val))#Mean Absolute error

# Calculate and print the model's accuracy on training and validation sets
tree_training_accuracy = tree.score(X_train, y_train)
tree_validation_accuracy = tree.score(X_val, y_val)
results_df = pd.DataFrame({
    'Training accuracy': [tree_training_accuracy],
    'Validation accuracy': [tree_validation_accuracy]
}, index=['DT'])
print(results_df)

# Conduct another round of cross-validation, this time evaluating R^2 scores
cv_results = cross_validate(tree, X_train, y_train, cv=5, scoring='r2', return_train_score=True)
train_r2 = cv_results['train_score']
val_r2 = cv_results['test_score']

# Calculate and print the average R^2 scores across the cross-validation folds for both training and validation sets
average_train_r2 = np.mean(train_r2)
average_val_r2 = np.mean(val_r2)
print(f"Average Training R^2 for DT: {average_train_r2}")
print(f"Average Validation R^2 for DT: {average_val_r2}")
print(f"MAE: {dt_mae:.2f}")#Mean Absoulte error

y_pred_dt = tree.predict(X_val)
mse_dt = mean_squared_error(y_val, y_pred_dt)

print(f"Mean Squared Error (MSE) for Decision Tree: {mse_dt:.8f}")

Accuracy on training set: 0.99049843
Accuracy on validation set: 0.98917684
train_score = 0.986
validation_score = 1.119
    Training accuracy  Validation accuracy
DT           0.990498             0.989177
Average Training R^2 for DT: 0.9900547524289627
Average Validation R^2 for DT: 0.9884104707034591
MAE: 0.75
Mean Squared Error (MSE) for Decision Tree: 1.16904366


###4. Gradient Boosting Regressor

In [43]:
# Import the Gradient Boosting Regressor model from scikit-learn
from sklearn.ensemble import GradientBoostingRegressor
# Import utilities for cross-validation
from sklearn.model_selection import cross_validate, cross_val_score
# Initialize the Gradient Boosting Regressor with specified max depth and random state for reproducibility
gradient_boosting_regressor = GradientBoostingRegressor(max_depth=5)

# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=0)

# Initialize a Gradient Boosting Regressor with a maximum depth of 5 and random state for reproducibility
gbrt = GradientBoostingRegressor(max_depth=5, random_state=0)
# Fit the model on the training data
gbrt.fit(X_train, y_train)

# Print the model's accuracy (R^2 score) on the training and validation sets
print("Accuracy on training set: {:.8f}".format(gbrt.score(X_train, y_train)))
print("Accuracy on validation set: {:.8f}".format(gbrt.score(X_val, y_val)))

# Perform 5-fold cross-validation and return the cross-validated score of the estimator
scores = cross_val_score(gbrt, X_train, y_train, cv=5)

# Perform 5-fold cross-validation, scoring with negative mean squared error and returning both training and test scores
scores = cross_validate(gbrt, X_train, y_train, cv=5, scoring='neg_mean_squared_error', return_train_score=True)

# Print mean cross-validation scores for training and validation datasets, negating scores for readability
for label_pair in [('train_score', 'train_score'), ('test_score', 'validation_score')]:
    print('{} = {:.3f}'.format(label_pair[1], -scores[label_pair[0]].mean()))
y_pred = gbrt.predict(X_val)
mse = mean_squared_error(y_val, y_pred)

print(f"Mean Squared Error (MSE): {mse:.8f}")
# Conduct cross-validation, this time evaluating with R^2 scoring metric, and return scores for both training and test sets
cv_results = cross_validate(gbrt, X_train, y_train, cv=5, scoring='r2', return_train_score=True)
train_r2 = cv_results['train_score']  # R^2 scores on the training set
val_r2 = cv_results['test_score']  # R^2 scores on the validation set

# Calculate and print the average R^2 scores across the 5 folds for both training and validation sets
average_train_r2 = np.mean(train_r2)
average_val_r2 = np.mean(val_r2)

gbrt_mae = mean_absolute_error(y_val, gbrt.predict(X_val))#Mean Absolute error

print(f"Average Training R^2 for GB: {average_train_r2}")
print(f"Average Validation R^2 for GB: {average_val_r2}")

print(f"MAE: {gbrt_mae:.2f}")

gbrt_feature_importances = gbrt.feature_importances_
print("\nGradient Boosting Feature Importances:")
for name, importance in zip(X.columns, gbrt_feature_importances):
    print(f"{name}: {importance:.4f}")

Accuracy on training set: 0.99963075
Accuracy on validation set: 0.99804294
train_score = 0.035
validation_score = 0.122
Mean Squared Error (MSE): 0.21138802
Average Training R^2 for GB: 0.9996466995264843
Average Validation R^2 for GB: 0.9987519990572371
MAE: 0.30

Gradient Boosting Feature Importances:
relative compactness: 0.2911
surface area: 0.4919
wall area: 0.0299
roof area: 0.0808
overall height: 0.0151
orientation: 0.0004
glazing area: 0.0809
glazing area distribution: 0.0099


In [None]:
import numpy as np  # Used for numerical operations
import pandas as pd  # Used for data manipulation and analysis
from sklearn.model_selection import train_test_split  # Facilitates splitting datasets into training and testing sets
from sklearn.linear_model import LinearRegression  # Regression model for predicting continuous values
from sklearn.ensemble import RandomForestRegressor  # Ensemble model for robust regression predictions
from sklearn.tree import DecisionTreeRegressor  # Model for regression using decision tree logic
from sklearn.ensemble import GradientBoostingRegressor  # Ensemble model for improved predictive performance via boosting
from sklearn.metrics import mean_squared_error, r2_score  # Metrics for evaluating regression model performance
from sklearn.model_selection import cross_validate, cross_val_score  # For assessing model performance using cross-validation
from yellowbrick.datasets import load_energy  # Utility for loading example datasets

# Load features and target variable from the energy dataset
X, y = load_energy()

# Split the dataset into 80% training data and 20% testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Linear Regression Model
# -----------------------
# Initialize the Linear Regression model
lr = LinearRegression()
# Train the model on the training data
lr.fit(X_train, y_train)
# Evaluate the model's performance on both the training and test datasets
print("Linear Regression Training R^2: {:.4f}".format(lr.score(X_train, y_train)))
print("Linear Regression Test R^2: {:.4f}".format(lr.score(X_test, y_test)))

# Random Forest Regressor
# ------------------------
# Initialize the Random Forest model with 100 decision trees
random_forest_model = RandomForestRegressor(n_estimators=100, random_state=0)
# Fit the Random Forest model to the training data
random_forest_model.fit(X_train, y_train)
# Evaluate and print the model's accuracy on the training and test sets
print("Random Forest Training R^2: {:.4f}".format(random_forest_model.score(X_train, y_train)))
print("Random Forest Test R^2: {:.4f}".format(random_forest_model.score(X_test, y_test)))

# Decision Tree Regressor
# ------------------------
# Initialize the Decision Tree Regressor with a maximum depth of 5
tree = DecisionTreeRegressor(max_depth=5, random_state=0)
# Train the Decision Tree model
tree.fit(X_train, y_train)
# Output the model's performance on the training and test data
print("Decision Tree Training R^2: {:.8f}".format(tree.score(X_train, y_train)))
print("Decision Tree Test R^2: {:.8f}".format(tree.score(X_test, y_test)))

# Gradient Boosting Regressor
# ----------------------------
# Initialize the Gradient Boosting Regressor with a depth of 5
gbrt = GradientBoostingRegressor(max_depth=5, random_state=0)
# Fit the model on the training data
gbrt.fit(X_train, y_train)
# Print the model's R^2 score for both the training and test datasets
print("Gradient Boosting Training R^2: {:.6f}".format(gbrt.score(X_train, y_train)))
print("Gradient Boosting Test R^2: {:.3f}".format(gbrt.score(X_test, y_test)))

# Cross-validation for Decision Tree Regressor to evaluate model stability
# -------------------------------------------------------------------------
# Perform 5-fold cross-validation, returning scores for both training and testing phases
cv_results = cross_validate(tree, X_train, y_train, cv=5, scoring='r2', return_train_score=True)
# Calculate and print the average R^2 score across all folds for both training and testing
print(f"Average CV Training R^2 for Decision Tree: {np.mean(cv_results['train_score'])}")
print(f"Average CV Test R^2 for Decision Tree: {np.mean(cv_results['test_score'])}")


Linear Regression Training R^2: 0.9177
Linear Regression Test R^2: 0.9083
Random Forest Training R^2: 0.9997
Random Forest Test R^2: 0.9970
Decision Tree Training R^2: 0.99023480
Decision Tree Test R^2: 0.99001928
Gradient Boosting Training R^2: 0.999553
Gradient Boosting Test R^2: 0.998
Average CV Training R^2 for Decision Tree: 0.9898954524705035
Average CV Test R^2 for Decision Tree: 0.9876708795716823
