In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
import pandas as pd

# Define the file paths
file_path1 = '/content/drive/MyDrive/SuperUROP /Data Analysis/caltech_training_data.csv'
file_path2 = '/content/drive/MyDrive/SuperUROP /Data Analysis/caltech_testing_data.csv'
file_path3 = '/content/drive/MyDrive/SuperUROP /Data Analysis/JPL_training_data.csv'
file_path4  = '/content/drive/MyDrive/SuperUROP /Data Analysis/JPL_testing_data.csv'
# Use pandas to read the CSV files and then convert them to NumPy arrays
caltech_train = pd.read_csv(file_path1).values
caltech_test = pd.read_csv(file_path2).values

JPL_train = pd.read_csv(file_path3).values
JPL_test=pd.read_csv(file_path4).values

#Data Processing

In [None]:
#Remove row number (in 1st column)
caltech_train=caltech_train[:,1:]
caltech_test=caltech_test[:,1:]

JPL_train=JPL_train[:,1:]
JPL_test=JPL_test[:,1:]

In [None]:
#Remove departure time (2nd column)
# Convert arrival date to hour and find day of the week
from datetime import datetime

def convert_time_and_day(data_array):
    """
    Converts the time from HH:MM to HH.XX format and appends the day of the week to it.
    Also, removes the second column.
    """
    transformed_data = []
    for row in data_array:
        # Convert the arrival time to HH.XX format
        time_obj = datetime.strptime(row[0], '%Y-%m-%d %H:%M:%S')
        new_time = time_obj.hour + (time_obj.minute / 60.0)

        # Convert the date to a day of the week
        day_of_week = time_obj.strftime('%A')
        new_time = str(new_time) + " " + day_of_week

        # Create a new row excluding the second column
        new_row = [new_time] + list(row[2:])
        transformed_data.append(new_row)

    return np.array(transformed_data)

caltech_train=convert_time_and_day(caltech_train)
caltech_test=convert_time_and_day(caltech_test)
JPL_train=convert_time_and_day(JPL_train)
JPL_test=convert_time_and_day(JPL_test)

In [None]:
def day_to_number(day):
    """Converts a day of the week to its corresponding discrete value."""
    days = {
        'Monday': 1,
        'Tuesday': 2,
        'Wednesday': 3,
        'Thursday': 4,
        'Friday': 5,
        'Saturday': 6,
        'Sunday': 7
    }
    return days[day]

def separate_time_and_day(data_array):
    """
    Separates the time and day in the given column,
    and converts the day into a discrete value between 1 and 7.

    """
    transformed_data = []
    for row in data_array:
        time_day_str = row[0]
        time, day = time_day_str.split()
        time = float(time)
        day_num = day_to_number(day)

        # Create a new row with separated time and day number
        new_row = [time, day_num] + list(row[1:])
        transformed_data.append(new_row)

    return np.array(transformed_data)

caltech_train=separate_time_and_day(caltech_train)
caltech_test=separate_time_and_day(caltech_test)
JPL_train=separate_time_and_day(JPL_train)
JPL_test=separate_time_and_day(JPL_test)

In [None]:
#Make training and testing set have the same user IDs
users_from_training_caltech = set(caltech_train[:, 3])
mask_caltech = np.isin(caltech_test[:, 3], list(users_from_training_caltech))
caltech_test = caltech_test[mask_caltech]
users_from_testing_caltech = set(caltech_test[:, 3])
mask_caltech = np.isin(caltech_train[:, 3], list(users_from_testing_caltech))
caltech_train = caltech_train[mask_caltech]

users_from_training = set(JPL_train[:, 3])
mask = np.isin(JPL_test[:, 3], list(users_from_training))
JPL_test = JPL_test[mask]
users_from_testing = set(JPL_test[:, 3])
mask = np.isin(JPL_train[:, 3], list(users_from_testing))
JPL_train = JPL_train[mask]

In [None]:
caltech_train = np.array(caltech_train, dtype='float')
caltech_test = np.array(caltech_test, dtype='float')
JPL_train = np.array(JPL_train, dtype='float')
JPL_test = np.array(JPL_test, dtype='float')

#MLR Energy Prediction

In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Sample training data

def mlr_model(train_data, test_data, user_id):
    # Filter training and testing data for the specific user
    user_train_data = train_data[train_data[:, 3] == user_id]
    user_test_data = test_data[test_data[:, 3] == user_id]

    # Independent variables are arrival time, day of the week and estimated duration
    X_train = user_train_data[:, [0,1,4]]
    X_test = user_test_data[:, [0,1,4]]

    # Dependent variable is the energy
    y_train = user_train_data[:, 2]
    y_test = user_test_data[:, 2]

    # Train the model
    model = LinearRegression().fit(X_train, y_train)

    # Predict on the test set
    y_pred = model.predict(X_test)

    # Calculate user SMAPE
    n = len(y_test)
    smape_val = (1/ n) * np.sum(np.abs(y_test - y_pred) / (np.abs(y_test+y_pred)))*100

    return model.coef_, smape_val


In [None]:
# Test the function
user_ids_JPL = np.unique(np.concatenate((JPL_train[:, 3], JPL_test[:, 3])))
smape_list_JPL=[]

for user_id in user_ids_JPL:
    model, smape = mlr_model(JPL_train, JPL_test, user_id)
    smape_list_JPL.append(smape)

#Calculate average SMAPE for JPL
no_JPL_users=len(user_ids_JPL)
JPL_smape=sum(smape_list_JPL)/no_JPL_users
print(f"Average SMAPE for JPL dataset using DKDE: {JPL_smape}")

#DKDE Energy Prediction

In [None]:
! pip install KDE-diffusion

In [None]:
from sklearn.preprocessing import StandardScaler
from kde_diffusion import kde2d
def dkde_model(train_data,test_data,user_id):
    # Filter training and testing data for the specific user
    user_train_data = train_data[train_data[:, 3] == user_id]
    user_test_data = test_data[test_data[:, 3] == user_id]

    # Extract columns for stay duration and energy consumption for both train and test sets
    X_train = user_train_data[:, 4]  # Stay duration
    Y_train = user_train_data[:, 2] # Energy consumption

    X_test = user_test_data[:, 4]
    y_test = user_test_data[:, 2]


    (density, grid, bandwidth) = kde2d(X_train, Y_train, n=128, limits=None)

    def predict_y_given_x(new_x, grid, density):
        # Find the closest x index
        x_idx = np.argmin(np.abs(grid[0] - new_x))

        # Get the y values and their corresponding densities for the given x
        y_values = grid[1]
        y_densities = density[x_idx]

        # Find the y with the maximum density
        predicted_y = y_values[np.argmax(y_densities)]

        return predicted_y

    # Predict y values for X_test using the density estimate
    y_pred = [predict_y_given_x(x_val, grid, density) for x_val in X_test]

    # Calculate user SMAPE
    n = len(y_test)
    smape_val = (1/ n) * np.sum(np.abs(y_test - y_pred) / (np.abs(y_test+y_pred)))*100

    return smape_val,density,grid



In [None]:
# Initialize dictionaries to store user-wise SMAPE values
user_smape_JPL = {}

# Loop through each user ID in the JPL dataset
for user_id in user_ids_JPL:
    smape, density, grid = dkde_model(JPL_train, JPL_test, user_id)
    user_smape_JPL[user_id] = smape  # Store the SMAPE in the dictionary

average_smape_JPL = np.mean(list(user_smape_JPL.values()))

print(f"Average SMAPE for JPL dataset using DKDE: {average_smape_JPL:.2f}%")

#Decision Tree energy prediction

In [None]:
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

def decision_tree_model(train_data, test_data, used_id, depth_range=(1, 21), min_samples_split_range=(2, 11), cv_folds=5):
    """
    Perform grid search with cross-validation to find optimal hyperparameters for Decision Tree regression.

    Parameters:
    - train_data: Training dataset with the format [sample, features]
    - test_data: Testing dataset with the same format as train_data
    - depth_range: Tuple representing the range of max_depth values to test (default is (1, 21))
    - min_samples_split_range: Tuple representing the range of min_samples_split values to test (default is (2, 11))
    - cv_folds: Number of cross-validation folds (default is 5)

    Returns:
    - y_pred: Predictions on the test set
    - best_params: Best hyperparameters found
    """

    # Filter training and testing data for the specific user
    user_train_data = train_data[train_data[:, 3] == user_id]
    user_test_data = test_data[test_data[:, 3] == user_id]

    # Extract columns for stay duration and energy consumption for both train and test sets
    X_train = user_train_data[:, 4].reshape(-1, 1)  # Stay Duration
    y_train = user_train_data[:, 2]  # Energy Consumption

    X_test = user_test_data[:, 4].reshape(-1, 1)
    y_test = user_test_data[:, 2]

    # Set up grid search with cross-validation for Decision Tree
    param_grid = {
        'max_depth': list(range(depth_range[0], depth_range[1])),
        'min_samples_split': list(range(min_samples_split_range[0], min_samples_split_range[1]))
    }
    dt = DecisionTreeRegressor()
    grid_search = GridSearchCV(dt, param_grid, cv=cv_folds, scoring='neg_mean_squared_error', return_train_score=True)
    grid_search.fit(X_train, y_train)

    # Train the model using the optimal hyperparameters
    best_dt = grid_search.best_estimator_

    # Predict on the test set
    y_pred = best_dt.predict(X_test)

    # Best hyperparameters
    best_params = grid_search.best_params_

    # Calculate user SMAPE
    n = len(y_test)
    smape_val = (1/ n) * np.sum(np.abs(y_test - y_pred) / (np.abs(y_test+y_pred)))*100

    return best_params,smape_val




In [None]:
# Initialize dictionaries to store user-wise SMAPE values
user_smape_JPL = {}

# Loop through each user ID in the JPL dataset
for user_id in user_ids_JPL:
    best_params,smape = decision_tree_model(JPL_train, JPL_test, user_id)
    user_smape_JPL[user_id] = smape  # Store the SMAPE in the dictionary

average_smape_JPL = np.mean(list(user_smape_JPL.values()))

print(f"Average SMAPE for JPL dataset using Decision Tree: {average_smape_JPL:.2f}%")

In [None]:
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

def svr_rbf_model(train_data, test_data, user_id,k=5):

    # Filter training and testing data for the specific user
    user_train_data = train_data[train_data[:, 3] == user_id]
    user_test_data = test_data[test_data[:, 3] == user_id]

    # Extract columns for stay duration and energy consumption for both train and test sets
    X_train = user_train_data[:,4].reshape(-1, 1)  # Stay Duration
    y_train = user_train_data[:,2]  # Energy Consumption

    X_test = user_test_data[:,4].reshape(-1, 1)
    y_test = user_test_data[:,2]

    # Define the hyperparameters to be optimized
    param_grid = {
        'C': [0.1, 1, 10, 100],
        'epsilon': [0.001, 0.01, 0.1, 1],
        'gamma': ['scale', 'auto', 0.1, 1, 10]
    }

    # Initialize SVR with RBF kernel
    svr_rbf = SVR(kernel='rbf')

    # Initialize GridSearchCV with k-fold cross-validation
    grid_search = GridSearchCV(estimator=svr_rbf, param_grid=param_grid, cv=k, scoring='neg_mean_squared_error', n_jobs=-1)

    # Fit the model to the training data
    grid_search.fit(X_train, y_train)

    # Use the best estimator to predict on the test data
    test_predictions = grid_search.best_estimator_.predict(X_test)

    # Calculate MSE on test data
    mse = mean_squared_error(y_test, test_predictions)

    # Calculate user SMAPE
    n = len(y_test)
    smape_val = (1/ n) * np.sum(np.abs(y_test - test_predictions) / (np.abs(y_test+test_predictions)))*100

    return grid_search.best_params_, smape_val




In [None]:
# Initialize dictionaries to store user-wise SMAPE values
user_smape_JPL = {}

# Loop through each user ID in the JPL dataset
for user_id in user_ids_JPL:
    best_params,smape = svr_rbf_model(JPL_train, JPL_test, user_id)
    user_smape_JPL[user_id] = smape  # Store the SMAPE in the dictionary

average_smape_JPL = np.mean(list(user_smape_JPL.values()))

print(f"Average SMAPE for JPL dataset using SVR: {average_smape_JPL:.2f}%")

#Ensemble ML energy prediction with different thresholds

In [None]:
R_DE={'169': 2.119068507176728,
 '171': 3.7115405017002776,
 '176': 2.9977726188370353,
 '220': 4.170165871125364,
 '322': 2.974319275358279,
 '334': 1.54796573662954,
 '335': 3.9915670302142447,
 '346': 4.687498008228514,
 '365': 4.400963644915041,
 '368': 3.4312414172978474,
 '372': 2.9575939245001077,
 '374': 3.839538311235034,
 '378': 3.1515844041039993,
 '382': 4.148262279562261,
 '404': 3.097855936467447,
 '405': 4.732202935285684,
 '406': 1.158211936837657,
 '409': 2.886901319079918,
 '410': 4.280915725084575,
 '416': 4.148262279562261,
 '436': 3.3808089672729613,
 '444': 3.0622185077071102,
 '458': 4.148262279562261,
 '467': 3.777008295296614,
 '474': 2.6165283654578775,
 '476': 3.7356597962280196,
 '481': 4.045488808054726,
 '483': 2.755038189921663,
 '507': 3.7804325624313817,
 '526': 2.4478241428939165,
 '531': 3.5032226226713927,
 '537': 3.1955976636365735,
 '551': 4.767567980735159,
 '553': 2.555583224030839,
 '576': 3.380624050295972,
 '577': 4.031441936975631,
 '581': 2.9304059447845647,
 '592': 3.056683078202939,
 '607': 4.625780698041485,
 '651': 3.492080210083478,
 '726': 3.2841952906487855,
 '742': 3.9023067062499264,
 '826': 3.375181253576597,
 '933': 3.9394875755880188}

R_DE = {int(key): value for key, value in R_DE.items()}

In [None]:
import numpy as np

# Assuming R_DE, JPL_train, JPL_test, mlr_model, and dkde_model are already defined

user_smape_combined = {}  # Dictionary to store SMAPE values for all users

# Loop through each user ID in the JPL dataset
for user_id in user_ids_JPL:
    if R_DE.get(user_id, 0) < 3.5:
        # Use mlr_model for users with R_DE < 3.5
        model, smape = mlr_model(JPL_train, JPL_test, user_id)
    else:
        # Use dkde_model for users with R_DE >= 3.5
        smape, density, grid = dkde_model(JPL_train, JPL_test, user_id)

    user_smape_combined[user_id] = smape  # Store the SMAPE in the dictionary

# Calculate the overall average SMAPE for the entire dataset
average_smape_combined = np.mean(list(user_smape_combined.values()))

print(f"Average SMAPE for JPL dataset using combined approach with threshold of 3.5: {average_smape_combined:.2f}%")


In [None]:
import numpy as np

# Assuming R_DE, JPL_train, JPL_test, mlr_model, and dkde_model are already defined

user_smape_combined = {}  # Dictionary to store SMAPE values for all users

# Loop through each user ID in the JPL dataset
for user_id in user_ids_JPL:
    if R_DE.get(user_id, 0) < 4:
        # Use mlr_model for users with R_DE < 4
        model, smape = mlr_model(JPL_train, JPL_test, user_id)
    else:
        # Use dkde_model for users with R_DE >= 4
        smape, density, grid = dkde_model(JPL_train, JPL_test, user_id)

    user_smape_combined[user_id] = smape  # Store the SMAPE in the dictionary

# Calculate the overall average SMAPE for the entire dataset
average_smape_combined = np.mean(list(user_smape_combined.values()))

print(f"Average SMAPE for JPL dataset using combined approach with threshold of 4: {average_smape_combined:.2f}%")

#Ensemble ML energy prediction (threshold of 4.5)

In [None]:
import numpy as np

# Assuming R_DE, JPL_train, JPL_test, mlr_model, and dkde_model are already defined

user_smape_combined = {}  # Dictionary to store SMAPE values for all users

# Loop through each user ID in the JPL dataset
for user_id in user_ids_JPL:
    if R_DE.get(user_id, 0) < 4.5:
        # Use DT for users with R_DE < 4.5
        best_params,smape = decision_tree_model(JPL_train, JPL_test, user_id)
    else:
        # Use dkde_model for users with R_DE >= 4.5
        smape, density, grid = dkde_model(JPL_train, JPL_test, user_id)

    user_smape_combined[user_id] = smape  # Store the SMAPE in the dictionary

# Calculate the overall average SMAPE for the entire dataset
average_smape_combined = np.mean(list(user_smape_combined.values()))

print(f"Average SMAPE for JPL dataset using combined approach with threshold of 4.5: {average_smape_combined:.2f}%")

#Ensemble ML energy prediction (threshold of 3)

In [None]:
import numpy as np

# Assuming R_DE, JPL_train, JPL_test, mlr_model, and dkde_model are already defined

user_smape_combined = {}  # Dictionary to store SMAPE values for all users

# Loop through each user ID in the JPL dataset
for user_id in user_ids_JPL:
    if R_DE.get(user_id, 0) < 3:
        # Use SVR for users with R_DE < 3
        best_params,smape = svr_rbf_model(JPL_train, JPL_test, user_id)
    else:
        # Use DT for users with R_DE >= 3
        best_params,smape = decision_tree_model(JPL_train, JPL_test, user_id)

    user_smape_combined[user_id] = smape  # Store the SMAPE in the dictionary

# Calculate the overall average SMAPE for the entire dataset
average_smape_combined = np.mean(list(user_smape_combined.values()))

print(f"Average SMAPE for JPL dataset using combined approach with threshold of 3: {average_smape_combined:.2f}%")