In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
import pandas as pd

# Define the file paths
file_path1 = '/content/drive/MyDrive/SuperUROP /Data Analysis/caltech_training_data.csv'
file_path2 = '/content/drive/MyDrive/SuperUROP /Data Analysis/caltech_testing_data.csv'
file_path3 = '/content/drive/MyDrive/SuperUROP /Data Analysis/JPL_training_data.csv'
file_path4  = '/content/drive/MyDrive/SuperUROP /Data Analysis/JPL_testing_data.csv'
# Use pandas to read the CSV files and then convert them to NumPy arrays
caltech_train = pd.read_csv(file_path1).values
caltech_test = pd.read_csv(file_path2).values

JPL_train = pd.read_csv(file_path3).values
JPL_test=pd.read_csv(file_path4).values

In [None]:
import pandas as pd
import numpy as np

# Define the file paths
file_path1 = '/content/drive/MyDrive/SuperUROP /Data Analysis/caltech_training_data_full.csv'
file_path3 = '/content/drive/MyDrive/SuperUROP /Data Analysis/JPL_training_data.csv'
file_path4 = '/content/drive/MyDrive/SuperUROP /Data Analysis/JPL_testing_data.csv'

# Use pandas to read the CSV files
caltech_full_df = pd.read_csv(file_path1)

JPL_train_df = pd.read_csv(file_path3)
JPL_test_df = pd.read_csv(file_path4)

# Filter the caltech_full DataFrame
caltech_full_df = caltech_full_df[caltech_full_df['duration'] <= 24]

caltech_train_df = caltech_full_df[(caltech_full_df['connectionTime'] >= '2021-03-01') & (caltech_full_df['connectionTime'] <= '2021-05-31')]
caltech_train_df= caltech_train_df[caltech_train_df['no_sessions'] >= 50]
caltech_test_df = caltech_full_df[(caltech_full_df['connectionTime'] >= '2021-06-01') & (caltech_full_df['connectionTime'] <= '2021-06-30')]

JPL_train_df=JPL_train_df[JPL_train_df['no_sessions'] >= 30]

# Convert to NumPy arrays if necessary
caltech_train = caltech_train_df.values
caltech_test = caltech_test_df.values
JPL_train = JPL_train_df.values
JPL_test = JPL_test_df.values


#Data Processing for MLR & DKDE

In [None]:
#Remove row number (in 1st column)
caltech_train=caltech_train[:,1:]
caltech_test=caltech_test[:,1:]

JPL_train=JPL_train[:,1:]
JPL_test=JPL_test[:,1:]

In [None]:
#Remove departure time (2nd column)
# Convert arrival date to hour and find day of the week
from datetime import datetime

def convert_time_and_day(data_array):
    """
    Converts the time from HH:MM to HH.XX format and appends the day of the week to it.
    Also, removes the second column.
    """
    transformed_data = []
    for row in data_array:
        # Convert the arrival time to HH.XX format
        time_obj = datetime.strptime(row[0], '%Y-%m-%d %H:%M:%S')
        new_time = time_obj.hour + (time_obj.minute / 60.0)

        # Convert the date to a day of the week
        day_of_week = time_obj.strftime('%A')
        new_time = str(new_time) + " " + day_of_week

        # Create a new row excluding the second column
        new_row = [new_time] + list(row[2:])
        transformed_data.append(new_row)

    return np.array(transformed_data)

caltech_train=convert_time_and_day(caltech_train)
caltech_test=convert_time_and_day(caltech_test)
JPL_train=convert_time_and_day(JPL_train)
JPL_test=convert_time_and_day(JPL_test)

In [None]:
def day_to_number(day):
    """Converts a day of the week to its corresponding discrete value."""
    days = {
        'Monday': 1,
        'Tuesday': 2,
        'Wednesday': 3,
        'Thursday': 4,
        'Friday': 5,
        'Saturday': 6,
        'Sunday': 7
    }
    return days[day]

def separate_time_and_day(data_array):
    """
    Separates the time and day in the given column,
    and converts the day into a discrete value between 1 and 7.

    """
    transformed_data = []
    for row in data_array:
        time_day_str = row[0]
        time, day = time_day_str.split()
        time = float(time)
        day_num = day_to_number(day)

        # Create a new row with separated time and day number
        new_row = [time, day_num] + list(row[1:])
        transformed_data.append(new_row)

    return np.array(transformed_data)

caltech_train=separate_time_and_day(caltech_train)
caltech_test=separate_time_and_day(caltech_test)
JPL_train=separate_time_and_day(JPL_train)
JPL_test=separate_time_and_day(JPL_test)

In [None]:
#Make training and testing set have the same user IDs
users_from_training_caltech = set(caltech_train[:, 3])
mask_caltech = np.isin(caltech_test[:, 3], list(users_from_training_caltech))
caltech_test = caltech_test[mask_caltech]
users_from_testing_caltech = set(caltech_test[:, 3])
mask_caltech = np.isin(caltech_train[:, 3], list(users_from_testing_caltech))
caltech_train = caltech_train[mask_caltech]

users_from_training = set(JPL_train[:, 3])
mask = np.isin(JPL_test[:, 3], list(users_from_training))
JPL_test = JPL_test[mask]
users_from_testing = set(JPL_test[:, 3])
mask = np.isin(JPL_train[:, 3], list(users_from_testing))
JPL_train = JPL_train[mask]

In [None]:
caltech_train = np.array(caltech_train, dtype='float')
caltech_test = np.array(caltech_test, dtype='float')
JPL_train = np.array(JPL_train, dtype='float')
JPL_test = np.array(JPL_test, dtype='float')

#Data Processing for correlation

In [None]:
from collections import defaultdict

# Organize the arrival times by user ID for the entire dataset
user_times_cal = defaultdict(list)
user_times_JPL = defaultdict(list)

# Accumulate all arrival times for each user in caltech_train
for row in caltech_train:
    arrival_time = float(row[0])
    user_id = row[3]
    user_times_cal[user_id].append(arrival_time)

# Accumulate all arrival times for each user in JPL_train
for row in JPL_train:
    arrival_time = float(row[0])
    user_id = row[3]
    user_times_JPL[user_id].append(arrival_time)

# Define the functions for hourly and half-hourly representations
def create_hourly_representation(times):
    hourly_vector = [0] * 24
    for time in times:
        hour = int(float(time))
        hourly_vector[hour] += 1
    return hourly_vector

def create_half_hourly_representation(times):
    half_hourly_vector = [0] * 48
    for time in times:
        interval = int(float(time) * 2)  # Multiply by 2 for half-hourly intervals
        half_hourly_vector[interval] += 1
    return half_hourly_vector

# Compute the hourly and half-hourly representations for each user
hourly_representations_cal = {}
half_hourly_representations_cal = {}
hourly_representations_JPL = {}
half_hourly_representations_JPL = {}

for user_id, times in user_times_cal.items():
    hourly_representations_cal[user_id] = create_hourly_representation(times)
    half_hourly_representations_cal[user_id] = create_half_hourly_representation(times)

for user_id, times in user_times_JPL.items():
    hourly_representations_JPL[user_id] = create_hourly_representation(times)
    half_hourly_representations_JPL[user_id] = create_half_hourly_representation(times)



In [None]:
max_energy_caltech= int(np.round(caltech_train[:, 2].astype(float).max()))
max_energy_JPL= int(np.round(JPL_train[:, 2].astype(float).max()))

In [None]:
import numpy as np
from collections import defaultdict

# Organize the energy consumption durations by user ID for the entire dataset
user_durations_cal = defaultdict(list)
user_durations_JPL = defaultdict(list)

# Accumulate all energy consumption durations for each user in caltech_train
for row in caltech_train:
    energy_consumption = float(row[2])
    user_id = row[3]
    user_durations_cal[user_id].append(energy_consumption)

# Accumulate all energy consumption durations for each user in JPL_train
for row in JPL_train:
    energy_consumption = float(row[2])
    user_id = row[3]
    user_durations_JPL[user_id].append(energy_consumption)

# Define functions for hourly and half-hourly representations
def create_hourly_representation_duration(durations, max_energy):
    hourly_vector = [0] * (max_energy + 1)
    for duration in durations:
        index = int(float(duration))
        hourly_vector[index] += 1
    return hourly_vector

def create_half_hourly_representation_duration(durations, max_energy):
    half_hourly_vector = [0] * ((max_energy + 1) * 2)
    for duration in durations:
        interval = int(float(duration) * 2)
        half_hourly_vector[interval] += 1
    return half_hourly_vector

# Compute the hourly and half-hourly representations for each user
hourly_representations_duration_cal = {}
half_hourly_representations_duration_cal = {}
hourly_representations_duration_JPL = {}
half_hourly_representations_duration_JPL = {}

for user_id, durations in user_durations_cal.items():
    hourly_representations_duration_cal[user_id] = create_hourly_representation_duration(durations, max_energy_caltech)
    half_hourly_representations_duration_cal[user_id] = create_half_hourly_representation_duration(durations, max_energy_caltech)

for user_id, durations in user_durations_JPL.items():
    hourly_representations_duration_JPL[user_id] = create_hourly_representation_duration(durations, max_energy_JPL)
    half_hourly_representations_duration_JPL[user_id] = create_half_hourly_representation_duration(durations, max_energy_JPL)


In [None]:
def concatenate_representations(dict1, dict2):
    concatenated = {}
    for key in set(dict1.keys()) | set(dict2.keys()):  # union of keys from both dicts
        list1 = dict1.get(key, [])
        list2 = dict2.get(key, [])

        # Concatenate the lists
        concatenated_list = list1 + list2

        concatenated[key] = concatenated_list

    return concatenated

# Example usage
concatenated_hourly_cal = concatenate_representations(hourly_representations_cal, hourly_representations_duration_cal)
concatenated_half_hourly_cal = concatenate_representations(half_hourly_representations_cal, half_hourly_representations_duration_cal)

concatenated_hourly_JPL = concatenate_representations(hourly_representations_JPL, hourly_representations_duration_JPL)
concatenated_half_hourly_JPL = concatenate_representations(half_hourly_representations_JPL, half_hourly_representations_duration_JPL)



#Calculate correlations - choose one of the two
Run one of the two cells as the results have the same name

##Calculate cosine similarity

In [None]:
from sklearn.metrics import pairwise_distances
import pandas as pd

def data_to_dataframe(data):
    return pd.DataFrame.from_dict(data, orient='index')

def compute_cosine_similarity(df):
    cosine_distance = pairwise_distances(df, metric='cosine')
    cosine_similarity = 1 - cosine_distance

    # Set the user IDs as row and column names to preserve them
    cosine_similarity_df = pd.DataFrame(cosine_similarity, index=df.index, columns=df.index)
    return cosine_similarity_df

# Compute cosine similarity for each concatenated dataset
df_hourly_cal = data_to_dataframe(concatenated_hourly_cal)
similarities_hourly_cal = compute_cosine_similarity(df_hourly_cal)

df_half_hourly_cal = data_to_dataframe(concatenated_half_hourly_cal)
similarities_half_hourly_cal = compute_cosine_similarity(df_half_hourly_cal)

df_hourly_JPL = data_to_dataframe(concatenated_hourly_JPL)
similarities_hourly_JPL = compute_cosine_similarity(df_hourly_JPL)

df_half_hourly_JPL = data_to_dataframe(concatenated_half_hourly_JPL)
similarities_half_hourly_JPL = compute_cosine_similarity(df_half_hourly_JPL)



##Calculate Pearson correlation

In [None]:
import numpy as np

def compute_pearson_correlation(v1, v2):
    # Handling cases where vectors are too short or contain constant values
    if len(v1) > 1 and len(v2) > 1 and np.std(v1) * np.std(v2) != 0:
        return np.corrcoef(v1, v2)[0, 1]
    else:
        return None  # Returning None for cases where correlation is not defined

import pandas as pd

def compute_pearson_correlation_matrix_df(representations):
    user_ids = list(representations.keys())
    n = len(user_ids)
    correlation_matrix = np.zeros((n, n))

    for i in range(n):
        for j in range(i, n):  # Compute only for one triangle and mirror it as the matrix is symmetric
            if i != j:
                corr = compute_pearson_correlation(representations[user_ids[i]], representations[user_ids[j]])
                correlation_matrix[i, j] = corr if corr is not None else 0
                correlation_matrix[j, i] = correlation_matrix[i, j]  # Mirror the value
            else:
                correlation_matrix[i, j] = 1  # Self-correlation is always 1

    # Convert to DataFrame for better usability
    correlation_df = pd.DataFrame(correlation_matrix, index=user_ids, columns=user_ids)
    return correlation_df

similarities_hourly_cal = compute_pearson_correlation_matrix_df(concatenated_hourly_cal)
similarities_half_hourly_cal = compute_pearson_correlation_matrix_df(concatenated_half_hourly_cal)

similarities_hourly_JPL = compute_pearson_correlation_matrix_df(concatenated_hourly_JPL)
similarities_half_hourly_JPL = compute_pearson_correlation_matrix_df(concatenated_half_hourly_JPL)




In [None]:
from scipy.stats import spearmanr
import pandas as pd

def data_to_dataframe(data):
    return pd.DataFrame.from_dict(data, orient='index')

def calculate_spearman_correlation(df):
    # Transpose the DataFrame to make users (rows) into columns
    df_transposed = df.T

    # Initialize an empty DataFrame to store the results
    spearman_corr_df = pd.DataFrame(index=df.index, columns=df.index)

    # Calculate Spearman correlation for each pair of users (now columns)
    for user1 in df_transposed.columns:
        for user2 in df_transposed.columns:
            # Calculate Spearman correlation
            corr, _ = spearmanr(df_transposed[user1], df_transposed[user2])
            spearman_corr_df.loc[user1, user2] = corr

    # Convert all values to numeric
    return spearman_corr_df.apply(pd.to_numeric)

# Compute Spearman correlation for each concatenated dataset
df_hourly_cal = data_to_dataframe(concatenated_hourly_cal)
similarities_hourly_cal = calculate_spearman_correlation(df_hourly_cal)

df_half_hourly_cal = data_to_dataframe(concatenated_half_hourly_cal)
similarities_half_hourly_cal = calculate_spearman_correlation(df_half_hourly_cal)

df_hourly_JPL = data_to_dataframe(concatenated_hourly_JPL)
similarities_hourly_JPL = calculate_spearman_correlation(df_hourly_JPL)

df_half_hourly_JPL = data_to_dataframe(concatenated_half_hourly_JPL)
similarities_half_hourly_JPL = calculate_spearman_correlation(df_half_hourly_JPL)


#Calculated most correlated users (using raw values)
Run one of the two cells as the results have the same name - raw values perform much better

In [None]:
#Using threshold
def most_correlated_users(matrix, threshold=0.82):
    """
    For each user, find the top correlated users with correlations above 0.5.

    :param matrix: Cosine similarity matrix.
    :param threshold: Minimum correlation threshold for considering a user as correlated.
    :return: Dictionary with keys being user IDs and values being lists of top correlated user IDs.
    """
    result = {}

    for user_index in matrix.index:
        # Subtract self-similarity value for the current user
        user_similarities = matrix.loc[user_index].drop(user_index)

        # Filter users with correlation above the threshold
        filtered_users = user_similarities[abs(user_similarities) > threshold]

        # Sort by descending correlation
        sorted_users = filtered_users.sort_values(ascending=False)

        # Append users above the threshold to the result
        result[user_index] = sorted_users.index.tolist()

    return result

# Assuming `similarities_hourly_cal` is your cosine similarity matrix for the entire dataset
most_corr_users_hourly_cal = most_correlated_users(similarities_hourly_cal)
most_corr_users_half_hourly_cal = most_correlated_users(similarities_half_hourly_cal)

# Similarly for JPL data
most_corr_users_hourly_JPL = most_correlated_users(similarities_hourly_JPL)
most_corr_users_half_hourly_JPL = most_correlated_users(similarities_half_hourly_JPL)

#MLR Energy Prediction for all users (no correlation)

In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Sample training data
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


def mlr_model(train_data, test_data, user_id):
    # Filter training and testing data for the specific user
    user_train_data = train_data[train_data[:, 3] == user_id]
    user_test_data = test_data[test_data[:, 3] == user_id]

    # Independent variables are arrival time, day of the week and estimated duration
    X_train = user_train_data[:, [0,1,4]]
    X_test = user_test_data[:, [0,1,4]]

    # Dependent variable is the energy
    y_train = user_train_data[:, 2]
    y_test = user_test_data[:, 2]

    # Train the model
    model = LinearRegression().fit(X_train, y_train)

    # Predict on the test set
    y_pred = model.predict(X_test)

    # Calculate user SMAPE
    n = len(y_test)
    smape_val = (1/ n) * np.sum(np.abs(y_test - y_pred) / (np.abs(y_test+y_pred)))*100

    return model





In [None]:
MLR_user_models={}
user_ids_JPL = np.unique(np.concatenate((JPL_train[:, 3], JPL_test[:, 3])))

# Loop through each user ID in the JPL dataset
for user_id in user_ids_JPL:
    model = mlr_model(JPL_train, JPL_test, user_id)
    MLR_user_models[user_id]=model


#Train DKDE models for all users

In [None]:
! pip install KDE-diffusion

In [None]:
import numpy as np
from kde_diffusion import kde2d
user_ids_JPL = np.unique(np.concatenate((JPL_train[:, 3], JPL_test[:, 3])))

class KDEModel:
    def __init__(self, grid, density):
        self.grid = grid
        self.density = density

    def predict(self, X):
        predictions = [self._predict_y_given_x(x) for x in X]
        return predictions

    def _predict_y_given_x(self, new_x):
        # Find the closest x index
        x_idx = np.argmin(np.abs(self.grid[0] - new_x))

        # Get the y values and their corresponding densities for the given x
        y_values = self.grid[1]
        y_densities = self.density[x_idx]

        # Find the y with the maximum density
        predicted_y = y_values[np.argmax(y_densities)]

        return predicted_y

def dkde_model(train_data, user_id):
    # Filter training data for the specific user
    user_train_data = train_data[train_data[:, 3] == user_id]

    # Extract columns for stay duration and energy consumption
    X_train = user_train_data[:, 4]  # Stay duration
    Y_train = user_train_data[:, 2]  # Energy consumption

    # Perform KDE
    density, grid, bandwidth = kde2d(X_train, Y_train, n=64, limits=None)

    # Create and return a KDE model instance
    kde_model = KDEModel(grid, density)
    return kde_model

DKDE_user_models = {user_id: dkde_model(JPL_train, user_id) for user_id in user_ids_JPL}


#Train Decision Tree model for all users

In [None]:
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler

def train_user_model(train_data, user_id,depth_range=(1, 21), min_samples_split_range=(2, 11), cv_folds=5):
    user_train_data = train_data[train_data[:, 3] == user_id]
    X_train = user_train_data[:, 4].reshape(-1, 1)  # Arrival time
    y_train = user_train_data[:, 2]  # Stay duration

    # Set up grid search with cross-validation for Decision Tree
    param_grid = {
        'max_depth': list(range(depth_range[0], depth_range[1])),
        'min_samples_split': list(range(min_samples_split_range[0], min_samples_split_range[1]))
    }
    dt = DecisionTreeRegressor()
    grid_search = GridSearchCV(dt, param_grid, cv=cv_folds, scoring='neg_mean_squared_error', return_train_score=True)
    grid_search.fit(X_train, y_train)

    # Train the model using the optimal k
    model = grid_search.best_estimator_


    return model

DT_user_models = {user_id: train_user_model(JPL_train, user_id) for user_id in user_ids_JPL}

#Train SVR models for all users

In [None]:
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
def train_user_model(train_data, user_id, k=5):
    user_train_data = train_data[train_data[:, 3] == user_id]
    X_train = user_train_data[:, 4].reshape(-1, 1)  # Arrival time
    y_train = user_train_data[:, 2]  # Stay duration

   # Define the hyperparameters to be optimized
    param_grid = {
        'C': [0.1, 1, 10, 100],
        'epsilon': [0.001, 0.01, 0.1, 1],
        'gamma': ['scale', 'auto', 0.1, 1, 10]
    }

    # Initialize SVR with RBF kernel
    svr_rbf = SVR(kernel='rbf')

    # Initialize GridSearchCV with k-fold cross-validation
    grid_search = GridSearchCV(estimator=svr_rbf, param_grid=param_grid, cv=k, scoring='neg_mean_squared_error', n_jobs=-1)

    # Fit the model to the training data
    grid_search.fit(X_train, y_train)

    model = grid_search.best_estimator_

    return model
svr_models_JPL = {user_id: train_user_model(JPL_train, user_id) for user_id in user_ids_JPL}

#Enseble ML Algorithm Energy Prediction (with correlations)

In [None]:
R_DE={'169': 2.119068507176728,
 '171': 3.7115405017002776,
 '176': 2.9977726188370353,
 '220': 4.170165871125364,
 '322': 2.974319275358279,
 '334': 1.54796573662954,
 '335': 3.9915670302142447,
 '346': 4.687498008228514,
 '365': 4.400963644915041,
 '368': 3.4312414172978474,
 '372': 2.9575939245001077,
 '374': 3.839538311235034,
 '378': 3.1515844041039993,
 '382': 4.148262279562261,
 '404': 3.097855936467447,
 '405': 4.732202935285684,
 '406': 1.158211936837657,
 '409': 2.886901319079918,
 '410': 4.280915725084575,
 '416': 4.148262279562261,
 '436': 3.3808089672729613,
 '444': 3.0622185077071102,
 '458': 4.148262279562261,
 '467': 3.777008295296614,
 '474': 2.6165283654578775,
 '476': 3.7356597962280196,
 '481': 4.045488808054726,
 '483': 2.755038189921663,
 '507': 3.7804325624313817,
 '526': 2.4478241428939165,
 '531': 3.5032226226713927,
 '537': 3.1955976636365735,
 '551': 4.767567980735159,
 '553': 2.555583224030839,
 '576': 3.380624050295972,
 '577': 4.031441936975631,
 '581': 2.9304059447845647,
 '592': 3.056683078202939,
 '607': 4.625780698041485,
 '651': 3.492080210083478,
 '726': 3.2841952906487855,
 '742': 3.9023067062499264,
 '826': 3.375181253576597,
 '933': 3.9394875755880188}

R_DE = {int(key): value for key, value in R_DE.items()}


In [None]:
def ensemble_algorithm_with_correlation(train_data, test_data, user_id, most_corr_users, mlr_user_model, DKDE_user_model,DT_user_model,svr_user_model,similarity_matrix, R_DE):
    # Filter training and testing data for the specific user
    user_train_data = train_data[train_data[:, 3] == user_id]
    user_test_data = test_data[test_data[:, 3] == user_id]

    X_test_mlr = user_test_data[:, [0, 1, 4]]
    X_test_dkde = user_test_data[:, 4]
    X_test_dt = user_test_data[:, 4].reshape(-1, 1)
    X_test_svr = user_test_data[:, 4].reshape(-1, 1)
    y_test = user_test_data[:, 2]

    # Choose and use the model based on the R_SD ratio for the main user
    if R_DE.get(user_id, 0) < 3.5:
        user_model =mlr_user_model[user_id]
        y_pred = user_model.predict(X_test_mlr)
    else:
        user_model = DKDE_user_model[user_id]
        y_pred = user_model.predict(X_test_dkde)

    # Convert y_pred to a NumPy array if it is a list
    y_pred = np.array(y_pred) if isinstance(y_pred, list) else y_pred

    # Adjust the prediction based on the most correlated users
    total_correlation = 1
    if user_id in most_corr_users:
        correlated_users = most_corr_users[user_id]

        for other_user_id in correlated_users:
            if other_user_id in mlr_user_model or other_user_id in DKDE_user_model:
                correlation = similarity_matrix.loc[user_id, other_user_id]

                # Choose the model for the correlated user based on R_SD
                if R_DE.get(other_user_id, 0) < 3.5:
                    other_user_model = mlr_user_model[other_user_id]
                    other_user_pred = other_user_model.predict(X_test_mlr)
                else:
                    other_user_model = DKDE_user_model[other_user_id]
                    other_user_pred = other_user_model.predict(X_test_dkde)

                # Convert other_user_pred to a NumPy array if it's a list
                other_user_pred = np.array(other_user_pred) if isinstance(other_user_pred, list) else other_user_pred

                # Ensure y_pred is an array before the operation
                if not isinstance(y_pred, np.ndarray):
                    y_pred = np.array(y_pred)

                y_pred += other_user_pred * correlation
                total_correlation += correlation

    # Ensure y_pred is an array before the division
    if not isinstance(y_pred, np.ndarray):
        y_pred = np.array(y_pred)

    if total_correlation != 0:
        y_pred /= total_correlation

    # Calculate user SMAPE
    n = len(y_test)
    smape_val = (1 / n) * np.sum(np.abs(y_test - y_pred) / (np.abs(y_test + y_pred))) * 100

    return smape_val


In [None]:
user_ids_JPL = np.unique(np.concatenate((JPL_train[:, 3], JPL_test[:, 3])))

smape_list_JPL=[]

for user_id in user_ids_JPL:
    smape = ensemble_algorithm_with_correlation(JPL_train, JPL_test, user_id, most_corr_users_half_hourly_JPL, MLR_user_models, DKDE_user_models,DT_user_models,svr_models_JPL,similarities_half_hourly_JPL,R_DE)
    smape_list_JPL.append(smape)

# Calculate the average SMAPE
average_smape = np.mean(smape_list_JPL) if smape_list_JPL else 0

print(f"Average SMAPE for the dataset: {average_smape:.2f}%")

In [None]:
def ensemble_algorithm_with_correlation(train_data, test_data, user_id, most_corr_users, mlr_user_model, DKDE_user_model,DT_user_model,svr_user_model,similarity_matrix, R_DE):
    # Filter training and testing data for the specific user
    user_train_data = train_data[train_data[:, 3] == user_id]
    user_test_data = test_data[test_data[:, 3] == user_id]

    X_test_mlr = user_test_data[:, [0, 1, 4]]
    X_test_dkde = user_test_data[:, 4]
    X_test_dt = user_test_data[:, 4].reshape(-1, 1)
    X_test_svr = user_test_data[:, 4].reshape(-1, 1)
    y_test = user_test_data[:, 2]

    # Choose and use the model based on the R_SD ratio for the main user
    if R_DE.get(user_id, 0) < 3.5:
        user_model =mlr_user_model[user_id]
        y_pred = user_model.predict(X_test_mlr)
    else:
        user_model = DKDE_user_model[user_id]
        y_pred = user_model.predict(X_test_dkde)

    # Convert y_pred to a NumPy array if it is a list
    y_pred = np.array(y_pred) if isinstance(y_pred, list) else y_pred

    # Adjust the prediction based on the most correlated users
    total_correlation = 1
    if user_id in most_corr_users:
        correlated_users = most_corr_users[user_id]

        for other_user_id in correlated_users:
            if other_user_id in mlr_user_model or other_user_id in DKDE_user_model:
                correlation = similarity_matrix.loc[user_id, other_user_id]

                # Choose the model for the correlated user based on R_SD
                if R_DE.get(other_user_id, 0) < 3.5:
                    other_user_model = mlr_user_model[other_user_id]
                    other_user_pred = other_user_model.predict(X_test_mlr)
                else:
                    other_user_model = DKDE_user_model[other_user_id]
                    other_user_pred = other_user_model.predict(X_test_dkde)

                # Convert other_user_pred to a NumPy array if it's a list
                other_user_pred = np.array(other_user_pred) if isinstance(other_user_pred, list) else other_user_pred

                # Ensure y_pred is an array before the operation
                if not isinstance(y_pred, np.ndarray):
                    y_pred = np.array(y_pred)

                y_pred += other_user_pred * correlation
                total_correlation += correlation

    # Ensure y_pred is an array before the division
    if not isinstance(y_pred, np.ndarray):
        y_pred = np.array(y_pred)

    if total_correlation != 0:
        y_pred /= total_correlation

    # Calculate user SMAPE
    n = len(y_test)
    smape_val = (1 / n) * np.sum(np.abs(y_test - y_pred) / (np.abs(y_test + y_pred))) * 100

    return smape_val
