In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
import pandas as pd

# Define the file paths
file_path1 = '/content/drive/MyDrive/SuperUROP /Data Analysis/caltech_training_data.csv'
file_path2 = '/content/drive/MyDrive/SuperUROP /Data Analysis/caltech_testing_data.csv'
file_path3 = '/content/drive/MyDrive/SuperUROP /Data Analysis/JPL_training_data.csv'
file_path4  = '/content/drive/MyDrive/SuperUROP /Data Analysis/JPL_testing_data.csv'
# Use pandas to read the CSV files and then convert them to NumPy arrays
caltech_train = pd.read_csv(file_path1).values
caltech_test = pd.read_csv(file_path2).values

JPL_train = pd.read_csv(file_path3).values
JPL_test=pd.read_csv(file_path4).values

#Data Processing for Decision Tree

In [None]:
#Remove row number (in 1st column)
caltech_train=caltech_train[:,1:]
caltech_test=caltech_test[:,1:]

JPL_train=JPL_train[:,1:]
JPL_test=JPL_test[:,1:]

In [None]:
#Remove departure time (2nd column)
# Convert arrival date to hour and find day of the week
from datetime import datetime

def convert_time_and_day(data_array):
    """
    Converts the time from HH:MM to HH.XX format and appends the day of the week to it.
    Also, removes the second column.
    """
    transformed_data = []
    for row in data_array:
        # Convert the arrival time to HH.XX format
        time_obj = datetime.strptime(row[0], '%Y-%m-%d %H:%M:%S')
        new_time = time_obj.hour + (time_obj.minute / 60.0)

        # Convert the date to a day of the week
        day_of_week = time_obj.strftime('%A')
        new_time = str(new_time) + " " + day_of_week

        # Create a new row excluding the second column
        new_row = [new_time] + list(row[2:])
        transformed_data.append(new_row)

    return np.array(transformed_data)

caltech_train=convert_time_and_day(caltech_train)
caltech_test=convert_time_and_day(caltech_test)
JPL_train=convert_time_and_day(JPL_train)
JPL_test=convert_time_and_day(JPL_test)

In [None]:
def day_to_number(day):
    """Converts a day of the week to its corresponding discrete value."""
    days = {
        'Monday': 1,
        'Tuesday': 2,
        'Wednesday': 3,
        'Thursday': 4,
        'Friday': 5,
        'Saturday': 6,
        'Sunday': 7
    }
    return days[day]

def separate_time_and_day(data_array):
    """
    Separates the time and day in the given column,
    and converts the day into a discrete value between 1 and 7.

    """
    transformed_data = []
    for row in data_array:
        time_day_str = row[0]
        time, day = time_day_str.split()
        time = float(time)
        day_num = day_to_number(day)

        # Create a new row with separated time and day number
        new_row = [time, day_num] + list(row[1:])
        transformed_data.append(new_row)

    return np.array(transformed_data)

caltech_train=separate_time_and_day(caltech_train)
caltech_test=separate_time_and_day(caltech_test)
JPL_train=separate_time_and_day(JPL_train)
JPL_test=separate_time_and_day(JPL_test)

In [None]:
#Make training and testing set have the same user IDs
users_from_training_caltech = set(caltech_train[:, 3])
mask_caltech = np.isin(caltech_test[:, 3], list(users_from_training_caltech))
caltech_test = caltech_test[mask_caltech]
users_from_testing_caltech = set(caltech_test[:, 3])
mask_caltech = np.isin(caltech_train[:, 3], list(users_from_testing_caltech))
caltech_train = caltech_train[mask_caltech]

users_from_training = set(JPL_train[:, 3])
mask = np.isin(JPL_test[:, 3], list(users_from_training))
JPL_test = JPL_test[mask]
users_from_testing = set(JPL_test[:, 3])
mask = np.isin(JPL_train[:, 3], list(users_from_testing))
JPL_train = JPL_train[mask]

In [None]:
caltech_train = np.array(caltech_train, dtype='float')
caltech_test = np.array(caltech_test, dtype='float')
JPL_train = np.array(JPL_train, dtype='float')
JPL_test = np.array(JPL_test, dtype='float')

#Predict energy consumption using K-fold CV & Grid Search

In [None]:
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

def perform_grid_search_decision_tree(train_data, test_data, used_id, depth_range=(1, 21), min_samples_split_range=(2, 11), cv_folds=5):
    """
    Perform grid search with cross-validation to find optimal hyperparameters for Decision Tree regression.

    Parameters:
    - train_data: Training dataset with the format [sample, features]
    - test_data: Testing dataset with the same format as train_data
    - depth_range: Tuple representing the range of max_depth values to test (default is (1, 21))
    - min_samples_split_range: Tuple representing the range of min_samples_split values to test (default is (2, 11))
    - cv_folds: Number of cross-validation folds (default is 5)

    Returns:
    - y_pred: Predictions on the test set
    - best_params: Best hyperparameters found
    """

    # Filter training and testing data for the specific user
    user_train_data = train_data[train_data[:, 3] == user_id]
    user_test_data = test_data[test_data[:, 3] == user_id]

    # Extract columns for stay duration and energy consumption for both train and test sets
    X_train = user_train_data[:, 4].reshape(-1, 1)  # Stay Duration
    y_train = user_train_data[:, 2]  # Energy Consumption

    X_test = user_test_data[:, 4].reshape(-1, 1)
    y_test = user_test_data[:, 2]

    # Set up grid search with cross-validation for Decision Tree
    param_grid = {
        'max_depth': list(range(depth_range[0], depth_range[1])),
        'min_samples_split': list(range(min_samples_split_range[0], min_samples_split_range[1]))
    }
    dt = DecisionTreeRegressor()
    grid_search = GridSearchCV(dt, param_grid, cv=cv_folds, scoring='neg_mean_squared_error', return_train_score=True)
    grid_search.fit(X_train, y_train)

    # Train the model using the optimal hyperparameters
    best_dt = grid_search.best_estimator_

    # Predict on the test set
    y_pred = best_dt.predict(X_test)

    # Best hyperparameters
    best_params = grid_search.best_params_

    # Calculate user SMAPE
    n = len(y_test)
    smape_val = (1/ n) * np.sum(np.abs(y_test - y_pred) / (np.abs(y_test+y_pred)))*100

    return best_params,smape_val




In [None]:
# Test the function
user_ids_caltech = np.unique(np.concatenate((caltech_train[:, 3], caltech_test[:, 3])))
user_ids_JPL = np.unique(np.concatenate((JPL_train[:, 3], JPL_test[:, 3])))

smape_list_caltech=[]
smape_list_JPL=[]
best_params_caltech=[]
best_params_JPL=[]
for user_id in user_ids_caltech:
    best_params,smape = perform_grid_search_decision_tree(caltech_train, caltech_test, user_id)
    smape_list_caltech.append(smape)
    best_params_caltech.append(best_params)
for user_id in user_ids_JPL:
    best_params, smape = perform_grid_search_decision_tree(JPL_train, JPL_test, user_id)
    smape_list_JPL.append(smape)
    best_params_JPL.append(best_params)

In [None]:
#Calculate average SMAPE for each location
no_caltech_users=len(user_ids_caltech)
caltech_smape=sum(smape_list_caltech)/no_caltech_users

no_JPL_users=len(user_ids_JPL)
JPL_smape=sum(smape_list_JPL)/no_JPL_users

print(f"Caltech SMAPE: {caltech_smape}")
print(f"JPL SMAPE: {JPL_smape}")

##Predict energy consumption using Leave-One-Out CV & Grid Search

In [None]:
import numpy as np
from sklearn.model_selection import GridSearchCV,LeaveOneOut
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

def perform_grid_search_decision_tree(train_data, test_data, used_id, depth_range=(1, 21), min_samples_split_range=(2, 11)):
    """
    Perform grid search with cross-validation to find optimal hyperparameters for Decision Tree regression.

    Parameters:
    - train_data: Training dataset with the format [sample, features]
    - test_data: Testing dataset with the same format as train_data
    - depth_range: Tuple representing the range of max_depth values to test (default is (1, 21))
    - min_samples_split_range: Tuple representing the range of min_samples_split values to test (default is (2, 11))
    - cv_folds: Number of cross-validation folds (default is 5)

    Returns:
    - y_pred: Predictions on the test set
    - best_params: Best hyperparameters found
    """

    # Filter training and testing data for the specific user
    user_train_data = train_data[train_data[:, 3] == user_id]
    user_test_data = test_data[test_data[:, 3] == user_id]

    # Extract columns for stay duration and energy consumption for both train and test sets
    X_train = user_train_data[:, 4].reshape(-1, 1)  # Stay Duration
    y_train = user_train_data[:, 2]  # Energy Consumption

    X_test = user_test_data[:, 4].reshape(-1, 1)
    y_test = user_test_data[:, 2]

    # Set up grid search with cross-validation for Decision Tree
    param_grid = {
        'max_depth': list(range(depth_range[0], depth_range[1])),
        'min_samples_split': list(range(min_samples_split_range[0], min_samples_split_range[1]))
    }

    # Use LeaveOneOut for cross-validation
    loo = LeaveOneOut()

    dt = DecisionTreeRegressor()
    grid_search = GridSearchCV(dt, param_grid, cv=loo, scoring='neg_mean_squared_error', return_train_score=True)
    grid_search.fit(X_train, y_train)

    # Train the model using the optimal hyperparameters
    best_dt = grid_search.best_estimator_

    # Predict on the test set
    y_pred = best_dt.predict(X_test)

    # Best hyperparameters
    best_params = grid_search.best_params_

    # Calculate user SMAPE
    n = len(y_test)
    smape_val = (1/ n) * np.sum(np.abs(y_test - y_pred) / (np.abs(y_test+y_pred)))*100

    return best_params,smape_val


In [None]:
# Test the function
user_ids_caltech = np.unique(np.concatenate((caltech_train[:, 3], caltech_test[:, 3])))
user_ids_JPL = np.unique(np.concatenate((JPL_train[:, 3], JPL_test[:, 3])))

smape_list_caltech=[]
smape_list_JPL=[]
best_params_caltech=[]
best_params_JPL=[]
for user_id in user_ids_caltech:
    best_params,smape = perform_grid_search_decision_tree(caltech_train, caltech_test, user_id)
    smape_list_caltech.append(smape)
    best_params_caltech.append(best_params)
for user_id in user_ids_JPL:
    best_params, smape = perform_grid_search_decision_tree(JPL_train, JPL_test, user_id)
    smape_list_JPL.append(smape)
    best_params_JPL.append(best_params)

In [None]:
#Calculate average SMAPE for each location
no_caltech_users=len(user_ids_caltech)
caltech_smape=sum(smape_list_caltech)/no_caltech_users

no_JPL_users=len(user_ids_JPL)
JPL_smape=sum(smape_list_JPL)/no_JPL_users

print(f"Caltech SMAPE: {caltech_smape}")
print(f"JPL SMAPE: {JPL_smape}")

#Plot SMAPE vs R_de for all users (JPL)

In [None]:
# Initialize dictionaries to store user-wise SMAPE values
user_smape_JPL = {}
user_ids_JPL = np.unique(np.concatenate((JPL_train[:, 3], JPL_test[:, 3])))

# Loop through each user ID in the JPL dataset
for user_id in user_ids_JPL:
    best_params, smape = perform_grid_search_decision_tree(JPL_train, JPL_test, user_id)
    user_smape_JPL[user_id] = smape  # Store the SMAPE in the dictionary

average_smape_JPL = np.mean(list(user_smape_JPL.values()))

print(f"Average SMAPE for JPL dataset: {average_smape_JPL:.2f}%")

In [None]:
R_DE={'169': 2.119068507176728,
 '171': 3.7115405017002776,
 '176': 2.9977726188370353,
 '220': 4.170165871125364,
 '322': 2.974319275358279,
 '334': 1.54796573662954,
 '335': 3.9915670302142447,
 '346': 4.687498008228514,
 '365': 4.400963644915041,
 '368': 3.4312414172978474,
 '372': 2.9575939245001077,
 '374': 3.839538311235034,
 '378': 3.1515844041039993,
 '382': 4.148262279562261,
 '404': 3.097855936467447,
 '405': 4.732202935285684,
 '406': 1.158211936837657,
 '409': 2.886901319079918,
 '410': 4.280915725084575,
 '416': 4.148262279562261,
 '436': 3.3808089672729613,
 '444': 3.0622185077071102,
 '458': 4.148262279562261,
 '467': 3.777008295296614,
 '474': 2.6165283654578775,
 '476': 3.7356597962280196,
 '481': 4.045488808054726,
 '483': 2.755038189921663,
 '507': 3.7804325624313817,
 '526': 2.4478241428939165,
 '531': 3.5032226226713927,
 '537': 3.1955976636365735,
 '551': 4.767567980735159,
 '553': 2.555583224030839,
 '576': 3.380624050295972,
 '577': 4.031441936975631,
 '581': 2.9304059447845647,
 '592': 3.056683078202939,
 '607': 4.625780698041485,
 '651': 3.492080210083478,
 '726': 3.2841952906487855,
 '742': 3.9023067062499264,
 '826': 3.375181253576597,
 '933': 3.9394875755880188}

In [None]:
import matplotlib.pyplot as plt

# Convert the keys of R_SD to integers for proper comparison
R_DE_int_keys = {int(k): v for k, v in R_DE.items()}

# Finding common keys between the two dictionaries
common_keys = set(user_smape_JPL.keys()).intersection(R_DE_int_keys.keys())
x_values = [R_DE_int_keys[key] for key in common_keys]
y_values = [user_smape_JPL[key] for key in common_keys]

correlation_coefficient = np.corrcoef(x_values, y_values)[0, 1]
print(correlation_coefficient)

# Calculating the line of best fit
m, b = np.polyfit(x_values, y_values, 1)  # m is slope, b is y-intercept
# Generating y-values for the line of best fit
fit_line = [m*x + b for x in x_values]

# Plotting
plt.figure(figsize=(10, 6))
plt.scatter(x_values, y_values, color='blue')
plt.plot(x_values, fit_line, color='black')
plt.title("Comparison of JPL SMAPE and R_DE values for each user")
plt.xlabel("R_SD")
plt.ylabel("JPL SMAPE")
plt.grid(True)
plt.show()



In [None]:
sparsity={'169': 0.9779411764705882,
 '171': 0.9656862745098039,
 '176': 0.9754901960784313,
 '220': 0.9558823529411765,
 '322': 0.9705882352941176,
 '334': 0.9901960784313726,
 '335': 0.9583333333333334,
 '346': 0.9436274509803921,
 '365': 0.9509803921568627,
 '368': 0.9656862745098039,
 '372': 0.9754901960784313,
 '374': 0.9607843137254902,
 '378': 0.9730392156862745,
 '382': 0.9583333333333334,
 '404': 0.9754901960784313,
 '405': 0.9362745098039216,
 '406': 0.9877450980392157,
 '409': 0.9779411764705882,
 '410': 0.9485294117647058,
 '416': 0.9583333333333334,
 '436': 0.9705882352941176,
 '444': 0.9730392156862745,
 '458': 0.9583333333333334,
 '467': 0.9632352941176471,
 '474': 0.9779411764705882,
 '476': 0.9656862745098039,
 '481': 0.9607843137254902,
 '483': 0.9754901960784313,
 '507': 0.9632352941176471,
 '526': 0.9779411764705882,
 '531': 0.9656862745098039,
 '537': 0.9705882352941176,
 '551': 0.9411764705882353,
 '553': 0.9828431372549019,
 '576': 0.9705882352941176,
 '577': 0.9583333333333334,
 '581': 0.9779411764705882,
 '592': 0.9730392156862745,
 '607': 0.9436274509803921,
 '651': 0.9656862745098039,
 '726': 0.9705882352941176,
 '742': 0.9607843137254902,
 '826': 0.9730392156862745,
 '933': 0.9632352941176471}

entropy={'169': 2.0723243489301826,
 '171': 3.5841837197791895,
 '176': 2.924297799747892,
 '220': 3.9861879650463035,
 '322': 2.8868392966712704,
 '334': 1.5327896019567016,
 '335': 3.825251737288651,
 '346': 4.423251796980337,
 '365': 4.1852301329094015,
 '368': 3.313502741214098,
 '372': 2.8851038773309874,
 '374': 3.6889681813826796,
 '378': 3.0666152167384504,
 '382': 3.9754180179138334,
 '404': 3.021928094887363,
 '405': 4.430640983527282,
 '406': 1.1440181631019015,
 '409': 2.823219672335508,
 '410': 4.060574474528751,
 '416': 3.9754180179138334,
 '436': 3.2813734094119917,
 '444': 2.9796586949993205,
 '458': 3.9754180179138334,
 '467': 3.638147696204827,
 '474': 2.558810827984542,
 '476': 3.6074753914554893,
 '481': 3.8868421881310113,
 '483': 2.6875127440902498,
 '507': 3.641446071165522,
 '526': 2.393828022094786,
 '531': 3.383014003266002,
 '537': 3.101609497059027,
 '551': 4.487122805397797,
 '553': 2.511737433422467,
 '576': 3.2811939311696197,
 '577': 3.863465189601647,
 '581': 2.865764637179023,
 '592': 2.9742725050160947,
 '607': 4.365013648887185,
 '651': 3.3722539283649273,
 '726': 3.1876013115120565,
 '742': 3.7492750707107136,
 '826': 3.2841837197791888,
 '933': 3.794653473544342}


# Convert the keys of R_SD to integers for proper comparison
entropy_int_keys = {int(k): v for k, v in entropy.items()}

# Finding common keys between the two dictionaries
common_keys_ent = set(user_smape_JPL.keys()).intersection(entropy_int_keys.keys())
x_values_ent = [entropy_int_keys[key] for key in common_keys_ent]
y_values_ent = [user_smape_JPL[key] for key in common_keys_ent]

correlation_coefficient_ent = np.corrcoef(x_values_ent, y_values_ent)[0, 1]
print(correlation_coefficient_ent)
# Calculating the line of best fit
m, b = np.polyfit(x_values_ent, y_values_ent, 1)  # m is slope, b is y-intercept
# Generating y-values for the line of best fit
fit_line_ent= [m*x + b for x in x_values_ent]


# Convert the keys of R_SD to integers for proper comparison
sparsity_int_keys = {int(k): v for k, v in sparsity.items()}

# Finding common keys between the two dictionaries
common_keys_spars = set(user_smape_JPL.keys()).intersection(sparsity_int_keys.keys())
x_values_spars = [sparsity_int_keys[key] for key in common_keys_spars]
y_values_spars = [user_smape_JPL[key] for key in common_keys_spars]

correlation_coefficient_spars = np.corrcoef(x_values_spars, y_values_spars)[0, 1]
print(correlation_coefficient_spars)
# Calculating the line of best fit
m, b = np.polyfit(x_values_spars, y_values_spars, 1)  # m is slope, b is y-intercept
# Generating y-values for the line of best fit
fit_line_spars= [m*x + b for x in x_values_spars]


In [None]:
import matplotlib.pyplot as plt

# Create a subplot with 1 row and 2 columns
fig, ax = plt.subplots(1, 2, figsize=(20, 6))

# Plotting for Entropy_SD
ax[0].scatter(x_values_ent, y_values_ent, color='blue')
ax[0].plot(x_values_ent, fit_line_ent, color='black')
ax[0].set_xlabel("Entropy_DE",fontsize=16)
ax[0].set_ylabel("DT",fontsize=16)
ax[0].grid(True)
ax[0].text(0.05, 0.82, f'Correlation: {correlation_coefficient_ent:.3f}', transform=ax[0].transAxes, fontsize=14)

# Plotting for Sparsity_SD
ax[1].scatter(x_values_spars, y_values_spars, color='blue')
ax[1].plot(x_values_spars, fit_line_spars, color='black')
ax[1].set_xlabel("Sparsity_DE", fontsize=16)
ax[1].set_ylabel("DT", fontsize=16)
ax[1].grid(True)
ax[1].text(0.05, 0.82, f'Correlation: {correlation_coefficient_spars:.3f}', transform=ax[1].transAxes, fontsize=14)

plt.tight_layout()
plt.show()
