In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
import pandas as pd

# Define the file paths
file_path1 = '/content/drive/MyDrive/SuperUROP /Data Analysis/caltech_training_data.csv'
file_path2 = '/content/drive/MyDrive/SuperUROP /Data Analysis/caltech_testing_data.csv'
file_path3 = '/content/drive/MyDrive/SuperUROP /Data Analysis/JPL_training_data.csv'
file_path4  = '/content/drive/MyDrive/SuperUROP /Data Analysis/JPL_testing_data.csv'
# Use pandas to read the CSV files and then convert them to NumPy arrays
caltech_train = pd.read_csv(file_path1).values
caltech_test = pd.read_csv(file_path2).values

JPL_train = pd.read_csv(file_path3).values
JPL_test=pd.read_csv(file_path4).values

#Data Processing for Random Forest



In [None]:
#Remove row number (in 1st column)
caltech_train=caltech_train[:,1:]
caltech_test=caltech_test[:,1:]

JPL_train=JPL_train[:,1:]
JPL_test=JPL_test[:,1:]

In [None]:
#Remove departure time (2nd column)
# Convert arrival date to hour and find day of the week
from datetime import datetime

def convert_time_and_day(data_array):
    """
    Converts the time from HH:MM to HH.XX format and appends the day of the week to it.
    Also, removes the second column.
    """
    transformed_data = []
    for row in data_array:
        # Convert the arrival time to HH.XX format
        time_obj = datetime.strptime(row[0], '%Y-%m-%d %H:%M:%S')
        new_time = time_obj.hour + (time_obj.minute / 60.0)

        # Convert the date to a day of the week
        day_of_week = time_obj.strftime('%A')
        new_time = str(new_time) + " " + day_of_week

        # Create a new row excluding the second column
        new_row = [new_time] + list(row[2:])
        transformed_data.append(new_row)

    return np.array(transformed_data)

caltech_train=convert_time_and_day(caltech_train)
caltech_test=convert_time_and_day(caltech_test)
JPL_train=convert_time_and_day(JPL_train)
JPL_test=convert_time_and_day(JPL_test)

In [None]:
def day_to_number(day):
    """Converts a day of the week to its corresponding discrete value."""
    days = {
        'Monday': 1,
        'Tuesday': 2,
        'Wednesday': 3,
        'Thursday': 4,
        'Friday': 5,
        'Saturday': 6,
        'Sunday': 7
    }
    return days[day]

def separate_time_and_day(data_array):
    """
    Separates the time and day in the given column,
    and converts the day into a discrete value between 1 and 7.

    """
    transformed_data = []
    for row in data_array:
        time_day_str = row[0]
        time, day = time_day_str.split()
        time = float(time)
        day_num = day_to_number(day)

        # Create a new row with separated time and day number
        new_row = [time, day_num] + list(row[1:])
        transformed_data.append(new_row)

    return np.array(transformed_data)

caltech_train=separate_time_and_day(caltech_train)
caltech_test=separate_time_and_day(caltech_test)
JPL_train=separate_time_and_day(JPL_train)
JPL_test=separate_time_and_day(JPL_test)

In [None]:
#Make training and testing set have the same user IDs
users_from_training_caltech = set(caltech_train[:, 3])
mask_caltech = np.isin(caltech_test[:, 3], list(users_from_training_caltech))
caltech_test = caltech_test[mask_caltech]
users_from_testing_caltech = set(caltech_test[:, 3])
mask_caltech = np.isin(caltech_train[:, 3], list(users_from_testing_caltech))
caltech_train = caltech_train[mask_caltech]

users_from_training = set(JPL_train[:, 3])
mask = np.isin(JPL_test[:, 3], list(users_from_training))
JPL_test = JPL_test[mask]
users_from_testing = set(JPL_test[:, 3])
mask = np.isin(JPL_train[:, 3], list(users_from_testing))
JPL_train = JPL_train[mask]

In [None]:
caltech_train = np.array(caltech_train, dtype='float')
caltech_test = np.array(caltech_test, dtype='float')
JPL_train = np.array(JPL_train, dtype='float')
JPL_test = np.array(JPL_test, dtype='float')

#Data processing for correllations (discretization)

In [None]:
from collections import defaultdict

# Organize the arrival times by user ID for the entire dataset
user_times_cal = defaultdict(list)
user_times_JPL = defaultdict(list)

# Accumulate all arrival times for each user in caltech_train
for row in caltech_train:
    arrival_time = float(row[0])
    user_id = row[3]
    user_times_cal[user_id].append(arrival_time)

# Accumulate all arrival times for each user in JPL_train
for row in JPL_train:
    arrival_time = float(row[0])
    user_id = row[3]
    user_times_JPL[user_id].append(arrival_time)

# Define the functions for hourly and half-hourly representations
def create_hourly_representation(times):
    hourly_vector = [0] * 24
    for time in times:
        hour = int(float(time))
        hourly_vector[hour] += 1
    return hourly_vector

def create_half_hourly_representation(times):
    half_hourly_vector = [0] * 48
    for time in times:
        interval = int(float(time) * 2)  # Multiply by 2 for half-hourly intervals
        half_hourly_vector[interval] += 1
    return half_hourly_vector

# Compute the hourly and half-hourly representations for each user
hourly_representations_cal = {}
half_hourly_representations_cal = {}
hourly_representations_JPL = {}
half_hourly_representations_JPL = {}

for user_id, times in user_times_cal.items():
    hourly_representations_cal[user_id] = create_hourly_representation(times)
    half_hourly_representations_cal[user_id] = create_half_hourly_representation(times)

for user_id, times in user_times_JPL.items():
    hourly_representations_JPL[user_id] = create_hourly_representation(times)
    half_hourly_representations_JPL[user_id] = create_half_hourly_representation(times)



In [None]:
max_duration_caltech= int(np.round(caltech_train[:, 4].astype(float).max()))
max_duration_JPL= int(np.round(JPL_train[:, 4].astype(float).max()))

In [None]:
import numpy as np
from collections import defaultdict

# Organize the energy consumption durations by user ID for the entire dataset
user_durations_cal = defaultdict(list)
user_durations_JPL = defaultdict(list)

# Accumulate all energy consumption durations for each user in caltech_train
for row in caltech_train:
    stay_duration = float(row[4])
    user_id = row[3]
    user_durations_cal[user_id].append(stay_duration)

# Accumulate all energy consumption durations for each user in JPL_train
for row in JPL_train:
    stay_duration = float(row[4])
    user_id = row[3]
    user_durations_JPL[user_id].append(stay_duration)

# Define functions for hourly and half-hourly representations
def create_hourly_representation_duration(durations):
    hourly_vector = [0] * 24
    for duration in durations:
        index = int(float(duration))
        hourly_vector[index] += 1
    return hourly_vector

def create_half_hourly_representation_duration(durations):
    half_hourly_vector = [0] * 48
    for duration in durations:
        interval = int(float(duration) * 2)
        half_hourly_vector[interval] += 1
    return half_hourly_vector

# Compute the hourly and half-hourly representations for each user
hourly_representations_duration_cal = {}
half_hourly_representations_duration_cal = {}
hourly_representations_duration_JPL = {}
half_hourly_representations_duration_JPL = {}

for user_id, durations in user_durations_cal.items():
    hourly_representations_duration_cal[user_id] = create_hourly_representation_duration(durations)
    half_hourly_representations_duration_cal[user_id] = create_half_hourly_representation_duration(durations)

for user_id, durations in user_durations_JPL.items():
    hourly_representations_duration_JPL[user_id] = create_hourly_representation_duration(durations)
    half_hourly_representations_duration_JPL[user_id] = create_half_hourly_representation_duration(durations)


In [None]:
def concatenate_representations(dict1, dict2):
    concatenated = {}
    for key in set(dict1.keys()) | set(dict2.keys()):  # union of keys from both dicts
        list1 = dict1.get(key, [])
        list2 = dict2.get(key, [])

        # Concatenate the lists
        concatenated_list = list1 + list2

        concatenated[key] = concatenated_list

    return concatenated

# Example usage
concatenated_hourly_cal = concatenate_representations(hourly_representations_cal, hourly_representations_duration_cal)
concatenated_half_hourly_cal = concatenate_representations(half_hourly_representations_cal, half_hourly_representations_duration_cal)

concatenated_hourly_JPL = concatenate_representations(hourly_representations_JPL, hourly_representations_duration_JPL)
concatenated_half_hourly_JPL = concatenate_representations(half_hourly_representations_JPL, half_hourly_representations_duration_JPL)



#Correlation calculation

##Cosine similarity calculation

In [None]:
from sklearn.metrics import pairwise_distances
import pandas as pd

def data_to_dataframe(data):
    return pd.DataFrame.from_dict(data, orient='index')

def compute_cosine_similarity(df):
    cosine_distance = pairwise_distances(df, metric='cosine')
    cosine_similarity = 1 - cosine_distance

    # Set the user IDs as row and column names to preserve them
    cosine_similarity_df = pd.DataFrame(cosine_similarity, index=df.index, columns=df.index)
    return cosine_similarity_df

# Compute cosine similarity for each concatenated dataset
df_hourly_cal = data_to_dataframe(concatenated_hourly_cal)
similarities_hourly_cal = compute_cosine_similarity(df_hourly_cal)

df_half_hourly_cal = data_to_dataframe(concatenated_half_hourly_cal)
similarities_half_hourly_cal = compute_cosine_similarity(df_half_hourly_cal)

df_hourly_JPL = data_to_dataframe(concatenated_hourly_JPL)
similarities_hourly_JPL = compute_cosine_similarity(df_hourly_JPL)

df_half_hourly_JPL = data_to_dataframe(concatenated_half_hourly_JPL)
similarities_half_hourly_JPL = compute_cosine_similarity(df_half_hourly_JPL)



##Pearson correlation calculation

In [None]:
import numpy as np

def compute_pearson_correlation(v1, v2):
    # Handling cases where vectors are too short or contain constant values
    if len(v1) > 1 and len(v2) > 1 and np.std(v1) * np.std(v2) != 0:
        return np.corrcoef(v1, v2)[0, 1]
    else:
        return None  # Returning None for cases where correlation is not defined

import pandas as pd

def compute_pearson_correlation_matrix_df(representations):
    user_ids = list(representations.keys())
    n = len(user_ids)
    correlation_matrix = np.zeros((n, n))

    for i in range(n):
        for j in range(i, n):  # Compute only for one triangle and mirror it as the matrix is symmetric
            if i != j:
                corr = compute_pearson_correlation(representations[user_ids[i]], representations[user_ids[j]])
                correlation_matrix[i, j] = corr if corr is not None else 0
                correlation_matrix[j, i] = correlation_matrix[i, j]  # Mirror the value
            else:
                correlation_matrix[i, j] = 1  # Self-correlation is always 1

    # Convert to DataFrame for better usability
    correlation_df = pd.DataFrame(correlation_matrix, index=user_ids, columns=user_ids)
    return correlation_df

similarities_hourly_cal = compute_pearson_correlation_matrix_df(concatenated_hourly_cal)
similarities_half_hourly_cal = compute_pearson_correlation_matrix_df(concatenated_half_hourly_cal)

similarities_hourly_JPL = compute_pearson_correlation_matrix_df(concatenated_hourly_JPL)
similarities_half_hourly_JPL = compute_pearson_correlation_matrix_df(concatenated_half_hourly_JPL)




#Find most correlated users (using energy and raw values)
Raw threshold performs better (choose a threshold of 0.5/0.75 depending on the data)

In [None]:
#Using energy
def most_correlated_users(matrix, threshold=0.5):
    """
    For each user, calculate the top correlated users that cumulatively contribute to a given energy threshold.

    :param matrix: Cosine similarity matrix.
    :param threshold: Energy threshold.
    :return: Dictionary with keys being user IDs and values being lists of top correlated user IDs.
    """
    result = {}

    for user_index in matrix.index:
        # Subtract self-similarity value for the current user
        user_similarities = matrix.loc[user_index].drop(user_index)

        # Square values to get energy, then sort by descending energy
        sorted_users = user_similarities.map(np.square).sort_values(ascending=False)

        # Calculate total energy
        total_energy = sorted_users.sum()

        # Find the subset of users whose energy sums to the given threshold of the total energy
        cumulative_energy = 0
        selected_users = []
        for other_user, user_energy in sorted_users.items():
            cumulative_energy += user_energy
            selected_users.append(other_user)
            if cumulative_energy / total_energy >= threshold:
                break

        result[user_index] = selected_users

    return result

# Assuming `similarities_hourly_cal` is your cosine similarity matrix for the entire dataset
most_corr_users_hourly_cal = most_correlated_users(similarities_hourly_cal)
most_corr_users_half_hourly_cal = most_correlated_users(similarities_half_hourly_cal)

# Similarly for JPL data
most_corr_users_hourly_JPL = most_correlated_users(similarities_hourly_JPL)
most_corr_users_half_hourly_JPL = most_correlated_users(similarities_half_hourly_JPL)


In [None]:
#Using threshold
def most_correlated_users(matrix, threshold=0.75):
    """
    For each user, find the top correlated users with correlations above 0.5.

    :param matrix: Cosine similarity matrix.
    :param threshold: Minimum correlation threshold for considering a user as correlated.
    :return: Dictionary with keys being user IDs and values being lists of top correlated user IDs.
    """
    result = {}

    for user_index in matrix.index:
        # Subtract self-similarity value for the current user
        user_similarities = matrix.loc[user_index].drop(user_index)

        # Filter users with correlation above the threshold
        filtered_users = user_similarities[abs(user_similarities) > threshold]

        # Sort by descending correlation
        sorted_users = filtered_users.sort_values(ascending=False)

        # Append users above the threshold to the result
        result[user_index] = sorted_users.index.tolist()

    return result

# Assuming `similarities_hourly_cal` is your cosine similarity matrix for the entire dataset
most_corr_users_hourly_cal = most_correlated_users(similarities_hourly_cal)
most_corr_users_half_hourly_cal = most_correlated_users(similarities_half_hourly_cal)

# Similarly for JPL data
most_corr_users_hourly_JPL = most_correlated_users(similarities_hourly_JPL)
most_corr_users_half_hourly_JPL = most_correlated_users(similarities_half_hourly_JPL)

#Rabdom Forest Stay Duration (no correlations - individual user's data **only**)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, LeaveOneOut
from sklearn.metrics import mean_squared_error
import numpy as np

def random_forest_regression(train_data,test_data,user_id, n_splits=5):

    # Filter training and testing data for the specific user
    user_train_data = train_data[train_data[:, 3] == user_id]
    user_test_data = test_data[test_data[:, 3] == user_id]

    # Extract columns for arrival time and stay duration for both train and test sets
    X_train = user_train_data[:, 0].reshape(-1, 1)  # Arrival time
    y_train = user_train_data[:, 4] # Stay duration

    X_test = user_test_data[:, 0].reshape(-1, 1)
    y_test = user_test_data[:, 4]

    # Define the hyperparameters and their possible values
    param_grid = {
        'n_estimators': [10, 20,50],
        'max_depth': [2,5,7,10,12],
        'min_samples_split':[2,3,5,7,10] ,
        'max_features': [1.0, 'sqrt']
    }

    # Initialize RandomForestRegressor and GridSearchCV
    rf = RandomForestRegressor(random_state=42)
    grid_search = GridSearchCV(rf, param_grid, cv=n_splits, scoring='neg_mean_squared_error')

    # Fit the model
    grid_search.fit(X_train, y_train)

    # Predict using the best model
    best_rf = grid_search.best_estimator_
    y_pred = best_rf.predict(X_test)

    # Calculate the performance metric (mean squared error)
    mse = mean_squared_error(y_test, y_pred)

    # Calculate user SMAPE
    n = len(y_test)
    smape_val = (1/ n) * np.sum(np.abs(y_test - y_pred) / (np.abs(y_test+y_pred)))*100

    return grid_search.best_params_,smape_val



In [None]:
# Test the function
user_ids_caltech = np.unique(np.concatenate((caltech_train[:, 3], caltech_test[:, 3])))
user_ids_JPL = np.unique(np.concatenate((JPL_train[:, 3], JPL_test[:, 3])))

smape_list_caltech=[]
smape_list_JPL=[]
best_params_caltech=[]
best_params_JPL=[]
for user_id in user_ids_caltech:
    best_params,smape = random_forest_regression(caltech_train, caltech_test, user_id)
    smape_list_caltech.append(smape)
    best_params_caltech.append(best_params)
for user_id in user_ids_JPL:
    best_params, smape = random_forest_regression(JPL_train, JPL_test, user_id)
    smape_list_JPL.append(smape)
    best_params_JPL.append(best_params)

In [None]:
#Calculate average SMAPE for each location
no_caltech_users=len(user_ids_caltech)
caltech_smape=sum(smape_list_caltech)/no_caltech_users

no_JPL_users=len(user_ids_JPL)
JPL_smape=sum(smape_list_JPL)/no_JPL_users

print(f"Caltech SMAPE: {caltech_smape}")
print(f"JPL SMAPE: {JPL_smape}")

#Calculate Random Forest models for all users

In [None]:
from sklearn.linear_model import LinearRegression

def train_user_model(train_data, user_id,n_splits=5):
    user_train_data = train_data[train_data[:, 3] == user_id]
    X_train = user_train_data[:, 0].reshape(-1, 1)  # Arrival time
    y_train = user_train_data[:, 4]  # Stay duration

    # Define the hyperparameters and their possible values
    param_grid = {
        'n_estimators': [10, 20,50],
        'max_depth': [2,5,7,10,12],
        'min_samples_split':[2,3,5,7,10] ,
        'max_features': [1.0, 'sqrt']
    }

    # Initialize RandomForestRegressor and GridSearchCV
    rf = RandomForestRegressor(random_state=42)
    grid_search = GridSearchCV(rf, param_grid, cv=n_splits, scoring='neg_mean_squared_error')

    # Fit the model
    grid_search.fit(X_train, y_train)

    # Predict using the best model
    model = grid_search.best_estimator_


    return model

all_user_models_caltech = {user_id: train_user_model(caltech_train, user_id) for user_id in user_ids_caltech}
all_user_models_JPL = {user_id: train_user_model(JPL_train, user_id) for user_id in user_ids_JPL}

#Random Forest Stay Duration including most correlated users

In [None]:
def train_and_test_user_model_with_correlation(train_data, test_data, user_id, most_corr_users, all_user_models, similarity_matrix,n_splits=5):
    # Filter training and testing data for the specific user
    user_train_data = train_data[train_data[:, 3] == user_id]
    user_test_data = test_data[test_data[:, 3] == user_id]

    # Extract columns for arrival time and stay duration for both train and test sets
    X_train = user_train_data[:, 0].reshape(-1, 1)  # Arrival time
    y_train = user_train_data[:, 4]  # Stay duration

    X_test = user_test_data[:, 0].reshape(-1, 1)
    y_test = user_test_data[:, 4]

    # Define the hyperparameters and their possible values
    param_grid = {
        'n_estimators': [10, 20,50],
        'max_depth': [2,5,7,10,12],
        'min_samples_split':[2,3,5,7,10] ,
        'max_features': [1.0, 'sqrt']
    }

    # Initialize RandomForestRegressor and GridSearchCV
    rf = RandomForestRegressor(random_state=42)
    grid_search = GridSearchCV(rf, param_grid, cv=n_splits, scoring='neg_mean_squared_error')

    # Fit the model
    grid_search.fit(X_train, y_train)

    user_model = grid_search.best_estimator_
    # Predict on the test set
    y_pred = user_model.predict(X_test)

    # Adjust the prediction based on the most correlated users
    if user_id in most_corr_users:
      correlated_users = most_corr_users[user_id]
      total_correlation = 1

    for other_user_id in correlated_users:
        if other_user_id in all_user_models:
            correlation = similarity_matrix.loc[user_id, other_user_id]
            other_user_model = all_user_models[other_user_id]

            # Predict using the other user's model
            other_user_pred = other_user_model.predict(X_test)

            # Weight the prediction by the correlation and add to the base prediction
            y_pred += other_user_pred * correlation
            total_correlation += correlation
    if total_correlation != 0:
      y_pred /= total_correlation

    # Calculate user SMAPE
    n = len(y_test)
    smape_val = (1/ n) * np.sum(np.abs(y_test - y_pred) / (np.abs(y_test+y_pred)))*100


    return user_model, smape_val,total_correlation





In [None]:
user_ids_caltech = np.unique(np.concatenate((caltech_train[:, 3], caltech_test[:, 3])))
user_ids_JPL = np.unique(np.concatenate((JPL_train[:, 3], JPL_test[:, 3])))

smape_list_caltech=[]
smape_list_JPL=[]
sum_of_correlations_list=[]
# for user_id in user_ids_caltech:
#     model, smape,sum_of_correlations = train_and_test_user_model_with_correlation(caltech_train, caltech_test, user_id, most_corr_users_half_hourly_cal,all_user_models_caltech,similarities_half_hourly_cal)
#     smape_list_caltech.append(smape)
#     sum_of_correlations_list.append(sum_of_correlations)
for user_id in user_ids_JPL:
    model, smape,sum_of_correlations = train_and_test_user_model_with_correlation(JPL_train, JPL_test, user_id, most_corr_users_hourly_JPL,all_user_models_JPL,similarities_hourly_JPL)
    smape_list_JPL.append(smape)

In [None]:
#Calculate average SMAPE for each location
no_caltech_users=len(user_ids_caltech)
caltech_smape=sum(smape_list_caltech)/no_caltech_users

no_JPL_users=len(user_ids_JPL)
JPL_smape=sum(smape_list_JPL)/no_JPL_users

print(f"Caltech SMAPE: {caltech_smape}")
print(f"JPL SMAPE: {JPL_smape}")