In [None]:
from google.colab import drive
drive.mount('/content/drive')

#Data Processing

In [None]:
import numpy as np
import pandas as pd

# Define the file paths
file_path1 = '/content/drive/MyDrive/SuperUROP /Data Analysis/caltech_training_data.csv'
file_path2 = '/content/drive/MyDrive/SuperUROP /Data Analysis/caltech_testing_data.csv'
file_path3 = '/content/drive/MyDrive/SuperUROP /Data Analysis/JPL_training_data.csv'
file_path4  = '/content/drive/MyDrive/SuperUROP /Data Analysis/JPL_testing_data.csv'
# Use pandas to read the CSV files and then convert them to NumPy arrays
caltech_train = pd.read_csv(file_path1).values
caltech_test = pd.read_csv(file_path2).values

JPL_train = pd.read_csv(file_path3).values
JPL_test=pd.read_csv(file_path4).values

In [None]:
#Remove row number (in 1st column)
caltech_train=caltech_train[:,1:]
caltech_test=caltech_test[:,1:]

JPL_train=JPL_train[:,1:]
JPL_test=JPL_test[:,1:]

In [None]:
#Remove departure time (2nd column)
# Convert arrival date to hour and find day of the week
from datetime import datetime

def convert_time_and_day(data_array):
    """
    Converts the time from HH:MM to HH.XX format and appends the day of the week to it.
    Also, removes the second column.
    """
    transformed_data = []
    for row in data_array:
        # Convert the arrival time to HH.XX format
        time_obj = datetime.strptime(row[0], '%Y-%m-%d %H:%M:%S')
        new_time = time_obj.hour + (time_obj.minute / 60.0)

        # Convert the date to a day of the week
        day_of_week = time_obj.strftime('%A')
        new_time = str(new_time) + " " + day_of_week

        # Create a new row excluding the second column
        new_row = [new_time] + list(row[2:])
        transformed_data.append(new_row)

    return np.array(transformed_data)

caltech_train=convert_time_and_day(caltech_train)
caltech_test=convert_time_and_day(caltech_test)
JPL_train=convert_time_and_day(JPL_train)
JPL_test=convert_time_and_day(JPL_test)

In [None]:
def day_to_number(day):
    """Converts a day of the week to its corresponding discrete value."""
    days = {
        'Monday': 1,
        'Tuesday': 2,
        'Wednesday': 3,
        'Thursday': 4,
        'Friday': 5,
        'Saturday': 6,
        'Sunday': 7
    }
    return days[day]

def separate_time_and_day(data_array):
    """
    Separates the time and day in the given column,
    and converts the day into a discrete value between 1 and 7.

    """
    transformed_data = []
    for row in data_array:
        time_day_str = row[0]
        time, day = time_day_str.split()
        time = float(time)
        day_num = day_to_number(day)

        # Create a new row with separated time and day number
        new_row = [time, day_num] + list(row[1:])
        transformed_data.append(new_row)

    return np.array(transformed_data)

caltech_train=separate_time_and_day(caltech_train)
caltech_test=separate_time_and_day(caltech_test)
JPL_train=separate_time_and_day(JPL_train)
JPL_test=separate_time_and_day(JPL_test)

#Discretize the arrival time and stay duration into 48 30-minute intervals

In [None]:
caltech_train=caltech_train[:,[0,3,4]]
JPL_train=JPL_train[:,[0,3,4]]

In [None]:
#Calcuate max duration (used for sparsity matrix dimension)
df_caltech = pd.DataFrame(caltech_train, columns=['Arrival Time', 'UserID', 'Stay Duration'])
max_duration_caltech = int(np.round(df_caltech['Stay Duration'].astype(float).max()))

df_JPL = pd.DataFrame(JPL_train, columns=['Arrival Time', 'UserID', 'Stay Duration'])
max_duration_JPL = int(np.round(df_JPL['Stay Duration'].astype(float).max()))

In [None]:
import numpy as np

# Convert columns 0 and 2 to float
caltech_train[:, 0] = caltech_train[:, 0].astype(float)
caltech_train[:, 2] = caltech_train[:, 2].astype(float)
JPL_train[:, 0] = JPL_train[:, 0].astype(float)
JPL_train[:, 2] = JPL_train[:, 2].astype(float)

# Discretize the times into half-hour intervals for columns 0 and 2 using floor
caltech_train[:, 0] = np.floor(caltech_train[:, 0].astype(float) * 2).astype(int).astype(str)
caltech_train[:, 2] = np.floor(caltech_train[:, 2].astype(float) * 2).astype(int).astype(str)
JPL_train[:, 0] = np.floor(JPL_train[:, 0].astype(float) * 2).astype(int).astype(str)
JPL_train[:, 2] = np.floor(JPL_train[:, 2].astype(float) * 2).astype(int).astype(str)

#Calculate the joint probability distribution of arrival time and stay duration for each user

In [None]:
import numpy as np
import pandas as pd

def compute_user_joint_distributions(data_array):
    # Convert array to DataFrame
    df = pd.DataFrame(data_array, columns=['Arrival Time', 'UserID', 'Stay Duration'])

    # Split the data based on the UserID
    user_datasets = {user_id: user_data for user_id, user_data in df.groupby('UserID')}

    # The full range of expected values
    all_times = [str(i) for i in range(48)]

    # Inner function to compute the 2D joint distribution for a single user dataset
    def compute_joint_dist(user_data):
        joint_matrix = user_data.pivot_table(index='Stay Duration', columns='Arrival Time', values='UserID', aggfunc='size').fillna(0)

        # Reindex to ensure all values are present
        joint_matrix = joint_matrix.reindex(index=all_times, columns=all_times, fill_value=0)

        return joint_matrix / joint_matrix.sum().sum()

    # Construct joint probability distribution for each user
    joint_distributions = {user_id: compute_joint_dist(data) for user_id, data in user_datasets.items()}

    return joint_distributions


joint_dists_caltech = compute_user_joint_distributions(caltech_train)
joint_dists_JPL=compute_user_joint_distributions(JPL_train)

#Calculate the joint entropy for arrival time and stay duration for each user

In [None]:
import pandas as pd
import numpy as np

def joint_entropy(matrix):
    # Normalize the matrix
    prob_matrix = matrix / matrix.sum().sum()

    # Compute the joint entropy
    prob_matrix = prob_matrix.to_numpy()  # Convert DataFrame to numpy array for efficient computation
    log_prob_matrix = np.log2(prob_matrix, where=(prob_matrix > 0))
    log_prob_matrix[prob_matrix <= 0] = 0  # Set log(0) to 0
    entropy = -np.sum(prob_matrix * log_prob_matrix)

    return entropy

entropies_caltech = {user: joint_entropy(matrix) for user, matrix in joint_dists_caltech.items()}
entropies_JPL = {user: joint_entropy(matrix) for user, matrix in joint_dists_JPL.items()}

In [None]:
entropies_JPL

#Calculate the sparsity for arrival time and stay duration dataset for each user
For sparsity use a (max_duration+1)x48 matrix to match the paper instead of a 48x48 matrix used in the calculation of joint entropy - both exactly the same implemementation as the one used in the paper

- 48 is for the 48 30-minute intervals in arrival time

In [None]:
import pandas as pd

def sparsity(matrix, max_value):
    total_elements = 48 * (max_value+1) #matrix is a 48*(duration+1) element (+1 for 0 index)
    non_zero_elements = (matrix !=0).sum().sum()
    sparsity_val=(total_elements-non_zero_elements)/total_elements
    return sparsity_val

sparsities_caltech = {user: sparsity(matrix, max_duration_caltech) for user, matrix in joint_dists_caltech.items()}
sparsities_JPL = {user: sparsity(matrix, max_duration_JPL) for user, matrix in joint_dists_JPL.items()}


In [None]:
sparsities_JPL

#Calculate the ratio of entropy to sparsity for each user



In [None]:
entropy_sparsity_caltech = {user: entropies_caltech[user] / sparsities_caltech[user] for user in joint_dists_caltech.keys()}
entropy_sparsity_JPL = {user: entropies_JPL[user] / sparsities_JPL[user] for user in joint_dists_JPL.keys()}

In [None]:
entropy_sparsity_caltech

In [None]:
entropy_sparsity_JPL