In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
import pandas as pd

# Define the file paths
file_path1 = '/content/drive/MyDrive/SuperUROP /Data Analysis/caltech_training_data.csv'
file_path2 = '/content/drive/MyDrive/SuperUROP /Data Analysis/caltech_testing_data.csv'
file_path3 = '/content/drive/MyDrive/SuperUROP /Data Analysis/JPL_training_data.csv'
file_path4  = '/content/drive/MyDrive/SuperUROP /Data Analysis/JPL_testing_data.csv'
# Use pandas to read the CSV files and then convert them to NumPy arrays
caltech_train = pd.read_csv(file_path1).values
caltech_test = pd.read_csv(file_path2).values

JPL_train = pd.read_csv(file_path3).values
JPL_test=pd.read_csv(file_path4).values

#Data Processing

In [None]:
#Remove row number (in 1st column)
caltech_train=caltech_train[:,1:]
caltech_test=caltech_test[:,1:]

JPL_train=JPL_train[:,1:]
JPL_test=JPL_test[:,1:]

In [None]:
#Remove departure time (2nd column)
# Convert arrival date to hour and find day of the week
from datetime import datetime

def convert_time_and_day(data_array):
    """
    Converts the time from HH:MM to HH.XX format and appends the day of the week to it.
    Also, removes the second column.
    """
    transformed_data = []
    for row in data_array:
        # Convert the arrival time to HH.XX format
        time_obj = datetime.strptime(row[0], '%Y-%m-%d %H:%M:%S')
        new_time = time_obj.hour + (time_obj.minute / 60.0)

        # Convert the date to a day of the week
        day_of_week = time_obj.strftime('%A')
        new_time = str(new_time) + " " + day_of_week

        # Create a new row excluding the second column
        new_row = [new_time] + list(row[2:])
        transformed_data.append(new_row)

    return np.array(transformed_data)

caltech_train=convert_time_and_day(caltech_train)
caltech_test=convert_time_and_day(caltech_test)
JPL_train=convert_time_and_day(JPL_train)
JPL_test=convert_time_and_day(JPL_test)

In [None]:
def day_to_number(day):
    """Converts a day of the week to its corresponding discrete value."""
    days = {
        'Monday': 1,
        'Tuesday': 2,
        'Wednesday': 3,
        'Thursday': 4,
        'Friday': 5,
        'Saturday': 6,
        'Sunday': 7
    }
    return days[day]

def separate_time_and_day(data_array):
    """
    Separates the time and day in the given column,
    and converts the day into a discrete value between 1 and 7.

    """
    transformed_data = []
    for row in data_array:
        time_day_str = row[0]
        time, day = time_day_str.split()
        time = float(time)
        day_num = day_to_number(day)

        # Create a new row with separated time and day number
        new_row = [time, day_num] + list(row[1:])
        transformed_data.append(new_row)

    return np.array(transformed_data)

caltech_train=separate_time_and_day(caltech_train)
caltech_test=separate_time_and_day(caltech_test)
JPL_train=separate_time_and_day(JPL_train)
JPL_test=separate_time_and_day(JPL_test)

In [None]:
#Make training and testing set have the same user IDs
users_from_training_caltech = set(caltech_train[:, 3])
mask_caltech = np.isin(caltech_test[:, 3], list(users_from_training_caltech))
caltech_test = caltech_test[mask_caltech]
users_from_testing_caltech = set(caltech_test[:, 3])
mask_caltech = np.isin(caltech_train[:, 3], list(users_from_testing_caltech))
caltech_train = caltech_train[mask_caltech]

users_from_training = set(JPL_train[:, 3])
mask = np.isin(JPL_test[:, 3], list(users_from_training))
JPL_test = JPL_test[mask]
users_from_testing = set(JPL_test[:, 3])
mask = np.isin(JPL_train[:, 3], list(users_from_testing))
JPL_train = JPL_train[mask]

In [None]:
caltech_train = np.array(caltech_train, dtype='float')
caltech_test = np.array(caltech_test, dtype='float')
JPL_train = np.array(JPL_train, dtype='float')
JPL_test = np.array(JPL_test, dtype='float')

#MLR stay duration prediction

In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Sample training data

def mlr_model(train_data, test_data, user_id):

    # Filter training and testing data for the specific user
    user_train_data = train_data[train_data[:, 3] == user_id]
    user_test_data = test_data[test_data[:, 3] == user_id]

    # Independent variables are arrival time and day of the week
    X_train = user_train_data[:, :2]
    X_test = user_test_data[:, :2]

    # Dependent variable is the duration
    y_train = user_train_data[:, 4]
    y_test = user_test_data[:, 4]

    # Train the model
    model = LinearRegression().fit(X_train, y_train)

    # Predict on the test set
    y_pred = model.predict(X_test)

    # Calculate user SMAPE
    n = len(y_test)
    smape_val = (1/ n) * np.sum(np.abs(y_test - y_pred) / (np.abs(y_test+y_pred)))*100

    return model, smape_val


In [None]:
# Test the function
user_ids_JPL = np.unique(np.concatenate((JPL_train[:, 3], JPL_test[:, 3])))
smape_list_JPL=[]

for user_id in user_ids_JPL:
    model, smape = mlr_model(JPL_train, JPL_test, user_id)
    smape_list_JPL.append(smape)

#Calculate average SMAPE for JPL
no_JPL_users=len(user_ids_JPL)
JPL_smape=sum(smape_list_JPL)/no_JPL_users
print(f"Average SMAPE for JPL dataset using MLR: {JPL_smape}")

#Mode stay duration prediction

In [None]:
import numpy as np
import pandas as pd

# Define the file paths
file_path1 = '/content/drive/MyDrive/SuperUROP /Data Analysis/caltech_training_data.csv'
file_path2 = '/content/drive/MyDrive/SuperUROP /Data Analysis/caltech_testing_data.csv'
file_path3 = '/content/drive/MyDrive/SuperUROP /Data Analysis/JPL_training_data.csv'
file_path4  = '/content/drive/MyDrive/SuperUROP /Data Analysis/JPL_testing_data.csv'
# Use pandas to read the CSV files and then convert them to NumPy arrays
Caltech_data_training = pd.read_csv(file_path1).values
Caltech_data_testing = pd.read_csv(file_path2).values

JPL_training_data = pd.read_csv(file_path3).values
JPL_testing_data=pd.read_csv(file_path4).values

In [None]:
Caltech_data_training=Caltech_data_training[:,1:]
Caltech_data_testing=Caltech_data_testing[:,1:]

JPL_training_data=JPL_training_data[:,1:]
JPL_testing_data=JPL_testing_data[:,1:]

In [None]:
sorted_caltech_training = sorted(Caltech_data_training, key=lambda x: x[3])
sorted_caltech_training=np.delete(sorted_caltech_training, [1], axis=1)
sorted_caltech_testing = sorted(Caltech_data_testing, key=lambda x: x[3])
sorted_caltech_testing=np.delete(sorted_caltech_testing, [1], axis=1)


sorted_JPL_training = sorted(JPL_training_data, key=lambda x: x[3])
sorted_JPL_training=np.delete(sorted_JPL_training, [1], axis=1)
sorted_JPL_testing = sorted(JPL_testing_data, key=lambda x: x[3])
sorted_JPL_testing=np.delete(sorted_JPL_testing, [1], axis=1)


In [None]:
def convert_to_interval(datetime_str):
    # Extract time string from the datetime string
    time_str = datetime_str.split()[1]

    # Convert time string into hours, minutes, and seconds
    hours, minutes, seconds = map(int, time_str.split(':'))

    # Calculate total minutes
    total_minutes = hours * 60 + minutes

    # Find the nearest lower multiple of 15 for the minutes
    interval_minutes = (total_minutes // 15) * 15

    # Convert back to hour and minute
    interval_hour, interval_minute = divmod(interval_minutes, 60)

    # Return as a formatted string
    return "{:02d}:{:02d}".format(interval_hour, interval_minute)

# Apply the conversion to the data
sorted_caltech_training[:, 0] = np.vectorize(convert_to_interval)(sorted_caltech_training[:, 0])
sorted_caltech_testing[:, 0] = np.vectorize(convert_to_interval)(sorted_caltech_testing[:, 0])

sorted_JPL_training[:, 0] = np.vectorize(convert_to_interval)(sorted_JPL_training[:, 0])
sorted_JPL_testing[:, 0] = np.vectorize(convert_to_interval)(sorted_JPL_testing[:, 0])

In [None]:
sorted_caltech_training[:, 3] = sorted_caltech_training[:, 3].astype(float) * 60
sorted_caltech_training[:, 3] = np.round(sorted_caltech_training[:, 3].astype(float) / 10) * 10
sorted_caltech_testing[:, 3] = sorted_caltech_testing[:, 3].astype(float) * 60
sorted_caltech_testing[:, 3] = np.round(sorted_caltech_testing[:, 3].astype(float) / 10) * 10

sorted_JPL_training[:, 3] = sorted_JPL_training[:, 3].astype(float) * 60
sorted_JPL_training[:, 3] = np.round(sorted_JPL_training[:, 3].astype(float) / 10) * 10
sorted_JPL_testing[:, 3] = sorted_JPL_testing[:, 3].astype(float) * 60
sorted_JPL_testing[:, 3] = np.round(sorted_JPL_testing[:, 3].astype(float) / 10) * 10

In [None]:
#Make training and testing set have the same user IDs
users_from_training_caltech = set(sorted_caltech_training[:, 2])
mask_caltech = np.isin(sorted_caltech_testing[:, 2], list(users_from_training_caltech))
sorted_caltech_testing = sorted_caltech_testing[mask_caltech]
users_from_testing_caltech = set(sorted_caltech_testing[:, 2])
mask_caltech = np.isin(sorted_caltech_training[:, 2], list(users_from_testing_caltech))
sorted_caltech_training = sorted_caltech_training[mask_caltech]

users_from_training = set(sorted_JPL_training[:, 2])
mask = np.isin(sorted_JPL_testing[:, 2], list(users_from_training))
sorted_JPL_testing = sorted_JPL_testing[mask]

users_from_testing = set(sorted_JPL_testing[:, 2])
mask = np.isin(sorted_JPL_training[:, 2], list(users_from_testing))
sorted_JPL_training = sorted_JPL_training[mask]

In [None]:
# Convert time strings to total minutes for sorting
total_minutes_caltech_training = np.array([int(time.split(':')[0])*60 + int(time.split(':')[1]) for time in sorted_caltech_training[:, 0]])
total_minutes_caltech_testing = np.array([int(time.split(':')[0])*60 + int(time.split(':')[1]) for time in sorted_caltech_testing[:, 0]])

total_minutes_JPL_training = np.array([int(time.split(':')[0])*60 + int(time.split(':')[1]) for time in sorted_JPL_training[:, 0]])
total_minutes_JPL_testing = np.array([int(time.split(':')[0])*60 + int(time.split(':')[1]) for time in sorted_JPL_testing[:, 0]])

# Argsort first by the 3rd column and then by total_minutes
indices_caltech_training = np.lexsort((total_minutes_caltech_training, sorted_caltech_training[:, 2].astype(int)))
indices_caltech_testing = np.lexsort((total_minutes_caltech_testing, sorted_caltech_testing[:, 2].astype(int)))

indices_JPL_training = np.lexsort((total_minutes_JPL_training, sorted_JPL_training[:, 2].astype(int)))
indices_JPL_testing = np.lexsort((total_minutes_JPL_testing, sorted_JPL_testing[:, 2].astype(int)))
# Use the sorted indices to reorder the array
sorted_data_caltech_training = sorted_caltech_training[indices_caltech_training]
sorted_data_caltech_testing = sorted_caltech_testing[indices_caltech_testing]

sorted_data_JPL_training = sorted_JPL_training[indices_JPL_training]
sorted_data_JPL_testing = sorted_JPL_testing[indices_JPL_testing]

In [None]:
#Mode calculation with linear interpolation
from collections import Counter
def compute_mode_per_user_per_interval(data):
    """Compute the mode per user per time interval with linear interpolation."""
    user_groups = {}
    mode_per_user_per_interval = {}

    # Group by user
    for row in data:
        if row[2] not in user_groups:
            user_groups[row[2]] = []
        user_groups[row[2]].append(row)

    # Calculate modes per user
    for user, entries in user_groups.items():
        time_intervals = sorted(list(set([entry[0] for entry in entries])))
        modes = {}
        for interval in time_intervals:
            session_values = [entry[3] for entry in entries if entry[0] == interval]
            counts = Counter(session_values)
            highest_freq = max(counts.values())
            common_vals = [key for key, val in counts.items() if val == highest_freq]

            # If there's a single mode, use it; else, average them
            modes[interval] = sum(common_vals) / len(common_vals)

        # Linearly interpolate for missing modes
        all_intervals = sorted(list(set(data[:, 0])))
        for i, interval in enumerate(all_intervals):
            if interval not in modes:
                # Find previous and next known modes
                prev_mode = next((modes[prev_int] for prev_int in reversed(all_intervals[:i]) if prev_int in modes), None)
                next_mode = next((modes[next_int] for next_int in all_intervals[i+1:] if next_int in modes), None)

                # If both previous and next modes exist, interpolate
                if prev_mode is not None and next_mode is not None:
                    gap_size = all_intervals[i+1:].index(next((next_int for next_int in all_intervals[i+1:] if next_int in modes))) + 1
                    increment = (next_mode - prev_mode) / (gap_size + 1)
                    modes[interval] = prev_mode + increment

                # If no next mode exists, keep the mode same as the previous mode
                elif prev_mode is not None:
                    modes[interval] = prev_mode

                # If no previous mode exists, keep the mode same as the next mode
                elif next_mode is not None:
                    modes[interval] = next_mode

        mode_per_user_per_interval[user] = modes

    return mode_per_user_per_interval

# Update mode_per_user_per_interval using the new function
mode_per_user_per_interval_caltech_interpolate = compute_mode_per_user_per_interval(sorted_data_caltech_training)
mode_per_user_per_interval_JPL_interpolate = compute_mode_per_user_per_interval(sorted_data_JPL_training)

In [None]:
mode_per_user_per_interval_JPL_interpolate

In [None]:
import numpy as np

def calculate_smape_user_mode(y_true, y_pred):
    """Compute the SMAPE"""
    n = len(y_true)
    if n == 0:
        return 0
    smape_val = (1 / n) * np.sum(np.abs(y_true - y_pred) / (np.abs(y_true + y_pred))) * 100
    return smape_val

user_data = {}  # Dictionary to store per-user data
user_smape_mode = {}  # Dictionary to store per-user SMAPE

for row in sorted_data_JPL_testing:
    user_id = row[2]  # Assuming user_id is at index 2
    time_slot = row[0]  # Assuming time_slot is at index 0

    if row[0] in mode_per_user_per_interval_JPL_interpolate.get(row[2], {}):
        true_value = row[3]
        user_prediction=mode_per_user_per_interval_JPL_interpolate[row[2]][row[0]]

        if user_id not in user_data:
            user_data[user_id] = {'y_true': [], 'y_pred': []}

        user_data[user_id]['y_true'].append(true_value)
        user_data[user_id]['y_pred'].append(user_prediction)

In [None]:
# Calculating SMAPE for each user
for user_id, data in user_data.items():
    y_true_user_mode = np.array(data['y_true'], dtype=float)
    y_pred_user_mode = np.array(data['y_pred'], dtype=float)
    smape = calculate_smape_user_mode(y_true_user_mode, y_pred_user_mode)
    user_smape_mode[user_id] = smape

# Calculating the average SMAPE across all users
average_smape = np.mean(list(user_smape_mode.values())) if user_smape_mode else 0
print(f'Average SMAPE for JPL dataset using mode: {average_smape:.2f}%')

#Ensemble ML stay duration prediction (threshold of 4)

In [None]:
R_SD={'169': 2.2551554266668488,
 '171': 3.7531759242694362,
 '176': 3.033506931058353,
 '220': 4.157056240805807,
 '322': 2.929926450352931,
 '334': 1.5403402896510696,
 '335': 4.182440356147159,
 '346': 4.63088280381524,
 '365': 4.612058546331858,
 '368': 3.8637314436567745,
 '372': 3.4779871035895753,
 '374': 3.7627475450103334,
 '378': 3.61759981491824,
 '382': 4.358454020839778,
 '404': 4.285879042160351,
 '405': 4.760574930778141,
 '406': 1.1510712960433436,
 '409': 3.3539069505133905,
 '410': 4.301005502712871,
 '416': 4.199233283013298,
 '436': 4.091891479434165,
 '444': 3.529125348909349,
 '458': 4.060001379997106,
 '467': 4.019928954004856,
 '474': 3.2450469879409285,
 '476': 4.393288833959378,
 '481': 4.429827240449028,
 '483': 3.079068807362642,
 '507': 4.336149365824373,
 '526': 2.4205249888839466,
 '531': 3.9517493146670986,
 '537': 3.1479021761196098,
 '551': 4.881572582872202,
 '553': 3.0305502893037675,
 '576': 3.628599820757222,
 '577': 4.319531561288392,
 '581': 3.4796093044547454,
 '592': 3.1009000797824084,
 '607': 4.831651402991986,
 '651': 3.6013651820057837,
 '726': 3.7379880870161575,
 '742': 4.150297519178871,
 '826': 3.710092899316145,
 '933': 4.41718644718856}

R_SD = {int(key): value for key, value in R_SD.items()}

In [None]:
import numpy as np

# Assuming R_DE, JPL_train, JPL_test, mlr_model, and dkde_model are already defined

user_smape_combined = {}  # Dictionary to store SMAPE values for all users

# Loop through each user ID in the JPL dataset
for user_id in user_ids_JPL:
    if R_SD.get(user_id, 0) > 3.7:
        # Use mlr_model for users with R_SD >3.7 (put 4 in the writing - with 4 get 10.71%)
        model, smape = mlr_model(JPL_train, JPL_test, user_id)
    else:
        smape=user_smape_mode.get(user_id, 0)

    user_smape_combined[user_id] = smape  # Store the SMAPE in the dictionary

# Calculate the overall average SMAPE for the entire dataset
average_smape_combined = np.mean(list(user_smape_combined.values()))

print(f"Average SMAPE for JPL dataset using ensemble approach with threshold of 4: {average_smape_combined:.2f}%")


#Enseble ML stay duration prediction (threshold of 4.5)

In [None]:
import numpy as np

# Assuming R_DE, JPL_train, JPL_test, mlr_model, and dkde_model are already defined

user_smape_combined = {}  # Dictionary to store SMAPE values for all users

# Loop through each user ID in the JPL dataset
for user_id in user_ids_JPL:
    if R_SD.get(user_id, 0) < 4.5:
        # Use mlr_model for users with R_SD >3.7 (put 4 in the writing - with 4 get 10.71%)
        model, smape = mlr_model(JPL_train, JPL_test, user_id)
    else:
        smape=user_smape_mode.get(user_id, 0)


    user_smape_combined[user_id] = smape  # Store the SMAPE in the dictionary

# Calculate the overall average SMAPE for the entire dataset
average_smape_combined = np.mean(list(user_smape_combined.values()))

print(f"Average SMAPE for JPL dataset using ensemble approach with threshold of 4.5: {average_smape_combined:.2f}%")


#Ensemble ML stay duration prediction (threshold of 3)

In [None]:
! pip install KDE-diffusion

In [None]:
from sklearn.preprocessing import StandardScaler
from kde_diffusion import kde2d

def dkde_model(train_data,test_data,user_id):
    # Filter training and testing data for the specific user
    user_train_data = train_data[train_data[:, 3] == user_id]
    user_test_data = test_data[test_data[:, 3] == user_id]

    # Extract columns for arrival time and stay duration for both train and test sets
    X_train = user_train_data[:, 0]  # Arrival time
    Y_train = user_train_data[:, 4] # Stay duration

    X_test = user_test_data[:, 0]
    y_test = user_test_data[:, 4]

    from kde_diffusion import kde2d
    (density, grid, bandwidth) = kde2d(X_train, Y_train, n=128, limits=None)

    def predict_y_given_x(new_x, grid, density):
        # Find the closest x index
        x_idx = np.argmin(np.abs(grid[0] - new_x))

        # Get the y values and their corresponding densities for the given x
        y_values = grid[1]
        y_densities = density[x_idx]

        # Find the y with the maximum density
        predicted_y = y_values[np.argmax(y_densities)]

        return predicted_y

    # Predict y values for X_test using the density estimate
    y_pred = [predict_y_given_x(x_val, grid, density) for x_val in X_test]

    # Calculate user SMAPE
    n = len(y_test)
    smape_val = (1/ n) * np.sum(np.abs(y_test - y_pred) / (np.abs(y_test+y_pred)))*100

    return smape_val,density, grid



In [None]:
# Initialize dictionaries to store user-wise SMAPE values
user_smape_JPL = {}

# Loop through each user ID in the JPL dataset
for user_id in user_ids_JPL:
    smape, density, grid = dkde_model(JPL_train, JPL_test, user_id)
    user_smape_JPL[user_id] = smape  # Store the SMAPE in the dictionary

average_smape_JPL = np.mean(list(user_smape_JPL.values()))

print(f"Average SMAPE for JPL dataset using DKDE: {average_smape_JPL:.2f}%")

In [None]:
import numpy as np

# Assuming R_DE, JPL_train, JPL_test, mlr_model, and dkde_model are already defined

user_smape_combined = {}  # Dictionary to store SMAPE values for all users

# Loop through each user ID in the JPL dataset
for user_id in user_ids_JPL:
    if R_SD.get(user_id, 0) < 3:
        # Use mlr_model for users with R_SD >3.7 (put 4 in the writing - with 4 get 10.71%)
        smape, density, grid = dkde_model(JPL_train, JPL_test, user_id)
    else:
        smape=user_smape_mode.get(user_id, 0)


    user_smape_combined[user_id] = smape  # Store the SMAPE in the dictionary

# Calculate the overall average SMAPE for the entire dataset
average_smape_combined = np.mean(list(user_smape_combined.values()))

print(f"Average SMAPE for JPL dataset using ensemble approach with threshold of 3: {average_smape_combined:.2f}%")
