In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
import pandas as pd

# Define the file paths
file_path1 = '/content/drive/MyDrive/SuperUROP /Data Analysis/caltech_training_data.csv'
file_path2 = '/content/drive/MyDrive/SuperUROP /Data Analysis/caltech_testing_data.csv'
file_path3 = '/content/drive/MyDrive/SuperUROP /Data Analysis/JPL_training_data.csv'
file_path4  = '/content/drive/MyDrive/SuperUROP /Data Analysis/JPL_testing_data.csv'
# Use pandas to read the CSV files and then convert them to NumPy arrays
caltech_train = pd.read_csv(file_path1).values
caltech_test = pd.read_csv(file_path2).values

JPL_train = pd.read_csv(file_path3).values
JPL_test=pd.read_csv(file_path4).values

#Data Processing for MLR

In [None]:
#Remove row number (in 1st column)
caltech_train=caltech_train[:,1:]
caltech_test=caltech_test[:,1:]

JPL_train=JPL_train[:,1:]
JPL_test=JPL_test[:,1:]

In [None]:
#Remove departure time (2nd column)
# Convert arrival date to hour and find day of the week
from datetime import datetime

def convert_time_and_day(data_array):
    """
    Converts the time from HH:MM to HH.XX format and appends the day of the week to it.
    Also, removes the second column.
    """
    transformed_data = []
    for row in data_array:
        # Convert the arrival time to HH.XX format
        time_obj = datetime.strptime(row[0], '%Y-%m-%d %H:%M:%S')
        new_time = time_obj.hour + (time_obj.minute / 60.0)

        # Convert the date to a day of the week
        day_of_week = time_obj.strftime('%A')
        new_time = str(new_time) + " " + day_of_week

        # Create a new row excluding the second column
        new_row = [new_time] + list(row[2:])
        transformed_data.append(new_row)

    return np.array(transformed_data)

caltech_train=convert_time_and_day(caltech_train)
caltech_test=convert_time_and_day(caltech_test)
JPL_train=convert_time_and_day(JPL_train)
JPL_test=convert_time_and_day(JPL_test)

In [None]:
def day_to_number(day):
    """Converts a day of the week to its corresponding discrete value."""
    days = {
        'Monday': 1,
        'Tuesday': 2,
        'Wednesday': 3,
        'Thursday': 4,
        'Friday': 5,
        'Saturday': 6,
        'Sunday': 7
    }
    return days[day]

def separate_time_and_day(data_array):
    """
    Separates the time and day in the given column,
    and converts the day into a discrete value between 1 and 7.

    """
    transformed_data = []
    for row in data_array:
        time_day_str = row[0]
        time, day = time_day_str.split()
        time = float(time)
        day_num = day_to_number(day)

        # Create a new row with separated time and day number
        new_row = [time, day_num] + list(row[1:])
        transformed_data.append(new_row)

    return np.array(transformed_data)

caltech_train=separate_time_and_day(caltech_train)
caltech_test=separate_time_and_day(caltech_test)
JPL_train=separate_time_and_day(JPL_train)
JPL_test=separate_time_and_day(JPL_test)

In [None]:
#Make training and testing set have the same user IDs
users_from_training_caltech = set(caltech_train[:, 3])
mask_caltech = np.isin(caltech_test[:, 3], list(users_from_training_caltech))
caltech_test = caltech_test[mask_caltech]
users_from_testing_caltech = set(caltech_test[:, 3])
mask_caltech = np.isin(caltech_train[:, 3], list(users_from_testing_caltech))
caltech_train = caltech_train[mask_caltech]

users_from_training = set(JPL_train[:, 3])
mask = np.isin(JPL_test[:, 3], list(users_from_training))
JPL_test = JPL_test[mask]
users_from_testing = set(JPL_test[:, 3])
mask = np.isin(JPL_train[:, 3], list(users_from_testing))
JPL_train = JPL_train[mask]

In [None]:
caltech_train = np.array(caltech_train, dtype='float')
caltech_test = np.array(caltech_test, dtype='float')
JPL_train = np.array(JPL_train, dtype='float')
JPL_test = np.array(JPL_test, dtype='float')

#MLR Duration Prediction

In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Sample training data

def train_and_test_user_model(train_data, test_data, user_id):

    # Filter training and testing data for the specific user
    user_train_data = train_data[train_data[:, 3] == user_id]
    user_test_data = test_data[test_data[:, 3] == user_id]

    # Independent variables are arrival time and day of the week
    X_train = user_train_data[:, :2]
    X_test = user_test_data[:, :2]

    # Dependent variable is the duration
    y_train = user_train_data[:, 4]
    y_test = user_test_data[:, 4]

    # Train the model
    model = LinearRegression().fit(X_train, y_train)

    # Predict on the test set
    y_pred = model.predict(X_test)

    # Calculate user SMAPE
    n = len(y_test)
    smape_val = (1/ n) * np.sum(np.abs(y_test - y_pred) / (np.abs(y_test+y_pred)))*100

    return model, smape_val


In [None]:
# Test the function
user_ids_caltech = np.unique(np.concatenate((caltech_train[:, 3], caltech_test[:, 3])))
user_ids_JPL = np.unique(np.concatenate((JPL_train[:, 3], JPL_test[:, 3])))

smape_list_caltech=[]
smape_list_JPL=[]
for user_id in user_ids_caltech:
    model, smape = train_and_test_user_model(caltech_train, caltech_test, user_id)
    smape_list_caltech.append(smape)

for user_id in user_ids_JPL:
    model, smape = train_and_test_user_model(JPL_train, JPL_test, user_id)
    smape_list_JPL.append(smape)



In [None]:
#Calculate average SMAPE for each location
no_caltech_users=len(user_ids_caltech)
caltech_smape=sum(smape_list_caltech)/no_caltech_users

no_JPL_users=len(user_ids_JPL)
JPL_smape=sum(smape_list_JPL)/no_JPL_users

print(f"Caltech SMAPE: {caltech_smape}")
print(f"JPL SMAPE: {JPL_smape}")

#Calculate SMAPE for different thresholds

In [None]:
# Initialize dictionaries to store user-wise SMAPE values
user_smape_JPL = {}

# Loop through each user ID in the JPL dataset
for user_id in user_ids_JPL:
    model, smape = train_and_test_user_model(JPL_train, JPL_test, user_id)
    user_smape_JPL[user_id] = smape  # Store the SMAPE in the dictionary

average_smape_JPL = np.mean(list(user_smape_JPL.values()))

print(f"Average SMAPE for JPL dataset: {average_smape_JPL:.2f}%")

In [None]:
R_SD={'169': 2.2551554266668488,
 '171': 3.7531759242694362,
 '176': 3.033506931058353,
 '220': 4.157056240805807,
 '322': 2.929926450352931,
 '334': 1.5403402896510696,
 '335': 4.182440356147159,
 '346': 4.63088280381524,
 '365': 4.612058546331858,
 '368': 3.8637314436567745,
 '372': 3.4779871035895753,
 '374': 3.7627475450103334,
 '378': 3.61759981491824,
 '382': 4.358454020839778,
 '404': 4.285879042160351,
 '405': 4.760574930778141,
 '406': 1.1510712960433436,
 '409': 3.3539069505133905,
 '410': 4.301005502712871,
 '416': 4.199233283013298,
 '436': 4.091891479434165,
 '444': 3.529125348909349,
 '458': 4.060001379997106,
 '467': 4.019928954004856,
 '474': 3.2450469879409285,
 '476': 4.393288833959378,
 '481': 4.429827240449028,
 '483': 3.079068807362642,
 '507': 4.336149365824373,
 '526': 2.4205249888839466,
 '531': 3.9517493146670986,
 '537': 3.1479021761196098,
 '551': 4.881572582872202,
 '553': 3.0305502893037675,
 '576': 3.628599820757222,
 '577': 4.319531561288392,
 '581': 3.4796093044547454,
 '592': 3.1009000797824084,
 '607': 4.831651402991986,
 '651': 3.6013651820057837,
 '726': 3.7379880870161575,
 '742': 4.150297519178871,
 '826': 3.710092899316145,
 '933': 4.41718644718856}

R_SD = {int(key): value for key, value in R_SD.items()}

In [None]:
import numpy as np

# Initialize dictionaries to store user-wise SMAPE values
user_smape_JPL_less_than_3_5 = {}
user_smape_JPL_greater_than_3_5 = {}

# Loop through each user ID in the JPL dataset
for user_id in user_ids_JPL:
    model, smape = train_and_test_user_model(JPL_train, JPL_test, user_id)

    if R_SD.get(user_id, 0) < 3.5:
        user_smape_JPL_less_than_3_5[user_id] = smape
    else:
        user_smape_JPL_greater_than_3_5[user_id] = smape
        print(f'Above 3.5: User {user_id} SMAPE: {smape:.2f}%')

# Calculate the average SMAPE for users with ratio < 3.5
average_smape_JPL_less_than_3_5 = np.mean(list(user_smape_JPL_less_than_3_5.values())) if user_smape_JPL_less_than_3_5 else 0

# Calculate the average SMAPE for users with ratio > 3.5
average_smape_JPL_greater_than_3_5 = np.mean(list(user_smape_JPL_greater_than_3_5.values())) if user_smape_JPL_greater_than_3_5 else 0

print(f"Average SMAPE for JPL dataset (Ratio < 3.5): {average_smape_JPL_less_than_3_5:.2f}%")
print(f"Average SMAPE for JPL dataset (Ratio > 3.5): {average_smape_JPL_greater_than_3_5:.2f}%")


In [None]:
import numpy as np

# Initialize dictionaries to store user-wise SMAPE values
user_smape_JPL_less_than_4 = {}
user_smape_JPL_greater_than_4 = {}

# Loop through each user ID in the JPL dataset
for user_id in user_ids_JPL:
    model, smape = train_and_test_user_model(JPL_train, JPL_test, user_id)

    if R_SD.get(user_id, 0) < 4.5:
        user_smape_JPL_less_than_4[user_id] = smape
    else:
        user_smape_JPL_greater_than_4[user_id] = smape
        print(f'Above 4: User {user_id} SMAPE: {smape:.2f}%')

# Calculate the average SMAPE for users with ratio < 4
average_smape_JPL_less_than_4 = np.mean(list(user_smape_JPL_less_than_4.values())) if user_smape_JPL_less_than_4 else 0

# Calculate the average SMAPE for users with ratio > 4
average_smape_JPL_greater_than_4 = np.mean(list(user_smape_JPL_greater_than_4.values())) if user_smape_JPL_greater_than_4 else 0

print(f"Average SMAPE for JPL dataset (Ratio < 4): {average_smape_JPL_less_than_4:.2f}%")
print(f"Average SMAPE for JPL dataset (Ratio > 4): {average_smape_JPL_greater_than_4:.2f}%")
