In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
import pandas as pd

# Define the file paths
file_path1 = '/content/drive/MyDrive/SuperUROP /Data Analysis/caltech_training_data.csv'
file_path2 = '/content/drive/MyDrive/SuperUROP /Data Analysis/caltech_testing_data.csv'
file_path3 = '/content/drive/MyDrive/SuperUROP /Data Analysis/JPL_training_data.csv'
file_path4  = '/content/drive/MyDrive/SuperUROP /Data Analysis/JPL_testing_data.csv'
# Use pandas to read the CSV files and then convert them to NumPy arrays
caltech_train = pd.read_csv(file_path1).values
caltech_test = pd.read_csv(file_path2).values

JPL_train = pd.read_csv(file_path3).values
JPL_test=pd.read_csv(file_path4).values

#Data Processing for Decision Tree

In [None]:
#Remove row number (in 1st column)
caltech_train=caltech_train[:,1:]
caltech_test=caltech_test[:,1:]

JPL_train=JPL_train[:,1:]
JPL_test=JPL_test[:,1:]

In [None]:
#Remove departure time (2nd column)
# Convert arrival date to hour and find day of the week
from datetime import datetime

def convert_time_and_day(data_array):
    """
    Converts the time from HH:MM to HH.XX format and appends the day of the week to it.
    Also, removes the second column.
    """
    transformed_data = []
    for row in data_array:
        # Convert the arrival time to HH.XX format
        time_obj = datetime.strptime(row[0], '%Y-%m-%d %H:%M:%S')
        new_time = time_obj.hour + (time_obj.minute / 60.0)

        # Convert the date to a day of the week
        day_of_week = time_obj.strftime('%A')
        new_time = str(new_time) + " " + day_of_week

        # Create a new row excluding the second column
        new_row = [new_time] + list(row[2:])
        transformed_data.append(new_row)

    return np.array(transformed_data)

caltech_train=convert_time_and_day(caltech_train)
caltech_test=convert_time_and_day(caltech_test)
JPL_train=convert_time_and_day(JPL_train)
JPL_test=convert_time_and_day(JPL_test)

In [None]:
def day_to_number(day):
    """Converts a day of the week to its corresponding discrete value."""
    days = {
        'Monday': 1,
        'Tuesday': 2,
        'Wednesday': 3,
        'Thursday': 4,
        'Friday': 5,
        'Saturday': 6,
        'Sunday': 7
    }
    return days[day]

def separate_time_and_day(data_array):
    """
    Separates the time and day in the given column,
    and converts the day into a discrete value between 1 and 7.

    """
    transformed_data = []
    for row in data_array:
        time_day_str = row[0]
        time, day = time_day_str.split()
        time = float(time)
        day_num = day_to_number(day)

        # Create a new row with separated time and day number
        new_row = [time, day_num] + list(row[1:])
        transformed_data.append(new_row)

    return np.array(transformed_data)

caltech_train=separate_time_and_day(caltech_train)
caltech_test=separate_time_and_day(caltech_test)
JPL_train=separate_time_and_day(JPL_train)
JPL_test=separate_time_and_day(JPL_test)

In [None]:
#Make training and testing set have the same user IDs
users_from_training_caltech = set(caltech_train[:, 3])
mask_caltech = np.isin(caltech_test[:, 3], list(users_from_training_caltech))
caltech_test = caltech_test[mask_caltech]
users_from_testing_caltech = set(caltech_test[:, 3])
mask_caltech = np.isin(caltech_train[:, 3], list(users_from_testing_caltech))
caltech_train = caltech_train[mask_caltech]

users_from_training = set(JPL_train[:, 3])
mask = np.isin(JPL_test[:, 3], list(users_from_training))
JPL_test = JPL_test[mask]
users_from_testing = set(JPL_test[:, 3])
mask = np.isin(JPL_train[:, 3], list(users_from_testing))
JPL_train = JPL_train[mask]

In [None]:
caltech_train = np.array(caltech_train, dtype='float')
caltech_test = np.array(caltech_test, dtype='float')
JPL_train = np.array(JPL_train, dtype='float')
JPL_test = np.array(JPL_test, dtype='float')

#Predict stay duration using K-fold CV & Grid Search

In [None]:
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

def perform_grid_search_decision_tree(train_data, test_data, used_id, depth_range=(1, 21), min_samples_split_range=(2, 11), cv_folds=5):
    """
    Perform grid search with cross-validation to find optimal hyperparameters for Decision Tree regression.

    Parameters:
    - train_data: Training dataset with the format [sample, features]
    - test_data: Testing dataset with the same format as train_data
    - depth_range: Tuple representing the range of max_depth values to test (default is (1, 21))
    - min_samples_split_range: Tuple representing the range of min_samples_split values to test (default is (2, 11))
    - cv_folds: Number of cross-validation folds (default is 5)

    Returns:
    - y_pred: Predictions on the test set
    - best_params: Best hyperparameters found
    """

    # Filter training and testing data for the specific user
    user_train_data = train_data[train_data[:, 3] == user_id]
    user_test_data = test_data[test_data[:, 3] == user_id]

    # Extract columns for arrival time and stay duration for both train and test sets
    X_train = user_train_data[:, 0].reshape(-1, 1)  # Arrival time
    y_train = user_train_data[:, 4]  # Stay duration

    X_test = user_test_data[:, 0].reshape(-1, 1)
    y_test = user_test_data[:, 4]

    # Set up grid search with cross-validation for Decision Tree
    param_grid = {
        'max_depth': list(range(depth_range[0], depth_range[1])),
        'min_samples_split': list(range(min_samples_split_range[0], min_samples_split_range[1]))
    }
    dt = DecisionTreeRegressor()
    grid_search = GridSearchCV(dt, param_grid, cv=cv_folds, scoring='neg_mean_squared_error', return_train_score=True)
    grid_search.fit(X_train, y_train)

    # Train the model using the optimal hyperparameters
    best_dt = grid_search.best_estimator_

    # Predict on the test set
    y_pred = best_dt.predict(X_test)

    # Best hyperparameters
    best_params = grid_search.best_params_

    # Calculate user SMAPE
    n = len(y_test)
    smape_val = (1/ n) * np.sum(np.abs(y_test - y_pred) / (np.abs(y_test+y_pred)))*100

    return best_params,smape_val




In [None]:
# Test the function
user_ids_caltech = np.unique(np.concatenate((caltech_train[:, 3], caltech_test[:, 3])))
user_ids_JPL = np.unique(np.concatenate((JPL_train[:, 3], JPL_test[:, 3])))

smape_list_caltech=[]
smape_list_JPL=[]
best_params_caltech=[]
best_params_JPL=[]
# for user_id in user_ids_caltech:
#     best_params,smape = perform_grid_search_decision_tree(caltech_train, caltech_test, user_id)
#     smape_list_caltech.append(smape)
#     best_params_caltech.append(best_params)
for user_id in user_ids_JPL:
    best_params, smape = perform_grid_search_decision_tree(JPL_train, JPL_test, user_id)
    smape_list_JPL.append(smape)
    best_params_JPL.append(best_params)

In [None]:
best_params_JPL

In [None]:
#Calculate average SMAPE for each location
no_caltech_users=len(user_ids_caltech)
caltech_smape=sum(smape_list_caltech)/no_caltech_users

no_JPL_users=len(user_ids_JPL)
JPL_smape=sum(smape_list_JPL)/no_JPL_users

print(f"Caltech SMAPE (K-fold CV): {caltech_smape}")
print(f"JPL SMAPE (K-fold CV): {JPL_smape}")

##Predict stay duration using Leave-One-Out CV & Grid Search

In [None]:
import numpy as np
from sklearn.model_selection import GridSearchCV,LeaveOneOut
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

def perform_grid_search_decision_tree(train_data, test_data, used_id, depth_range=(1, 21), min_samples_split_range=(2, 11)):
    """
    Perform grid search with cross-validation to find optimal hyperparameters for Decision Tree regression.

    Parameters:
    - train_data: Training dataset with the format [sample, features]
    - test_data: Testing dataset with the same format as train_data
    - depth_range: Tuple representing the range of max_depth values to test (default is (1, 21))
    - min_samples_split_range: Tuple representing the range of min_samples_split values to test (default is (2, 11))
    - cv_folds: Number of cross-validation folds (default is 5)

    Returns:
    - y_pred: Predictions on the test set
    - best_params: Best hyperparameters found
    """

    # Filter training and testing data for the specific user
    user_train_data = train_data[train_data[:, 3] == user_id]
    user_test_data = test_data[test_data[:, 3] == user_id]

    # Extract columns for arrival time and stay duration for both train and test sets
    X_train = user_train_data[:, 0].reshape(-1, 1)  # Arrival time
    y_train = user_train_data[:, 4]  # Stay duration

    X_test = user_test_data[:, 0].reshape(-1, 1)
    y_test = user_test_data[:, 4]

    # Set up grid search with cross-validation for Decision Tree
    param_grid = {
        'max_depth': list(range(depth_range[0], depth_range[1])),
        'min_samples_split': list(range(min_samples_split_range[0], min_samples_split_range[1]))
    }

    # Use LeaveOneOut for cross-validation
    loo = LeaveOneOut()

    dt = DecisionTreeRegressor()
    grid_search = GridSearchCV(dt, param_grid, cv=loo, scoring='neg_mean_squared_error', return_train_score=True)
    grid_search.fit(X_train, y_train)

    # Train the model using the optimal hyperparameters
    best_dt = grid_search.best_estimator_

    # Predict on the test set
    y_pred = best_dt.predict(X_test)

    # Best hyperparameters
    best_params = grid_search.best_params_

    # Calculate user SMAPE
    n = len(y_test)
    smape_val = (1/ n) * np.sum(np.abs(y_test - y_pred) / (np.abs(y_test+y_pred)))*100

    return best_params,smape_val




In [None]:
# Test the function
user_ids_caltech = np.unique(np.concatenate((caltech_train[:, 3], caltech_test[:, 3])))
user_ids_JPL = np.unique(np.concatenate((JPL_train[:, 3], JPL_test[:, 3])))

smape_list_caltech=[]
smape_list_JPL=[]
best_params_caltech=[]
best_params_JPL=[]
for user_id in user_ids_caltech:
    best_params,smape = perform_grid_search_decision_tree(caltech_train, caltech_test, user_id)
    smape_list_caltech.append(smape)
    best_params_caltech.append(best_params)
for user_id in user_ids_JPL:
    best_params, smape = perform_grid_search_decision_tree(JPL_train, JPL_test, user_id)
    smape_list_JPL.append(smape)
    best_params_JPL.append(best_params)

In [None]:
#Calculate average SMAPE for each location
no_caltech_users=len(user_ids_caltech)
caltech_smape=sum(smape_list_caltech)/no_caltech_users

no_JPL_users=len(user_ids_JPL)
JPL_smape=sum(smape_list_JPL)/no_JPL_users

print(f"Caltech SMAPE (Leave One Out CV): {caltech_smape}")
print(f"JPL SMAPE (Leave One Out CV): {JPL_smape}")

#Plot SMAPE vs R_sd for all users (JPL)

In [None]:
# Initialize dictionaries to store user-wise SMAPE values
user_smape_JPL = {}

# Loop through each user ID in the JPL dataset
for user_id in user_ids_JPL:
    best_params,smape = perform_grid_search_decision_tree(JPL_train, JPL_test, user_id)
    user_smape_JPL[user_id] = smape  # Store the SMAPE in the dictionary

average_smape_JPL = np.mean(list(user_smape_JPL.values()))

print(f"Average SMAPE for JPL dataset: {average_smape_JPL:.2f}%")

In [None]:
R_SD={'169': 2.2551554266668488,
 '171': 3.7531759242694362,
 '176': 3.033506931058353,
 '220': 4.157056240805807,
 '322': 2.929926450352931,
 '334': 1.5403402896510696,
 '335': 4.182440356147159,
 '346': 4.63088280381524,
 '365': 4.612058546331858,
 '368': 3.8637314436567745,
 '372': 3.4779871035895753,
 '374': 3.7627475450103334,
 '378': 3.61759981491824,
 '382': 4.358454020839778,
 '404': 4.285879042160351,
 '405': 4.760574930778141,
 '406': 1.1510712960433436,
 '409': 3.3539069505133905,
 '410': 4.301005502712871,
 '416': 4.199233283013298,
 '436': 4.091891479434165,
 '444': 3.529125348909349,
 '458': 4.060001379997106,
 '467': 4.019928954004856,
 '474': 3.2450469879409285,
 '476': 4.393288833959378,
 '481': 4.429827240449028,
 '483': 3.079068807362642,
 '507': 4.336149365824373,
 '526': 2.4205249888839466,
 '531': 3.9517493146670986,
 '537': 3.1479021761196098,
 '551': 4.881572582872202,
 '553': 3.0305502893037675,
 '576': 3.628599820757222,
 '577': 4.319531561288392,
 '581': 3.4796093044547454,
 '592': 3.1009000797824084,
 '607': 4.831651402991986,
 '651': 3.6013651820057837,
 '726': 3.7379880870161575,
 '742': 4.150297519178871,
 '826': 3.710092899316145,
 '933': 4.41718644718856}

In [None]:
import matplotlib.pyplot as plt

# Convert the keys of R_SD to integers for proper comparison
R_SD_int_keys = {int(k): v for k, v in R_SD.items()}

# Finding common keys between the two dictionaries
common_keys = set(user_smape_JPL.keys()).intersection(R_SD_int_keys.keys())
x_values = [R_SD_int_keys[key] for key in common_keys]
y_values = [user_smape_JPL[key] for key in common_keys]

correlation_coefficient = np.corrcoef(x_values, y_values)[0, 1]
print(correlation_coefficient)
# Calculating the line of best fit
m, b = np.polyfit(x_values, y_values, 1)  # m is slope, b is y-intercept
# Generating y-values for the line of best fit
fit_line = [m*x + b for x in x_values]

# Plotting
plt.figure(figsize=(10, 6))
plt.scatter(x_values, y_values, color='blue')
plt.plot(x_values, fit_line, color='black')
plt.title("Comparison of JPL SMAPE and R_SD values for each user")
plt.xlabel("R_SD")
plt.ylabel("JPL SMAPE")
plt.grid(True)
plt.show()



In [None]:
entropy={'169': 2.230282388872729,
 '171': 3.684183719779189,
 '176': 2.9926140680171254,
 '220': 4.060262039120378,
 '322': 2.8868392966712704,
 '334': 1.5327896019567016,
 '335': 4.085055102756477,
 '346': 4.494680368408909,
 '365': 4.476409765557392,
 '368': 3.77850207357611,
 '372': 3.422577995321604,
 '374': 3.6889681813826796,
 '378': 3.5555331514269954,
 '382': 4.251629167387822,
 '404': 4.180832987205441,
 '405': 4.6030559073332755,
 '406': 1.1440181631019015,
 '409': 3.3086949695628425,
 '410': 4.179776180945229,
 '416': 4.106603137064475,
 '436': 4.001629167387823,
 '444': 3.468576629687865,
 '458': 3.9754180179138334,
 '467': 3.9361804341297546,
 '474': 3.1973257087065035,
 '476': 4.280226253673659,
 '481': 4.315824333525708,
 '483': 3.030015015088482,
 '507': 4.229871195093383,
 '526': 2.3938280220947856,
 '531': 3.864578373902383,
 '537': 3.1016094970590276,
 '551': 4.720049960644813,
 '553': 2.9934111926211235,
 '576': 3.561897618169773,
 '577': 4.2136606896881865,
 '581': 3.4241743523004415,
 '592': 3.0552986080209026,
 '607': 4.671780584510634,
 '651': 3.5307501784370428,
 '726': 3.669275070710713,
 '742': 4.053660689688186,
 '826': 3.646439344671015,
 '933': 4.303508854797678}

sparsity={'169': 0.9889705882352942,
 '171': 0.9816176470588235,
 '176': 0.9865196078431373,
 '220': 0.9767156862745098,
 '322': 0.9852941176470589,
 '334': 0.9950980392156863,
 '335': 0.9767156862745098,
 '346': 0.9705882352941176,
 '365': 0.9705882352941176,
 '368': 0.9779411764705882,
 '372': 0.9840686274509803,
 '374': 0.9803921568627451,
 '378': 0.9828431372549019,
 '382': 0.9754901960784313,
 '404': 0.9754901960784313,
 '405': 0.9669117647058824,
 '406': 0.9938725490196079,
 '409': 0.9865196078431373,
 '410': 0.9718137254901961,
 '416': 0.9779411764705882,
 '436': 0.9779411764705882,
 '444': 0.9828431372549019,
 '458': 0.9791666666666666,
 '467': 0.9791666666666666,
 '474': 0.9852941176470589,
 '476': 0.9742647058823529,
 '481': 0.9742647058823529,
 '483': 0.9840686274509803,
 '507': 0.9754901960784313,
 '526': 0.9889705882352942,
 '531': 0.9779411764705882,
 '537': 0.9852941176470589,
 '551': 0.9669117647058824,
 '553': 0.9877450980392157,
 '576': 0.9816176470588235,
 '577': 0.9754901960784313,
 '581': 0.9840686274509803,
 '592': 0.9852941176470589,
 '607': 0.9669117647058824,
 '651': 0.9803921568627451,
 '726': 0.9816176470588235,
 '742': 0.9767156862745098,
 '826': 0.9828431372549019,
 '933': 0.9742647058823529}

In [None]:
import matplotlib.pyplot as plt

# Convert the keys of R_SD to integers for proper comparison
entropy_int_keys = {int(k): v for k, v in entropy.items()}

# Finding common keys between the two dictionaries
common_keys_ent = set(user_smape_JPL.keys()).intersection(entropy_int_keys.keys())
x_values_ent = [entropy_int_keys[key] for key in common_keys_ent]
y_values_ent = [user_smape_JPL[key] for key in common_keys_ent]

correlation_coefficient_ent = np.corrcoef(x_values_ent, y_values_ent)[0, 1]
print(correlation_coefficient_ent)
# Calculating the line of best fit
m, b = np.polyfit(x_values_ent, y_values_ent, 1)  # m is slope, b is y-intercept
# Generating y-values for the line of best fit
fit_line_ent= [m*x + b for x in x_values_ent]




In [None]:
import matplotlib.pyplot as plt

# Convert the keys of R_SD to integers for proper comparison
sparsity_int_keys = {int(k): v for k, v in sparsity.items()}

# Finding common keys between the two dictionaries
common_keys_spars = set(user_smape_JPL.keys()).intersection(sparsity_int_keys.keys())
x_values_spars = [sparsity_int_keys[key] for key in common_keys_spars]
y_values_spars = [user_smape_JPL[key] for key in common_keys_spars]

correlation_coefficient_spars = np.corrcoef(x_values_spars, y_values_spars)[0, 1]
print(correlation_coefficient_spars)
# Calculating the line of best fit
m, b = np.polyfit(x_values_spars, y_values_spars, 1)  # m is slope, b is y-intercept
# Generating y-values for the line of best fit
fit_line_spars= [m*x + b for x in x_values_spars]


In [None]:
import matplotlib.pyplot as plt

# Create a subplot with 1 row and 2 columns
fig, ax = plt.subplots(1, 2, figsize=(20, 6))

# Plotting for Entropy_SD
ax[0].scatter(x_values_ent, y_values_ent, color='blue')
ax[0].plot(x_values_ent, fit_line_ent, color='black')
ax[0].set_xlabel("Entropy_SD",fontsize=16)
ax[0].set_ylabel("DT",fontsize=16)
ax[0].grid(True)
ax[0].text(0.05, 0.81, f'Correlation: {correlation_coefficient_ent:.3f}', transform=ax[0].transAxes, fontsize=14)

# Plotting for Sparsity_SD
ax[1].scatter(x_values_spars, y_values_spars, color='blue')
ax[1].plot(x_values_spars, fit_line_spars, color='black')
ax[1].set_xlabel("Sparsity_SD", fontsize=16)
ax[1].set_ylabel("DT", fontsize=16)
ax[1].grid(True)
ax[1].text(0.05, 0.81, f'Correlation: {correlation_coefficient_spars:.3f}', transform=ax[1].transAxes, fontsize=14)

plt.tight_layout()
plt.show()
