In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!git clone https://github.com/AbdullahO/SAMoSSA.git

In [None]:
%cd /content/SAMoSSA

In [None]:
from samossa import SAMoSSA

#Data Processing
Filter the data to obtain the number of users with appropriate number of charging sessions to satisfu requirements -> need number of columns of page matrix to be an **integer** and equal to sqrt(N/T)

In [None]:
import numpy as np
import pandas as pd

# Define the file paths

file_path3 = '/content/drive/MyDrive/SuperUROP /Data Analysis/JPL_training_data.csv'
file_path4  = '/content/drive/MyDrive/SuperUROP /Data Analysis/JPL_testing_data.csv'
# Use pandas to read the CSV files and then convert them to NumPy arrays
JPL_train = pd.read_csv(file_path3).values
JPL_test=pd.read_csv(file_path4).values

In [None]:
JPL_train

In [None]:
#Remove row number (in 1st column)
JPL_train=JPL_train[:,1:]
JPL_test=JPL_test[:,1:]

In [None]:
JPL_train

In [None]:
import math

# Extracting the unique IDs from column 3
unique_ids = np.unique(JPL_train[:, 3])

# Finding the corresponding number of unique charging parameters for each ID
users_charg_sessions = {uid: JPL_train[JPL_train[:, 3] == uid, -1][0] for uid in unique_ids}

# Function to find the largest perfect square less than or equal to the number of
# charging sessions (to make matrix square)
def find_largest_perfect_square(n):
    sqrt_floor = math.floor(math.sqrt(n))
    return sqrt_floor**2

# Calculating 'n' for each user
#n_values = {user: find_largest_perfect_square(sessions) for user, sessions in users_charg_sessions.items()}
n_values=users_charg_sessions


In [None]:
users_from_training = set(JPL_train[:, 3])
mask = np.isin(JPL_test[:, 3], list(users_from_training))
filtered_JPL_test = JPL_test[mask]

users_from_testing = set(JPL_test[:, 3])
mask = np.isin(JPL_train[:, 3], list(users_from_testing))
filtered_JPL_train = JPL_train[mask]

In [None]:
# Converting to DataFrame
df_train = pd.DataFrame(filtered_JPL_train, columns=['arrival_time', 'departure_time', 'duration', 'user_id', 'energy', 'no_sessions'])
df_test = pd.DataFrame(filtered_JPL_test, columns=['arrival_time', 'departure_time', 'duration', 'user_id', 'energy', 'no_sessions'])

# Sorting by start_time within each user_id
sorted_JPL_train = df_train.sort_values(by=['user_id', 'arrival_time'])
sorted_JPL_test = df_test.sort_values(by=['user_id', 'arrival_time'])

sorted_JPL_train

In [None]:
user_counts_train = sorted_JPL_train['user_id'].value_counts()
min_sessions_train=min(user_counts_train)

In [None]:
# Grouping by 'user_id' and taking the last n rows for each user
filtered_sessions = sorted_JPL_train.groupby('user_id').apply(lambda x: x.tail(min_sessions_train))

last_n_rows_per_user = filtered_sessions.reset_index(drop=True)
last_n_rows_per_user=last_n_rows_per_user[['arrival_time', 'user_id']]

In [None]:
# Resetting the index
sorted_JPL_test = sorted_JPL_test.reset_index(drop=True)
sorted_JPL_test=sorted_JPL_test[['arrival_time', 'user_id']]

In [None]:
def convert_to_hours(time_str):
    time_parts = time_str.split()[1].split(':') # Splitting to get only the time part
    hours = int(time_parts[0]) + int(time_parts[1])/60 + int(time_parts[2])/3600
    return round(hours, 2) # Rounding to 2 decimal places


last_n_rows_per_user['arrival_time_hours'] = last_n_rows_per_user['arrival_time'].apply(convert_to_hours)
last_n_rows_per_user = last_n_rows_per_user.drop(columns=['arrival_time'])
last_n_rows_per_user


In [None]:
sorted_JPL_test['arrival_time_hours'] = sorted_JPL_test['arrival_time'].apply(convert_to_hours)
sorted_JPL_test = sorted_JPL_test.drop(columns=['arrival_time'])
sorted_JPL_test

In [None]:
arrival_time_per_user = last_n_rows_per_user.groupby('user_id')['arrival_time_hours'].apply(list)
arrival_time_per_user

In [None]:
arrival_time_per_user_test = sorted_JPL_test.groupby('user_id')['arrival_time_hours'].apply(list)
arrival_time_per_user_test

In [None]:
# Check if all lists in the series are of the same length
if len(set(arrival_time_per_user.apply(len))) != 1:
    raise ValueError("All lists must be of the same length in train dataset")

# Convert the series to a list of numpy arrays
array_list = [np.array(lst) for lst in arrival_time_per_user]
array_list_test = [np.array(lst) for lst in arrival_time_per_user_test]

# Find the maximum length among the arrays
max_length = max(len(arr) for arr in array_list_test)

# Pad each array to have the same length
padded_array_list_test = [np.pad(arr, (0, max_length - len(arr)), 'constant') for arr in array_list_test]

matrix_train = np.column_stack(array_list)
matrix_test=np.column_stack(padded_array_list_test)
matrix=np.concatenate((matrix_train, matrix_test))
data = np.array(matrix)
print(data)

#Implement SamoSSA

In [None]:
T, N = data[:len(matrix_train),:].shape

In [None]:
L = int(T/1.01)
model = SAMoSSA(N, L, )

In [None]:
# fit model on training data
model.fit(data[:-len(matrix_test),:])

In [None]:
predictions = model.predict(len(matrix_test))
predictions

In [None]:
def calculate_smape(true_values, predicted_values):
    """Calculate SMAPE between true and predicted values, excluding cases where true_values are 0."""
    mask = true_values != 0  # Create a mask to filter out the cases where true_values are 0
    true_values = true_values[mask]
    predicted_values = predicted_values[mask]

    if len(true_values) == 0:  # Check if there are no elements to avoid division by zero
        return np.nan

    denominator = np.abs(true_values + predicted_values)
    diff = np.abs(predicted_values - true_values) / denominator
    return 100 * np.mean(diff)

# Assuming 'predictions' and 'matrix_test' are your arrays
smape_list = []

for i in range(predictions.shape[1]):
    true_values = matrix_test[:, i]
    predicted_values = predictions[:, i]
    smape = calculate_smape(true_values, predicted_values)

    if not np.isnan(smape):
        smape_list.append(smape)
        print(f"SMAPE for column {i}: {smape}")

smape_average = sum(smape_list) / len(smape_list)
print(f"Average SMAPE across all users: {smape_average}")
