In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
import pandas as pd

# Define the file paths

file_path3 = '/content/drive/MyDrive/SuperUROP /Data Analysis/JPL_training_data.csv'
file_path4  = '/content/drive/MyDrive/SuperUROP /Data Analysis/JPL_testing_data.csv'
# Use pandas to read the CSV files and then convert them to NumPy arrays
JPL_train = pd.read_csv(file_path3).values
JPL_test=pd.read_csv(file_path4).values

#Training phase
-Create blue matrix \\
-SVD decomposition -> find orange matrix (non-stationary part) \\
-Subtract blue from orange -> get green matrix -> stationary part/residual \\
-Calculate beta_hat (will be used in testing phase for out of sample forecasting of "SAMoSSA" part

In [None]:
#Remove row number (in 1st column)
JPL_train=JPL_train[:,1:]
JPL_test=JPL_test[:,1:]

In [None]:
import pandas as pd

def process_dataframe(df):
    # Select only columns 0 and 3
    df = pd.DataFrame(df)
    selected_df = df.iloc[:, [0, 3]]

    # Splitting the date and time in column 0
    df_split = selected_df[0].str.split(' ', expand=True)

    # Renaming the columns for clarity
    df_split.columns = ['Date', 'Time']

    # Including the second column from the original data
    processed_df = pd.concat([df_split, selected_df.iloc[:, 1]], axis=1)

    # Rename the UserID column for clarity
    processed_df.rename(columns={3: 'UserID'}, inplace=True)

    return processed_df

# Usage example
JPL_train = process_dataframe(JPL_train)
JPL_test = process_dataframe(JPL_test)

In [None]:
def convert_time_to_decimal(time_str):
    # Splitting the time into hours, minutes, and seconds
    hours, minutes, seconds = map(int, time_str.split(':'))

    # Converting time to decimal format
    decimal_hours = hours + minutes / 60 + seconds / 3600

    return decimal_hours

# Applying the conversion to the 'Time' column
JPL_train['Time'] = JPL_train['Time'].apply(convert_time_to_decimal)
JPL_test['Time'] = JPL_test['Time'].apply(convert_time_to_decimal)

In [None]:
import pandas as pd

JPL_train = pd.DataFrame(JPL_train, columns=['Date', 'Time', 'UserID'])
JPL_train['Date'] = pd.to_datetime(JPL_train['Date'])
sorted_JPL_train = JPL_train.sort_values(by=['UserID', 'Date', 'Time'])


JPL_test = pd.DataFrame(JPL_test, columns=['Date', 'Time', 'UserID'])
JPL_test['Date'] = pd.to_datetime(JPL_test['Date'])
sorted_JPL_test = JPL_test.sort_values(by=['UserID', 'Date', 'Time'])

In [None]:
#Keep only one session (earliest) per day for each user
# Assuming your DataFrame is named df and is structured as shown
sorted_JPL_train['Date'] = pd.to_datetime(sorted_JPL_train['Date'])

# Group by 'UserID' and 'Date', then find the index of the earliest 'Time' for each group
idx = sorted_JPL_train.groupby(['UserID', 'Date'])['Time'].idxmin()

# Use these indices to filter the original DataFrame
sorted_JPL_train = sorted_JPL_train.loc[idx]

In [None]:
#Keep only one session (earliest) per day for each user
# Assuming your DataFrame is named df and is structured as shown
sorted_JPL_test['Date'] = pd.to_datetime(sorted_JPL_test['Date'])

# Group by 'UserID' and 'Date', then find the index of the earliest 'Time' for each group
idx = sorted_JPL_test.groupby(['UserID', 'Date'])['Time'].idxmin()

# Use these indices to filter the original DataFrame
sorted_JPL_test = sorted_JPL_test.loc[idx]

In [None]:
# Find common UserIDs
common_user_ids = set(sorted_JPL_train['UserID']).intersection(set(sorted_JPL_test['UserID']))
# # Filter both datasets to include only common UserIDs
sorted_JPL_train = sorted_JPL_train[sorted_JPL_train['UserID'].isin(common_user_ids)]
sorted_JPL_test = sorted_JPL_test[sorted_JPL_test['UserID'].isin(common_user_ids)]

In [None]:
sorted_JPL_train

In [None]:
user_counts_train = sorted_JPL_train['UserID'].value_counts()
min_sessions_train=min(user_counts_train)
min_sessions_train

In [None]:
# Selecting only the last 20 rows for each user ID

last_19_sessions = sorted_JPL_train.groupby('UserID').apply(lambda x: x.tail(19))

# Resetting the index
last_19_sessions = last_19_sessions.reset_index(drop=True)
last_19_sessions=last_19_sessions[['Time', 'UserID']]

In [None]:
last_19_sessions

In [None]:
arrival_times_per_user = last_19_sessions.groupby('UserID')['Time'].apply(list)
arrival_times_per_user

In [None]:
def list_to_matrix_columnwise_corrected(lst, rows=19, columns=1):
    # Initialize a matrix of zeros
    matrix = np.zeros((rows, columns))

    # Fill the matrix column-wise
    for i, val in enumerate(lst):
        row = i % rows
        col = i // rows
        if col < columns:
            matrix[row, col] = val

    return matrix

page_matrices = {user_id: list_to_matrix_columnwise_corrected(times) for user_id, times in arrival_times_per_user.items()}


In [None]:
matrices = list(page_matrices.values())

# Stacking the matrices horizontally
stacked_page_matrix = np.hstack(matrices)
stacked_page_matrix.shape ##blue matrix created

In [None]:
!pip install git+https://github.com/ShunChi100/RobustPCA


In [None]:
!pip install fbpca

In [None]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Standardize the data (mean=0, variance=1)
scaler = StandardScaler()
data_standardized = scaler.fit_transform(stacked_page_matrix)  # Transpose to standardize across users, not time points


In [None]:
from RobustPCA.rpca import RobustPCA

rpca = RobustPCA(max_iter=2100)

rpca.fit(data_standardized)
L = rpca.get_low_rank()
S = rpca.get_sparse()


In [None]:
L_inverse_scaled = scaler.inverse_transform(L)

##SVD decomposition
-Use a hard margin of k=5 (same as the one used in the paper)

In [None]:
from sklearn.decomposition import PCA
import numpy as np

# Assuming 'stacked_page_matrix' is your data matrix

# Step 1: PCA automatically centers the data, so you don't need to manually subtract the mean

# Step 2: Perform PCA with the desired number of components
num_principal_components = 5  # This is akin to your num_singular_values
pca = PCA(n_components=num_principal_components)

# Fit PCA to the data and transform the data onto the principal components
pca.fit(L)
transformed_data = pca.transform(L)

# Step 3: Reconstruct the data from the principal components
non_stationary_component = pca.inverse_transform(transformed_data)

# The 'reconstructed_data' matrix now acts as your non-stationary component


In [None]:
pca = PCA().fit(L)

#% matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (12,6)

fig, ax = plt.subplots()
xi = np.arange(1, 20, step=1)
y = np.cumsum(pca.explained_variance_ratio_)

plt.ylim(0.0,1.1)
plt.plot(xi, y, marker='o', linestyle='--', color='b')

plt.xlabel('Number of Components')
plt.xticks(np.arange(0, 20, step=1)) #change from 0-based array index to 1-based human-readable label
plt.ylabel('Cumulative variance (%)')
plt.title('The number of components needed to explain variance')

plt.axhline(y=0.99, color='r', linestyle='-')
plt.text(0.5, 0.85, '95% cut-off threshold', color = 'red', fontsize=16)

ax.grid(axis='x')
plt.show()

In [None]:
np.cumsum(pca.explained_variance_ratio_)

In [None]:
from scipy.linalg import svd

# The singular values are in the vector 's'
# U and VT are the left and right singular vectors, respectively
U, s, VT = svd(L_inverse_scaled)

In [None]:
L_inverse_scaled.shape

In [None]:
num_singular_values = 5
s_reduced = np.zeros(L_inverse_scaled.shape)
np.fill_diagonal(s_reduced, s[:num_singular_values])

non_stationary_component = U @ s_reduced @ VT ##orange matrix

In [None]:
non_stationary_component.shape

In [None]:
F_hat_mat = non_stationary_component[:18, :] #L-1 rows of orange matrix
y_vector = stacked_page_matrix[-1, :] #last row of blue matrix

import numpy as np

Y = y_vector.reshape(-1, 1)  # Reshape Y to be a column vector if it's a 1D array

# Solve for beta_hat using the least squares method
beta_hat, residuals, rank, s = np.linalg.lstsq(F_hat_mat.T, Y, rcond=None)

# # beta_hat now contains the estimated beta parameters, should be 18x1 dimensions
beta_hat

In [None]:
array=beta_hat
desired_length = 19
last_element = array[-1, :]
while len(array) < desired_length:
    array = np.vstack([array, last_element])

In [None]:
residual=stacked_page_matrix-array.T@non_stationary_component ##green matrix


##Train LSTM with residual (one LSTM model per user)

In [None]:
userIDs = list(arrival_times_per_user.keys())
residuals_dict = {userID: residual[:, i] for i, userID in enumerate(userIDs)}


In [None]:
residuals_dict

In [None]:
normalized_residuals_dict = {}
means_dict = {}
stds_dict = {}

for key, values in residuals_dict.items():
    # Calculate mean and standard deviation
    mean = values.mean()
    std = values.std()

    # Normalize values
    normalized_values = (values - mean) / std

    # Store normalized values and statistics
    normalized_residuals_dict[key] = normalized_values
    means_dict[key] = mean
    stds_dict[key] = std

In [None]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
import numpy as np
from keras import backend as K

def smape_loss(y_true, y_pred):
    denominator = K.maximum(K.abs(y_true) + K.abs(y_pred), K.epsilon())
    diff = K.abs(y_pred - y_true) / denominator
    return 100 * K.mean(diff, axis=-1)

def create_lstm_model(input_shape):
    model = Sequential()
    model.add(LSTM(units=32, return_sequences=True, input_shape=input_shape))
    model.add(Dropout(0.2))  # Dropout layer after the first LSTM layer
    model.add(LSTM(units=16))  # Second LSTM layer with 32 units
    model.add(Dropout(0.2))  # Dropout layer after the second LSTM layer
    model.add(Dense(units=1))
    model.compile(optimizer='adam', loss=smape_loss)
    return model

def prepare_data(residuals, n_steps):
    X, y = [], []
    for i in range(len(residuals)):
        end_ix = i + n_steps
        if end_ix > len(residuals)-1:
            break
        seq_x, seq_y = residuals[i:end_ix], residuals[end_ix]
        X.append(seq_x)
        y.append(seq_y)
    return np.array(X), np.array(y)

user_ids = normalized_residuals_dict.keys()
n_steps = 1  # Number of time steps for LSTM. Adjust as needed.

lstm_models = {}

for user_id in user_ids:
    residuals = normalized_residuals_dict[user_id]

    # Check if the user has sufficient data
    if len(residuals) > n_steps:
        X, y = prepare_data(residuals, n_steps)
        if X.size > 0 and y.size > 0:
            X = X.reshape((X.shape[0], X.shape[1], 1))
            lstm_model = create_lstm_model((X.shape[1], 1))
            lstm_model.fit(X, y, epochs=100, batch_size=1)
            lstm_models[user_id] = lstm_model
        else:
            print(f"Insufficient data for user {user_id}")
    else:
        print(f"Not enough data points for user {user_id} for n_steps = {n_steps}")


#Inference phase

##Estimate b_hat (parameters of non-stationary component)

In [None]:
F_hat_mat = non_stationary_component[:18, :] #L-1 rows of orange matrix
y_vector = stacked_page_matrix[-1, :] #last row of blue matrix

import numpy as np

Y = y_vector.reshape(-1, 1)  # Reshape Y to be a column vector if it's a 1D array

# Solve for beta_hat using the least squares method
beta_hat, residuals, rank, s = np.linalg.lstsq(F_hat_mat.T, Y, rcond=None)

# # beta_hat now contains the estimated beta parameters, should be 18x1 dimensions
beta_hat

In [None]:
sorted_JPL_test

##Prediction of non-stationary part (using SAMoSSA method)
-Use method outlined in part V of figure (to predict value at time interval t use previous L points)

In [None]:
import pandas as pd

# Assuming last_19_sessions is a pandas DataFrame
# last_19_sessions = pd.read_csv('your_dataset.csv')  # or however you load your dataset

# Initialize an empty DataFrame to store the first 18 sessions for each user
last_18_sessions_per_user = pd.DataFrame()

# Group the data by userID and then take the first 18 rows for each group
for user_id, group in last_19_sessions.groupby('UserID'):
    last_18_sessions = group.tail(18)
    last_18_sessions_per_user = pd.concat([last_18_sessions_per_user, last_18_sessions])

# Reset index of the new DataFrame
last_18_sessions_per_user.reset_index(drop=True, inplace=True)
last_18_sessions_per_user

In [None]:
import pandas as pd

# Initialize an empty dictionary to store time values for each user
user_times = {}

# Process the train dataset
for index, row in last_18_sessions_per_user.iterrows():
    user_id = row['UserID']
    time = row['Time']
    if user_id not in user_times:
        user_times[user_id] = []
    user_times[user_id].append(time)

# Process the test dataset
for index, row in sorted_JPL_test.iterrows():
    user_id = row['UserID']
    time = row['Time']
    if user_id not in user_times:
        user_times[user_id] = []
    user_times[user_id].append(time)
#Create a dictionary where for each user store the last 18 values of the train dataset and the values
#of the test dataset for each user

In [None]:
import numpy as np

# Initialize a dictionary to store predictions for each user
user_predictions_non_stationary = {}

for user_id, times in user_times.items():
    # We can only make a prediction if there are at least 18 values
    if len(times) >= 18:
        predictions = []
        # Slide the window and predict
        for i in range(len(times) - 18):
            window = np.array(times[i:i+18])
            prediction = np.dot(window, beta_hat).item()
            predictions.append(prediction)
        user_predictions_non_stationary[user_id] = predictions

# user_predictions now contains the predicted values for each user


In [None]:
# Group by 'userID' and aggregate the 'time' values into lists
user_actual_time = sorted_JPL_test.groupby('UserID')['Time'].apply(list).to_dict()

In [None]:
user_actual_time

In [None]:
# Assuming user_actual_time and user_predictions_non_stationary are dictionaries with lists as values
residual_test = {key: [a - b for a, b in zip(user_actual_time[key], user_predictions_non_stationary[key])]
                 for key in user_actual_time
                 if key in user_predictions_non_stationary}


In [None]:
residual_test

In [None]:
# Adjusted dictionary to store the results
adjusted_residual_test = {}

for key, values in residual_test.items():
    # Retrieve the mean and standard deviation for the current key
    mean = means_dict[key]
    std = stds_dict[key]

    # Adjust the values by subtracting the mean and dividing by the standard deviation
    adjusted_values = (values - mean) / std

    # Store the adjusted values
    adjusted_residual_test[key] = adjusted_values

##Make predictions using LSTM on stationary/residual part

In [None]:
import pandas as pd
import numpy as np

def prepare_lstm_input(residuals, n_steps):
    X = []
    for i in range(len(residuals) - n_steps + 1):
        X.append(residuals[i:i + n_steps])
    return np.array(X)

def reverse_normalize_data(normalized_data, mean, std):
    return (normalized_data * std) + mean

# Create a DataFrame for storing final predictions
final_predictions_df = pd.DataFrame(columns=['UserID', 'Final_Prediction'])

# Iterate over each user and their residuals
for user_id, residuals in adjusted_residual_test.items():
    if user_id in lstm_models:
        mean = means_dict[user_id]
        std = stds_dict[user_id]
        # Prepare LSTM input from residuals
        if len(residuals) >= n_steps:
            lstm_input = prepare_lstm_input(residuals, n_steps)
            lstm_input = lstm_input.reshape((-1, n_steps, 1))  # Reshape for LSTM

            # Make prediction with LSTM
            lstm_pred_normalized = lstm_models[user_id].predict(lstm_input)

            # Reverse normalization on LSTM predictions to bring them back to original scale
            lstm_pred = reverse_normalize_data(lstm_pred_normalized, mean, std)

            # Retrieve the corresponding non-stationary model predictions
            non_stationary_pred = user_predictions_non_stationary[user_id]

            # Add LSTM predictions to non-stationary model predictions
            # Assuming non_stationary_pred is aligned with the last lstm_pred
            #combined_pred = non_stationary_pred[-len(lstm_pred):] + (beta_hat[-len(lstm_pred):].T@lstm_pred).flatten()
            combined_pred = non_stationary_pred[-len(lstm_pred):] + lstm_pred.flatten()

        else:
            combined_pred = residuals
    else:
        # If no LSTM model, use non-stationary model predictions as is
        combined_pred = user_predictions_non_stationary[user_id]

    # Add combined predictions to the DataFrame
    final_predictions_df = final_predictions_df.append(pd.DataFrame({
        'UserID': user_id,
        'Final_Prediction': combined_pred
    }), ignore_index=True)

# final_predictions_df now contains combined predictions for each user


In [None]:
import pandas as pd
import numpy as np

def prepare_lstm_input(residuals, n_steps):
    X = []
    for i in range(len(residuals) - n_steps + 1):
        X.append(residuals[i:i + n_steps])
    return np.array(X)

# Create a DataFrame for storing final predictions
final_predictions_df = pd.DataFrame(columns=['UserID', 'Final_Prediction'])

# Iterate over each user and their residuals
for user_id, residuals in residual_test.items():
    if user_id in lstm_models:
        # Prepare LSTM input from residuals
        if len(residuals) >= n_steps:
            lstm_input = prepare_lstm_input(residuals, n_steps)
            lstm_input = lstm_input.reshape((-1, n_steps, 1))  # Reshape for LSTM

            # Make prediction with LSTM
            lstm_pred = lstm_models[user_id].predict(lstm_input)
            # Retrieve the corresponding non-stationary model predictions
            non_stationary_pred = user_predictions_non_stationary[user_id]

            # Add LSTM predictions to non-stationary model predictions
            # Assuming non_stationary_pred is aligned with the last lstm_pred
            #combined_pred = non_stationary_pred[-len(lstm_pred):] + (beta_hat[-len(lstm_pred):].T@lstm_pred).flatten()
            combined_pred = non_stationary_pred[-len(lstm_pred):] + lstm_pred.flatten()

        else:
            combined_pred = residuals
    else:
        # If no LSTM model, use non-stationary model predictions as is
        combined_pred = user_predictions_non_stationary[user_id]

    # Add combined predictions to the DataFrame
    final_predictions_df = final_predictions_df.append(pd.DataFrame({
        'UserID': user_id,
        'Final_Prediction': combined_pred
    }), ignore_index=True)

# final_predictions_df now contains combined predictions for each user


#SMAPE of hybrid model (Rank Decomposition+LSTM hybrid)

In [None]:
df = pd.DataFrame(sorted_JPL_test)

# Convert DataFrame to dictionary with UserID as key and Time values as list
true_values = df.groupby('UserID')['Time'].apply(list).to_dict()

In [None]:
import numpy as np
import pandas as pd

def calculate_smape(actual, predicted):
    """Calculate SMAPE between two series."""
    denominator = (np.abs(actual) + np.abs(predicted))
    diff = np.abs(actual - predicted) / denominator
    diff[denominator == 0] = 0.0  # handle division by zero
    return 100 * np.mean(diff)

# Dictionary to store SMAPE for each user
smape_values_non_stationary = {}

# Iterate over each user
for user in user_predictions_non_stationary:
    # Retrieve the predicted values for the user and convert to a Pandas Series if not already
    predicted = pd.Series(user_predictions_non_stationary[user])

    # Retrieve the true values for the user and convert to a Pandas Series
    actual = pd.Series(true_values[user],index=predicted.index)

    # Calculate SMAPE
    smape = calculate_smape(actual, predicted)

    # Store the SMAPE value
    smape_values_non_stationary[user] = smape

# smape_values dictionary now contains the SMAPE for each user


In [None]:
final_predictions_df = final_predictions_df.reset_index(drop=True)
sorted_JPL_test = sorted_JPL_test.reset_index(drop=True)
combined_df = pd.concat([final_predictions_df, sorted_JPL_test], axis=1)
combined_df = combined_df.loc[:, ~combined_df.columns.duplicated()]

In [None]:
import pandas as pd
df = pd.DataFrame(combined_df)

# Function to calculate SMAPE
def calculate_smape(df):
    def smape(y_true, y_pred):
        denominator = (abs(y_true) + abs(y_pred))
        diff = abs(y_true - y_pred) / denominator
        return 100 * diff.mean()

    smape_values = df.groupby('UserID').apply(lambda x: smape(x['Time'], x['Final_Prediction']))
    return smape_values

smape_results_combined = calculate_smape(df)


In [None]:
# Assuming smape_results1 and smape_results2 are the SMAPE results from two different DataFrames
final_smape_results = pd.DataFrame({
    'SMAPE1': smape_values_non_stationary,
    'SMAPE2': smape_results_combined
})

# Select the better SMAPE for each UserID
final_smape_results['Best_SMAPE'] = final_smape_results.min(axis=1)

# Display the final SMAPE results
print(final_smape_results)

In [None]:
mean_best_smape = final_smape_results['Best_SMAPE'].mean()
print("SMAPE for hybrid SAMoSSA and LSTM model (%):", mean_best_smape)

In [None]:
mean_best_smape = final_smape_results['SMAPE2'].mean()
print("SMAPE for hybrid SAMoSSA and LSTM model (%):", mean_best_smape)