In [None]:
!pip install pytorch-lightning==2.0.2 pytorch_forecasting==1.0.0


In [None]:
import pandas as pd
import numpy as np
from pytorch_forecasting import TimeSeriesDataSet, TemporalFusionTransformer
from pytorch_forecasting.data import GroupNormalizer
from pytorch_forecasting.metrics import QuantileLoss
from torch.utils.data import DataLoader


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
import pandas as pd

# Define the file paths
file_path1 = '/content/drive/MyDrive/SuperUROP /Data Analysis/JPL_training_data.csv'
file_path2= '/content/drive/MyDrive/SuperUROP /Data Analysis/JPL_testing_data.csv'

JPL_train= pd.read_csv(file_path1).values
JPL_test= pd.read_csv(file_path2).values


In [None]:
#Remove row number (in 1st column)
JPL_train=JPL_train[:,1:]
JPL_test=JPL_test[:,1:]

In [None]:
import pandas as pd

def process_dataframe(df):
    # Select only columns 0 and 3
    df = pd.DataFrame(df)
    selected_df = df.iloc[:, [0, 3]]

    # Splitting the date and time in column 0
    df_split = selected_df[0].str.split(' ', expand=True)

    # Renaming the columns for clarity
    df_split.columns = ['Date', 'Time']

    # Including the second column from the original data
    processed_df = pd.concat([df_split, selected_df.iloc[:, 1]], axis=1)

    # Rename the UserID column for clarity
    processed_df.rename(columns={3: 'UserID'}, inplace=True)

    return processed_df

# Usage example
JPL_train = process_dataframe(JPL_train)
JPL_test = process_dataframe(JPL_test)

In [None]:
def convert_time_to_decimal(time_str):
    # Splitting the time into hours, minutes, and seconds
    hours, minutes, seconds = map(int, time_str.split(':'))

    # Converting time to decimal format
    decimal_hours = hours + minutes / 60 + seconds / 3600

    return decimal_hours

# Applying the conversion to the 'Time' column
JPL_train['Time'] = JPL_train['Time'].apply(convert_time_to_decimal)
JPL_test['Time'] = JPL_test['Time'].apply(convert_time_to_decimal)

In [None]:
# Re-importing pandas as the code execution state was reset
import pandas as pd

JPL_train = pd.DataFrame(JPL_train, columns=['Date', 'Time', 'UserID'])
JPL_train['Date'] = pd.to_datetime(JPL_train['Date'])
sorted_JPL_train = JPL_train.sort_values(by=['UserID', 'Date', 'Time'])


JPL_test = pd.DataFrame(JPL_test, columns=['Date', 'Time', 'UserID'])
JPL_test['Date'] = pd.to_datetime(JPL_test['Date'])
sorted_JPL_test = JPL_test.sort_values(by=['UserID', 'Date', 'Time'])

In [None]:
#Keep only one session (earliest) per day for each user
# Assuming your DataFrame is named df and is structured as shown
sorted_JPL_train['Date'] = pd.to_datetime(sorted_JPL_train['Date'])

# Group by 'UserID' and 'Date', then find the index of the earliest 'Time' for each group
idx = sorted_JPL_train.groupby(['UserID', 'Date'])['Time'].idxmin()

# Use these indices to filter the original DataFrame
sorted_JPL_train = sorted_JPL_train.loc[idx]

In [None]:
sorted_JPL_train

In [None]:
#Keep only one session (earliest) per day for each user
# Assuming your DataFrame is named df and is structured as shown
sorted_JPL_test['Date'] = pd.to_datetime(sorted_JPL_test['Date'])

# Group by 'UserID' and 'Date', then find the index of the earliest 'Time' for each group
idx = sorted_JPL_test.groupby(['UserID', 'Date'])['Time'].idxmin()

# Use these indices to filter the original DataFrame
sorted_JPL_test = sorted_JPL_test.loc[idx]


In [None]:
sorted_JPL_test

In [None]:
# Find common UserIDs
common_user_ids = set(sorted_JPL_train['UserID']).intersection(set(sorted_JPL_test['UserID']))
# # Filter both datasets to include only common UserIDs
sorted_JPL_train = sorted_JPL_train[sorted_JPL_train['UserID'].isin(common_user_ids)]
sorted_JPL_test = sorted_JPL_test[sorted_JPL_test['UserID'].isin(common_user_ids)]


In [None]:
# Convert 'Date' to datetime and create a time index
sorted_JPL_train['Date'] = pd.to_datetime(sorted_JPL_train['Date'])

# Calculate the time index as the number of days since the start of the dataset
sorted_JPL_train['time_idx'] = (sorted_JPL_train['Date'] - sorted_JPL_train['Date'].min()).dt.days

# Ensure 'UserID' is of type 'category' for proper encoding
sorted_JPL_train['UserID'] = sorted_JPL_train['UserID'].astype(str).astype('category')

In [None]:
sorted_JPL_train

In [None]:
# Correcting the approach to maintain a continuous index for the DataFrame rows

# We will not reset the index after each user's data is appended. This will keep the row index continuous.

# Re-loading the full dataset with the correct structure
df_full_continuous_index = pd.DataFrame(sorted_JPL_test, columns=['ID', 'Date', 'Time', 'UserID', 'time_idx'])

# Convert 'Date' to datetime
df_full_continuous_index['Date'] = pd.to_datetime(df_full_continuous_index['Date'])

# Dropping the 'ID' column as it's just an index
df_full_continuous_index = df_full_continuous_index.drop(columns=['ID'])

# Generate the complete date range for December 2018
date_range_continuous_index = pd.date_range(start='2018-12-01', end='2018-12-31')

# Getting unique User IDs
user_ids_continuous_index = df_full_continuous_index['UserID'].unique()

# Creating a new DataFrame to store the results
filled_df_full_test = pd.DataFrame()

# Iterating over each user and filling missing dates
for user_id in user_ids_continuous_index:
    user_df_continuous_index = df_full_continuous_index[df_full_continuous_index['UserID'] == user_id]
    user_df_continuous_index = user_df_continuous_index.set_index('Date').reindex(date_range_continuous_index).reset_index().rename(columns={'index': 'Date'})
    user_df_continuous_index['UserID'] = user_df_continuous_index['UserID'].fillna(user_id)
    user_df_continuous_index['Time'] = user_df_continuous_index['Time'].fillna(0)
    user_df_continuous_index['time_idx'] = user_df_continuous_index['Date'].dt.day
    filled_df_full_test = filled_df_full_test.append(user_df_continuous_index, ignore_index=True)




In [None]:
# Convert 'Date' to datetime and create a time index
filled_df_full_test['Date'] = pd.to_datetime(filled_df_full_test['Date'])

# Calculate the time index as the number of days since the start of the dataset
filled_df_full_test['time_idx'] = (filled_df_full_test['Date'] - filled_df_full_test['Date'].min()).dt.days

# Ensure 'UserID' is of type 'category' for proper encoding
filled_df_full_test['UserID'] = filled_df_full_test['UserID'].astype(str).astype('category')

In [None]:
filled_df_full_test

In [None]:
user_counts_train = sorted_JPL_train['UserID'].value_counts()
max_sessions_train=max(user_counts_train)
max_sessions_train

In [None]:
user_counts_test = sorted_JPL_test['UserID'].value_counts()
max_sessions_test=max(user_counts_test)
max_sessions_test

In [None]:
user_counts_test = sorted_JPL_test['UserID'].value_counts()
min_sessions_test=min(user_counts_test)
min_sessions_test

In [None]:
from pytorch_forecasting import TimeSeriesDataSet

max_encoder_length = max_sessions_train  # Maximum number of sessions in test set
max_prediction_length = 30  # Maximum number of sessions in test set

training = TimeSeriesDataSet(
    sorted_JPL_train,
    time_idx="time_idx",
    target="Time",
    group_ids=["UserID"],
    min_encoder_length=1,  # This can be less than max_encoder_length
    max_encoder_length=max_encoder_length,
    min_prediction_length=1,
    max_prediction_length=30,
    static_categoricals=["UserID"],
    time_varying_known_reals=["time_idx"],
    time_varying_unknown_reals=["Time"],
    allow_missing_timesteps=True,
    target_normalizer=GroupNormalizer(
    groups=["UserID"], transformation="softplus"
    ),  # or any other normalization
    add_relative_time_idx=True,
    add_target_scales=True,
    add_encoder_length=True,
)


In [None]:
from torch.utils.data import DataLoader

batch_size = 16 # Define based on your system's capabilities
train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=1)

In [None]:
import lightning.pytorch as pl


trainer = pl.Trainer(
    max_epochs=50,
    accelerator='gpu',
    devices=1,
    enable_model_summary=True,
    gradient_clip_val=0.1)

tft = TemporalFusionTransformer.from_dataset(
    training,
    learning_rate=0.001,
    hidden_size=32,
    attention_head_size=4,
    hidden_continuous_size=32,
    dropout=0.1,
    output_size=7,  # there are 7 quantiles by default: [0.02, 0.1, 0.25, 0.5, 0.75, 0.9, 0.98]
    loss=QuantileLoss())

trainer.fit(
    tft,
    train_dataloaders=train_dataloader)


In [None]:
filled_df_full_test

In [None]:
predictions = tft.predict(filled_df_full_test)
predictions

In [None]:
import torch
import numpy as np

user_ids=filled_df_full_test['UserID'].unique()


# Convert the tensor to a NumPy array
numpy_data = predictions.cpu().numpy()

# Create a dictionary with UserIDs as keys and tensor rows as values
user_predicted_times= {user_id: row for user_id, row in zip(user_ids, numpy_data)}


In [None]:
# Converting to DataFrame
df = pd.DataFrame(filled_df_full_test, columns=['Date', 'Time', 'UserID', 'time_idx'])
filtered_df = df[df['time_idx'] != 30]

# Convert 'Date' to datetime
filtered_df['Date'] = pd.to_datetime(filtered_df['Date'])

# Creating a dictionary where the key is the UserID and the value is a list of 'Time' values for that UserID
user_actual_times = filtered_df.groupby('UserID')['Time'].apply(list).to_dict()



In [None]:
import numpy as np

def calculate_smape_exclude_zeros(actual, forecast):
    """Calculate SMAPE between two arrays, excluding entries where actual values are zero."""
    actual = np.array(actual)
    forecast = np.array(forecast)

    non_zero_indices = np.nonzero(actual)  # Indices where actual values are not zero
    actual_non_zero = actual[non_zero_indices]
    forecast_non_zero = forecast[non_zero_indices]

    return 100/len(actual_non_zero) * np.sum(np.abs(forecast_non_zero - actual_non_zero) / (np.abs(actual_non_zero) + np.abs(forecast_non_zero)))


# Calculate SMAPE for each key, excluding zeros in actual dictionary
smape_per_user = {}
for key in user_actual_times:
    smape_per_user[key] = calculate_smape_exclude_zeros(user_actual_times[key], user_predicted_times[key])

# Calculate overall SMAPE, excluding NaN values
overall_smape = np.mean(list(smape_per_user.values()))

smape_per_user



In [None]:
overall_smape = np.mean(list(smape_per_user.values()))
print("SMAPE for TFT model across all users (%):",overall_smape)