In [None]:
!pip install pytorch-lightning==2.0.2 pytorch_forecasting==1.0.0
!pip install gluonts


In [None]:
import pandas as pd
import numpy as np
from pytorch_forecasting import TimeSeriesDataSet, TemporalFusionTransformer
from pytorch_forecasting.data import GroupNormalizer
from pytorch_forecasting.metrics import QuantileLoss
from torch.utils.data import DataLoader
from gluonts.dataset.common import ListDataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
import pandas as pd

# Define the file paths
file_path1 = '/content/drive/MyDrive/SuperUROP /Data Analysis/JPL_training_data.csv'
file_path2= '/content/drive/MyDrive/SuperUROP /Data Analysis/JPL_testing_data.csv'

JPL_train= pd.read_csv(file_path1).values
JPL_test= pd.read_csv(file_path2).values


In [None]:
#Remove row number (in 1st column)
JPL_train=JPL_train[:,1:]
JPL_test=JPL_test[:,1:]

In [None]:
import pandas as pd

def process_dataframe(df):
    # Select only columns 0 and 3
    df = pd.DataFrame(df)
    selected_df = df.iloc[:, [0, 3]]

    # Splitting the date and time in column 0
    df_split = selected_df[0].str.split(' ', expand=True)

    # Renaming the columns for clarity
    df_split.columns = ['Date', 'Time']

    # Including the second column from the original data
    processed_df = pd.concat([df_split, selected_df.iloc[:, 1]], axis=1)

    # Rename the UserID column for clarity
    processed_df.rename(columns={3: 'UserID'}, inplace=True)

    return processed_df

# Usage example
JPL_train = process_dataframe(JPL_train)
JPL_test = process_dataframe(JPL_test)

In [None]:
def convert_time_to_decimal(time_str):
    # Splitting the time into hours, minutes, and seconds
    hours, minutes, seconds = map(int, time_str.split(':'))

    # Converting time to decimal format
    decimal_hours = hours + minutes / 60 + seconds / 3600

    return decimal_hours

# Applying the conversion to the 'Time' column
JPL_train['Time'] = JPL_train['Time'].apply(convert_time_to_decimal)
JPL_test['Time'] = JPL_test['Time'].apply(convert_time_to_decimal)

In [None]:
# Re-importing pandas as the code execution state was reset
import pandas as pd

JPL_train = pd.DataFrame(JPL_train, columns=['Date', 'Time', 'UserID'])
JPL_train['Date'] = pd.to_datetime(JPL_train['Date'])
sorted_JPL_train = JPL_train.sort_values(by=['UserID', 'Date', 'Time'])


JPL_test = pd.DataFrame(JPL_test, columns=['Date', 'Time', 'UserID'])
JPL_test['Date'] = pd.to_datetime(JPL_test['Date'])
sorted_JPL_test = JPL_test.sort_values(by=['UserID', 'Date', 'Time'])

In [None]:
#Keep only one session (earliest) per day for each user
# Assuming your DataFrame is named df and is structured as shown
sorted_JPL_train['Date'] = pd.to_datetime(sorted_JPL_train['Date'])

# Group by 'UserID' and 'Date', then find the index of the earliest 'Time' for each group
idx = sorted_JPL_train.groupby(['UserID', 'Date'])['Time'].idxmin()

# Use these indices to filter the original DataFrame
sorted_JPL_train = sorted_JPL_train.loc[idx]

In [None]:
sorted_JPL_train

In [None]:
#Keep only one session (earliest) per day for each user
# Assuming your DataFrame is named df and is structured as shown
sorted_JPL_test['Date'] = pd.to_datetime(sorted_JPL_test['Date'])

# Group by 'UserID' and 'Date', then find the index of the earliest 'Time' for each group
idx = sorted_JPL_test.groupby(['UserID', 'Date'])['Time'].idxmin()

# Use these indices to filter the original DataFrame
sorted_JPL_test = sorted_JPL_test.loc[idx]


In [None]:
# Find common UserIDs
common_user_ids = set(sorted_JPL_train['UserID']).intersection(set(sorted_JPL_test['UserID']))
# # Filter both datasets to include only common UserIDs
sorted_JPL_train = sorted_JPL_train[sorted_JPL_train['UserID'].isin(common_user_ids)]
sorted_JPL_test = sorted_JPL_test[sorted_JPL_test['UserID'].isin(common_user_ids)]


#Modify train dataset to be suitable for DeepAR

In [None]:
end_date = pd.Timestamp('2018-11-30')

# Create a new DataFrame to store the extended data
extended_data = []

for user_id, group in sorted_JPL_train.groupby('UserID'):
    # Create a date range for each user that extends to November 30th
    user_date_range = pd.date_range(start=group['Date'].min(), end=end_date, freq='D')

    # Reindex the group to the new date range and forward-fill missing values
    group_extended = group.set_index('Date').reindex(user_date_range).fillna(method='ffill').reset_index()
    group_extended['UserID'] = user_id  # Add UserID back after reindexing

    extended_data.append(group_extended)

# Concatenate all extended data
extended_df_train = pd.concat(extended_data)

# Now extended_df contains each user's data extended and filled up to November 30th

In [None]:
from gluonts.dataset.pandas import PandasDataset

train_ds = PandasDataset.from_long_dataframe(extended_df_train, target='Time', item_id='UserID',
                                       timestamp='index', freq='D')

#Train model and make predictions

In [None]:
from gluonts.torch.model.deepar import DeepAREstimator

estimator = DeepAREstimator(freq='D', prediction_length=31, num_layers=7, dropout_rate=0.1, trainer_kwargs={'accelerator': 'gpu', 'max_epochs':50})

predictor = estimator.train(train_ds, num_workers=2)

In [None]:
pred = list(predictor.predict(train_ds))

In [None]:
all_preds = list()
for item in pred:
    family = item.item_id
    p = item.samples.mean(axis=0)
    p10 = np.percentile(item.samples, 10, axis=0)
    p90 = np.percentile(item.samples, 90, axis=0)
    dates = pd.date_range(start=item.start_date.to_timestamp(), periods=len(p), freq='D')
    family_pred = pd.DataFrame({'Date': dates, 'pred': p,'UserID': family})
    all_preds += [family_pred]
all_preds = pd.concat(all_preds, ignore_index=True)


#Modify test set to match predictions

In [None]:
#Keep only one session (earliest) per day for each user
# Assuming your DataFrame is named df and is structured as shown
sorted_JPL_test['Date'] = pd.to_datetime(sorted_JPL_test['Date'])

# Group by 'UserID' and 'Date', then find the index of the earliest 'Time' for each group
idx = sorted_JPL_test.groupby(['UserID', 'Date'])['Time'].idxmin()

# Use these indices to filter the original DataFrame
sorted_JPL_test = sorted_JPL_test.loc[idx]

In [None]:
user_df = pd.DataFrame(sorted_JPL_test)
user_df['Date'] = pd.to_datetime(user_df['Date'])

# Filling the data for each user so that each user has data between 1/12 and 31/12/2018
start_date = pd.Timestamp('2018-12-01')
end_date = pd.Timestamp('2018-12-31')
filled_data = []

for user_id, group in user_df.groupby('UserID'):
    date_range = pd.date_range(start=start_date, end=end_date, freq='D')
    group_filled = group.set_index('Date').reindex(date_range, fill_value=0).reset_index()
    group_filled['UserID'] = user_id
    filled_data.append(group_filled)

filled_df_test = pd.concat(filled_data)

filled_df_test.rename(columns={'index': 'Date'}, inplace=True)

#Compare predictions with ground truth and calculate SMAPE

In [None]:
filled_df_test.reset_index(drop=True, inplace=True)
all_preds.reset_index(drop=True, inplace=True)


In [None]:
all_preds['Time'] = filled_df_test['Time']

In [None]:
all_preds

In [None]:
all_preds_df = pd.DataFrame(all_preds)

# Exclude rows where 'Time' is 0
non_zero_time_df = all_preds_df[all_preds_df['Time'] != 0]

# Calculate SMAPE for each user
smape_values = []
for user_id, group in non_zero_time_df.groupby('UserID'):
    smape = 100 * (np.abs(group['pred'] - group['Time']).sum()) / (np.abs(group['pred']) + np.abs(group['Time'])).sum()
    smape_values.append(smape)

smape_values

In [None]:
average_smape = np.mean(smape_values)
print("SMAPE for DeepAR model across all users (%):",average_smape)