In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
import pandas as pd

# Define the file paths
file_path1 = '/content/drive/MyDrive/SuperUROP /Data Analysis/JPL_training_data.csv'
file_path2= '/content/drive/MyDrive/SuperUROP /Data Analysis/JPL_testing_data.csv'

JPL_train= pd.read_csv(file_path1).values
JPL_test= pd.read_csv(file_path2).values



In [None]:
#Remove row number (in 1st column)
JPL_train=JPL_train[:,1:]
JPL_test=JPL_test[:,1:]

In [None]:
import pandas as pd

def process_dataframe(df):
    # Select only columns 0 and 3
    df = pd.DataFrame(df)
    selected_df = df.iloc[:, [0, 3]]

    # Splitting the date and time in column 0
    df_split = selected_df[0].str.split(' ', expand=True)

    # Renaming the columns for clarity
    df_split.columns = ['Date', 'Time']

    # Including the second column from the original data
    processed_df = pd.concat([df_split, selected_df.iloc[:, 1]], axis=1)

    # Rename the UserID column for clarity
    processed_df.rename(columns={3: 'UserID'}, inplace=True)

    return processed_df

# Usage example
JPL_train = process_dataframe(JPL_train)
JPL_test = process_dataframe(JPL_test)

In [None]:
def convert_time_to_decimal(time_str):
    # Splitting the time into hours, minutes, and seconds
    hours, minutes, seconds = map(int, time_str.split(':'))

    # Converting time to decimal format
    decimal_hours = hours + minutes / 60 + seconds / 3600

    return decimal_hours

# Applying the conversion to the 'Time' column
JPL_train['Time'] = JPL_train['Time'].apply(convert_time_to_decimal)
JPL_test['Time'] = JPL_test['Time'].apply(convert_time_to_decimal)

In [None]:
# Re-importing pandas as the code execution state was reset
import pandas as pd

JPL_train = pd.DataFrame(JPL_train, columns=['Date', 'Time', 'UserID'])
JPL_train['Date'] = pd.to_datetime(JPL_train['Date'])
sorted_JPL_train = JPL_train.sort_values(by=['UserID', 'Date', 'Time'])


JPL_test = pd.DataFrame(JPL_test, columns=['Date', 'Time', 'UserID'])
JPL_test['Date'] = pd.to_datetime(JPL_test['Date'])
sorted_JPL_test = JPL_test.sort_values(by=['UserID', 'Date', 'Time'])

In [None]:
#Keep only one session (earliest) per day for each user
# Assuming your DataFrame is named df and is structured as shown
sorted_JPL_train['Date'] = pd.to_datetime(sorted_JPL_train['Date'])

# Group by 'UserID' and 'Date', then find the index of the earliest 'Time' for each group
idx = sorted_JPL_train.groupby(['UserID', 'Date'])['Time'].idxmin()

# Use these indices to filter the original DataFrame
sorted_JPL_train = sorted_JPL_train.loc[idx]

In [None]:
sorted_JPL_train

In [None]:
#Keep only one session (earliest) per day for each user
# Assuming your DataFrame is named df and is structured as shown
sorted_JPL_test['Date'] = pd.to_datetime(sorted_JPL_test['Date'])

# Group by 'UserID' and 'Date', then find the index of the earliest 'Time' for each group
idx = sorted_JPL_test.groupby(['UserID', 'Date'])['Time'].idxmin()

# Use these indices to filter the original DataFrame
sorted_JPL_test = sorted_JPL_test.loc[idx]



In [None]:
!pip install prophet

In [None]:
from prophet import Prophet

# Unique UserIDs
user_ids = sorted_JPL_train['UserID'].unique()

# Dictionary to store models for each user
models = {}

for user_id in user_ids:
    # Subset for each user
    df_user = sorted_JPL_train[sorted_JPL_train['UserID'] == user_id][['Date', 'Time']]
    df_user = df_user.rename(columns={'Date': 'ds', 'Time': 'y'})

    # Remove rows with NaN values in 'y'
    df_user = df_user.dropna()

    model = Prophet(seasonality_mode='additive', yearly_seasonality=False,
                weekly_seasonality=True, daily_seasonality=False, seasonality_prior_scale=0.01)
    model.fit(df_user)
    models[user_id] = model





In [None]:
# Preparing a list or a DataFrame to store predictions
predictions = []

for user_id in sorted_JPL_test['UserID'].unique():
    if user_id in models:
        # Prepare the future DataFrame for this user
        user_test_data = sorted_JPL_test[sorted_JPL_test['UserID'] == user_id]
        future_dates = user_test_data[['Date']].rename(columns={'Date': 'ds'})
        future_dates = future_dates.dropna()

        # Make predictions
        forecast = models[user_id].predict(future_dates)

        # Add UserID to match with the test data
        forecast['UserID'] = user_id

        # Selecting only relevant columns for the predictions
        forecast_reduced = forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper', 'UserID']]

        # Merge with the true values from the test data
        merged_forecast = pd.merge(forecast_reduced, user_test_data, left_on=['UserID', 'ds'], right_on=['UserID', 'Date'],how='left')

        predictions.append(merged_forecast)

# Combine all predictions and true values into a single DataFrame
all_predictions_with_true = pd.concat(predictions)

# Now, all_predictions_with_true DataFrame contains the forecasts, including lower and upper bounds, along with true values for each user


In [None]:
all_predictions_with_true

In [None]:
import pandas as pd
df = pd.DataFrame(all_predictions_with_true)

# Function to calculate SMAPE
def calculate_smape(df):
    def smape(y_true, y_pred):
        denominator = (abs(y_true) + abs(y_pred))
        diff = abs(y_true - y_pred) / denominator
        return 100 * diff.mean()

    smape_values = df.groupby('UserID').apply(lambda x: smape(x['Time'], x['yhat']))
    return smape_values

smape_results = calculate_smape(df)
overall_mean_smape = smape_results.mean()
overall_mean_smape