In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
import pandas as pd

# Define the file paths
file_path1 = '/content/drive/MyDrive/SuperUROP /Data Analysis/JPL_training_data.csv'
file_path2= '/content/drive/MyDrive/SuperUROP /Data Analysis/JPL_testing_data.csv'

JPL_train= pd.read_csv(file_path1).values
JPL_test= pd.read_csv(file_path2).values


In [None]:
#Remove row number (in 1st column)
JPL_train=JPL_train[:,1:]
JPL_test=JPL_test[:,1:]

In [None]:
import pandas as pd

def process_dataframe(df):
    # Select only columns 0 and 3
    df = pd.DataFrame(df)
    selected_df = df.iloc[:, [0, 3]]

    # Splitting the date and time in column 0
    df_split = selected_df[0].str.split(' ', expand=True)

    # Renaming the columns for clarity
    df_split.columns = ['Date', 'Time']

    # Including the second column from the original data
    processed_df = pd.concat([df_split, selected_df.iloc[:, 1]], axis=1)

    # Rename the UserID column for clarity
    processed_df.rename(columns={3: 'UserID'}, inplace=True)

    return processed_df

# Usage example
JPL_train = process_dataframe(JPL_train)
JPL_test = process_dataframe(JPL_test)

In [None]:
def convert_time_to_decimal(time_str):
    # Splitting the time into hours, minutes, and seconds
    hours, minutes, seconds = map(int, time_str.split(':'))

    # Converting time to decimal format
    decimal_hours = hours + minutes / 60 + seconds / 3600

    return decimal_hours

# Applying the conversion to the 'Time' column
JPL_train['Time'] = JPL_train['Time'].apply(convert_time_to_decimal)
JPL_test['Time'] = JPL_test['Time'].apply(convert_time_to_decimal)

In [None]:
# Re-importing pandas as the code execution state was reset
import pandas as pd

JPL_train = pd.DataFrame(JPL_train, columns=['Date', 'Time', 'UserID'])
JPL_train['Date'] = pd.to_datetime(JPL_train['Date'])
sorted_JPL_train = JPL_train.sort_values(by=['UserID', 'Date', 'Time'])


JPL_test = pd.DataFrame(JPL_test, columns=['Date', 'Time', 'UserID'])
JPL_test['Date'] = pd.to_datetime(JPL_test['Date'])
sorted_JPL_test = JPL_test.sort_values(by=['UserID', 'Date', 'Time'])

In [None]:
#Keep only one session (earliest) per day for each user
# Assuming your DataFrame is named df and is structured as shown
sorted_JPL_train['Date'] = pd.to_datetime(sorted_JPL_train['Date'])

# Group by 'UserID' and 'Date', then find the index of the earliest 'Time' for each group
idx = sorted_JPL_train.groupby(['UserID', 'Date'])['Time'].idxmin()

# Use these indices to filter the original DataFrame
sorted_JPL_train = sorted_JPL_train.loc[idx]

In [None]:
#Keep only one session (earliest) per day for each user
# Assuming your DataFrame is named df and is structured as shown
sorted_JPL_test['Date'] = pd.to_datetime(sorted_JPL_test['Date'])

# Group by 'UserID' and 'Date', then find the index of the earliest 'Time' for each group
idx = sorted_JPL_test.groupby(['UserID', 'Date'])['Time'].idxmin()

# Use these indices to filter the original DataFrame
sorted_JPL_test = sorted_JPL_test.loc[idx]

In [None]:
# Find common UserIDs
common_user_ids = set(sorted_JPL_train['UserID']).intersection(set(sorted_JPL_test['UserID']))
# # Filter both datasets to include only common UserIDs
sorted_JPL_train = sorted_JPL_train[sorted_JPL_train['UserID'].isin(common_user_ids)]
sorted_JPL_test = sorted_JPL_test[sorted_JPL_test['UserID'].isin(common_user_ids)]

In [None]:
import pandas as pd

sorted_JPL_train= pd.DataFrame(sorted_JPL_train)

# Removing the 'Date' column and resetting index for each user
sorted_JPL_train = sorted_JPL_train.drop('Date', axis=1)
sorted_JPL_train = sorted_JPL_train.groupby('UserID').apply(lambda x: x.reset_index(drop=True)).reset_index(level=0,drop=True)




In [None]:
sorted_JPL_test= pd.DataFrame(sorted_JPL_test)

# Removing the 'Date' column and resetting index for each user
sorted_JPL_test = sorted_JPL_test.drop('Date', axis=1)
sorted_JPL_test = sorted_JPL_test.groupby('UserID').apply(lambda x: x.reset_index(drop=True)).reset_index(level=0,drop=True)

In [None]:
sorted_JPL_train

#Create ARIMA model for each user

In [None]:
import pandas as pd
from statsmodels.tsa.stattools import adfuller

def find_optimal_d_for_user(df, user_id):
    # Filter the dataframe for the given user
    user_df = df[df['UserID'] == user_id]

    # Check stationarity and find optimal d
    d = 0
    while True:
        adf_test = adfuller(user_df['Time'])
        p_value = adf_test[1]
        if p_value < 0.05:
            # The series is stationary
            break
        else:
            # The series is not stationary, apply differencing
            user_df = user_df.diff().dropna()
            d += 1

    return d

# Load the dataset
df = sorted_JPL_train

# Find the unique user IDs
unique_users = df['UserID'].unique()

# Dictionary to hold the optimal d value for each user
optimal_d_values = {}

# Iterate over each user and find the optimal d value
for user in unique_users:
    optimal_d_values[user] = find_optimal_d_for_user(df, user)

# Print the optimal d values for each user
print(optimal_d_values)


In [None]:
import pandas as pd
import numpy as np
from statsmodels.tsa.stattools import acf, pacf

def find_optimal_p_q_for_user(df, user_id, d_value, significance_level=0.05):
    # Filter the dataframe for the given user
    user_df = df[df['UserID'] == user_id]

    # Differencing the series based on the optimal d value, if d is not 0
    if d_value > 0:
        user_df = user_df.diff(d_value).dropna()

    # Length of the user's time series data
    N = len(user_df)

    # Critical value for the given significance level
    critical_value = 1.96 / np.sqrt(N)

    # Set the maximum number of lags for ACF and PACF (up to 50% of the sample size)
    max_lags = min(20, int(N / 2 - 1))

    # Calculate ACF and PACF
    lag_acf = acf(user_df['Time'], nlags=max_lags)
    lag_pacf = pacf(user_df['Time'], nlags=max_lags, method='ols')

    # Find the optimal p value (where PACF first crosses the critical value)
    p = next((i for i, x in enumerate(lag_pacf) if abs(x) < critical_value), None)

    # Find the optimal q value (where ACF first crosses the critical value)
    q = next((i for i, x in enumerate(lag_acf) if abs(x) < critical_value), None)

    return p, q

# Load the dataset
df = sorted_JPL_train

# Assuming you have a dictionary `optimal_d_values` from the previous step
optimal_pq_values = {}

# Iterate over each user and find the optimal p and q value
for user in df['UserID'].unique():
    d_value = optimal_d_values[user]
    optimal_pq_values[user] = find_optimal_p_q_for_user(df, user, d_value)

# Print the optimal p and q values for each user
print(optimal_pq_values)


In [None]:
import pandas as pd
from statsmodels.tsa.arima.model import ARIMA

# Assuming you have two dictionaries: optimal_d_values and optimal_pq_values
# Also assuming you have a DataFrame df containing the time series data

# Dictionary to store ARIMA models for each user
arima_models = {}

# Iterate over each user
for user in df['UserID'].unique():
    # Retrieve the user's data
    user_df = df[df['UserID'] == user]

    # Retrieve the optimal parameters for the user
    d = optimal_d_values[user]
    d=min(1,d)
    p, q = optimal_pq_values[user]

    # Build and fit the ARIMA model
    model = ARIMA(user_df['Time'], order=(p, d, q))
    fitted_model = model.fit()

    # Store the fitted model
    arima_models[user] = fitted_model

# At this point, arima_models dictionary contains the fitted ARIMA model for each user


#ARIMA model predictions on test set

In [None]:
# Assuming you have a dictionary arima_models containing the fitted ARIMA models for each user
# Assuming test_df is your test dataset

# Dictionary to store predictions for each user
arima_predictions = {}

# Iterate over each user
for user in sorted_JPL_test['UserID'].unique():
    # Retrieve the test data for the user
    user_test_df = sorted_JPL_test[sorted_JPL_test['UserID'] == user]

    # Retrieve the fitted ARIMA model for the user
    fitted_model = arima_models[user]

    # Make out-of-sample predictions
    # The number of periods to predict is the length of the user's test data
    num_periods = len(user_test_df)
    user_predictions = fitted_model.forecast(steps=num_periods)
    # Calculate the maximum and minimum values of the predictions
    max_prediction = user_predictions.max()
    min_prediction = user_predictions.min()

    # Calculate the difference between max and min predictions
    diff = max_prediction - min_prediction
    print(diff)
    # Store the predictions
    arima_predictions[user] = user_predictions

# At this point, predictions dictionary contains the out-of-sample predictions for each user


#SMAPE on ARIMA model

In [None]:
df = pd.DataFrame(sorted_JPL_test)

# Convert DataFrame to dictionary with UserID as key and Time values as list
true_values = df.groupby('UserID')['Time'].apply(list).to_dict()

In [None]:
import numpy as np
import pandas as pd

def calculate_smape(actual, predicted):
    """Calculate SMAPE between two series."""
    denominator = (np.abs(actual) + np.abs(predicted))
    diff = np.abs(actual - predicted) / denominator
    diff[denominator == 0] = 0.0  # handle division by zero
    return 100 * np.mean(diff)

# Dictionary to store SMAPE for each user
smape_values_arima = {}

# Iterate over each user
for user in arima_predictions:
    # Retrieve the predicted values for the user and convert to a Pandas Series if not already
    predicted = pd.Series(arima_predictions[user])

    # Retrieve the true values for the user and convert to a Pandas Series
    actual = pd.Series(true_values[user],index=predicted.index)

    # Calculate SMAPE
    smape = calculate_smape(actual, predicted)

    # Store the SMAPE value
    smape_values_arima[user] = smape

# smape_values dictionary now contains the SMAPE for each user


In [None]:
mean_smape_arima = np.mean(list(smape_values_arima.values()))

print("SMAPE of ARIMA model (%):", mean_smape_arima)
