In [2]:
import pandas as pd

# Load the dataset
df = pd.read_csv('transaction.csv')

# Display basic information about the dataset
df.info()

# Check the first few rows
print(df.head())

# Check for missing values
df.isnull().sum()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 4 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   Dt      1048575 non-null  object
 1   _id     1048575 non-null  object
 2   EqN     1048575 non-null  int64 
 3   Sta     1048575 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 32.0+ MB
                         Dt                       _id     EqN  Sta
0  2018-08-06T09:25:42.000Z  63b9174d1f70e5b6d84fab6b  141684  106
1  2018-08-06T12:53:59.000Z  63b9174d1f70e5b6d84fab6c  207232  106
2  2018-08-06T12:55:11.000Z  63b9174d1f70e5b6d84fab6d  141687  106
3  2018-08-06T12:55:12.000Z  63b9174d1f70e5b6d84fab6e  141687  106
4  2018-08-06T12:55:13.000Z  63b9174d1f70e5b6d84fab6f  141687  106


Dt     0
_id    0
EqN    0
Sta    0
dtype: int64

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from darts import TimeSeries
from darts.models import NBEATSModel
from darts.metrics import mae, rmse
from darts.dataprocessing.transformers import Scaler
from darts.utils.utils import model_selection
from darts.utils.timeseries_generation import datetime_attribute_timeseries
from darts.utils.statistics import check_seasonality

# Load and preprocess your data
df = pd.read_csv('transaction.csv')  # Replace with your dataset path
df['Dt'] = pd.to_datetime(df['Dt'], errors='coerce')
df = df.dropna(subset=['Dt'])
df = df.sort_values(by='Dt')  # Sort by date
df.set_index('Dt', inplace=True)

# Filtering based on user input
stn_no = input("Enter station number (or 'all' for all stations): ")
eqn_no = input("Enter equipment number (or 'all' for all equipment): ")
if stn_no.lower() != 'all':
    df = df[df['Sta'] == int(stn_no)]
if eqn_no.lower() != 'all':
    df = df[df['EqN'] == int(eqn_no)]

# Aggregating the Footfall at Hourly Level and filling gaps
hourly_footfall = df.resample('H').count()  # Count events per hour
hourly_footfall = hourly_footfall.asfreq('H', fill_value=0)  # Fill missing hours with 0

# Convert to TimeSeries object
series = TimeSeries.from_dataframe(hourly_footfall, value_cols='_id')

# Splitting into train and test sets
train, test = series[:-48], series[-48:]

# Normalize data using Scaler
scaler = Scaler()
train_transformed = scaler.fit_transform(train)
test_transformed = scaler.transform(test)

# Function for cross-validation and model evaluation
def cross_validate_model(model, train_data, k=5):
    errors = []
    for train_cv, val_cv in model_selection.get_folds(train_data, n=k):
        model.fit(train_cv)
        forecast_cv = model.predict(len(val_cv))
        error = rmse(val_cv, forecast_cv)
        errors.append(error)
    return np.mean(errors)

# Initialize and tune the N-BEATS model with adjusted hyperparameters
model = NBEATSModel(input_chunk_length=48, output_chunk_length=24, n_epochs=200)

# Perform cross-validation and fit the model
cv_error = cross_validate_model(model, train_transformed, k=5)
print(f'Cross-validation RMSE: {cv_error}')

# Fit the final model on the entire training data
model.fit(train_transformed, verbose=True)

# Forecast using the fitted model
forecast = model.predict(len(test_transformed))

# Inverse transform the forecast back to the original scale
forecast = scaler.inverse_transform(forecast)

# Evaluate the model on the test set
error_mae = mae(test, forecast)
error_rmse = rmse(test, forecast)

print(f'MAE: {error_mae}')
print(f'RMSE: {error_rmse}')

# Plot results
plt.figure(figsize=(12, 6))
series.plot(label='Actual')
forecast.plot(label='Forecast', color='orange')
plt.xlabel('Date')
plt.ylabel('Footfall')
plt.title('Hourly Forecast vs Actual')
plt.legend()
plt.show()

# Additional Feature Engineering - Adding datetime attributes (optional)
day_series = datetime_attribute_timeseries(pd.date_range(start=series.start_time(), freq='H', periods=len(series)), attribute='day')
month_series = datetime_attribute_timeseries(pd.date_range(start=series.start_time(), freq='H', periods=len(series)), attribute='month')
year_series = datetime_attribute_timeseries(pd.date_range(start=series.start_time(), freq='H', periods=len(series)), attribute='year')

# Check for seasonality
is_seasonal, seasonality_period = check_seasonality(series)

if is_seasonal:
    print(f"Seasonality detected with a period of {seasonality_period}.")

