In [None]:
import pandas as pd
from prophet import Prophet, diagnostics

# Read the data
ped_df_2023 = pd.read_excel('/home/arasal/group-14-sustainable-city-management/raw_data/pedestrian-counts-1-jan-31-december-2023.xlsx')
ped_df_2024 = pd.read_csv('/home/arasal/group-14-sustainable-city-management/raw_data/pedestrian-counts-1-jan-1-feburary-2024.csv')

# Function to transform data
def transform_data(df):
    return df.melt(id_vars='Time', value_vars=list(df.columns)[1:], value_name='y')

# Aggregate data per location
aggregated_data_2023 = ped_df_2023.groupby(ped_df_2023.columns[0]).sum().reset_index()
aggregated_data_2024 = ped_df_2024.groupby(ped_df_2024.columns[0]).sum().reset_index()

# Transform aggregated data
melted_data_2023 = transform_data(aggregated_data_2023)
melted_data_2024 = transform_data(aggregated_data_2024)

# Drop rows with 'OUT' and 'IN' in the variable column
melted_data_2023 = melted_data_2023[~melted_data_2023['variable'].str.endswith(('OUT', 'IN'))]
melted_data_2024 = melted_data_2024[~melted_data_2024['variable'].str.endswith(('OUT', 'IN'))]

# Rename columns
new_names = {'Time': 'ds', 'variable': 'location'}
melted_data_2023.rename(columns=new_names, inplace=True)
melted_data_2024.rename(columns=new_names, inplace=True)

# Create and train the model
model = Prophet()
model.fit(melted_data_2023[['ds', 'y']])

# Make future dataframe
future_df = model.make_future_dataframe(periods=365)

# Predictions
#predictions = model.predict(future_df)

# Cross-validation
df_cv = diagnostics.cross_validation(model, horizon='90 days', period='1 hour', initial='180 days')

# Performance metrics
print(diagnostics.performance_metrics(df_cv).head())


In [None]:
from prophet.serialize import model_to_json

In [None]:
with ('pedestrial_model_json.json', 'w') as fout:5
    fout.write(model_to_json(model))

In [4]:
import pandas as pd
from prophet import Prophet
import os
import pickle

# Read the data
ped_df_2023 = pd.read_excel('/home/arasal/group-14-sustainable-city-management/raw_data/pedestrian-counts-1-jan-31-december-2023.xlsx')
ped_df_2024 = pd.read_csv('/home/arasal/group-14-sustainable-city-management/raw_data/pedestrian-counts-1-jan-1-feburary-2024.csv')

# Function to transform data
def transform_data(df):
    return df.melt(id_vars='Time', value_vars=list(df.columns)[1:], value_name='y')

# Aggregate data per location
def aggregate_data_per_location(df):
    aggregated_data = df.groupby(df.columns[0]).sum().reset_index()
    return aggregated_data

# Drop rows with 'OUT' and 'IN' in the variable column
def drop_out_in_rows(df):
    return df[~df['variable'].str.endswith(('OUT', 'IN'))]

# Rename columns
def rename_columns(df):
    return df.rename(columns={'Time': 'ds', 'variable': 'location'})

# Train Prophet model for each location and store models in pickle files
def train_models_and_save(df, save_path):
    unique_locations = df['location'].unique()
    for location in unique_locations:
        # Filter data for current location
        location_df = df[df['location'] == location]
        
        # Check if there are at least two non-NaN rows for the location
        if location_df['y'].notnull().sum() < 2:
            print(f"Skipping location '{location}' due to insufficient data.")
            continue
        
        # Initialize Prophet model
        model = Prophet()
        
        # Fit the model
        model.fit(location_df[['ds', 'y']])
        
        # Save the model to pickle file
        location = location.replace(" ", '_').replace('/','_')
        with open(f'{save_path}/{location}_model.pickle', 'wb') as f:
            pickle.dump(model, f)
            print(f"Model for location '{location}' saved successfully.")

In [None]:
# Transform, aggregate, and clean the data
melted_data_2023 = transform_data(ped_df_2023)
melted_data_2024 = transform_data(ped_df_2024)
melted_data_2023 = drop_out_in_rows(melted_data_2023)
melted_data_2024 = drop_out_in_rows(melted_data_2024)
melted_data_2023 = rename_columns(melted_data_2023)
melted_data_2024 = rename_columns(melted_data_2024)

# Train models for 2023 data and save them
save_path = './models'
os.makedirs(save_path, exist_ok=True)  # Ensure the directory exists or create it if not

train_models_and_save(melted_data_2023, save_path)

In [None]:
# Load trained Prophet models from pickle files
from prophet.diagnostics import cross_validation, performance_metrics

def load_models(model_dir):
    models = {}
    for filename in os.listdir(model_dir):
        if filename.endswith("_model.pickle"):
            location = filename.replace("_model.pickle", "").replace("_", "/")
            with open(os.path.join(model_dir, filename), 'rb') as f:
                models[location] = pickle.load(f)
    return models

# Make predictions using the loaded models
def make_predictions(models, df_2024):
    predictions = {}
    for location, model in models.items():
        # Filter data for current location
        location_df = df_2024[df_2024['location'] == location]
        future = model.make_future_dataframe(periods=365)
        forecast = model.predict(future)
        #print(forecast)
        predictions[location] = forecast[['ds', 'yhat']]
    return predictions

# Evaluate predictions against the actual values
def evaluate_predictions(predictions, df_2024):
    evaluation_results = {}
    for location, pred_df in predictions.items():
        location_df = df_2024[df_2024['location'] == location]
        merged_df = pred_df.merge(location_df, on='ds', suffixes=('_pred', '_actual'))
        # Compute evaluation metrics (e.g., MAE, RMSE)
        # For example:
        mae = (merged_df['yhat'] - merged_df['y']).abs().mean()
        rmse = ((merged_df['yhat'] - merged_df['y']) ** 2).mean() ** 0.5
        evaluation_results[location] = {'MAE': mae, 'RMSE': rmse}
    return evaluation_results

# Transform, clean, and rename the 2024 dataset
melted_data_2024 = transform_data(ped_df_2024)
melted_data_2024 = drop_out_in_rows(melted_data_2024)
melted_data_2024 = rename_columns(melted_data_2024)

# Load trained models
model_dir = './models'
trained_models = load_models(model_dir)

for k, c in trained_models.items():

    df_cv = cross_validation(trained_models[k], horizon='90 days', period='1 hour', initial='180 days')
    df_p = performance_metrics(df_cv)
    print(df_p.head())

  0%|          | 0/2280 [00:00<?, ?it/s]

14:59:17 - cmdstanpy - INFO - Chain [1] start processing
14:59:17 - cmdstanpy - INFO - Chain [1] done processing
14:59:17 - cmdstanpy - INFO - Chain [1] start processing
14:59:18 - cmdstanpy - INFO - Chain [1] done processing
14:59:18 - cmdstanpy - INFO - Chain [1] start processing
14:59:18 - cmdstanpy - INFO - Chain [1] done processing
14:59:19 - cmdstanpy - INFO - Chain [1] start processing
14:59:19 - cmdstanpy - INFO - Chain [1] done processing
14:59:20 - cmdstanpy - INFO - Chain [1] start processing
14:59:20 - cmdstanpy - INFO - Chain [1] done processing
14:59:21 - cmdstanpy - INFO - Chain [1] start processing
14:59:21 - cmdstanpy - INFO - Chain [1] done processing
14:59:21 - cmdstanpy - INFO - Chain [1] start processing
14:59:21 - cmdstanpy - INFO - Chain [1] done processing
14:59:22 - cmdstanpy - INFO - Chain [1] start processing
14:59:22 - cmdstanpy - INFO - Chain [1] done processing
14:59:23 - cmdstanpy - INFO - Chain [1] start processing
14:59:23 - cmdstanpy - INFO - Chain [1]