In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib
import os
from data_split import read_json_from_local, prepare_data, split_data
import matplotlib.pyplot as plt

In [2]:
# Load and prepare data
file_path = 'file_audit.json'  # Make sure this path is correct
data = read_json_from_local(file_path)
prepared_data = prepare_data(data)

print("Columns in prepared_data:")
print(prepared_data.columns.tolist())
print("Shape of prepared_data:", prepared_data.shape)

# Ensure we have separate columns for AM and PM cycles
prepared_data = prepared_data.pivot(index=['date', 'day_of_week', 'month', 'is_weekday'], 
                                    columns='cycle', 
                                    values=['file_count', 'total_records']).reset_index()
prepared_data.columns = ['date', 'day_of_week', 'month', 'is_weekday', 
                         'file_count_AM', 'file_count_PM', 
                         'total_records_AM', 'total_records_PM']

# Handle NaN values
print("\nNaN values before handling:")
print(prepared_data.isna().sum())

# Fill NaN values with median
for column in ['file_count_AM', 'file_count_PM', 'total_records_AM', 'total_records_PM']:
    median_value = prepared_data[column].median()
    prepared_data[column].fillna(median_value, inplace=True)

print("\nNaN values after handling:")
print(prepared_data.isna().sum())

print("\nColumns after restructuring:")
print(prepared_data.columns.tolist())
print("Shape after restructuring:", prepared_data.shape)

# Split the data for training and testing
train_data, test_data = split_data(prepared_data)

print(f"\nTraining data shape: {train_data.shape}")
print(f"Testing data shape: {test_data.shape}")

Processing dates and times...
Filtering data for the last 5 years...
Adding AM/PM cycle...
Grouping data by date and cycle...
Renaming columns...
Adding features...
Sorting data...
Data preparation complete.
Columns in the prepared data: ['date', 'cycle', 'file_count', 'total_records', 'day_of_week', 'month', 'is_weekday']
Shape of the prepared data: (3291, 7)
Columns in prepared_data:
['date', 'cycle', 'file_count', 'total_records', 'day_of_week', 'month', 'is_weekday']
Shape of prepared_data: (3291, 7)

NaN values before handling:
date                  0
day_of_week           0
month                 0
is_weekday            0
file_count_AM         4
file_count_PM       297
total_records_AM      4
total_records_PM    297
dtype: int64

NaN values after handling:
date                0
day_of_week         0
month               0
is_weekday          0
file_count_AM       0
file_count_PM       0
total_records_AM    0
total_records_PM    0
dtype: int64

Columns after restructuring:
['date', 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cycle'] = df['datetime'].dt.strftime('%p')
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  prepared_data[column].fillna(median_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.meth

In [3]:
# Define features and targets
features = ['day_of_week', 'month', 'is_weekday']
targets = ['file_count_AM', 'file_count_PM', 'total_records_AM', 'total_records_PM']

# Train models
models = {}
for target in targets:
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(train_data[features], train_data[target])
    models[target] = model
    
    # Evaluate the model
    predictions = model.predict(test_data[features])
    mse = mean_squared_error(test_data[target], predictions)
    print(f"Mean Squared Error for {target}: {mse}")

Mean Squared Error for file_count_AM: 5.109024511210895
Mean Squared Error for file_count_PM: 2.65724690051325
Mean Squared Error for total_records_AM: 26638259992681.17
Mean Squared Error for total_records_PM: 16331236166794.752


In [4]:
# Save the models locally
model_path = "models"
os.makedirs(model_path, exist_ok=True)
for target, model in models.items():
    joblib.dump(model, os.path.join(model_path, f"{target}_model.joblib"))

print(f"Models saved locally in: {model_path}")

Models saved locally in: models


In [5]:
# Function to make predictions using locally saved models
def predict_local(input_data):
    predictions = {}
    for target in targets:
        model = joblib.load(os.path.join(model_path, f"{target}_model.joblib"))
        predictions[target] = model.predict(input_data)
    return pd.DataFrame(predictions)

# Function to get actual and predicted values for a given date
def get_actual_and_predicted(date, prepared_data, models):
    # Convert date to datetime if it's not already
    date = pd.to_datetime(date)
    
    # Prepare input data for prediction
    input_data = pd.DataFrame({
        'day_of_week': [date.dayofweek],
        'month': [date.month],
        'is_weekday': [1 if date.dayofweek < 5 else 0]
    })
    
    # Make predictions
    predictions = predict_local(input_data)
    
    # Get actual values if available
    actual_data = prepared_data[prepared_data['date'] == date]
    
    results = {}
    for target in targets:
        results[target] = {
            'predicted': predictions[target].values[0],
            'actual': actual_data[target].values[0] if not actual_data.empty else 'N/A'
        }
    
    return results

In [None]:
# Interactive date input and prediction
while True:
    date_str = input("Enter a date (YYYY-MM-DD) to predict for, or 'q' to quit: ")
    if date_str.lower() == 'q':
        break

    try:
        date = pd.to_datetime(date_str)
        results = get_actual_and_predicted(date, prepared_data, models)

        print(f"\nPredictions and Actual Values for {date.date()}:")
        for target, values in results.items():
            print(f"{target}:")
            if 'file_count' in target:
                print(f"  Predicted: {int(round(values['predicted']))}")
                print(f"  Actual: {int(values['actual']) if values['actual'] != 'N/A' else 'N/A'}")
            else:  # for total_records
                print(f"  Predicted: {int(round(values['predicted'])):,}")
                print(f"  Actual: {int(values['actual']):,}" if values['actual'] != 'N/A' else "  Actual: N/A")
            print()
    except ValueError:
        print("Invalid date format. Please use YYYY-MM-DD.")
    except Exception as e:
        print(f"An error occurred: {str(e)}")

Enter a date (YYYY-MM-DD) to predict for, or 'q' to quit:  2025-09-29



Predictions and Actual Values for 2025-09-29:
file_count_AM:
  Predicted: 4
  Actual: N/A

file_count_PM:
  Predicted: 3
  Actual: N/A

total_records_AM:
  Predicted: 7,678,487
  Actual: N/A

total_records_PM:
  Predicted: 4,787,655
  Actual: N/A

