In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_absolute_error, mean_squared_error,mean_absolute_percentage_error, r2_score
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.statespace.dynamic_factor import DynamicFactor
from prophet import Prophet
from statsmodels.tsa.forecasting.theta import ThetaModel
from sklearn.model_selection import train_test_split
import openai
from pydantic import BaseModel, Field
import re
import json

In [12]:
# OpenAI API Key (Replace with your own key)
OPENAI_API_KEY = "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"
client = openai.OpenAI(api_key=OPENAI_API_KEY)

In [13]:
# 1. Load & Transform Data
def load_and_transform(file_path, sheet_name=0):
    df = pd.read_excel(file_path, sheet_name=sheet_name)
    df = df.melt(id_vars=[df.columns[0]], var_name="Month", value_name="Value")
    df.columns = ["Item", "Month", "Value"]
    df["Month"] = pd.to_datetime(df["Month"], format="%b-%y")  # Adjust format as needed
    return df

In [14]:


class ForecastEntry(BaseModel):
    month: str
    value: float

class ForecastResponse(BaseModel):
    forecast: list[ForecastEntry]
    summary: str






def forecast_with_openai(train, forecast_periods):
    history_text = "\n".join(
        f"In {row.Month.strftime('%b-%y')}, the value was {row.Value}." for _, row in train.iterrows()
    )

    prompt = f"""
    Here is a time-series of financial data:
    {history_text}
    Based on the above pattern, predict the next {forecast_periods} months and provide a summary explanation of the forecast.
    The summary should be a little detailed. How is the trend and seasonality, how they are affecting the months, etc.
    Don't use any model or code, use natural reasoning ability for forecasting.
    Return the response as a JSON object with keys 'forecast' and 'summary'.
    """
    response = client.beta.chat.completions.parse(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}],
        response_format={
            "type": "json_schema",
            "json_schema": {
                "name": "forecast_schema",  # Add a descriptive name for your schema
                "schema": {
                    "type": "object",
                    "properties": {
                        "forecast": {
                            "type": "array",
                            "items": {
                                "type": "object",
                                "properties": {
                                    "month": {"type": "string"},
                                    "value": {"type": "number"}
                                },
                                "required": ["month", "value"]
                            }
                        },
                        "summary": {"type": "string"}
                    },
                    "required": ["forecast", "summary"],
                    "additionalProperties": False
                }
            }
        },
        store=True
    )

    response_data = json.loads(response.choices[0].message.content)
    print("==== response ====")
    print(response_data)
    parsed_forecast = ForecastResponse(**response_data)
    return parsed_forecast.forecast, parsed_forecast.summary



In [15]:
# Forecasting Function
def forecast_time_series(df, target_item, forecast_periods=12):
    data = df[df["Item"] == target_item][["Month", "Value"]].sort_values("Month")
    train, test = train_test_split(data, test_size=forecast_periods, shuffle=False)

    models = {
        "ARIMA": ARIMA(train["Value"], order=(1, 1, 1)).fit(),
        "Holt-Winters": ExponentialSmoothing(train["Value"], trend="add", seasonal="add", seasonal_periods=12).fit(),
        "SARIMA": SARIMAX(train["Value"], order=(1, 1, 1), seasonal_order=(1, 1, 1, 12)).fit(),
        "Prophet": Prophet().fit(train.rename(columns={"Month": "ds", "Value": "y"})),
        # "Theta": ThetaModel(train["Value"]).fit(),
        # "Dynamic Factor": DynamicFactor(train["Value"], k_factors=1).fit()
    }

    results = {}
    test_df = pd.DataFrame({"ds": test["Month"]})

    for name, model in models.items():
        if name == "Prophet":
            forecast = model.predict(test_df)["yhat"]
        elif name == "Dynamic Factor":
            forecast = model.predict(start=len(train), end=len(train) + len(test) - 1)
        else:
            forecast = model.forecast(steps=len(test))

        results[name] = {
            "Forecast": forecast.values,
            "MAE": round(mean_absolute_error(test["Value"], forecast), 3),
            "RMSE": round(np.sqrt(mean_squared_error(test["Value"], forecast)),3),
            "MAPE": round(mean_absolute_percentage_error(test["Value"], forecast), 3),
            "R2": round(r2_score(test["Value"], forecast),3)
            
        }

    # OpenAI Forecasting

    openai_forecast, forecast_summary = forecast_with_openai(train, forecast_periods)
    forecast_values = np.array([entry.value for entry in openai_forecast])

    

    results["OpenAI"] = {
        "Forecast": openai_forecast,
        "MAE": round(mean_absolute_error(test["Value"], forecast_values), 3),
        "RMSE": round(np.sqrt(mean_squared_error(test["Value"], forecast_values)), 3),
        "MAPE": round(mean_absolute_percentage_error(test["Value"], forecast_values), 3),
        "R2": round(r2_score(test["Value"], forecast_values),3),
        "Summary": forecast_summary
    }

    return train, test, results

In [16]:
# Visualization
# Visualization
def plot_results(train, test, results):
    plt.figure(figsize=(12, 12))
    plt.plot(train["Month"], train["Value"], label="Train Data", color="blue")
    plt.plot(test["Month"], test["Value"], label="Test Data", color="black", linestyle="dashed")

    for name, res in results.items():
        if name == "OpenAI":
            # Extract values from ForecastEntry objects
            forecast_values = np.array([entry.value for entry in res["Forecast"]], dtype=float)
        else:
            forecast_values = np.array(res["Forecast"], dtype=float)
        plt.plot(test["Month"], forecast_values, label=name)


    plt.legend()
    plt.xlabel("Month")
    plt.ylabel("Value")
    plt.title("Forecasting Comparison (Including OpenAI)")
    plt.grid()
    plt.show()

    metrics_df = pd.DataFrame({name: [res["MAE"], res["RMSE"], res["MAPE"],res["R2"]] for name, res in results.items()},
                              index=["MAE", "RMSE","MAPE","R2"]).T
    return metrics_df

In [None]:
# Usage Example
file_path = "Datasets/SF229792CFL.xlsx"
df = load_and_transform(file_path)
train, test, results = forecast_time_series(df, "Revenue")
print("\n==== Result ====")
print(results)
metrics_df = plot_results(train, test, results)

display(metrics_df)




print("\n=== OpenAI Forecast Summary ===")
print(results["OpenAI"]["Summary"])

ValueError: test_size=12 should be either positive and smaller than the number of samples 0 or a float in the (0, 1) range