In [1]:
from bin.config import *
from bin.forecasting import Model
from bin.producer import Producer
from bin.consumer import Consumer

In [2]:
LIMIT_DATETIME_VALUES: list[dict[str, int]] = [
    {"days": 7},
    {"days": 3},
    {"days": 1},
]

In [3]:
spark_session: SparkSession = (
    SparkSession.builder.appName("Forecasting App")  # type: ignore
    .config("spark.jars.packages", ",".join(SPARK_PACKAGES))
    .config("spark.sql.legacy.timeParserPolicy", "CORRECTED")
    .getOrCreate()
)


In [4]:
con: Consumer = Consumer(CAPTURE_TOPIC, CAPTURE_SCHEMA_LIST, spark_session)
pro: Producer = Producer(PREDICTION_TOPIC, PREDICTION_SCHEMA_LIST, spark_session)


In [5]:
def forecasting_traffic(history_df: pd.DataFrame) -> pd.DataFrame:
    segment_id_value: int = history_df.at[0, "SegmentID"]
    direction_value: str = history_df.at[0, "Direction"]

    max_his_ds: datetime = history_df["ds"].max()

    model: Model = Model(timedelta(**LIMIT_DATETIME_VALUES[2]))
    model.fit(history_df)
    future_pred_df: pd.DataFrame = model.forecasting()

    future_pred_df["SegmentID"] = segment_id_value
    future_pred_df["Direction"] = direction_value

    future_pred_df.loc[:, "min_history_ds"] = model.range_history[0]
    future_pred_df.loc[:, "max_history_ds"] = max_his_ds

    return future_pred_df


In [6]:
def group_by__forecasting(df: DataFrame) -> DataFrame:
    pred_df: DataFrame = (
        df.withColumnsRenamed({"Timestamp": "ds", "Vol": "y"})
        .groupBy("SegmentID", "Direction")
        .applyInPandas(
            func=forecasting_traffic,
            schema=", ".join(FORECASTING_SCHEMA_LIST),
        )
        .withColumnsRenamed({"ds": "Timestamp", "yhat": "prediction_vol"})
        .withColumn("prediction_ds", F.current_timestamp())
    )

    return pred_df


In [7]:
history_df: DataFrame = con.get_history_df()

while True:
    start_time_point_: datetime = datetime.now()
    future_prediction_df: DataFrame = group_by__forecasting(history_df)
    pro.store_dataframe_to_kafka(future_prediction_df)
    time: float = (datetime.now() - start_time_point_).total_seconds()
    print(time)
    sleeping_time: float = max(DELAY * 4 - time, 0)
    sleep(sleeping_time)

69.310463
58.293239
54.293513
59.895232
62.527391
59.052255
60.279827
49.378019
56.192061
45.369206
41.036297
42.423481
44.608354
48.193988
42.718109
37.22834
83.109899
49.020806
43.645338
42.225739
40.057337
39.557947
38.743521
37.715431
40.824382


KeyboardInterrupt: 