In [1]:
from bin.config import *
from bin.forecasting import Model
from bin.producer import Producer
from bin.consumer import Consumer

In [2]:
LIMIT_DATETIME_VALUES: list[dict[str, int]] = [
    {"days": 7},
    {"days": 3},
    {"days": 1},
]

In [3]:
spark_session: SparkSession = (
    SparkSession.builder.appName("Forecasting App")  # type: ignore
    .config("spark.jars.packages", ",".join(SPARK_PACKAGES))
    .config("spark.sql.legacy.timeParserPolicy", "CORRECTED")
    .getOrCreate()
)

In [4]:
con: Consumer = Consumer(
    topic=CAPTURE_TOPIC,
    schema_list=CAPTURE_SCHEMA_LIST,
    spark_session=spark_session,
)

pro: Producer = Producer(
    topic=PREDICTION_TOPIC,
    schema_list=PREDICTION_SCHEMA_LIST,
    spark_session=spark_session,
)

In [5]:
def forecasting_traffic(history_df: pd.DataFrame) -> pd.DataFrame:
    segment_id_value: int = history_df.at[0, "SegmentID"]
    direction_value: str = history_df.at[0, "Direction"]

    max_his_ds: datetime = history_df["ds"].max()

    model: Model = Model(timedelta(**LIMIT_DATETIME_VALUES[0]))
    model.fit(history_df)
    future_pred_df: pd.DataFrame = model.forecasting()

    future_pred_df["SegmentID"] = segment_id_value
    future_pred_df["Direction"] = direction_value

    future_pred_df.loc[:, "min_history_ds"] = model.range_history[0]
    future_pred_df.loc[:, "max_history_ds"] = max_his_ds

    return future_pred_df


In [6]:
def group_by__forecasting(df: DataFrame) -> DataFrame:
    pred_df: DataFrame = (
        df.withColumnsRenamed({"Timestamp": "ds", "Vol": "y"})
        .groupBy("SegmentID", "Direction")
        .applyInPandas(
            func=forecasting_traffic,
            schema=", ".join(FORECASTING_SCHEMA_LIST),
        )
        .withColumnsRenamed({"ds": "Timestamp", "yhat": "prediction_vol"})
        .withColumn("prediction_ds", F.current_timestamp())
    )

    return pred_df


In [None]:
history_df: DataFrame = con.get_history_df()

while True:
    start_time_point_: datetime = datetime.now()
    future_prediction_df: DataFrame = group_by__forecasting(history_df)
    pro.store_dataframe_to_kafka(future_prediction_df)
    time: float = (datetime.now() - start_time_point_).total_seconds()
    print(time)
    sleeping_time: float = max(DELAY - time, 0)
    sleep(sleeping_time)

60.613925
41.946724
39.883609
101.760841
36.762458
39.627119
36.995126
39.621901
40.326314
38.649536
48.633052
51.153944
45.969296
40.159911
38.616403
36.161473
46.925173
42.254812
41.627194
49.011821
37.449618
43.374959
43.134062
43.579632
47.498931
35.081412
39.067044
43.622305
38.040624
37.899777
38.394969
37.60574
36.911409
39.78108
37.247215
37.146867
37.955538
36.991289
37.26367
38.312235
36.193139
36.976447
