In [1]:
# !pip install pystan==2.19.1.1
# !pip install fbprophet
# !pip install plotly
# !pip install --upgrade pretty-confusion-matrix --user

# data path in VM local file system: home/data/in/ggtrends_downloads, home/data/in/weather_downloads
# data path in HDFS: hdfs://cluster-bda4-m/user/root/project/data/in/ggtrends, hdfs://cluster-bda4-m/user/root/project/data/in/weather

# To upload data from VM local file system to HDFS execute command similar to the following one:
# hdfs dfs -copyFromLocal home/data/in/ggtrends_downloads/Instagram_Hamburg_historic.csv hdfs://cluster-bda4-m/user/root/project/data/in/ggtrends

In [18]:
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from pyspark.sql import Window
from functools import reduce
from pyspark.sql import DataFrame
from pyspark.sql import functions as F
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import MinMaxScaler
from pyspark.sql.types import FloatType

from fbprophet import Prophet

import os
import numpy as np
import pandas as pd
from statsmodels.tsa.stattools import adfuller
import matplotlib.pyplot as plt

from sklearn import metrics
import seaborn as sns
import time

import os
import ast
import re
from datetime import datetime
from pyspark.sql import SparkSession
from concurrent.futures import TimeoutError

from fbprophet.serialize import model_to_json, model_from_json

from tqdm.notebook import tqdm

In [11]:
# Parameters section

# credentials_path = 'weather-based-forecasting-v2-c4bde37656a7.json'
# os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credentials_path
timeout = 5.0

model_variables = ["Temperature", "Relative Humidity", "Wind Speed", "ds"]

In [12]:
# Load pre-trained model from HDFS

with open('serialized_model.json', 'r') as fin:
    m = model_from_json(fin.read())  # Load model (test)
    
print(m)

<fbprophet.forecaster.Prophet object at 0x7f4f0ae1e410>


In [13]:
spark = SparkSession \
    .builder \
    .appName("Time series data analysis with Spark") \
    .config("spark.redis.ssl", "true") \
    .getOrCreate()

In [14]:
from google.cloud import pubsub_v1

In [15]:
# Auxiliary functions


def trim_colnames(df):
    colnames = [re.sub("[^a-zA-Z0-9,]", "", i) for i in df.columns]
    df.columns = colnames
    return df


def assign_class(value, decision_boundaries):
    for _range, _class in decision_boundaries.items():
        # for every pair that you see in table
        if _range[0] <= value < _range[1]:
            return _class
        
        
def predict_class(model, df):
    pred = model.predict(df.toPandas())
    decision_boundaries = {
        (-np.inf, -pred.yhat.quantile(0.9)): -1,
        (-pred.yhat.quantile(0.9), pred.yhat.quantile(0.9)): 0,
        (pred.yhat.quantile(0.9), np.inf): 1,
    }

    pred['pred_class'] = pred.yhat.apply((lambda x: assign_class(x, decision_boundaries)))
    
    return pred[['ds', 'pred_class']]


In [16]:
weather_batch = []

def weather_callback(message):

    print(f'Received weather message')
    message.ack()
    weather_batch.append(message.data)

In [None]:
check_count = 0
weather_batch = []
prev_length = len(weather_batch)

max_check = 3

while True:
    if check_count >= max_check: break
    
    # Initialize subscriber
    subscriber = pubsub_v1.SubscriberClient()

    # Weather
    subscription_path_WD = 'projects/weather-based-forecasting-v2/subscriptions/python_forecast_sub'
    streaming_pull_future_WD = subscriber.subscribe(
        subscription_path_WD, callback=weather_callback
    )
    print(f'Listening for messages on {subscription_path_WD}')

    try:
        streaming_pull_future_WD.result(timeout=120)  # wrap subscriber in a 'with' block to automatically call close() when done
    except TimeoutError:        
        streaming_pull_future_WD.cancel()  # trigger the shutdown
        streaming_pull_future_WD.result()  # block until the shutdown is complete
        
    if len(weather_batch) > prev_length:
        
        cnt_diff = len(weather_batch) - prev_length
        
        new_batches = weather_batch[-cnt_diff:]
        
        batch_df = pd.DataFrame()

        for weather_obs in tqdm(new_batches):
            batch_dict = ast.literal_eval(weather_obs.decode('UTF-8'))
            batch_obs_list = list(batch_dict.values())
            for row in tqdm(batch_obs_list):
                row_df = pd.DataFrame([row])

                row_df["ds"] = pd.to_datetime(row_df["Date time"])
                row_df = row_df[model_variables]
                row_df = trim_colnames(row_df)

                batch_df = pd.concat([batch_df, row_df]).reset_index(drop=True)

        # conversion to Spark dataframe
        batch_DF = spark.createDataFrame(batch_df)

        # Prediction

        # transforming a regression problem into a classification one
        pred = predict_class(m, batch_DF)

        # Saving prediction in HDFS
        ts = str(datetime.now().timestamp()).replace(".", "-")
        path = f"hdfs://cluster-bda2-m/user/root/predictions/forecasts/{ts}.parquet"

        spark.createDataFrame(pred).write.parquet(path)
        
        # Update
        prev_length = len(weather_batch)
    else:
        check_count += 1
        print(f"No new messages - check number: {check_count}.")
        time.sleep(60)


Listening for messages on projects/weather-based-forecasting-v2/subscriptions/python_forecast_sub
Received weather message
Received weather message


INFO:google.api_core.bidi:Thread-ConsumeBidirectionalStream exiting


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/720 [00:00<?, ?it/s]

  0%|          | 0/720 [00:00<?, ?it/s]

Listening for messages on projects/weather-based-forecasting-v2/subscriptions/python_forecast_sub
Received weather message


INFO:google.api_core.bidi:Thread-ConsumeBidirectionalStream exiting


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/720 [00:00<?, ?it/s]

Listening for messages on projects/weather-based-forecasting-v2/subscriptions/python_forecast_sub


INFO:google.api_core.bidi:Thread-ConsumeBidirectionalStream exiting


No new messages - check number: 2.
Listening for messages on projects/weather-based-forecasting-v2/subscriptions/python_forecast_sub


INFO:google.api_core.bidi:Thread-ConsumeBidirectionalStream exiting


No new messages - check number: 3.
Listening for messages on projects/weather-based-forecasting-v2/subscriptions/python_forecast_sub


INFO:google.api_core.bidi:Thread-ConsumeBidirectionalStream exiting


No new messages - check number: 4.


In [25]:
# # Initialize subscriber
# subscriber = pubsub_v1.SubscriberClient()

# # Weather
# subscription_path_WD = 'projects/weather-based-forecasting-v2/subscriptions/python_forecast_sub'
# streaming_pull_future_WD = subscriber.subscribe(
#     subscription_path_WD, callback=weather_callback
# )
# print(f'Listening for messages on {subscription_path_WD}')

In [21]:
# # Reading messages from topics using multiple subscribers
# weather_batch = []

# with subscriber:
#     try:
#         streaming_pull_future_WD.result(timeout=120)  # wrap subscriber in a 'with' block to automatically call close() when done
#     except TimeoutError:        
#         streaming_pull_future_WD.cancel()  # trigger the shutdown
#         streaming_pull_future_WD.result()  # block until the shutdown is complete

In [22]:
# # Process a batch of stream data (e.g., forecast data)
# # TODO: bieżące trackowanie długości listy wiadomości i powtarzanie procesu przy zmianie

# batch_df = pd.DataFrame()

# for weather_obs in tqdm(weather_batch):
#     batch_dict = ast.literal_eval(weather_obs.decode('UTF-8'))
#     batch_obs_list = list(batch_dict.values())
#     for row in tqdm(batch_obs_list):
#         row_df = pd.DataFrame([row])
        
#         row_df["ds"] = pd.to_datetime(row_df["Date time"])
#         row_df = row_df[model_variables]
#         row_df = trim_colnames(row_df)
        
#         batch_df = pd.concat([batch_df, row_df]).reset_index(drop=True)
        
# # conversion to Spark dataframe
# batch_DF = spark.createDataFrame(batch_df)

In [23]:
# # Prediction

# # transforming a regression problem into a classification one
# pred = predict_class(m, batch_DF)

# # Saving prediction in HDFS
# ts = str(datetime.now().timestamp()).replace(".", "-")
# path = f"hdfs://cluster-bda2-m/user/root/predictions/forecasts/{ts}.parquet"
    
# spark.createDataFrame(pred).write.parquet(path)