In [None]:
# !pip install pystan==2.19.1.1
# !pip install fbprophet
# !pip install plotly
# !pip install --upgrade pretty-confusion-matrix --user

# data path in VM local file system: home/data/in/ggtrends_downloads, home/data/in/weather_downloads
# data path in HDFS: hdfs://cluster-bda4-m/user/root/project/data/in/ggtrends, hdfs://cluster-bda4-m/user/root/project/data/in/weather

# To upload data from VM local file system to HDFS execute command similar to the following one:
# hdfs dfs -copyFromLocal home/data/in/ggtrends_downloads/Instagram_Hamburg_historic.csv hdfs://cluster-bda4-m/user/root/project/data/in/ggtrends

In [1]:
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from pyspark.sql import Window
from functools import reduce
from pyspark.sql import DataFrame
from pyspark.sql import functions as F
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import MinMaxScaler
from pyspark.sql.types import FloatType

from fbprophet import Prophet

import os
import numpy as np
import pandas as pd
from statsmodels.tsa.stattools import adfuller
import matplotlib.pyplot as plt

from sklearn import metrics
import seaborn as sns
import time

import os
import ast
import re
from datetime import datetime
from pyspark.sql import SparkSession
from concurrent.futures import TimeoutError

from fbprophet.serialize import model_to_json, model_from_json

from tqdm.notebook import tqdm

  import pandas.util.testing as tm


In [2]:
# Parameters section

# credentials_path = 'weather-based-forecasting-v2-c4bde37656a7.json'
# os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credentials_path
timeout = 5.0

model_variables = ["Temperature", "Relative Humidity", "Wind Speed", "ds"]

In [3]:
# Load pre-trained model from HDFS

with open('serialized_model.json', 'r') as fin:
    m = model_from_json(fin.read())  # Load model (test)
    
print(m)

<fbprophet.forecaster.Prophet object at 0x7f74aefc9090>


In [4]:
spark = SparkSession \
    .builder \
    .appName("Time series data analysis with Spark") \
    .config("spark.redis.ssl", "true") \
    .getOrCreate()

In [5]:
from google.cloud import pubsub_v1

In [6]:
# Auxiliary functions


def trim_colnames(df):
    colnames = [re.sub("[^a-zA-Z0-9,]", "", i) for i in df.columns]
    df.columns = colnames
    return df


def assign_class(value, decision_boundaries):
    for _range, _class in decision_boundaries.items():
        # for every pair that you see in table
        if _range[0] <= value < _range[1]:
            return _class
        
        
def predict_class(model, df):
    pred = model.predict(df.toPandas())
    decision_boundaries = {
        (-np.inf, -pred.yhat.quantile(0.9)): -1,
        (-pred.yhat.quantile(0.9), pred.yhat.quantile(0.9)): 0,
        (pred.yhat.quantile(0.9), np.inf): 1,
    }

    pred['pred_class'] = pred.yhat.apply((lambda x: assign_class(x, decision_boundaries)))
    
    return pred[['ds', 'pred_class']]


In [7]:
weather_batch = []

def weather_callback(message):

    print(f'Received weather message')
    message.ack()
    obs = ast.literal_eval(message.data.decode('UTF-8'))
    obs_df = pd.DataFrame.from_dict([obs])
    
    # Process an item of stream data (e.g., forecast data)

    obs_df["ds"] = pd.to_datetime(obs_df["Date time"])
    obs_df = obs_df[model_variables]
    obs_df = trim_colnames(obs_df)
    
    obs_DF = spark.createDataFrame(obs_df)
    
    # Real-time prediction
    # transforming a regression problem into a classification one
    pred = predict_class(m, obs_DF)
    
    weather_batch.append(pred)

    # Saving prediction in HDFS
    ts = str(datetime.now().timestamp()).replace(".", "-")
    path = f"hdfs://cluster-bda2-m/user/root/predictions/observations/{ts}.parquet"

    spark.createDataFrame(pred).write.parquet(path)

In [8]:
check_count = 0
weather_batch = []
prev_length = len(weather_batch)

max_check = 3

while True:
    if check_count >= max_check: break
    
    # Initialize subscriber
    subscriber = pubsub_v1.SubscriberClient()

    # Weather
    subscription_path_WD = 'projects/weather-based-forecasting-v2/subscriptions/python_obs_sub'
    streaming_pull_future_WD = subscriber.subscribe(
        subscription_path_WD, callback=weather_callback
    )
    print(f'Listening for messages on {subscription_path_WD}')

    try:
        streaming_pull_future_WD.result(timeout=120)  # wrap subscriber in a 'with' block to automatically call close() when done
    except TimeoutError:        
        streaming_pull_future_WD.cancel()  # trigger the shutdown
        streaming_pull_future_WD.result()  # block until the shutdown is complete
        
    if len(weather_batch) > prev_length: prev_length = len(weather_batch)
    else:
        check_count += 1
        print(f"No new messages - check number: {check_count}.")
        time.sleep(60)


Listening for messages on projects/weather-based-forecasting-v2/subscriptions/python_obs_sub
Received weather message
Received weather message


INFO:google.api_core.bidi:Thread-ConsumeBidirectionalStream exiting


Listening for messages on projects/weather-based-forecasting-v2/subscriptions/python_obs_sub
Received weather message


INFO:google.api_core.bidi:Thread-ConsumeBidirectionalStream exiting


Listening for messages on projects/weather-based-forecasting-v2/subscriptions/python_obs_sub


INFO:google.api_core.bidi:Thread-ConsumeBidirectionalStream exiting


No new messages - check number: 3.


In [9]:
print(1)

1


In [None]:
# # Initialize subscriber
# subscriber = pubsub_v1.SubscriberClient()

# # Weather
# subscription_path_WD = 'projects/weather-based-forecasting-v2/subscriptions/python_obs_sub'
# streaming_pull_future_WD = subscriber.subscribe(
#     subscription_path_WD, callback=weather_callback
# )
# print(f'Listening for messages on {subscription_path_WD}')

In [None]:
# # Reading messages from topics using multiple subscribers
# weather_batch = []

# with subscriber:  # wrap subscriber in a 'with' block to automatically call close() when done
#     try:
#         streaming_pull_future_WD.result(timeout=120)
#     except TimeoutError:        
#         streaming_pull_future_WD.cancel()  # trigger the shutdown
#         streaming_pull_future_WD.result()  # block until the shutdown is complete