In [1]:
# Estymacja pierwszego modelu i zapis w HDFS

In [1]:
# Prerequisites

# !pip install pystan==2.19.1.1
# !pip install fbprophet
# !pip install plotly
# !pip install --upgrade pretty-confusion-matrix --user

In [2]:
from pyspark.sql import SparkSession
from fbprophet import Prophet

import numpy as np
import pandas as pd
from typing import List
import os
from datetime import datetime

from statsmodels.tsa.stattools import adfuller
import matplotlib.pyplot as plt

from sklearn import metrics
import seaborn as sns
from pretty_confusion_matrix import pp_matrix

from fbprophet.serialize import model_to_json, model_from_json

  import pandas.util.testing as tm


In [3]:
# Parameters

LOCATION = "Hamburg"
WEBSITE = "Instagram"
TIME = "day"

date_col = 'ds'
y_col = 'y'

In [4]:
# Auxiliary functions


def get_newest_train_fpath(root_dir="hdfs://cluster-bda2-m/user/root"):
    ls = !hdfs dfs -ls modeling/in
    avail_files = [c for c in ls if f"{str.lower(WEBSITE)}_{str.lower(LOCATION)}_{str.lower(TIME)}_train" in c]

    files_dict = {}

    for f in avail_files:
        date, time, name = f.split()[-3:]
        creation_timestamp = datetime.strptime(" ".join([date, time]), "%Y-%m-%d %H:%M")

        files_dict[name] = creation_timestamp

    return os.path.join(root_dir, max(files_dict, key=files_dict.get))


def load_parquet_from_HDFS(spark, fpath: List[str]):
    df = spark.read.parquet(fpath)

    return df


def estimate_prophet(df_train, date_col, y_col):
    regressors = list(set(df_train.columns) - set([date_col, y_col]))
    m = Prophet()

    for c in regressors:
        m.add_regressor(c)

    m.fit(df_train.toPandas())
    return m


def symmetric_mean_absolute_percentage_error(A, F):
    with np.errstate(divide='ignore', invalid='ignore'):
        tmp = 2 * np.abs(F-A) / (np.abs(A) + np.abs(F))
    tmp[np.isnan(tmp)] = 0
    return np.sum(tmp) / len(tmp) * 100


def get_version():
    ls = !ls -l --time-style=long-iso models/
    existing_versions = [c for c in ls if f"{str.lower(WEBSITE)}_{str.lower(LOCATION)}_{str.lower(TIME)}_model" in c]
    return len(existing_versions)+1

In [5]:
spark = SparkSession \
    .builder \
    .appName("Time series data analysis with Spark") \
    .config("spark.redis.ssl", "true") \
    .getOrCreate()

In [6]:
train_fpath = get_newest_train_fpath()

In [8]:
# load data from HDFS
df_train = load_parquet_from_HDFS(spark, train_fpath)

In [9]:
# estimate regression model
model = estimate_prophet(df_train, date_col, y_col)

INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.


In [10]:
version = get_version()

In [12]:
with open(f'models/{str.lower(WEBSITE)}_{str.lower(LOCATION)}_{str.lower(TIME)}_model{version}.json', 'w') as fout:
    fout.write(model_to_json(model))  # Save model

In [13]:
# Test

with open(f'models/{str.lower(WEBSITE)}_{str.lower(LOCATION)}_{str.lower(TIME)}_model{version}.json', 'r') as fin:
    m = model_from_json(fin.read())  # Load model (test)
    
print(m)

<fbprophet.forecaster.Prophet object at 0x7f9f4bf11590>
