# Imports

In [1]:
# From original code, commented out imports are not is use
import prophet
import sys

sys.modules['fbprophet'] = prophet
import warnings
#warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)


import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from tqdm import tqdm
import time


from fbprophet import Prophet
from fbprophet.plot import add_changepoints_to_plot
from fbprophet.diagnostics import cross_validation
from fbprophet.diagnostics import performance_metrics

import itertools
import os, sys
sys.path.append("..")



from sklearn.preprocessing import StandardScaler
from supersmoother import SuperSmoother, LinearSmoother


from collections import OrderedDict
from configs.bad_direction_kpi_dict import bad_direction_kpi_dict
from configs.kpi_constraints_dict import kpi_constraints_dict


import logging




logging.disable(sys.maxsize) #turn off prophet infos

  from .autonotebook import tqdm as notebook_tqdm


# From configs

In [2]:
from fbprophet import Prophet
from fbprophet.diagnostics import cross_validation
from fbprophet.diagnostics import performance_metrics
from fbprophet.diagnostics import generate_cutoffs, single_cutoff_forecast

from supersmoother import SuperSmoother


import logging
logger = logging.getLogger('prophet')


def is_weekday(timestamp):
    date = pd.to_datetime(timestamp)
    return (date.dayofweek < 5)


def run_prophet_funct(df, params, daily_fourier_order, weekly_fourier_order, is_weekend, country_name):
    
    if is_weekend:
        df['weekday'] = df['ds'].apply(is_weekday)
        df['weekend'] = ~df['ds'].apply(is_weekday)
        m = Prophet(**params, daily_seasonality = False, weekly_seasonality = weekly_fourier_order, uncertainty_samples = 0)
        #hozzaadjuk a holidayeket
        #weekday, weekend
        m.add_seasonality(name='weekday', period=1, fourier_order=daily_fourier_order, condition_name='weekday')
        m.add_seasonality(name='weekend', period=1, fourier_order=daily_fourier_order, condition_name='weekend')
    else:
        m = Prophet(**params, daily_seasonality =daily_fourier_order, weekly_seasonality = weekly_fourier_order, uncertainty_samples = 0)
    m.add_country_holidays(country_name=country_name)
    m = m.fit(df)
    return m
    
def make_future(model, end, periods):
    """
    end: given in pd.datetime format: not unix timestamp, we start our dataframe at end+1h
    model: the prophet model
    period: how many hours to forecast
    """

    dates = pd.date_range(start=end+pd.Timedelta('1H'), end = end+pd.Timedelta(str(periods)+'H'), freq = 'H')
    dates = np.concatenate((np.array(model.history_dates), dates))

    future = pd.DataFrame({'ds': dates})
    return future

In [3]:
metadata_store = pd.read_csv(data_loc + 'metadata-anon.csv')
# Mixed datatypes in the dimension_name col: floats and OrderedDict as str (need eval)

# Get rid of all irrelevant metadata
metadata_store = metadata_store[ metadata_store.model_type == 'non_seasonal_trend' ]

# evaluate str to OrderedDict
# makes things so much easier
metadata_store.dimension_name = metadata_store.dimension_name.map(lambda element: eval(element))

#params
missing_data_percentage_param = 0.3
daily_fourier_order = 0
weekly_fourier_order = 0
is_weekend = False
country_name = 'USA'
percent = 0.1
scores = ['mae'] #['mdape', 'mape', 'smape', 'mae']
predictions_write_to = ''
errors_write_to = ''
write_to = ''
alpha = 1.0


#infos
end = pd.to_datetime(metadata_store['ts'].values[0], unit='s')
ts = metadata_store['ts'].values[0]
start = end - pd.Timedelta(4, unit = 'w') # 
files = metadata_store['path'].unique() # arr of unique files

files = files[ files != "4weeks-lte_eci-1614153600.csv" ] # remove from test list,
                                                          # this is just too large


print("-"*30,"DF READ ✔️","-"*30)

  metadata_store = pd.read_csv('/home/jovyan/work/elbaanh/AIO-dev/data/anonimized2/metadata-anon.csv')


------------------------------ DF READ ✔️ ------------------------------


# UDF

In [4]:
from pyspark.sql.types import *
from pyspark.sql.functions import pandas_udf, PandasUDFType


sSmoothing_schema = StructType([StructField("y", FloatType()), 
                                StructField("dt", DateType()), 
                                StructField("ytop", FloatType()) ])



@pandas_udf(sSmoothing_schema, PandasUDFType.GROUPED_MAP)
def sSmoothing(df):

    #df.index = df["dt"]    # lehetne a megoldás, ha df.index kell
    print(df.columns)
    df["y"] = df["value"]
    df["range"] = df["dt"]  # df.index
    max_range = df.range.max()


    model = SuperSmoother()
    model.fit(np.array(df.range), df.y, (np.ones(max_range+1)))
    
    tfit = np.linspace(0, max_range, max_range+1)
    yfit = model.predict(tfit)
    df["ytop"] = df["y"].copy()
    df.y = df.ytop - yfit
    
    q3, q1 = np.percentile(df.y, [75 ,25])
    IQR = q3 - q1
    df["y"] = np.where(((df.y < q1-3*IQR)|(df.y > q3+3*IQR)), np.nan, df.y)
    df["y"] = df.y.interpolate(method='akima')
    df.y = df.y + yfit
    df = df.drop(['range'], axis=1)

    # Reorder df to evade schema mismatch
    df = df[["y","dt", "ytop"]]

    return df

In [5]:
training_schema = StructType([StructField("horizon", IntegerType()),
                                StructField("mse", FloatType()),
                                StructField("rmse", FloatType()),
                                StructField("mae", FloatType()),
                                StructField("mape", FloatType()),
                                StructField("mdape", FloatType()),
                                StructField("coverage", FloatType())])
                                #StructField("params", MapType(StringType(), FloatType()))])

from pyspark.sql.functions import pandas_udf, PandasUDFType

#@pandas_udf(training_schema, PandasUDFType.GROUPED_MAP)
def training(df: pd.DataFrame) -> pd.DataFrame:
    daily_fourier_order = 0
    weekly_fourier_order = 0
    is_weekend = False
    country_name = "USA"

    #df["ds"] = pd.to_datetime(df["ts"] , unit='s')
    df["ds"] = df["dt"]
    df["y"] = df["value"]
    df = df.sort_values('ds')
    df = df.reset_index(drop=True)
    df = df[["ds","y"]]
    
    

    param_grid = {  'changepoint_prior_scale': [0.01, 0.1, 1.0],
                        'changepoint_range': [0.8, 0.9, 0.95]       }
    all_params = [dict(zip(param_grid.keys(), v)) for v in itertools.product(*param_grid.values())]

    #tuning_results = pd.DataFrame.from_dict({"horizon":[1], "mse" : [1.5], "rmse" :[1.5], "mae": [1.5], "mape":[1.5], 
    #                                           "mdape":[1.5], "coverage": [1.5]}) #! debug


    perf_row_list = []

    init_horizon = (df.ds.max() - df.ds.min()).days

    for params in all_params:
        m = run_prophet_funct(df, params, 
                                daily_fourier_order, 
                                weekly_fourier_order, is_weekend,
                                country_name)

        if init_horizon > 15:
            df_cv = cross_validation(m,initial = "14 days",
                                    horizon="1 days",
                                    period ="1 days")

            df_p = performance_metrics(df_cv, rolling_window=1)

            perf_row_list.append(df_p)
            
    if init_horizon > 15:
        tuning_results = pd.concat( perf_row_list ).reset_index(drop = True)   #make df out of list of dfs
    #tuning_results["params"] = all_params

        tuning_results["horizon"] = tuning_results["horizon"].map(lambda timedelta: int(timedelta.total_seconds()) ) #convert horizon of datatype timedelta to integer

    else:
        pass


    healthy_columns = ["horizon", "mse", "rmse", "mae", "mape", "mdape", "coverage"]
    
    

    if len(tuning_results.columns) != 7:
        for idx, col in enumerate(healthy_columns):
            if col not in tuning_results.columns:
                tuning_results.insert(idx, col, [69] * len(tuning_results))
        
        
    
    return tuning_results

# Spark hyperparams

In [6]:
def config_grid_gen(i):
    config_grid = {
    "spark.executor.memory" : ["2g", "4g", "7g" ], 
    "spark.executor.instances" : ["8", "16", "32", "64","64"],    #[:i],
    "spark.executor.cores" : ["8", "4", "2", "1", "2"],    #[i-1:],
    "spark.driver.memory" : ["16g"][0] #"64g"
    #"spark.task.cpus" : [ "1", "2"],
    #"spark.python.worker.memory": ["1g", "5g"]
    }
    return config_grid

In [7]:
config_grid = {
    "spark.executor.memory" : ["2g", "4g"], 
    "spark.executor.instances" : ["16", "32", "64"],    #[:i],
    "spark.executor.cores" : ["8", "2"],    #[i-1:],
    "spark.driver.memory" : ["16g"] #"64g"
    #"spark.task.cpus" : [ "1", "2"],
    #"spark.python.worker.memory": ["1g", "5g"]
    }

In [8]:
config_params = np.random.choice([dict(zip(config_grid.keys(), v)) for v in itertools.product(*config_grid.values())], size = 5)

In [9]:
#config_params = [ config_grid_gen(i) for i in range(5) ]


In [10]:
import findspark
findspark.init("/home/jovyan/work/elbaanh/spark-3.2.1-bin-hadoop3.2/")  # put the path to pyspark here

from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
import time, json

In [11]:
sql_statement_gb3 = """
  SELECT
      lte_enodeb_id,
      variable,
      SUM(value) as value,
      dt
    FROM spark_datas
    GROUP BY lte_enodeb_id, variable, dt
    ORDER BY lte_enodeb_id, variable, dt
  """

In [12]:
#! SparkConf has no effect if spark is launched locally, configuration variables need to be set through $SPARK_HOME/conf/spark-defaults.conf
cparam = {  'spark.executor.instances': '4', 'spark.executor.memory' : "16g",
            'spark.executor.cores': '16', 'spark.driver.memory': '120g',
            #'spark.memory.offHeap.enabled': 'true', 'spark.memory.offHeap.size': '40g',
            #'spark.driver.maxResultsSize':'0' 
        }


try:
    sc = SparkContext.getOrCreate()
    sc.stop()
except Exception:
    pass

conf = SparkConf()
#for k,v in cparam.items(): #! noeffect
    #conf.set(k, v)

try:
    #sc.stop()
    sc = SparkContext(conf=conf)

except Exception as e:
    print(e)
    """with open("/home/jovyan/work/elbaanh/AIO-dev/AIO-non-seasonal-trend-parallel/spark_times.txt", "a") as f:
        s = "infeasible configuration: " + str(e) + "\t" + json.dumps(cparam)
        f.write(s)"""

spark = SparkSession.builder.getOrCreate()

# ! change path here
spark_datas = spark.read.parquet(f"/home/jovyan/work/elbaanh/AIO-dev/data/anonimized2/spark/enodeb_datas_long_pa_100.parquet")   

spark_datas.createOrReplaceTempView('spark_datas')
spark_datas = spark_datas.withColumn("dt", spark_datas["dt"].cast(TimestampType())) #spark_datas.createOrReplaceTempView('spark_datas')
spark_datas.createOrReplaceTempView('spark_datas')


t1 = time.time()
try:
    # without repartition there's no parallelism
    stored_sdf = ( spark.sql(sql_statement_gb3).repartition(spark.sparkContext.defaultParallelism, ["variable", "lte_enodeb_id"])).cache()


    qqqq = stored_sdf.groupby("variable", "lte_enodeb_id").applyInPandas(training, training_schema).collect() #\
    """.write.option("header", True) \
                                                                        .csv("/home/jovyan/work/elbaanh/AIO-dev/data/duck/datacsv")
                                                                        #"""

    # TODO kiírni csv-be, write.csv
    # TODO rdd <- pandas : map
    

    """with open("/home/jovyan/work/elbaanh/AIO-dev/AIO-non-seasonal-trend-parallel/spark_times.txt", "a") as f:
        s = str(time.time() - t1 ) + "  " + str(size)
        f.write(s + "\t" +  json.dumps( cparam ) + "\n")"""


except Exception as e:
    print(e)
    """with open("/home/jovyan/work/elbaanh/AIO-dev/AIO-non-seasonal-trend-parallel/spark_times.txt", "a") as f:
        s = str(time.time() - t1 )
        f.write(s + "\t" + json.dumps( cparam ) + "\t ------- failed: "+ str(e) +"\n")"""