# Imports

In [1]:
import prophet
import sys
sys.modules['fbprophet'] = prophet

In [2]:
import warnings
#warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)


import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from tqdm import tqdm
import time



from fbprophet.diagnostics import performance_metrics


import plotly.offline as pyo
import plotly.graph_objs as go

import itertools
import os, sys

from sklearn.preprocessing import StandardScaler
from supersmoother import SuperSmoother, LinearSmoother


from collections import OrderedDict
sys.path.append("..")
from configs.bad_direction_kpi_dict import bad_direction_kpi_dict
from configs.kpi_constraints_dict import kpi_constraints_dict
from configs.functions import make_future, add_cond_trend_version, run_prophet_funct, hparam_tuning



import dask
from dask.distributed import Client

import logging, sys
#logging.disable(sys.maxsize) #turn off prophet infos

In [3]:
pwd

'/Users/eaxxprx/Desktop/Work/EEA Code/aio/Non-seasonal-trend'

In [4]:
file = "4weeks-lte_enodeb_id-anon.csv"
datas = pd.read_csv( "/Users/eaxxprx/Desktop/Work/EEA Code/aio/Data//" + file )
print("-"*30,"DF READ ✔️","-"*30)

------------------------------ DF READ ✔️ ------------------------------


# Dask client init

In [5]:
client = Client( dashboard_address=':44594', n_workers = 68, threads_per_worker = 2) #scheduler_address=':37243'
client

0,1
Client  Scheduler: tcp://127.0.0.1:50682  Dashboard: http://127.0.0.1:44594/status,Cluster  Workers: 68  Cores: 136  Memory: 17.18 GB


# Parameter setup, metadata import

In [11]:
metadata_store = pd.read_csv('/Users/eaxxprx/Desktop/Work/EEA Code/aio/Data/metadata_anon.csv')
# Mixed datatypes in the dimension_name col: floats and OrderedDict as str (need eval)

# Get rid of all irrelevant metadata
metadata_store = metadata_store[ metadata_store.model_type == 'non_seasonal_trend' ]

# evaluate str to OrderedDict
# makes things so much easier
#metadata_store.dimension_name = metadata_store.dimension_name.map(lambda element: eval(element))

#params
missing_data_percentage_param = 0.3
daily_fourier_order = 0
weekly_fourier_order = 0
is_weekend = False
country_name = 'USA'
percent = 0.1
scores = ['mae'] #['mdape', 'mape', 'smape', 'mae']
predictions_write_to = ''
errors_write_to = ''
write_to = ''
alpha = 1.0


#infos
end = pd.to_datetime(metadata_store['ts'].values[0], unit='s')
ts = metadata_store['ts'].values[0]
start = end - pd.Timedelta(4, unit = 'w') # 
files = metadata_store['path'].unique() # arr of unique files

files = files[ files != "4weeks-lte_eci-anon.csv" ] # remove from test list,
                                                          # this is just too large

In [12]:
def _params_to_df_p(model, df):

    df["changepoint_prior_scale"] = model.changepoint_prior_scale
    df["changepoint_range"] = model.changepoint_range
    
    return df

In [13]:
# Oi, You are stepping on thin 🦆ing ice mate: monkeypatched version of fbprophet.diagnostics.cross_validation
from fbprophet.diagnostics import generate_cutoffs, single_cutoff_forecast
import logging
from asyncio import CancelledError
logger = logging.getLogger('prophet')


def my_cross_validation(model, horizon, 
                        period=None, initial=None, 
                        parallel=None, cutoffs=None, 
                        disable_tqdm=False):

    df = model.history.copy().reset_index(drop=True)
    horizon = pd.Timedelta(horizon)

    predict_columns = ['ds', 'yhat']
    if model.uncertainty_samples:
        predict_columns.extend(['yhat_lower', 'yhat_upper'])
        
    # Identify largest seasonality period
    period_max = 0.
    for s in model.seasonalities.values():
        period_max = max(period_max, s['period'])
    seasonality_dt = pd.Timedelta(str(period_max) + ' days')

    if cutoffs is None:
        # Set period
        period = 0.5 * horizon if period is None else pd.Timedelta(period)

        # Set initial
        initial = (
            max(3 * horizon, seasonality_dt) if initial is None
            else pd.Timedelta(initial)
        )

        # Compute Cutoffs
        cutoffs = generate_cutoffs(df, horizon, initial, period)

    else: 
        raise Exception("Unexpected: cutoff should be None") # but you can paste out the current version of this branch, it is not necessary for now

    if initial < seasonality_dt:
            msg = 'Seasonality has period of {} days '.format(period_max)
            msg += 'which is larger than initial window. '
            msg += 'Consider increasing initial.'
            logger.warning(msg)

    try:
        from dask.distributed import get_client
    except ImportError as e:
        raise ImportError("parallel='dask' requires the optional "
                            "dependency dask.") from e
    pool = get_client()
    # delay df and model to avoid large objects in task graph.
    df, model = pool.scatter([df, model])

    iterables = ((df, model, cutoff, horizon, predict_columns)
                     for cutoff in cutoffs)
    iterables = zip(*iterables)


    logger.info("Applying in parallel with %s", pool)
    
    predicts = pool.map(single_cutoff_forecast, *iterables)
            # convert Futures to DataFrames
            # predicts = pool.gather(predicts)

    

    myconcat = lambda predicts: pd.concat(predicts, axis=0).reset_index(drop=True)
    my_perf_metr = lambda df_cv: performance_metrics(df_cv, rolling_window=1)  

    fut = pool.submit(myconcat, predicts)
    df_p_fut = pool.submit(my_perf_metr, fut)
    df_p_fut = pool.submit(_params_to_df_p, model,df_p_fut)

       
    return df_p_fut

In [15]:
def sSmoothing(df):
    df["range"] = df.index
    max_range = df.range.max()
    
    model = SuperSmoother()
    model.fit(np.array(df.range), df.y, (np.ones(max_range+1)))
    
    tfit = np.linspace(0, max_range, max_range+1)
    yfit = model.predict(tfit)
    df["ytop"] = df["y"].copy()
    df.y = df.ytop - yfit
    
    q3, q1 = np.percentile(df.y, [75 ,25])
    IQR = q3 - q1
    df["y"] = np.where(((df.y < q1-3*IQR)|(df.y > q3+3*IQR)), np.nan, df.y)
    df["y"] = df.y.interpolate(method='akima')
    df.y = df.y + yfit
    df = df.drop(['range'], axis=1)
    
    return df

In [16]:
def preprocess_data(datas, row, 
                    start, end):
                    
    kpi = row["kpi_name"]
    dim_dict = row["dimension_name"]

    data = datas[(datas[list(dim_dict)] == pd.Series(dim_dict)).all(axis=1)]
    data = data.dropna( subset = [kpi] ) # dropna: axis = 0 removed - 
                                         # not supported in dask, default anyway
    df = pd.DataFrame()
    df['y'] = data[kpi]
    df['ds'] = pd.to_datetime(data["ts"], unit='s')

    df = df.loc[(df['ds']>=start)&(df['ds']<=end)]
    df = df.sort_values('ds')
    df = df.reset_index(drop=True)

    df = sSmoothing(df)
    return df

In [17]:
def submit_training(df, row, all_params):

    df_p_list = []
    
    for idx, params in enumerate(all_params):
        try:
            df_p = hparam_tuning(df, params, row, parallel = "dask")
            df_p_list.append(df_p)
        except Exception as e:
            print(e)

        
    return df_p_list

In [18]:
def myForecast(df, tuning_results,
                    end, row, score,
                    daily_fourier_order = 0,
                    weekly_fourier_order = 0,
                    is_weekend = False, 
                    country_name = "USA"):

    """
    df - dataframe with ts and kpi value (y)
    tuning_results - one rowed df with model metrics as cols, 
                    scores belonging to one parameter combination as vals
    row - row of metadata
    end - pd.Timestamp end of an interval of something #? end of interval for known data
    score - str, name of score
    """
    kpi = row["kpi_name"]
    dim_dict = row["dimension_name"]

    # Choosing the best model. If there are multiple equally good, pick one randomly
    tuning_results[score+'_rank'] = tuning_results[score].rank()
    tuning_results['rank'] = tuning_results[score+'_rank']
    params = tuning_results.loc[ tuning_results["rank"].idxmin(), ["changepoint_prior_scale", "changepoint_range"] ].to_dict()
    # Fit model with best params, predict future
    m = run_prophet_funct(df, params, daily_fourier_order, weekly_fourier_order, is_weekend, country_name)
    future = make_future(m, end, 168)
    forecast = m.predict(future)

    # Setting bounds 
    df["doy"] = df.ds.dt.dayofyear
    iqr = (df[['doy', 'y']].groupby('doy').quantile(0.75)-df[['doy', 'y']].groupby('doy').quantile(0.25)).median().values[0]
    minimum = -iqr
    maximum = +iqr

    lower = kpi_constraints_dict[kpi][0]
    upper = kpi_constraints_dict[kpi][1]


    #? What happens here?
    additive_condition = add_cond_trend_version(forecast, percent, lower, upper, minimum, maximum, end)
    if additive_condition:

        if len(m.changepoints[np.abs(np.nanmean(m.params['delta'], axis=0)) >= 0.01].values)==0:
            last_changepoint = start

        else:
            last_changepoint = m.changepoints[np.abs(np.nanmean(m.params['delta'], axis=0)) >= 0.01].values[-1]

        last_point = ((forecast.set_index('ds')[last_changepoint:]['trend']+alpha*minimum>lower)
                    & (forecast.set_index('ds')[last_changepoint:]['trend']+alpha*maximum<upper))[::-1].idxmax() 

        forecast.loc[forecast['ds']>last_point, 'trend'] = forecast.loc[forecast['ds']==last_point, 'trend'].values[0]

        forecast['yhat'] = forecast['trend']

    # Throw away out of bound predictions
    forecast['yhat'] = forecast['yhat'].clip(lower = lower, upper = upper)

    scaler =  StandardScaler(with_mean = False) # RobustScaler
    scaler.fit(df['y'].values.reshape(-1,1)) 

    df = df.set_index("ds")
    forecast = forecast.set_index("ds")

    results = pd.DataFrame( index = forecast.index,
                            columns = ["kpi_name", "dimension_name", "ground_truth",
                                        "pred", "error", "trend", "gt_wo_trend", "pred_wo_trend"])

    results["kpi_name"] = [kpi] * len(results)
    results["dimension_name"] = [dim_dict] * len(results)
    results["ground_truth"] = df.y
    results["pred"] = forecast.yhat
    results["error"] = scaler.transform((df['y']-forecast['yhat']).values.reshape(-1,1)).T[0]
    results["trend"] = forecast.trend
    results["gt_wo_trend"] = df.y - forecast.trend              # ground truth without trend
    results["pred_wo_trend"] = forecast.yhat - forecast.trend   # predictions without trend

    future_results = results.loc[ ~results.index.isin(df.index)]
    past_results = results.loc[ df.index ]

    return future_results, past_results

In [19]:
def one_row(row, datas, start, end):
    """
    row - one row of metadata
    start - pd.Timestamp
    """


    df = preprocess_data(datas, row, 
                         start, end)

    param_grid = {  'changepoint_prior_scale': [0.01, 0.1, 1.0],
                    'changepoint_range': [0.8, 0.9, 0.95]       }
    all_params = [dict(zip(param_grid.keys(), v)) for v in itertools.product(*param_grid.values())]
    #tuning_results = dask.delayed(training)( df, row ) 
    tuning_results = dask.delayed(submit_training)( df, row, all_params ) 
    
    return tuning_results
    scores = ["mae"]
    
    forecasted = []
    #for score in scores:
    #    new_forecast = dask.delayed(myForecast)(df, tuning_results,
    #                                end, row, score)
    #    forecasted.append(new_forecast)

    #return forecasted

# The main thing

In [21]:
mdrow_list = []

try: 
    # arr of unique kpis present in file
    kpis = metadata_store.loc[ metadata_store['path'] == file ]['kpi_name'].unique()

    timeseries_counter = 0
    delayed_result_list = []
    tsc_time = time.time()
    for kpi in kpis: #test on 5:6
        #kpi = "tcp_tp_ul_sum"
            # mask out part of metadata df
        mask =  (metadata_store['kpi_name'] == kpi) & \
                (metadata_store['path'] == file)    & \
                (metadata_store['model_type'] == 'non_seasonal_trend')

        metadata_store_kpi = metadata_store.loc[ mask ]


        # in case of mutiple simensions, we need another for here for the keys of the dimension dict e.g.

        metadata_store_kpi['dim_str'] = metadata_store_kpi.apply(lambda x: '_'.join(
            [str(elem) for elem in x['dimension_name'].keys()]), axis=1)

        dim_names_arr = metadata_store_kpi['dim_str'].unique()


        for dimension_name in dim_names_arr:
            metadata_store_dim = metadata_store_kpi.loc[ metadata_store_kpi['dim_str'] == dimension_name ]
            all_errors = pd.DataFrame(index = pd.date_range(start, end, freq = 'H'))
            all_predictions = pd.DataFrame(index = pd.date_range(start, end+pd.Timedelta('1w'), freq = 'H'))


            
            for _, row in metadata_store_dim.iterrows():
                timeseries_counter +=1
                mdrow_list.append(row)
                #delayed_result = one_row( row, datas, start, end )
                #delayed_result_list.append( delayed_result )
            
            
except KeyboardInterrupt:
    print(timeseries_counter, " time series took ", time.time() - tsc_time, "s. Avg: ", (time.time() - tsc_time) / timeseries_counter)
    raise KeyboardInterrupt
print(timeseries_counter, " time series took ", time.time() - tsc_time, "s. Avg: ", (time.time() - tsc_time) / timeseries_counter)

ZeroDivisionError: float division by zero

In [25]:
dim_names_arr

NameError: name 'dim_names_arr' is not defined

# Compute

In [13]:
mdrow_list[0]

ts                                                       1614556800
path                            4weeks-lte_enodeb_id-1614153600.csv
dimension_name                         {'lte_enodeb_id': 1001940.0}
kpi_name                                              tcp_tp_ul_sum
missing_data_ratio_all                                     0.254464
missing_data_ratio_last_week                               0.833333
seasonality_flag                                                0.0
statonarity_flag                                                0.0
missing_data_imputation_flag                                    1.0
table                                              Aggregator table
nan_trimming_flag                                               1.0
ACF_max_difference                                         0.147311
is_it_constant                                                  0.0
model_type                                       non_seasonal_trend
dim_str                                         

In [14]:
# Preprocessing dataframes

df_list = []
for mdrow in mdrow_list[:1000]:
    df = preprocess_data( datas, mdrow, start, end )
    df_list.append(df)

In [15]:
# Hyperparameters

param_grid = {  'changepoint_prior_scale': [0.01, 0.1, 1.0],
                        'changepoint_range': [0.8, 0.9, 0.95]       }
all_params = [dict(zip(param_grid.keys(), v)) for v in itertools.product(*param_grid.values())]

In [16]:
taskgroups = []

for i in range(200):
    taskgroup = dask.delayed(submit_training)(df_list[i], mdrow_list[i], all_params)
    taskgroups.append( taskgroup )

compute = dask.compute( *taskgroups )
results = [ [ task.result() for task in taskgroup ] for taskgroup in compute ]

In [21]:
pd.concat(results[0]).reset_index(drop = True)

Unnamed: 0,horizon,mse,rmse,mae,mape,mdape
0,1 days,244835500000000.0,15647220.0,14169190.0,4.144155,1.770515
1,1 days,266793700000000.0,16333820.0,14848420.0,4.297558,1.825283
2,1 days,249789400000000.0,15804730.0,14305600.0,4.16888,1.768909
3,1 days,130455400000000.0,11421710.0,8971302.0,2.239012,0.71665
4,1 days,129817200000000.0,11393730.0,8963837.0,2.246658,0.704857
5,1 days,134313300000000.0,11589360.0,9132960.0,2.290407,0.723
6,1 days,134545000000000.0,11599350.0,7892837.0,1.387964,0.610806
7,1 days,149605500000000.0,12231330.0,8196457.0,1.562161,0.671244
8,1 days,148118600000000.0,12170400.0,8149954.0,1.554829,0.655024


# Misc

In [None]:
#results = [ [ task.result() for task in taskgroup ] for taskgroup in computes ]
[ [ fut.status != "finished" for p_uuid, fut in compute[ts_uuid].items()] for ts_uuid in compute.keys() ]

([<Future: finished, key: lambda-8b78b1a333ff3e418678af22a55970cf>,
  <Future: finished, key: lambda-5e70054f8f85771e7c19aa378892686e>,
  <Future: finished, key: lambda-937ee98c3a2e86eb0c5e2390296786f0>,
  <Future: finished, key: lambda-db6bdf691d47ec7cc0f734e7f1efd6b8>,
  <Future: finished, key: lambda-f81829679ae45ce0fe9b9dad70149a02>,
  <Future: finished, key: lambda-85f335daa9292f623457bbabd43562a6>,
  <Future: finished, key: lambda-4629861b588bde28d03342a95543c8b2>,
  <Future: finished, key: lambda-9c46427590cf70cba3c5181a84bf7160>,
  <Future: finished, key: lambda-2869bc3ae5c9f54440eca73e15a8c7a4>],
 [<Future: finished, key: lambda-dcf6e55c158f69c403a913566185ba65>,
  <Future: finished, key: lambda-e06d8586ad020728f70fbd68a32275fe>,
  <Future: finished, key: lambda-8652d9cd0f3546f70c62766e4fe80e9c>,
  <Future: finished, key: lambda-2b8099161228a8b7773c276fb3661839>,
  <Future: finished, key: lambda-ef2508bb0fd5b4d40f0de61275a07c44>,
  <Future: finished, key: lambda-f91f0c6462ca3e

In [19]:


num = 70
for mult in range(2):
    t1 = time.time()
    
    
    q = dask.compute([ dask.delayed(submit_training)(df_list[i], mdrow_list[i], all_params) for i in range(len(df_list[mult*num:(mult+1)*num]))])[0]
    #qq = [ [  x.result() for x in subarr ] for subarr in q ]
    print( time.time() - t1, "   --->   ", (time.time() - t1)/num, "s / ts")
#qq = client.gather(q)

55.13321828842163    --->    0.7876174177442278 s / ts
41.5641405582428    --->    0.5937735251017979 s / ts


In [20]:
[ dask.delayed(submit_training)(df_list[i], mdrow_list[i], all_params) for i in range(len(df_list[mult*num:(mult+1)*num])) ]

[Delayed('submit_training-b9a9ae6d-9b8d-442b-8b79-924e766a7223'),
 Delayed('submit_training-33de9d83-1b46-4709-bf1e-1cbd51519d97'),
 Delayed('submit_training-bc0e5862-441b-4db8-ae87-49ee2009fb3c'),
 Delayed('submit_training-b75cf542-5b4b-45b1-9c0d-c703c1fb18da'),
 Delayed('submit_training-49cd915e-1cf4-448b-8ef5-65651cf0bd70'),
 Delayed('submit_training-5cd66e16-5142-41ff-bb82-a7a1305de8ad'),
 Delayed('submit_training-8865c4db-a60b-4efa-8e69-89ac3fca5977'),
 Delayed('submit_training-8f7baaad-04d7-45b8-9965-e4cb4a711119'),
 Delayed('submit_training-aaa389c8-fa33-44f8-a291-24cd32286848'),
 Delayed('submit_training-2b6e44d2-462a-48f9-b4b6-cfa9a504c405'),
 Delayed('submit_training-98c6531f-b8a3-4aa0-bcdb-d1fc57c25527'),
 Delayed('submit_training-29f31fe8-dbd1-4ade-a37b-c05a15ded107'),
 Delayed('submit_training-4605a22b-605c-45ae-9b35-af80f3950591'),
 Delayed('submit_training-2afa6ec6-bcf3-4e64-b57e-26a5ae55a076'),
 Delayed('submit_training-9c2b786f-422b-4712-8272-0cb09bfa9e40'),
 Delayed('

20  86.03   98.84   93.44
   
50  142.43  161.10  157.72

200 538.52  ------  649.16

400 ------  ------  1443.92

In [22]:
metadata_store.dimension_name

18               {'subs_mcc': 204}
51               {'subs_mcc': 330}
59               {'subs_mcc': 450}
69               {'subs_mcc': 520}
74               {'subs_mcc': 520}
                    ...           
1113307    {'lte_eci': 46672259.0}
1113367    {'lte_eci': 21747774.0}
1113617    {'lte_eci': 46672270.0}
1113618    {'lte_eci': 46672270.0}
1113634    {'lte_eci': 46672270.0}
Name: dimension_name, Length: 36887, dtype: object

In [25]:
pd.concat(qq[30])

Unnamed: 0,horizon,mse,rmse,mae,mape,mdape,changepoint_prior_scale,changepoint_range
0,2 days,234353500000.0,484100.760342,386238.108756,0.297749,0.223861,0.01,0.8
0,2 days,236361500000.0,486170.253553,388929.091711,0.299548,0.229261,0.01,0.9
0,2 days,230628000000.0,480237.423703,381218.16165,0.293153,0.226813,0.01,0.95
0,2 days,243975300000.0,493938.593274,393988.193131,0.294679,0.225087,0.1,0.8
0,2 days,243811300000.0,493772.525012,393602.212309,0.2941,0.226601,0.1,0.9
0,2 days,243618200000.0,493576.964559,393231.857399,0.293526,0.2272,0.1,0.95
0,2 days,292379200000.0,540720.959165,442513.100272,0.365475,0.240572,1.0,0.8
0,2 days,304606300000.0,551911.478057,451879.759598,0.36679,0.257058,1.0,0.9
0,2 days,307970600000.0,554950.969759,454577.878164,0.369637,0.257701,1.0,0.95


In [36]:
tuning_results = pd.concat(qq[30]).reset_index()
score = "mae"

tuning_results[score+'_rank'] = tuning_results[score].rank()
tuning_results['rank'] = tuning_results[score+'_rank']
params = tuning_results.loc[ tuning_results["rank"].idxmin(), ["changepoint_prior_scale", "changepoint_range"] ].to_dict()
params.

{'changepoint_prior_scale': 0.01, 'changepoint_range': 0.95}

In [43]:
myForecast(df_list[30], pd.concat(qq[30]).reset_index(drop=True),
            end, mdrow_list[30], "mae")[1]

INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.


Unnamed: 0_level_0,kpi_name,dimension_name,ground_truth,pred,error,trend,gt_wo_trend,pred_wo_trend
ds,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2021-02-01 00:00:00,tcp_tp_ul_wmean,{'lte_enodeb_id': 97585.0},2.025293e+06,2.168795e+06,-0.202334,2.168795e+06,-143501.117126,0.0
2021-02-01 01:00:00,tcp_tp_ul_wmean,{'lte_enodeb_id': 97585.0},1.680805e+06,2.167459e+06,-0.686175,2.167459e+06,-486654.177097,0.0
2021-02-01 02:00:00,tcp_tp_ul_wmean,{'lte_enodeb_id': 97585.0},1.788899e+06,2.166123e+06,-0.531880,2.166123e+06,-377224.182816,0.0
2021-02-01 03:00:00,tcp_tp_ul_wmean,{'lte_enodeb_id': 97585.0},1.995120e+06,2.164788e+06,-0.239230,2.164788e+06,-169668.369551,0.0
2021-02-01 04:00:00,tcp_tp_ul_wmean,{'lte_enodeb_id': 97585.0},1.553594e+06,2.163452e+06,-0.859892,2.163452e+06,-609858.669328,0.0
...,...,...,...,...,...,...,...,...
2021-02-24 04:00:00,tcp_tp_ul_wmean,{'lte_enodeb_id': 97585.0},1.896636e+06,1.421159e+06,0.670415,1.421159e+06,475476.921292,0.0
2021-02-24 05:00:00,tcp_tp_ul_wmean,{'lte_enodeb_id': 97585.0},1.466139e+06,1.419814e+06,0.065317,1.419814e+06,46324.897992,0.0
2021-02-24 06:00:00,tcp_tp_ul_wmean,{'lte_enodeb_id': 97585.0},1.677014e+06,1.418469e+06,0.364545,1.418469e+06,258545.317970,0.0
2021-02-24 07:00:00,tcp_tp_ul_wmean,{'lte_enodeb_id': 97585.0},1.421657e+06,1.417123e+06,0.006392,1.417123e+06,4533.562247,0.0


In [27]:
mdrow[230]

IndexError: index 230 is out of bounds for axis 0 with size 15

In [None]:
qq = [ pd.concat([  x.result() for x in subarr ], ignore_index = True) for subarr in q ]

In [20]:
res = dask.compute([ dask.delayed(myForecast)(df_list[i], qq[i], end, mdrow_list[i], "mae") for i in range(len(qq))])[0]

In [50]:
tuning_results = qq[0]
score = "mae"
tuning_results[score+'_rank'] = tuning_results[score].rank()
tuning_results['rank'] = tuning_results[score+'_rank']

tuning_results.loc[ tuning_results["rank"].idxmin(), ["changepoint_prior_scale", "changepoint_range"] ].to_dict()

{'changepoint_prior_scale': 1.0, 'changepoint_range': 0.8}

In [1]:
#qq = [ pd.concat([  x.result() for x in subarr ]) for subarr in q ]
#qq[0]
myForecast(qq[0])

NameError: name 'myForecast' is not defined

In [None]:
# 30 mag
# 20: 84, 111, 109 s
# 30: 112 s
# 40: 163 s

# 68 mag
# 10:  63 s -> 6.3 s
# 20:  94, 68, 121, 90, 132, 76, 74, 92, 134, 130, 191, 210
# 50: 143 s -> 2.86 s3
# 60: 216 s -> 3.6 s


BS = 10

ress = []
for idx in range(1):
    t1 = time.time()
    res = dask.compute(delayed_result_list[idx * BS: (idx+1) * BS])[0]
    ress.append(res)

    print(time.time() - t1)

99.2686755657196


In [98]:
dask.compute( one_row(row, datas, start, end) )

([(                             kpi_name              dimension_name  \
   ds                                                                  
   2021-03-01 01:00:00  rtp_delay_ul_sum  {'lte_enodeb_id': 84581.0}   
   2021-03-01 02:00:00  rtp_delay_ul_sum  {'lte_enodeb_id': 84581.0}   
   2021-03-01 03:00:00  rtp_delay_ul_sum  {'lte_enodeb_id': 84581.0}   
   2021-03-01 04:00:00  rtp_delay_ul_sum  {'lte_enodeb_id': 84581.0}   
   2021-03-01 05:00:00  rtp_delay_ul_sum  {'lte_enodeb_id': 84581.0}   
   ...                               ...                         ...   
   2021-03-07 20:00:00  rtp_delay_ul_sum  {'lte_enodeb_id': 84581.0}   
   2021-03-07 21:00:00  rtp_delay_ul_sum  {'lte_enodeb_id': 84581.0}   
   2021-03-07 22:00:00  rtp_delay_ul_sum  {'lte_enodeb_id': 84581.0}   
   2021-03-07 23:00:00  rtp_delay_ul_sum  {'lte_enodeb_id': 84581.0}   
   2021-03-08 00:00:00  rtp_delay_ul_sum  {'lte_enodeb_id': 84581.0}   
   
                        ground_truth       pred  error      

In [16]:
q = np.array(ress).reshape(300,2)
future_result = q[:, 0]
past_result = q[:, 1]       #Alex azt mondta, hogy ez kell az SVM-hez

pd.concat(past_result).to_csv("past_result.csv")


Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.



In [24]:
np.mean([163.28425669670105,
151.6457359790802,
110.84023022651672,
109.87729716300964,
125.84026026725769,
172.76928687095642,
126.07701253890991,
146.015367269516,
143.77940678596497,
187.29690790176392,
130.69336223602295,
155.33424377441406])

143.6211139758428

In [15]:
t1 = time.time()

([[{'preds':                      OrderedDict([('lte_enodeb_id', 1001940.0)])
    ds                                                              
    2021-02-01 00:00:00                                     2.786930
    2021-02-01 01:00:00                                     2.788719
    2021-02-01 02:00:00                                     2.790508
    2021-02-01 04:00:00                                     2.794085
    2021-02-01 05:00:00                                     2.795874
    ...                                                          ...
    2021-03-07 20:00:00                                     4.305158
    2021-03-07 21:00:00                                     4.306987
    2021-03-07 22:00:00                                     4.308816
    2021-03-07 23:00:00                                     4.310645
    2021-03-08 00:00:00                                     4.312475
    
    [682 rows x 1 columns],
    'errors': ds
    2021-02-01 00:00:00    0.096708
    2021