# Baseline Prediction 

https://towardsdatascience.com/multivariate-time-series-forecasting-using-random-forest-2372f3ecbad1

# <span style='background :khaki' > Install & Importa Libraries </span>

In [1]:
### ***Enviroment Preparation***
# Install Pandas
!pip install pandas
!pip install matplotlib
!pip install seaborn
!pip install scikit-learn
!pip install lightgbm

!pip install dtale

!pip install sktime
!pip install sklego

#!pip install skforecast

# Update pip -- WARNING Resolution
!python.exe -m pip install --upgrade pip





In [2]:
### ***Imports***
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

import dtale

from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

from datetime import datetime
from datetime import timedelta

from statsmodels.tsa.stattools import adfuller, kpss

from sklego.preprocessing import RepeatingBasisFunction

from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import TimeSeriesSplit

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

import time
# Current Location.. !cd
#C:\Users\ghoyo\Desktop\TFM\Nuevo\Proyect

import warnings
warnings.filterwarnings('ignore')

# Paths
data = r"C:\Users\ghoyo\Desktop\TFM4\Project\GeneratedDfs\dayly_rests_type.json"

# Full Time
start_full_infi = time.time()

#### <span style="background:skyblue"> Load Data <span>

In [3]:
df = pd.read_json(data)
df.shape

(11504, 2)

#### <span style="background:skyblue"> Little Analysis <span>


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 11504 entries, 2017-04-04 14:06:56 to 2022-11-24 22:04:22
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype
---  ------              --------------  -----
 0   rest_type           11504 non-null  int64
 1   n_streams_listened  11504 non-null  int64
dtypes: int64(2)
memory usage: 269.6 KB


In [5]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
rest_type,11504.0,2.45984,1.68546,0.0,1.0,2.0,4.0,6.0
n_streams_listened,11504.0,7.129781,8.876177,1.0,2.0,5.0,9.0,191.0


In [6]:
df.head(4)

Unnamed: 0,rest_type,n_streams_listened
2017-04-04 14:06:56,1,2
2017-04-04 14:55:10,1,9
2017-04-04 16:28:27,2,7
2017-04-04 17:28:27,0,18


# <span style='background :khaki' > Random Forest Predictions </span>

In [7]:
# Assuming number of predicted rest for next day is = 6
n_rests = 6

#### <span style="background:violet"> Functions <span>

In [8]:
class BlockingTimeSeriesSplit():
    def __init__(self, n_splits, numb):
        self.n_splits = n_splits
        self.numb = numb
    
    def get_n_splits(self, X, y, groups):
        return self.n_splits
    
    def split(self, X, y=None, groups=None):
        n_samples = len(X)
        k_fold_size = n_samples // self.n_splits
        indices = np.arange(n_samples)

        margin = 0
        for i in range(self.n_splits):
            start = i * k_fold_size
            stop = start + k_fold_size
            mid = stop-self.numb
            yield indices[start: mid], indices[mid + margin: stop]

In [9]:
def tree_feature_importance_class(df, target, split=7, n_s=100, plot = 0, r = 0, sort = 0):
    # Data
    X = df.drop(target, axis=1)
    y = df[target]
    
    # Split
    X_train, X_test = X.iloc[:-split,:], X.iloc[-split:,:]
    y_train, y_test = y.iloc[:-split], y.iloc[-split:]
    
    # Create - Train - Predict
    model = RandomForestClassifier(n_estimators=n_s)
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    
    # Evaluation Methods - accuracy, precision, recall & F1-score
    accuracy = np.round(accuracy_score(y_test, predictions), 3)    
    precision = np.round(precision_score(y_test, predictions, average='weighted'), 3)    
    recall = np.round(recall_score(y_test, predictions, average='weighted'), 3)    
    f1 = np.round(f1_score(y_test, predictions, average='weighted'), 3)    

    #print('Accuracy:', accuracy)
    #print('Precision:', precision)
    #print('Recall:', recall)
    #print('F1-score:', f1)
    
    # Plot Real Vs Prediction
    if plot == 0 or plot == 1:
        fig = plt.figure(figsize=(6,3))
        plt.title(f' {target}: Random Forest Real vs Prediction - Accuracy {accuracy}', fontsize=20)
        plt.plot(y_test, color='red')
        plt.plot(pd.Series(predictions, index=y_test.index), color='green')
        plt.xlabel('Days', fontsize=12)
        plt.ylabel('Target variable', fontsize=12)
        plt.legend(labels=['Real', 'Prediction'], fontsize=12)
        plt.grid()
        plt.show()
    
    # Dataframe with Importance of each feature
    if sort == 0:
        df_importances = pd.DataFrame({
            'feature': df.drop(columns = target).columns,
            'importance': model.feature_importances_}
        ).sort_values(by='importance', ascending=False)
    
    if sort == 1:
        df_importances = pd.DataFrame({
            'feature': df.drop(columns = target).columns,
            'importance': model.feature_importances_}
        )
    
    # Plot Features Importance
    if plot == 0 or plot == 2:
        plt.figure(figsize=(12,8))
        plt.title('Variable Importances', fontsize=12)
        sns.barplot(x=df_importances.importance, y=df_importances.feature, orient='h')
        plt.show()
        
    #  Print Importances
    #print('Accuracy: ', accuracy)
    #print(df_importances.sort_values(by='importance'), '\n')
    
    if r == 1:
        return accuracy, precision, recall, f1, df_importances
    
    return accuracy
    

In [10]:
def tree_classification_cv(df, target, cv = 2, cv_split=5, n_s=100, test_size=0.3):
    # Results
    c_predict = []
    c_acc = []
    c_pre = []
    c_rec = []
    c_f1 = []
    
    # Data
    X = df.drop(target, axis=1)
    y = df[target]
    
    # CV Method
    if(cv == 1):
        tscv = TimeSeriesSplit(max_train_size=None, n_splits=cv_split, test_size=test_size)
    elif(cv == 2):
        tscv = BlockingTimeSeriesSplit(n_splits = cv_split, numb = test_size)
    
    # Split CV Data
    for train_index, test_index in tscv.split(X):
        pre = []
        
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        model = RandomForestClassifier(n_estimators=n_s)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        c_predict.append(y_pred)
        c_acc.append(accuracy_score(y_test, y_pred))
        c_pre.append(precision_score(y_test, y_pred, average='weighted'))
        c_rec.append(recall_score(y_test, y_pred, average='weighted'))
        c_f1.append(f1_score(y_test, y_pred, average='weighted'))
    
    accuracy = np.mean(c_acc)
    precision = np.mean(c_pre)
    recall = np.mean(c_rec)
    f1 = np.mean(c_f1)
    
    return accuracy, precision, recall, f1

### <span style="background:lightGreen"> Prediction <span>

#### <span style='background :skyblue' > Generate Lags and Select a Best number of Lags </span>

In [11]:
def generate_lagged_dfs(df, target_col, num_lags, start):
    lagged_dfs = []
    for i in range(0, start):
        if len(lagged_dfs) == 0:
            lagged_df = df.copy()
        else:
            lagged_df = lagged_dfs[-1].copy()
        lagged_df[target_col + '_lag_' + str(i+1)] = df[target_col].shift(i+1)
        lagged_dfs.append(lagged_df)
    for i in range(start, start + num_lags):
        if len(lagged_dfs) == 0:
            lagged_df = df.copy()
        else:
            lagged_df = lagged_dfs[-1].copy()
        lagged_df[target_col + '_lag_' + str(i+1)] = df[target_col].shift(i+1)
        lagged_dfs.append(lagged_df)
    for d in lagged_dfs:
        d.dropna(inplace=True)
        
    return lagged_dfs

In [17]:
lag_df = generate_lagged_dfs(df, 'rest_type', 10, n_rests)
for i in range(n_rests-1):
    lag_df.pop(0)

In [19]:
lag_df[10].head(10)

Unnamed: 0,rest_type,n_streams_listened,rest_type_lag_1,rest_type_lag_2,rest_type_lag_3,rest_type_lag_4,rest_type_lag_5,rest_type_lag_6,rest_type_lag_7,rest_type_lag_8,rest_type_lag_9,rest_type_lag_10,rest_type_lag_11,rest_type_lag_12,rest_type_lag_13,rest_type_lag_14,rest_type_lag_15,rest_type_lag_16
2017-04-06 18:28:22,1,14,4.0,0.0,4.0,5.0,1.0,1.0,2.0,2.0,2.0,2.0,5.0,1.0,0.0,2.0,1.0,1.0
2017-04-06 19:55:01,1,14,1.0,4.0,0.0,4.0,5.0,1.0,1.0,2.0,2.0,2.0,2.0,5.0,1.0,0.0,2.0,1.0
2017-04-06 20:14:06,0,2,1.0,1.0,4.0,0.0,4.0,5.0,1.0,1.0,2.0,2.0,2.0,2.0,5.0,1.0,0.0,2.0
2017-04-07 05:33:19,5,8,0.0,1.0,1.0,4.0,0.0,4.0,5.0,1.0,1.0,2.0,2.0,2.0,2.0,5.0,1.0,0.0
2017-04-07 06:18:23,0,10,5.0,0.0,1.0,1.0,4.0,0.0,4.0,5.0,1.0,1.0,2.0,2.0,2.0,2.0,5.0,1.0
2017-04-07 06:57:40,1,6,0.0,5.0,0.0,1.0,1.0,4.0,0.0,4.0,5.0,1.0,1.0,2.0,2.0,2.0,2.0,5.0
2017-04-07 09:10:00,3,2,1.0,0.0,5.0,0.0,1.0,1.0,4.0,0.0,4.0,5.0,1.0,1.0,2.0,2.0,2.0,2.0
2017-04-07 10:03:33,1,6,3.0,1.0,0.0,5.0,0.0,1.0,1.0,4.0,0.0,4.0,5.0,1.0,1.0,2.0,2.0,2.0
2017-04-07 10:37:40,0,6,1.0,3.0,1.0,0.0,5.0,0.0,1.0,1.0,4.0,0.0,4.0,5.0,1.0,1.0,2.0,2.0
2017-04-07 13:25:47,3,7,0.0,1.0,3.0,1.0,0.0,5.0,0.0,1.0,1.0,4.0,0.0,4.0,5.0,1.0,1.0,2.0


#### <span style='background :skyblue' > Global Variables </span>

In [20]:
n_split = 14
n_sims = 75

#### <span style='background :skyblue' > Variables Importance </span>

In [None]:
%%time

for d in lag_df:
    print('DATAFRAME: ', d.columns[0], '-- Last Lag ------------------------------------')
    target = d.columns[0]
    tree_feature_importance(d, target, split=n_rests, plot = 2, r = 1, sort=0)

#### <span style='background :skyblue' > Basic RF </span>

In [None]:
%%time
start_full = time.time()

lag_acc = []
lag_pre = []
lag_rec = []
lag_f1 = []
lag_vi = []

for d, r in zip(lag_df, range(n_rests)):
    sim_acc = []
    sim_pre = []
    sim_rec = []
    sim_f1 = []
    sim_vi = []
    
    target = d.columns[0]
    
    for sim in range(n_sims):
        #print('-----------------', target, '- sim n:', sim, '- lag: ', r, '-------------------')
        acc, pre, rec, f1, vi = tree_feature_importance_class(d, target, split=n_rests, plot = 3, r = 1)
        
        # Save data -- SIM
        sim_acc.append(acc)
        sim_pre.append(pre)
        sim_rec.append(rec)
        sim_f1.append(f1)
        sim_vi.append(vi)
        
        
    mean_sim_vi = pd.DataFrame(columns = ['feature', 'importance'])

    # Loop to make variable importance simulation mean
    for i,col in zip(range(sim_vi[0].shape[0]), sim_vi[0]['feature']):
        aux = []
    
        for j in range(n_sims):
            aux.append(sim_vi[j]['importance'][i])
    
        mean_sim_vi.loc[i, 'feature'] = col
        mean_sim_vi.loc[i, 'importance'] = np.mean(aux)

    #print('\n Mean Variable Importance LAGS DATAFRAMES \n')
    #mean_sim_vi.head()
    
    # Save Data -- LAG Loop
    lag_acc.append(np.mean(sim_acc))
    lag_pre.append(np.mean(sim_pre))
    lag_rec.append(np.mean(sim_rec))
    lag_f1.append(np.mean(sim_f1))
    lag_vi.append(mean_sim_vi)
    
    # Save Data -- VARIABLES/FEATURES

    #print(lag_acc)
    #print(lag_pre)
    #print(lag_rec)
    #print(lag_f1)
    #print(lag_vi)

end_full = time.time()

In [None]:
print("full time: ", (end_full-start_full)* 10**3, 'ms')

In [None]:
print(lag_acc)
print(lag_pre)
print(lag_rec)
print(lag_f1)
#print(lag_vi)

#### <span style='background :skyblue' > CV </span>

<span style='background :skyblue' > TSCV </span>

In [None]:
%%time
start_full_tscv = time.time()

lag_acc_tscv = []
lag_pre_tscv = []
lag_rec_tscv = []
lag_f1_tscv = []
lag_vi_tscv = []

for d, r in zip(lag_df, range(n_rests)):
    sim_acc_tscv = []
    sim_pre_tscv = []
    sim_rec_tscv = []
    sim_f1_tscv = []
    sim_vi_tscv = []
    
    target = d.columns[0]
    
    for sim in range(n_sims):
        #print('-----------------', target, '- sim n:', sim, '- lag: ', r, '-------------------')
        acc_tscv, pre_tscv, rec_tscv, f1_tscv = tree_classification_cv(d, target, cv = 1, cv_split = n_split, test_size = n_rests)
        
        # Save data -- SIM
        sim_acc_tscv.append(acc_tscv)
        sim_pre_tscv.append(pre_tscv)
        sim_rec_tscv.append(rec_tscv)
        sim_f1_tscv.append(f1_tscv)
    
    # Save Data -- LAG Loop
    lag_acc_tscv.append(np.mean(sim_acc_tscv))
    lag_pre_tscv.append(np.mean(sim_pre_tscv))
    lag_rec_tscv.append(np.mean(sim_rec_tscv))
    lag_f1_tscv.append(np.mean(sim_f1_tscv))
    
    # Save Data -- VARIABLES/FEATURES

    #print(lag_acc_tscv)
    #print(lag_pre_tscv)
    #print(lag_rec_tscv)
    #print(lag_f1_tscv)

end_full_tscv = time.time()

In [None]:
print("full time: ", (end_full_tscv-start_full_tscv)* 10**3, 'ms')

In [None]:
print(lag_acc_tscv)
print(lag_pre_tscv)
print(lag_rec_tscv)
print(lag_f1_tscv)

<span style='background :skyblue' > BCV </span>

In [None]:
%%time
start_full_bcv = time.time()

lag_acc_bcv = []
lag_pre_bcv = []
lag_rec_bcv = []
lag_f1_bcv = []
lag_vi_bcv = []

for d, r in zip(lag_df, range(n_rests)):
    sim_acc_bcv = []
    sim_pre_bcv = []
    sim_rec_bcv = []
    sim_f1_bcv = []
    sim_vi_bcv = []
    
    target = d.columns[0]
    
    for sim in range(n_sims):
        #print('-----------------', target, '- sim n:', sim, '- lag: ', r, '-------------------')
        acc_bcv, pre_bcv, rec_bcv, f1_bcv = tree_classification_cv(d, target, cv = 2, cv_split = n_split, test_size = n_rests)
        
        # Save data -- SIM
        sim_acc_bcv.append(acc_bcv)
        sim_pre_bcv.append(pre_bcv)
        sim_rec_bcv.append(rec_bcv)
        sim_f1_bcv.append(f1_bcv)
    
    # Save Data -- LAG Loop
    lag_acc_bcv.append(np.mean(sim_acc_bcv))
    lag_pre_bcv.append(np.mean(sim_pre_bcv))
    lag_rec_bcv.append(np.mean(sim_rec_bcv))
    lag_f1_bcv.append(np.mean(sim_f1_bcv))
    
    # Save Data -- VARIABLES/FEATURES

    #print(lag_acc_bcv)
    #print(lag_pre_bcv)
    #print(lag_rec_bcv)
    #print(lag_f1_bcv)

end_full_bcv = time.time()

In [None]:
print("full time: ", (end_full_bcv-start_full_bcv)* 10**3, 'ms')

In [None]:
print(lag_acc_bcv)
print(lag_pre_bcv)
print(lag_rec_bcv)
print(lag_f1_bcv)

##### <span style='background :yellow' > Select best Nºlags and CV Type </span>

##### <span style='background :yellow' > Need to make huge Simulation... a lot of time</span>


### <span style='background :orange' > About checking time features and ciclycal values, we will use the results concluded from the first experiment N_Streams_Prediction </span>

# <span style='background :orange' > Is best base model??? </span>

In [None]:
# End total time
end_full_infi = time.time()

In [None]:
print("full time: ", end_full_infi-start_full_infi * 10**3, 'ms')

# <span style='background :orange' > Is best base model??? </span>

In [21]:
def create_time_features(df, f=0):
    df = df.copy()
    if f == 0:
        df['year'] = df.index.year
        df['quarter'] = df.index.quarter
        df['month'] = df.index.month
        df['week'] = df.index.isocalendar().week
        df['day'] = df.index.day
        df['dayofyear'] = df.index.dayofyear
        df['dayofweek'] = df.index.dayofweek
    
        df['is_month_end'] = df.index.is_month_end
        df['is_month_start'] = df.index.is_month_start
        df['is_cuarter_end'] = df.index.is_quarter_end
        df['is_cuarter_start'] = df.index.is_quarter_start
        df['is_year_start'] = df.index.is_year_start
    
    if f == 1:
        df['year'] = df.index.year
        df['quarter'] = df.index.quarter
        df['month'] = df.index.month
        df['week'] = df.index.isocalendar().week
        df['day'] = df.index.day
        df['dayofyear'] = df.index.dayofyear
        df['dayofweek'] = df.index.dayofweek
        
    if f == 2:
        df['day'] = df.index.day
        df['dayofyear'] = df.index.dayofyear
        df['dayofweek'] = df.index.dayofweek
        df['year'] = df.index.year
        
        
    return df

In [22]:
# Seleccionar Dataframe con mejores lags
# Crear Diferentes 3 sets de TF
dftf2 = create_time_features(lag_df[10], f = 1)


In [27]:
dftf2.to_json(r"C:\Users\ghoyo\Desktop\TFM4\Project\GeneratedDfs\df_resttype.json")