# Baseline Prediction 

https://towardsdatascience.com/multivariate-time-series-forecasting-using-random-forest-2372f3ecbad1

# <span style='background :khaki' > Install & Importa Libraries </span>

In [1]:
### ***Enviroment Preparation***
# Install Pandas
!pip install pandas
!pip install matplotlib
!pip install seaborn
!pip install scikit-learn
!pip install lightgbm

!pip install dtale

!pip install sktime
!pip install sklego

#!pip install skforecast

# Update pip -- WARNING Resolution
!python.exe -m pip install --upgrade pip





In [2]:
### ***Imports***
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

import dtale

from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

from datetime import datetime
from datetime import timedelta

from statsmodels.tsa.stattools import adfuller, kpss

from sklego.preprocessing import RepeatingBasisFunction

from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import TimeSeriesSplit

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

import time
# Current Location.. !cd
#C:\Users\ghoyo\Desktop\TFM\Nuevo\Proyect

import warnings
warnings.filterwarnings('ignore')

# Paths
data = r"C:\Users\ghoyo\Desktop\TFM4\Project\GeneratedDfs\dayly_rests_type.json"

# Full Time
start_full_infi = time.time()

#### <span style="background:skyblue"> Load Data <span>

In [3]:
df = pd.read_json(data)
df.shape

(11504, 2)

#### <span style="background:skyblue"> Little Analysis <span>


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 11504 entries, 2017-04-04 14:06:56 to 2022-11-24 22:04:22
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype
---  ------              --------------  -----
 0   rest_type           11504 non-null  int64
 1   n_streams_listened  11504 non-null  int64
dtypes: int64(2)
memory usage: 269.6 KB


In [5]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
rest_type,11504.0,2.45984,1.68546,0.0,1.0,2.0,4.0,6.0
n_streams_listened,11504.0,7.129781,8.876177,1.0,2.0,5.0,9.0,191.0


In [6]:
df.head(4)

Unnamed: 0,rest_type,n_streams_listened
2017-04-04 14:06:56,1,2
2017-04-04 14:55:10,1,9
2017-04-04 16:28:27,2,7
2017-04-04 17:28:27,0,18


# <span style='background :khaki' > Random Forest Predictions </span>

In [7]:
# Assuming number of predicted rest for next day is = 6
n_rests = 6

#### <span style="background:violet"> Functions <span>

In [8]:
class BlockingTimeSeriesSplit():
    def __init__(self, n_splits, numb):
        self.n_splits = n_splits
        self.numb = numb
    
    def get_n_splits(self, X, y, groups):
        return self.n_splits
    
    def split(self, X, y=None, groups=None):
        n_samples = len(X)
        k_fold_size = n_samples // self.n_splits
        indices = np.arange(n_samples)

        margin = 0
        for i in range(self.n_splits):
            start = i * k_fold_size
            stop = start + k_fold_size
            mid = stop-self.numb
            yield indices[start: mid], indices[mid + margin: stop]

In [9]:
def tree_feature_importance_class(df, target, split=7, n_s=100, plot = 0, r = 0, sort = 0):
    # Data
    X = df.drop(target, axis=1)
    y = df[target]
    
    # Split
    X_train, X_test = X.iloc[:-split,:], X.iloc[-split:,:]
    y_train, y_test = y.iloc[:-split], y.iloc[-split:]
    
    # Create - Train - Predict
    model = RandomForestClassifier(n_estimators=n_s)
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    
    # Evaluation Methods - accuracy, precision, recall & F1-score
    accuracy = np.round(accuracy_score(y_test, predictions), 3)    
    precision = np.round(precision_score(y_test, predictions, average='weighted'), 3)    
    recall = np.round(recall_score(y_test, predictions, average='weighted'), 3)    
    f1 = np.round(f1_score(y_test, predictions, average='weighted'), 3)    

    #print('Accuracy:', accuracy)
    #print('Precision:', precision)
    #print('Recall:', recall)
    #print('F1-score:', f1)
    
    # Plot Real Vs Prediction
    if plot == 0 or plot == 1:
        fig = plt.figure(figsize=(6,3))
        plt.title(f' {target}: Random Forest Real vs Prediction - Accuracy {accuracy}', fontsize=20)
        plt.plot(y_test, color='red')
        plt.plot(pd.Series(predictions, index=y_test.index), color='green')
        plt.xlabel('Days', fontsize=12)
        plt.ylabel('Target variable', fontsize=12)
        plt.legend(labels=['Real', 'Prediction'], fontsize=12)
        plt.grid()
        plt.show()
    
    # Dataframe with Importance of each feature
    if sort == 0:
        df_importances = pd.DataFrame({
            'feature': df.drop(columns = target).columns,
            'importance': model.feature_importances_}
        ).sort_values(by='importance', ascending=False)
    
    if sort == 1:
        df_importances = pd.DataFrame({
            'feature': df.drop(columns = target).columns,
            'importance': model.feature_importances_}
        )
    
    # Plot Features Importance
    if plot == 0 or plot == 2:
        plt.figure(figsize=(12,8))
        plt.title('Variable Importances', fontsize=12)
        sns.barplot(x=df_importances.importance, y=df_importances.feature, orient='h')
        plt.show()
        
    #  Print Importances
    #print('Accuracy: ', accuracy)
    #print(df_importances.sort_values(by='importance'), '\n')
    
    if r == 1:
        return accuracy, precision, recall, f1, df_importances
    
    return accuracy
    

In [10]:
def tree_classification_cv(df, target, cv = 2, cv_split=5, n_s=100, test_size=0.3):
    # Results
    c_predict = []
    c_acc = []
    c_pre = []
    c_rec = []
    c_f1 = []
    
    # Data
    X = df.drop(target, axis=1)
    y = df[target]
    
    # CV Method
    if(cv == 1):
        tscv = TimeSeriesSplit(max_train_size=None, n_splits=cv_split, test_size=test_size)
    elif(cv == 2):
        tscv = BlockingTimeSeriesSplit(n_splits = cv_split, numb = test_size)
    
    # Split CV Data
    for train_index, test_index in tscv.split(X):
        pre = []
        
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        model = RandomForestClassifier(n_estimators=n_s)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        c_predict.append(y_pred)
        c_acc.append(accuracy_score(y_test, y_pred))
        c_pre.append(precision_score(y_test, y_pred, average='weighted'))
        c_rec.append(recall_score(y_test, y_pred, average='weighted'))
        c_f1.append(f1_score(y_test, y_pred, average='weighted'))
    
    accuracy = np.mean(c_acc)
    precision = np.mean(c_pre)
    recall = np.mean(c_rec)
    f1 = np.mean(c_f1)
    
    return accuracy, precision, recall, f1

#### <span style='background :skyblue' > Global Variables </span>

In [15]:
n_split = 14
n_sims = 75

### <span style="background:lightGreen"> Prediction BASE <span>

#### <span style='background :skyblue' > BASE </span>

In [40]:
%%time
start_full = time.time()

b_acc = []
b_pre = []
b_rec = []
b_f1 = []
b_vi = []

b_sim_acc = []
b_sim_pre = []
b_sim_rec = []
b_sim_f1 = []
b_sim_vi = []
    
target = df.columns[0]
    
for sim in range(n_sims):
    #print('-----------------', target, '- sim n:', sim, '- lag: ', r, '-------------------')
    acc, pre, rec, f1, vi = tree_feature_importance_class(df, target, split=n_rests, plot = 3, r = 1)
        
    # Save data -- SIM
    b_sim_acc.append(acc)
    b_sim_pre.append(pre)
    b_sim_rec.append(rec)
    b_sim_f1.append(f1)
    b_sim_vi.append(vi)
        
        
mean_sim_vi = pd.DataFrame(columns = ['feature', 'importance'])

# Loop to make variable importance simulation mean
for i,col in zip(range(b_sim_vi[0].shape[0]), b_sim_vi[0]['feature']):
    aux = []
    
    for j in range(n_sims):
        aux.append(b_sim_vi[j]['importance'][i])
    
    mean_sim_vi.loc[i, 'feature'] = col
    mean_sim_vi.loc[i, 'importance'] = np.mean(aux)

#print('\n Mean Variable Importance LAGS DATAFRAMES \n')
#mean_sim_vi.head()
    
# Save Data -- LAG Loop
b_acc.append(np.mean(b_sim_acc))
b_pre.append(np.mean(b_sim_pre))
b_rec.append(np.mean(b_sim_rec))
b_f1.append(np.mean(b_sim_f1))
b_vi.append(mean_sim_vi)
    
# Save Data -- VARIABLES/FEATURES

#print(lag_acc)
#print(lag_pre)
#print(lag_rec)
#print(lag_f1)
#print(lag_vi)

end_full = time.time()

CPU times: total: 20.2 s
Wall time: 43.5 s


In [41]:
print(b_acc)
print(b_pre)
print(b_rec)
print(b_f1)
#print(lag_vi)

[0.020040000000000002]
[0.09996]
[0.020040000000000002]
[0.03336]


#### <span style='background :skyblue' > CV </span>

TSCV

In [18]:
%%time
start_full_tscv = time.time()

bl_acc_tscv = []
bl_pre_tscv = []
bl_rec_tscv = []
bl_f1_tscv = []
bl_vi_tscv = []

bl_sim_acc_tscv = []
bl_sim_pre_tscv = []
bl_sim_rec_tscv = []
bl_sim_f1_tscv = []
bl_sim_vi_tscv = []
    
target = df.columns[0]
    
for sim in range(n_sims):
    #print('-----------------', target, '- sim n:', sim, '- lag: ', r, '-------------------')
    b_acc_tscv, b_pre_tscv, b_rec_tscv, b_f1_tscv = tree_classification_cv(df, target, cv = 1, cv_split = n_split, test_size = n_rests)
        
    # Save data -- SIM
    bl_sim_acc_tscv.append(b_acc_tscv)
    bl_sim_pre_tscv.append(b_pre_tscv)
    bl_sim_rec_tscv.append(b_rec_tscv)
    bl_sim_f1_tscv.append(b_f1_tscv)
    
# Save Data -- LAG Loop
bl_acc_tscv.append(np.mean(bl_sim_acc_tscv))
bl_pre_tscv.append(np.mean(bl_sim_pre_tscv))
bl_rec_tscv.append(np.mean(bl_sim_rec_tscv))
bl_f1_tscv.append(np.mean(bl_sim_f1_tscv))
    
# Save Data -- VARIABLES/FEATURES

#print(b_acc_tscv)
#print(b_pre_tscv)
#print(b_rec_tscv)
#print(b_f1_tscv)

end_full_tscv = time.time()

CPU times: total: 5min 16s
Wall time: 5min 41s


In [19]:
print(np.round(b_acc_tscv,3))
print(b_pre_tscv)
print(b_rec_tscv)
print(b_f1_tscv)

0.131
0.037698412698412696
0.13095238095238096
0.05668934240362812


BCV

In [20]:
%%time
start_full_bcv = time.time()

bl_acc_bcv = []
bl_pre_bcv = []
bl_rec_bcv = []
bl_f1_bcv = []
bl_vi_bcv = []

bl_sim_acc_bcv = []
bl_sim_pre_bcv = []
bl_sim_rec_bcv = []
bl_sim_f1_bcv = []
bl_sim_vi_bcv = []
    
target = df.columns[0]
    
for sim in range(n_sims):
    #print('-----------------', target, '- sim n:', sim, '- lag: ', r, '-------------------')
    b_acc_bcv, b_pre_bcv, b_rec_bcv, b_f1_bcv = tree_classification_cv(df, target, cv = 1, cv_split = n_split, test_size = n_rests)
        
    # Save data -- SIM
    bl_sim_acc_bcv.append(b_acc_bcv)
    bl_sim_pre_bcv.append(b_pre_bcv)
    bl_sim_rec_bcv.append(b_rec_bcv)
    bl_sim_f1_bcv.append(b_f1_bcv)
    
# Save Data -- LAG Loop
bl_acc_bcv.append(np.mean(bl_sim_acc_bcv))
bl_pre_bcv.append(np.mean(bl_sim_pre_bcv))
bl_rec_bcv.append(np.mean(bl_sim_rec_bcv))
bl_f1_bcv.append(np.mean(bl_sim_f1_bcv))
    
# Save Data -- VARIABLES/FEATURES

#print(b_acc_bcv)
#print(b_pre_bcv)
#print(b_rec_bcv)
#print(b_f1_bcv)

end_full_bcv = time.time()

CPU times: total: 5min 25s
Wall time: 5min 46s


In [21]:
print(np.round(bl_acc_bcv,3))
print(bl_pre_bcv)
print(bl_rec_bcv)
print(bl_f1_bcv)

[0.127]
[0.05411640211640211]
[0.12730158730158728]
[0.0602358276643991]


### <span style="background:lightGreen"> Predictiones <span>

#### <span style='background :skyblue' > Generate Lags and Select a Best number of Lags </span>

In [42]:
def generate_lagged_dfs(df, target_col, num_lags, start):
    lagged_dfs = []
    for i in range(0, start):
        if len(lagged_dfs) == 0:
            lagged_df = df.copy()
        else:
            lagged_df = lagged_dfs[-1].copy()
        lagged_df[target_col + '_lag_' + str(i+1)] = df[target_col].shift(i+1)
        lagged_dfs.append(lagged_df)
    for i in range(start, start + num_lags):
        if len(lagged_dfs) == 0:
            lagged_df = df.copy()
        else:
            lagged_df = lagged_dfs[-1].copy()
        lagged_df[target_col + '_lag_' + str(i+1)] = df[target_col].shift(i+1)
        lagged_dfs.append(lagged_df)
    for d in lagged_dfs:
        d.dropna(inplace=True)
        
    return lagged_dfs

In [43]:
lag_df = generate_lagged_dfs(df, 'rest_type', n_rests, 8)
for i in range(n_rests-1):
    lag_df.pop(0)

In [44]:
n_rests

6

In [45]:
lag_df[0].head(10)

Unnamed: 0,rest_type,n_streams_listened,rest_type_lag_1,rest_type_lag_2,rest_type_lag_3,rest_type_lag_4,rest_type_lag_5,rest_type_lag_6
2017-04-05 10:14:38,2,54,5.0,1.0,0.0,2.0,1.0,1.0
2017-04-05 12:07:17,2,16,2.0,5.0,1.0,0.0,2.0,1.0
2017-04-05 14:18:35,2,5,2.0,2.0,5.0,1.0,0.0,2.0
2017-04-05 17:10:01,2,23,2.0,2.0,2.0,5.0,1.0,0.0
2017-04-05 18:22:31,1,10,2.0,2.0,2.0,2.0,5.0,1.0
2017-04-05 19:24:03,1,9,1.0,2.0,2.0,2.0,2.0,5.0
2017-04-06 05:32:24,5,6,1.0,1.0,2.0,2.0,2.0,2.0
2017-04-06 11:49:48,4,10,5.0,1.0,1.0,2.0,2.0,2.0
2017-04-06 12:46:40,0,14,4.0,5.0,1.0,1.0,2.0,2.0
2017-04-06 17:47:23,4,1,0.0,4.0,5.0,1.0,1.0,2.0


#### <span style='background :skyblue' > Variables Importance </span>

In [46]:
%%time

for d in lag_df:
    print('DATAFRAME: ', d.columns[0], '-- Last Lag ------------------------------------')
    target = d.columns[0]
    tree_feature_importance(d, target, split=n_rests, plot = 2, r = 1, sort=0)

DATAFRAME:  rest_type -- Last Lag ------------------------------------


NameError: name 'tree_feature_importance' is not defined

#### <span style='background :skyblue' > Basic RF </span>

In [47]:
%%time
start_full = time.time()

lag_acc = []
lag_pre = []
lag_rec = []
lag_f1 = []
lag_vi = []

for d, r in zip(lag_df, range(n_rests)):
    sim_acc = []
    sim_pre = []
    sim_rec = []
    sim_f1 = []
    sim_vi = []
    
    target = d.columns[0]
    
    for sim in range(n_sims):
        #print('-----------------', target, '- sim n:', sim, '- lag: ', r, '-------------------')
        acc, pre, rec, f1, vi = tree_feature_importance_class(d, target, split=n_rests, plot = 3, r = 1)
        
        # Save data -- SIM
        sim_acc.append(acc)
        sim_pre.append(pre)
        sim_rec.append(rec)
        sim_f1.append(f1)
        sim_vi.append(vi)
        
        
    mean_sim_vi = pd.DataFrame(columns = ['feature', 'importance'])

    # Loop to make variable importance simulation mean
    for i,col in zip(range(sim_vi[0].shape[0]), sim_vi[0]['feature']):
        aux = []
    
        for j in range(n_sims):
            aux.append(sim_vi[j]['importance'][i])
    
        mean_sim_vi.loc[i, 'feature'] = col
        mean_sim_vi.loc[i, 'importance'] = np.mean(aux)

    #print('\n Mean Variable Importance LAGS DATAFRAMES \n')
    #mean_sim_vi.head()
    
    # Save Data -- LAG Loop
    lag_acc.append(np.mean(sim_acc))
    lag_pre.append(np.mean(sim_pre))
    lag_rec.append(np.mean(sim_rec))
    lag_f1.append(np.mean(sim_f1))
    lag_vi.append(mean_sim_vi)
    
    # Save Data -- VARIABLES/FEATURES

    #print(lag_acc)
    #print(lag_pre)
    #print(lag_rec)
    #print(lag_f1)
    #print(lag_vi)

end_full = time.time()

CPU times: total: 10min 18s
Wall time: 19min 32s


In [48]:
print("full time: ", (end_full-start_full)* 10**3, 'ms')

full time:  1172825.243473053 ms


In [49]:
print(lag_acc)
print(lag_pre)
print(lag_rec)
print(lag_f1)
#print(lag_vi)

[0.3998133333333333, 0.2888133333333333, 0.20684, 0.27110666666666666, 0.31997333333333333, 0.33996]
[0.58636, 0.5244266666666666, 0.4521866666666667, 0.5996133333333333, 0.7196266666666666, 0.58948]
[0.3998133333333333, 0.2888133333333333, 0.20684, 0.27110666666666666, 0.31997333333333333, 0.33996]
[0.4728533333333333, 0.3671333333333333, 0.28134666666666663, 0.3621333333333333, 0.4237866666666667, 0.41973333333333335]


In [None]:
lag_df[0]

#### <span style='background :skyblue' > CV </span>

<span style='background :skyblue' > TSCV </span>

In [None]:
%%time
start_full_tscv = time.time()

lag_acc_tscv = []
lag_pre_tscv = []
lag_rec_tscv = []
lag_f1_tscv = []
lag_vi_tscv = []

for d, r in zip(lag_df, range(n_rests)):
    sim_acc_tscv = []
    sim_pre_tscv = []
    sim_rec_tscv = []
    sim_f1_tscv = []
    sim_vi_tscv = []
    
    target = d.columns[0]
    
    for sim in range(n_sims):
        #print('-----------------', target, '- sim n:', sim, '- lag: ', r, '-------------------')
        acc_tscv, pre_tscv, rec_tscv, f1_tscv = tree_classification_cv(d, target, cv = 1, cv_split = n_split, test_size = n_rests)
        
        # Save data -- SIM
        sim_acc_tscv.append(acc_tscv)
        sim_pre_tscv.append(pre_tscv)
        sim_rec_tscv.append(rec_tscv)
        sim_f1_tscv.append(f1_tscv)
    
    # Save Data -- LAG Loop
    lag_acc_tscv.append(np.mean(sim_acc_tscv))
    lag_pre_tscv.append(np.mean(sim_pre_tscv))
    lag_rec_tscv.append(np.mean(sim_rec_tscv))
    lag_f1_tscv.append(np.mean(sim_f1_tscv))
    
    # Save Data -- VARIABLES/FEATURES

    #print(lag_acc_tscv)
    #print(lag_pre_tscv)
    #print(lag_rec_tscv)
    #print(lag_f1_tscv)

end_full_tscv = time.time()

In [None]:
print("full time: ", (end_full_tscv-start_full_tscv)* 10**3, 'ms')

In [None]:
print(np.round(lag_acc_tscv,3))
print(lag_pre_tscv)
print(lag_rec_tscv)
print(lag_f1_tscv)

<span style='background :skyblue' > BCV </span>

In [None]:
%%time
start_full_bcv = time.time()

lag_acc_bcv = []
lag_pre_bcv = []
lag_rec_bcv = []
lag_f1_bcv = []
lag_vi_bcv = []

for d, r in zip(lag_df, range(n_rests)):
    sim_acc_bcv = []
    sim_pre_bcv = []
    sim_rec_bcv = []
    sim_f1_bcv = []
    sim_vi_bcv = []
    
    target = d.columns[0]
    
    for sim in range(n_sims):
        #print('-----------------', target, '- sim n:', sim, '- lag: ', r, '-------------------')
        acc_bcv, pre_bcv, rec_bcv, f1_bcv = tree_classification_cv(d, target, cv = 2, cv_split = n_split, test_size = n_rests)
        
        # Save data -- SIM
        sim_acc_bcv.append(acc_bcv)
        sim_pre_bcv.append(pre_bcv)
        sim_rec_bcv.append(rec_bcv)
        sim_f1_bcv.append(f1_bcv)
    
    # Save Data -- LAG Loop
    lag_acc_bcv.append(np.mean(sim_acc_bcv))
    lag_pre_bcv.append(np.mean(sim_pre_bcv))
    lag_rec_bcv.append(np.mean(sim_rec_bcv))
    lag_f1_bcv.append(np.mean(sim_f1_bcv))
    
    # Save Data -- VARIABLES/FEATURES

    #print(lag_acc_bcv)
    #print(lag_pre_bcv)
    #print(lag_rec_bcv)
    #print(lag_f1_bcv)

end_full_bcv = time.time()

In [None]:
print("full time: ", (end_full_bcv-start_full_bcv)* 10**3, 'ms')

In [None]:
print(np.round(lag_acc_bcv,3))
print(lag_pre_bcv)
print(lag_rec_bcv)
print(lag_f1_bcv)

##### <span style='background :yellow' > Select best Nºlags and CV Type </span>

##### <span style='background :yellow' > Need to make huge Simulation... a lot of time</span>


### <span style='background :orange' > About checking time features and ciclycal values, we will use the results concluded from the first experiment N_Streams_Prediction </span>

# <span style='background :orange' > Is best base model??? </span>

In [36]:
# End total time
end_full_infi = time.time()

In [37]:
print("full time: ", end_full_infi-start_full_infi * 10**3, 'ms')

full time:  -1679556165061.5093 ms


# <span style='background :lightgreen' > Create TF </span>

In [None]:
def create_time_features(df, f=0):
    df = df.copy()
    if f == 0:
        df['year'] = df.index.year
        df['quarter'] = df.index.quarter
        df['month'] = df.index.month
        df['week'] = df.index.isocalendar().week
        df['day'] = df.index.day
        df['dayofyear'] = df.index.dayofyear
        df['dayofweek'] = df.index.dayofweek
    
        df['is_month_end'] = df.index.is_month_end
        df['is_month_start'] = df.index.is_month_start
        df['is_cuarter_end'] = df.index.is_quarter_end
        df['is_cuarter_start'] = df.index.is_quarter_start
        df['is_year_start'] = df.index.is_year_start
    
    if f == 1:
        df['year'] = df.index.year
        df['quarter'] = df.index.quarter
        df['month'] = df.index.month
        df['week'] = df.index.isocalendar().week
        df['day'] = df.index.day
        df['dayofyear'] = df.index.dayofyear
        df['dayofweek'] = df.index.dayofweek
        
    if f == 2:
        df['day'] = df.index.day
        df['dayofyear'] = df.index.dayofyear
        df['dayofweek'] = df.index.dayofweek
        df['year'] = df.index.year
        
        
    return df

### <span style='background :lightgreen' > Base </span>

In [None]:
# Seleccionar Dataframe con mejores lags
# Crear Diferentes 3 sets de TF
dftf1 = create_time_features(lag_df[2])
dftf2 = create_time_features(lag_df[2], f = 1)
dftf3 = create_time_features(lag_df[2], f = 2)

### <span style="background:skyblue"> TF1 <span>

In [29]:
%%time
start_full = time.time()

lag_acc1 = []
lag_pre1 = []
lag_rec1 = []
lag_f11 = []
lag_vi1 = []

sim_acc1 = []
sim_pre1 = []
sim_rec1 = []
sim_f11 = []
sim_vi1 = []
    
target = dftf1.columns[0]
    
for sim in range(n_sims):
    #print('-----------------', target, '- sim n:', sim, '- lag: ', r, '-------------------')
    acc1, pre1, rec1, f11, vi1 = tree_feature_importance_class(dftf1, target, split=n_rests, plot = 3, r = 1)
        
    # Save data -- SIM
    sim_acc1.append(acc1)
    sim_pre1.append(pre1)
    sim_rec1.append(rec1)
    sim_f11.append(f11)
    sim_vi1.append(vi1)
        
        
mean_sim_vi = pd.DataFrame(columns = ['feature', 'importance'])

# Loop to make variable importance simulation mean
for i,col in zip(range(sim_vi[0].shape[0]), sim_vi[0]['feature']):
    aux = []
    
    for j in range(n_sims):
        aux.append(sim_vi[j]['importance'][i])
    
    mean_sim_vi.loc[i, 'feature'] = col
    mean_sim_vi.loc[i, 'importance'] = np.mean(aux)

#print('\n Mean Variable Importance LAGS DATAFRAMES \n')
#mean_sim_vi.head()
    
# Save Data -- LAG Loop
lag_acc1.append(np.mean(sim_acc1))
lag_pre1.append(np.mean(sim_pre1))
lag_rec1.append(np.mean(sim_rec1))
lag_f11.append(np.mean(sim_f11))
lag_vi1.append(mean_sim_vi)
    
# Save Data -- VARIABLES/FEATURES

#print(lag_acc)
#print(lag_pre)
#print(lag_rec)
#print(lag_f1)
#print(lag_vi)

end_full = time.time()

CPU times: total: 1min 53s
Wall time: 4min 1s


### <span style="background:skyblue"> TF2 <span>

In [30]:
%%time
start_full = time.time()

lag_acc2 = []
lag_pre2 = []
lag_rec2 = []
lag_f22 = []
lag_vi2 = []

sim_acc2 = []
sim_pre2 = []
sim_rec2 = []
sim_f22 = []
sim_vi2 = []
    
target = dftf2.columns[0]
    
for sim in range(n_sims):
    #print('-----------------', target, '- sim n:', sim, '- lag: ', r, '-------------------')
    acc2, pre2, rec2, f22, vi2 = tree_feature_importance_class(dftf2, target, split=n_rests, plot = 3, r = 1)
        
    # Save data -- SIM
    sim_acc2.append(acc2)
    sim_pre2.append(pre2)
    sim_rec2.append(rec2)
    sim_f22.append(f22)
    sim_vi2.append(vi2)
        
        
mean_sim_vi = pd.DataFrame(columns = ['feature', 'importance'])

# Loop to make variable importance simulation mean
for i,col in zip(range(sim_vi[0].shape[0]), sim_vi[0]['feature']):
    aux = []
    
    for j in range(n_sims):
        aux.append(sim_vi[j]['importance'][i])
    
    mean_sim_vi.loc[i, 'feature'] = col
    mean_sim_vi.loc[i, 'importance'] = np.mean(aux)

#print('\n Mean Variable Importance LAGS DATAFRAMES \n')
#mean_sim_vi.head()
    
# Save Data -- LAG Loop
lag_acc2.append(np.mean(sim_acc2))
lag_pre2.append(np.mean(sim_pre2))
lag_rec2.append(np.mean(sim_rec2))
lag_f22.append(np.mean(sim_f22))
lag_vi2.append(mean_sim_vi)
    
# Save Data -- VARIABLES/FEATURES

#print(lag_acc)
#print(lag_pre)
#print(lag_rec)
#print(lag_f2)
#print(lag_vi)

end_full = time.time()

CPU times: total: 2min 21s
Wall time: 3min 45s


### <span style="background:skyblue"> TF3 <span>

In [31]:
%%time
start_full = time.time()

lag_acc3 = []
lag_pre3 = []
lag_rec3 = []
lag_f33 = []
lag_vi3 = []

sim_acc3 = []
sim_pre3 = []
sim_rec3 = []
sim_f33 = []
sim_vi3 = []
    
target = dftf3.columns[0]
    
for sim in range(n_sims):
    #print('-----------------', target, '- sim n:', sim, '- lag: ', r, '-------------------')
    acc3, pre3, rec3, f33, vi3 = tree_feature_importance_class(dftf3, target, split=n_rests, plot = 3, r = 1)
        
    # Save data -- SIM
    sim_acc3.append(acc3)
    sim_pre3.append(pre3)
    sim_rec3.append(rec3)
    sim_f33.append(f33)
    sim_vi3.append(vi3)
        
        
mean_sim_vi = pd.DataFrame(columns = ['feature', 'importance'])

# Loop to make variable importance simulation mean
for i,col in zip(range(sim_vi[0].shape[0]), sim_vi[0]['feature']):
    aux = []
    
    for j in range(n_sims):
        aux.append(sim_vi[j]['importance'][i])
    
    mean_sim_vi.loc[i, 'feature'] = col
    mean_sim_vi.loc[i, 'importance'] = np.mean(aux)

#print('\n Mean Variable Importance LAGS DATAFRAMES \n')
#mean_sim_vi.head()
    
# Save Data -- LAG Loop
lag_acc3.append(np.mean(sim_acc3))
lag_pre3.append(np.mean(sim_pre3))
lag_rec3.append(np.mean(sim_rec3))
lag_f33.append(np.mean(sim_f33))
lag_vi3.append(mean_sim_vi)
    
# Save Data -- VARIABLES/FEATURES

#print(lag_acc)
#print(lag_pre)
#print(lag_rec)
#print(lag_f3)
#print(lag_vi)

end_full = time.time()

CPU times: total: 2min
Wall time: 3min 43s


### <span style='background :lightgreen' > TSCV </span>

### <span style="background:skyblue"> TF1 <span>

In [32]:
%%time
start_full_tscv1 = time.time()

lag_acc_tscv1 = []
lag_pre_tscv1 = []
lag_rec_tscv1 = []
lag_f1_tscv1 = []
lag_vi_tscv1 = []

sim_acc_tscv1 = []
sim_pre_tscv1 = []
sim_rec_tscv1 = []
sim_f1_tscv1 = []
sim_vi_tscv1 = []
    
target = dftf1.columns[0]
    
for sim in range(n_sims):
    #print('-----------------', target, '- sim n:', sim, '- lag: ', r, '-------------------')
    acc_tscv1, pre_tscv1, rec_tscv1, f1_tscv1 = tree_classification_cv(dftf1, target, cv = 1, cv_split = n_split, test_size = n_rests)
        
    # Save data -- SIM
    sim_acc_tscv1.append(acc_tscv1)
    sim_pre_tscv1.append(pre_tscv1)
    sim_rec_tscv1.append(rec_tscv1)
    sim_f1_tscv1.append(f1_tscv1)
    
# Save Data -- LAG Loop
lag_acc_tscv1.append(np.mean(sim_acc_tscv1))
lag_pre_tscv1.append(np.mean(sim_pre_tscv1))
lag_rec_tscv1.append(np.mean(sim_rec_tscv1))
lag_f1_tscv1.append(np.mean(sim_f1_tscv1))
    
# Save Data -- VARIABLES/FEATURES

#print(lag_acc_tscv1)
#print(lag_pre_tscv1)
#print(lag_rec_tscv1)
#print(lag_f1_tscv1)

end_full_tscv1 = time.time()

CPU times: total: 20min 52s
Wall time: 59min 43s


### <span style="background:skyblue"> TF2 <span>

In [33]:
%%time
start_full_tscv2 = time.time()

lag_acc_tscv2 = []
lag_pre_tscv2 = []
lag_rec_tscv2 = []
lag_f1_tscv2 = []
lag_vi_tscv2 = []

sim_acc_tscv2 = []
sim_pre_tscv2 = []
sim_rec_tscv2 = []
sim_f1_tscv2 = []
sim_vi_tscv2 = []
    
target = dftf1.columns[0]
    
for sim in range(n_sims):
    #print('-----------------', target, '- sim n:', sim, '- lag: ', r, '-------------------')
    acc_tscv2, pre_tscv2, rec_tscv2, f1_tscv2 = tree_classification_cv(dftf1, target, cv = 1, cv_split = n_split, test_size = n_rests)
        
    # Save data -- SIM
    sim_acc_tscv2.append(acc_tscv2)
    sim_pre_tscv2.append(pre_tscv2)
    sim_rec_tscv2.append(rec_tscv2)
    sim_f1_tscv2.append(f1_tscv2)
    
# Save Data -- LAG Loop
lag_acc_tscv2.append(np.mean(sim_acc_tscv2))
lag_pre_tscv2.append(np.mean(sim_pre_tscv2))
lag_rec_tscv2.append(np.mean(sim_rec_tscv2))
lag_f1_tscv2.append(np.mean(sim_f1_tscv2))
    
# Save Data -- VARIABLES/FEATURES

#print(lag_acc_tscv2)
#print(lag_pre_tscv2)
#print(lag_rec_tscv2)
#print(lag_f1_tscv2)

end_full_tscv2 = time.time()

CPU times: total: 20min 13s
Wall time: 1h 7min 45s


### <span style="background:skyblue"> TF3 <span>

In [34]:
%%time
start_full_tscv3 = time.time()

lag_acc_tscv3 = []
lag_pre_tscv3 = []
lag_rec_tscv3 = []
lag_f1_tscv3 = []
lag_vi_tscv3 = []

sim_acc_tscv3 = []
sim_pre_tscv3 = []
sim_rec_tscv3 = []
sim_f1_tscv3 = []
sim_vi_tscv3 = []
    
target = dftf1.columns[0]
    
for sim in range(n_sims):
    #print('-----------------', target, '- sim n:', sim, '- lag: ', r, '-------------------')
    acc_tscv3, pre_tscv3, rec_tscv3, f1_tscv3 = tree_classification_cv(dftf1, target, cv = 1, cv_split = n_split, test_size = n_rests)
        
    # Save data -- SIM
    sim_acc_tscv3.append(acc_tscv3)
    sim_pre_tscv3.append(pre_tscv3)
    sim_rec_tscv3.append(rec_tscv3)
    sim_f1_tscv3.append(f1_tscv3)
    
# Save Data -- LAG Loop
lag_acc_tscv3.append(np.mean(sim_acc_tscv3))
lag_pre_tscv3.append(np.mean(sim_pre_tscv3))
lag_rec_tscv3.append(np.mean(sim_rec_tscv3))
lag_f1_tscv3.append(np.mean(sim_f1_tscv3))
    
# Save Data -- VARIABLES/FEATURES

#print(lag_acc_tscv3)
#print(lag_pre_tscv3)
#print(lag_rec_tscv3)
#print(lag_f1_tscv3)

end_full_tscv3 = time.time()

CPU times: total: 27min 17s
Wall time: 1h 11min 38s


### <span style='background :lightgreen' > BCV </span>

### <span style="background:skyblue"> TF1 <span>

In [35]:
%%time
start_full_bcv1 = time.time()

lag_acc_bcv1 = []
lag_pre_bcv1 = []
lag_rec_bcv1 = []
lag_f1_bcv1 = []
lag_vi_bcv1 = []

sim_acc_bcv1 = []
sim_pre_bcv1 = []
sim_rec_bcv1 = []
sim_f1_bcv1 = []
sim_vi_bcv1 = []
    
target = dftf1.columns[0]
    
for sim in range(n_sims):
    #print('-----------------', target, '- sim n:', sim, '- lag: ', r, '-------------------')
    acc_bcv1, pre_bcv1, rec_bcv1, f1_bcv1 = tree_classification_cv(dftf1, target, cv = 2, cv_split = n_split, test_size = n_rests)
        
    # Save data -- SIM
    sim_acc_bcv1.append(acc_bcv1)
    sim_pre_bcv1.append(pre_bcv1)
    sim_rec_bcv1.append(rec_bcv1)
    sim_f1_bcv1.append(f1_bcv1)
    
# Save Data -- LAG Loop
lag_acc_bcv1.append(np.mean(sim_acc_bcv1))
lag_pre_bcv1.append(np.mean(sim_pre_bcv1))
lag_rec_bcv1.append(np.mean(sim_rec_bcv1))
lag_f1_bcv1.append(np.mean(sim_f1_bcv1))
    
# Save Data -- VARIABLES/FEATURES

#print(lag_acc_bcv1)
#print(lag_pre_bcv1)
#print(lag_rec_bcv1)
#print(lag_f1_bcv1)

end_full_bcv1 = time.time()

CPU times: total: 3min 45s
Wall time: 4min 5s


### <span style="background:skyblue"> TF2 <span>

In [36]:
%%time
start_full_bcv2 = time.time()

lag_acc_bcv2 = []
lag_pre_bcv2 = []
lag_rec_bcv2 = []
lag_f1_bcv2 = []
lag_vi_bcv2 = []

sim_acc_bcv2 = []
sim_pre_bcv2 = []
sim_rec_bcv2 = []
sim_f1_bcv2 = []
sim_vi_bcv2 = []
    
target = dftf2.columns[0]
    
for sim in range(n_sims):
    #print('-----------------', target, '- sim n:', sim, '- lag: ', r, '-------------------')
    acc_bcv2, pre_bcv2, rec_bcv2, f1_bcv2 = tree_classification_cv(dftf2, target, cv = 2, cv_split = n_split, test_size = n_rests)
        
    # Save data -- SIM
    sim_acc_bcv2.append(acc_bcv2)
    sim_pre_bcv2.append(pre_bcv2)
    sim_rec_bcv2.append(rec_bcv2)
    sim_f1_bcv2.append(f1_bcv2)
    
# Save Data -- LAG Loop
lag_acc_bcv2.append(np.mean(sim_acc_bcv2))
lag_pre_bcv2.append(np.mean(sim_pre_bcv2))
lag_rec_bcv2.append(np.mean(sim_rec_bcv2))
lag_f1_bcv2.append(np.mean(sim_f1_bcv2))
    
# Save Data -- VARIABLES/FEATURES

#print(lag_acc_bcv2)
#print(lag_pre_bcv2)
#print(lag_rec_bcv2)
#print(lag_f1_bcv2)

end_full_bcv2 = time.time()

CPU times: total: 3min 26s
Wall time: 3min 31s


### <span style="background:skyblue"> TF3 <span>

In [37]:
%%time
start_full_bcv3 = time.time()

lag_acc_bcv3 = []
lag_pre_bcv3 = []
lag_rec_bcv3 = []
lag_f1_bcv3 = []
lag_vi_bcv3 = []

sim_acc_bcv3 = []
sim_pre_bcv3 = []
sim_rec_bcv3 = []
sim_f1_bcv3 = []
sim_vi_bcv3 = []
    
target = dftf3.columns[0]
    
for sim in range(n_sims):
    #print('-----------------', target, '- sim n:', sim, '- lag: ', r, '-------------------')
    acc_bcv3, pre_bcv3, rec_bcv3, f1_bcv3 = tree_classification_cv(dftf3, target, cv = 2, cv_split = n_split, test_size = n_rests)
        
    # Save data -- SIM
    sim_acc_bcv3.append(acc_bcv3)
    sim_pre_bcv3.append(pre_bcv3)
    sim_rec_bcv3.append(rec_bcv3)
    sim_f1_bcv3.append(f1_bcv3)
    
# Save Data -- LAG Loop
lag_acc_bcv3.append(np.mean(sim_acc_bcv3))
lag_pre_bcv3.append(np.mean(sim_pre_bcv3))
lag_rec_bcv3.append(np.mean(sim_rec_bcv3))
lag_f1_bcv3.append(np.mean(sim_f1_bcv3))
    
# Save Data -- VARIABLES/FEATURES

#print(lag_acc_bcv3)
#print(lag_pre_bcv3)
#print(lag_rec_bcv3)
#print(lag_f1_bcv3)

end_full_bcv3 = time.time()

CPU times: total: 3min 10s
Wall time: 3min 14s


# Results

In [38]:
for mae, d1, mae1, d2, mae2 in zip(lag_acc1, dftf2, lag_acc2, dftf3, lag_acc3):
    print('TF 0 BCV:',str(dftf1.columns[0]), ': ', mae)
    print('TF 1 BCV:', str(dftf2.columns[0]), ': ', mae1)
    print('TF 2 BCV:',str(dftf3.columns[0]), ': ', mae2)
    print('\n')

TF 0 BCV: rest_type :  0.7332933333333334
TF 1 BCV: rest_type :  0.7444
TF 2 BCV: rest_type :  0.7066933333333334




In [39]:
for mae, d1, mae1, d2, mae2 in zip(lag_acc_tscv1, dftf2, lag_acc_tscv2, dftf3, lag_acc_tscv3):
    print('TF 0 BCV:',str(dftf1.columns[0]), ': ', mae)
    print('TF 1 BCV:', str(dftf2.columns[0]), ': ', mae1)
    print('TF 2 BCV:',str(dftf3.columns[0]), ': ', mae2)
    print('\n')

TF 0 BCV: rest_type :  0.3393650793650795
TF 1 BCV: rest_type :  0.3376190476190476
TF 2 BCV: rest_type :  0.3453968253968254




In [40]:
for mae, d1, mae1, d2, mae2 in zip(lag_acc_bcv1, dftf2, lag_acc_bcv2, dftf3, lag_acc_bcv3):
    print('TF 0 BCV:',str(dftf1.columns[0]), ': ', mae)
    print('TF 1 BCV:', str(dftf2.columns[0]), ': ', mae1)
    print('TF 2 BCV:',str(dftf3.columns[0]), ': ', mae2)
    print('\n')

TF 0 BCV: rest_type :  0.26650793650793647
TF 1 BCV: rest_type :  0.25761904761904764
TF 2 BCV: rest_type :  0.25587301587301586


