## Auto Regressive feature engineering

In [None]:
def add_lags(
    df,
    lags,
    column,
    ts_id=None,
    use_32_bit=False
):
    added_features = []
    for l in lags:
        lag_name = f"{column}_lag_{l}"
        if ts_id:
            df[lag_name] = df.groupby(ts_id)[column].shift(l)
        else:
            df[lag_name] = df[column].shift(l)

        if use_32_bit:
            df[lag_name] = df[lag_name].astype('float32')

        added_features.append(lag_name)

    return df, added_features

In [None]:
def add_rolling_features(
    df,
    rolls,
    column,
    agg_funcs=["mean", "std"],
    ts_id=None,
    n_shift=1,
    use_32_bit=False
):
    added_features = []
    for l in rolls:
        for agg in agg_funcs:
            feature_name = f"{column}_rolling_{l}_{agg}"
            if ts_id:
                df[feature_name] = df.groupby(ts_id)[column].shift(n_shift).rolling(l).agg(agg)
            else:
                df[feature_name] = df[column].shift(n_shift).rolling(l).agg(agg)
            if use_32_bit:
                df[feature_name] = df[feature_name].astype('float32')
            added_features.append(feature_name)

    return df, added_features

def add_seasonal_rolling_features(
    df,
    seasonal_periods,
    rolls,
    column,
    agg_funcs=["mean", "std"],
    ts_id=None,
    n_shift=1,
    use_32_bit=False
):
    added_features = []
    for sp in seasonal_periods:
        for l in rolls:
            for agg in agg_funcs:
                if ts_id:
                    grouped = df.groupby(ts_id)[column].shift(n_shift * sp)
                else:
                    grouped = df[column].shift(n_shift * sp)

                rolling_name = f"{column}_{sp}_seasonal_rolling_{l}_{agg}"
                df[rolling_name] = grouped.rolling(l).agg(agg)
                if use_32_bit:
                    df[rolling_name] = df[rolling_name].astype('float32')
                added_features.append(rolling_name)

    return df, added_features

def add_ewma(
    df,
    column,
    alphas=[0.5],
    spans=None,
    ts_id=None,
    n_shift=1,
    use_32_bit=False
):
    added_features = []
    if spans is None:
        spans = [None] * len(alphas)

    for alpha, span in zip(alphas, spans):
        if ts_id:
            grouped = df.groupby(ts_id)[column].shift(n_shift)
        else:
            grouped = df[column].shift(n_shift)

        ewma_name = f"{column}_ewma_{'span' if span is not None else 'alpha'}_{span or alpha}"
        df[ewma_name] = grouped.ewm(alpha=alpha, span=span, adjust=False).mean()
        if use_32_bit:
            df[ewma_name] = df[ewma_name].astype('float32')
        added_features.append(ewma_name)

    return df, added_features


 ## temporal feature engineering

In [None]:
import pandas as pd
import numpy as np
import re
from pandas.tseries.frequencies import to_offset

def time_features_from_frequency_str(freq_str):
    features_by_offsets = {
        'A': [],
        'Q': ["Month", "Quarter", "Is_quarter_end", "Is_quarter_start", "Is_year_end", "Is_year_start"],
        'M': ["Month", "Quarter", "Is_quarter_end", "Is_quarter_start", "Is_year_end", "Is_year_start"],
        'W': ["Month", "Quarter", "Is_quarter_end", "Is_quarter_start", "Is_year_end", "Is_year_start", "Is_month_start", "Week"],
        'D': ["Month", "Quarter", "Is_quarter_end", "Is_quarter_start", "Is_year_end", "Is_year_start", "Is_month_start", "Week", "Day", "Dayofweek", "Dayofyear"],
        'B': ["Month", "Quarter", "Is_quarter_end", "Is_quarter_start", "Is_year_end", "Is_year_start", "Is_month_start", "Week", "Day", "Dayofweek", "Dayofyear"],
        'H': ["Month", "Quarter", "Is_quarter_end", "Is_quarter_start", "Is_year_end", "Is_year_start", "Is_month_start", "Week", "Day", "Dayofweek", "Dayofyear", "Hour"],
        'T': ["Month", "Quarter", "Is_quarter_end", "Is_quarter_start", "Is_year_end", "Is_year_start", "Is_month_start", "Week", "Day", "Dayofweek", "Dayofyear", "Hour", "Minute"]
    }
    return features_by_offsets.get(to_offset(freq_str).__class__, [])

def add_lags(df, lags, column, ts_id=None, use_32_bit=False):
    for l in lags:
        lag_name = f"{column}_lag_{l}"
        df[lag_name] = df.groupby(ts_id)[column].shift(l) if ts_id else df[column].shift(l)
        if use_32_bit:
            df[lag_name] = df[lag_name].astype('float32')
    return df

def add_temporal_features(df, field_name, frequency, add_elapsed=True, prefix=None, drop=True, use_32_bit=False):
    prefix = prefix or re.sub("[Dd]ate$", "", field_name) + "_"
    for n in time_features_from_frequency_str(frequency):
        df[prefix + n] = getattr(df[field_name].dt, n.lower()).astype('float32' if use_32_bit else 'float64')
    if add_elapsed:
        df[prefix + "Elapsed"] = df[field_name].astype('int64') // 10**9
        df[prefix + "Elapsed"] = df[prefix + "Elapsed"].astype('float32' if use_32_bit else 'int64')
    if drop:
        df.drop(field_name, axis=1, inplace=True)
    return df

def add_fourier_features(df, column_to_encode, max_value=None, n_fourier_terms=1, use_32_bit=False):
    max_value = max_value or df[column_to_encode].max()
    seasonal_cycle = df[column_to_encode].astype(int)
    for i in range(1, n_fourier_terms + 1):
        df[f"{column_to_encode}_sin_{i}"] = np.sin((2 * np.pi * seasonal_cycle * i) / max_value)
        df[f"{column_to_encode}_cos_{i}"] = np.cos((2 * np.pi * seasonal_cycle * i) / max_value)
        if use_32_bit:
            df[f"{column_to_encode}_sin_{i}"] = df[f"{column_to_encode}_sin_{i}"].astype('float32')
            df[f"{column_to_encode}_cos_{i}"] = df[f"{column_to_encode}_cos_{i}"].astype('float32')
    return df


In [None]:
def intersect_list(list1, list2):
    return list(set(list1).intersection(set(list2)))

def difference_list(list1, list2):
    return list(set(list1)- set(list2))

def union_list(list1, list2):
    return list(set(list1).union(set(list2)))

In [None]:
from dataclasses import dataclass, field
import pandas as pd
import numpy as np
from typing import List, Dict, Union
from sklearn.base import BaseEstimator, clone
from sklearn.preprocessing import StandardScaler
import warnings

def intersect_list(a, b):
    return list(set(a) & set(b))

def difference_list(a, b):
    return list(set(a) - set(b))

class MissingValueConfig:
    bfill_columns: List = field(default_factory=list)
    ffill_columns: List = field(default_factory=list)
    zero_fill_columns: List = field(default_factory=list)

    def impute_missing_values(self, df: pd.DataFrame):
        df = df.copy()
        bfill_columns = intersect_list(df.columns, self.bfill_columns)
        df[bfill_columns] = df[bfill_columns].fillna(method="bfill")
        ffill_columns = intersect_list(df.columns, self.ffill_columns)
        df[ffill_columns] = df[ffill_columns].fillna(method="ffill")
        zero_fill_columns = intersect_list(df.columns, self.zero_fill_columns)
        df[zero_fill_columns] = df[zero_fill_columns].fillna(0)
        check = df.isnull().any()
        missing_cols = check[check].index.tolist()
        missing_numeric_cols = intersect_list(
            missing_cols, df.select_dtypes([np.number]).columns.tolist()
        )
        missing_object_cols = intersect_list(
            missing_cols, df.select_dtypes(["object"]).columns.tolist()
        )
        df[missing_numeric_cols] = df[missing_numeric_cols].fillna(
            df[missing_numeric_cols].mean()
        )
        df[missing_object_cols] = df[missing_object_cols].fillna("NA")
        return df

@dataclass
class FeatureConfig:
    date: List = field(default_factory=list)
    target: str = field(default=None)
    original_target: str = field(default=None)
    continuous_features: List[str] = field(default_factory=list)
    categorical_features: List[str] = field(default_factory=list)
    boolean_features: List[str] = field(default_factory=list)
    index_cols: str = field(default_factory=list)
    exogenous_features: List[str] = field(default_factory=list)
    feature_list: List[str] = field(init=False)

    def __post_init__(self):
        self.feature_list = (
            self.categorical_features + self.continuous_features + self.boolean_features
        )
        if self.original_target is None:
            self.original_target = self.target

    def get_X_y(self, df: pd.DataFrame, categorical: bool = False, exogenous: bool = False):
        feature_list = self.continuous_features
        if categorical:
            feature_list += self.categorical_features + self.boolean_features
        if not exogenous:
            feature_list = list(set(feature_list) - set(self.exogenous_features))
        feature_list = list(set(feature_list))
        delete_index_cols = list(set(self.index_cols) - set(self.feature_list))
        X, y, y_orig = (
            df.loc[:, set(feature_list + self.index_cols)]
            .set_index(self.index_cols, drop=False)
            .drop(columns=delete_index_cols),
            df.loc[:, [self.target] + self.index_cols].set_index(
                self.index_cols, drop=True
            )
            if self.target in df.columns
            else None,
            df.loc[:, [self.original_target] + self.index_cols].set_index(
                self.index_cols, drop=True
            )
            if self.original_target in df.columns
            else None,
        )
        return X, y, y_orig

@dataclass
class ModelConfig:
    model: BaseEstimator = field(default=None)
    name: str = field(default=None)
    normalize: bool = field(default=False)
    fill_missing: bool = field(default=True)
    encode_categorical: bool = field(default=False)
    categorical_encoder: BaseEstimator = field(default=None)

    def clone(self):
        self.model = clone(self.model)
        return self

class MLForecast:
    def __init__(self, model_config: ModelConfig, feature_config: FeatureConfig, missing_config: MissingValueConfig = None, target_transformer: object = None):
        self.model_config = model_config
        self.feature_config = feature_config
        self.missing_config = missing_config
        self.target_transformer = target_transformer
        self._model = clone(model_config.model)
        if self.model_config.normalize:
            self._scaler = StandardScaler()
        if self.model_config.encode_categorical:
            self._cat_encoder = self.model_config.categorical_encoder
            self._encoded_categorical_features = self.feature_config.categorical_features

    def


In [None]:
import itertools
import math
import random
from typing import Callable, List, Tuple

import numpy as np
import pandas as pd
from scipy import optimize

def calculate_diversity(ens, diversity_matrix, default_div=1):
    if len(ens) == 1:
        return default_div
    return np.mean([diversity_matrix.loc[i, j] for i, j in itertools.combinations(ens, 2)])

def calculate_performance(ens, pred_wide, target, ensemble_func=np.mean, metric_func=None):
    pred = ensemble_func(pred_wide[ens], axis=1)
    act = pred_wide[target]
    return metric_func(pred, act)

def generate_random_candidate(candidates):
    return random.sample(candidates, 1)

def generate_best_candidate(objective, solution, candidates):
    cost = [objective(solution + [c]) for c in candidates]
    return [candidates[np.argmin(cost)]], np.min(cost)

def _initialize(candidates, objective, init):
    if init == "best":
        cost = [objective([c]) for c in candidates]
        return [candidates[np.argmin(cost)]], np.min(cost)
    elif init == "random":
        c = generate_random_candidate(candidates)
        return c, objective(c)

def greedy_optimization(objective, candidates, verbose=True):
    solution, solution_eval = _initialize(candidates, objective, init="best")
    candidates.remove(solution[0])
    while candidates:
        _candidate, candidate_eval = generate_best_candidate(objective, solution, candidates)
        candidate = solution + _candidate
        if candidate_eval <= solution_eval:
            solution, solution_eval = candidate, candidate_eval
            candidates.remove(_candidate[0])
    return solution, solution_eval

def stochastic_hillclimbing(objective, candidates, n_iterations=None, init="best", verbose=True, random_state=42):
    random.seed(random_state)
    n_iterations = len(candidates) * 2 if n_iterations is None else n_iterations
    solution, solution_eval = _initialize(candidates, objective, init)
    candidates.remove(solution[0])
    for i in range(n_iterations):
        _candidate = generate_random_candidate(candidates)
        candidate = solution + _candidate
        candidate_eval = objective(candidate)
        if candidate_eval <= solution_eval:
            solution, solution_eval = candidate, candidate_eval
            candidates.remove(_candidate[0])

    return solution, solution_eval

def _decay_temperature(current_temp, alpha, kind="linear"):
    return current_temp - alpha if kind == "linear" else current_temp / alpha

def initialize_temperature_range(objective, candidate_pool, p_range, n_iterations=100):
    diff_l = []
    candidates = generate_random_candidate(candidate_pool)
    candidate_score = objective(candidates)
    for _ in range(n_iterations):
        cand = generate_random_candidate(candidate_pool)
        candidates += cand
        candidate_pool.remove(cand[0])
        diff = candidate_score - objective(candidates)
        diff_l.append(diff)
    avg_diff = np.median(np.abs(diff_l))
    return (-avg_diff / math.log(p_range[0]), -avg_diff / math.log(p_range[1]))

def simulated_annealing(objective, candidates, n_iterations, p_range=(0.7, 0.001), t_range=None, init="best", temperature_decay="linear", verbose=True, random_state=42):
    random.seed(random_state)
    n_iterations = min(n_iterations, int(len(candidates) * 1.2))
    if t_range is None:
        t_range = initialize_temperature_range(objective, candidates, p_range)
    alpha = (t_range[0] - t_range[1]) / (n_iterations - 1) if temperature_decay == "linear" else math.pow((t_range[0] / t_range[1]), 1 / (n_iterations - 1))
    best_solution, best_solution_eval = _initialize(candidates, objective, init)
    candidates.remove(best_solution[0])
    current_temp = t_range[0]
    for i in range(n_iterations):
        _candidate = generate_random_candidate(candidates)
        candidate = best_solution + _candidate
        candidate_eval = objective(candidate)
        diff = best_solution_eval - candidate_eval
        if diff > 0 or random.uniform(0, 1) < math.exp(-abs(diff) / current_temp):
            best_solution, best_solution_eval = candidate, candidate_eval
            candidates.remove(_candidate[0])
        current_temp = _decay_temperature(current_temp, alpha, temperature_decay)
        if not candidates:
            break
    return best_solution, best_solution_eval

def find_optimal_combination(candidates, pred_wide, target, metric_fn):
    def loss_function(weights):
        fc = np.sum(pred_wide[candidates].values * np.array(weights), axis=1)
        return metric_fn(pred_wide[target].values, fc)

    opt_weights = optimize.minimize(
        loss_function,
        x0=[1 / len(candidates)] * len(candidates),
        constraints=({'type': 'eq', 'fun': lambda w: 1 - sum(w)}),
        method='SLSQP',
        bounds=[(0.0, 1.0)] * len(candidates),
        options={'ftol': 1e-10}
    )['x']
    
    return opt_weights

In [None]:
# Constants for cyclical transformation
days_in_month = 31  # Simplified; you might want to adjust this per month/year
days_in_week = 7
months_in_year = 12

# Transform to cyclical features
df['day_sin'] = np.sin(2 * np.pi * df['date_day'] / days_in_month)
df['day_cos'] = np.cos(2 * np.pi * df['date_day'] / days_in_month)

df['weekday_sin'] = np.sin(2 * np.pi * df['date_weekday'] / days_in_week)
df['weekday_cos'] = np.cos(2 * np.pi * df['date_weekday'] / days_in_week)

df['month_sin'] = np.sin(2 * np.pi * df['date_month'] / months_in_year)
df['month_cos'] = np.cos(2 * np.pi * df['date_month'] / months_in_year)

In [None]:
# Assume df is your time series dataframe and you have a timestamp index
# TARGET_COL is the name of the column you want to predict
# HORIZON is the number of time steps you want to predict into the future

TARGET_COL = 'your_target_column_name'
HORIZON = 1  # Change this based on your specific forecasting horizon
TRAIN_SIZE = 0.9  # Proportion of data to use for training

# Shift the target column to align with the forecast horizon
df['shifted_target'] = df[TARGET_COL].shift(-HORIZON)

# Separate features and target
X = df.drop(columns=[TARGET_COL, 'shifted_target'])
y = df['shifted_target']

# Calculate the index to split the data on
split_index = int(len(df) * TRAIN_SIZE)

# Split the data into training and testing sets without shuffling
X_train, X_test = X.iloc[:split_index], X.iloc[split_index:]
y_train, y_test = y.iloc[:split_index], y.iloc[split_index:]

# Check the shape of the datasets
print('Training set shape:', X_train.shape, y_train.shape)
print('Testing set shape:', X_test.shape, y_test.shape)

## General Times Series Forecasting/ Shifted times series forecasting

In [None]:
# Load your time series dataset
df = pd.read_csv('your_time_series_data.csv')

# Specify the target column and forecast horizon
TARGET_COL = 'your_target_column'
HORIZON = 1  # Example: Predict 1 step ahead

# Shift the target column to align with the forecast horizon
df['shifted_target'] = df[TARGET_COL].shift(-HORIZON)

# Prepare features (X) and target (y), dropping NA values caused by shifting
X = df.drop(columns=[TARGET_COL, 'shifted_target']).iloc[:-HORIZON, :]
y = df['shifted_target'].dropna()

# Initialize time series cross-validator with the desired number of splits
tscv = TimeSeriesSplit(n_splits=5)

# Initialize your model (using Linear Regression as an example)
model = LinearRegression()

# List to store the scores for each fold
mse_scores = []

# Perform time series cross-validation
for train_index, test_index in tscv.split(X):
    # Split data into training and testing sets
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Fit the model on the training data
    model.fit(X_train, y_train)
    
    # Make predictions on the testing data
    y_pred = model.predict(X_test)
    
    # Calculate and store the mean squared error for this fold
    mse = mean_squared_error(y_test, y_pred)
    mse_scores.append(mse)

# Output the evaluation results
print("MSE scores for each fold:", mse_scores)
print("Average MSE:", np.mean(mse_scores))

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.plot(y_test.index, y_test, label='Actual')
plt.plot(y_test.index, y_pred, label='Predicted')
plt.title('Actual vs Predicted')
plt.xlabel('Time')
plt.ylabel('Target Variable')
plt.legend()
plt.show()

## Auto regressive ml approach

In [None]:
def one_step_forecast(df, window):
    d = df.values
    x = []
    n = len(df)
    idx = df.index[:-window]
    for start in range(n-window):
        end = start + window
        x.append(d[start:end])
    cols = [f'x_{i}' for i in range(1, window+1)]
    x = np.array(x).reshape(n-window, -1)
    y = df.iloc[window:].values
    df_xs = pd.DataFrame(x, columns=cols, index=idx)
    df_y = pd.DataFrame(y.reshape(-1), columns=['y'], index=idx)
    return pd.concat([df_xs, df_y], axis=1).dropna()

In [None]:
def split_data(df, test_split=0.15):
    n = int(len(df) * test_split)
    train, test = df[:-n], df[-n:]
    return train, test

## Multistep Forecast 

In [None]:
def multi_step_forecast(data, model, steps=10):
    forecast = []
    for i in range(steps):
        one_step_pred = model.predict(np.array(data).reshape(1,-1))[0]
        forecast.append(one_step_pred)
        _ = data.pop(0)
        data.append(one_step_pred)
    return np.array(forecast)

In [None]:
def create_dataset_two_inputs(x, y, sequence_length, batch_size):
    dataset_x = tf.data.Dataset.from_tensor_slices(x)
    dataset_y = tf.data.Dataset.from_tensor_slices(y)
    dataset = tf.data.Dataset.zip((dataset_x, dataset_y))
    dataset = dataset.window(sequence_length+1, shift=1, drop_remainder=True)
    dataset = dataset.flat_map(lambda x, y: tf.data.Dataset.zip((x.batch(sequence_length+1), y.batch(sequence_length+1))))
    dataset = dataset.map(lambda x, y: (x[:-1], y[-1:]))
    dataset = dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return dataset

    def create_dataset(x, y, sequence_length, batch_size):
    dataset = tensorflow.data.Dataset.from_tensor_slices((x, y))
    dataset = dataset.window(sequence_length+1, shift=1, drop_remainder=True)
    dataset = dataset.flat_map(lambda x, y: tensorflow.data.Dataset.zip((x.batch(sequence_length+1), y.batch(sequence_length + 1))))
    dataset = dataset.map(lambda x, y: (x[:-1], tensorflow.squeeze(y[-1:], axis = -1)))
    dataset = dataset.batch(batch_size).prefetch(tensorflow.data.AUTOTUNE)
    return dataset
def create_dataset_multiple_outputs(x, y, sequence_length, batch_size, output_steps):
    dataset = tf.data.Dataset.from_tensor_slices((x, y))
    dataset = dataset.window(sequence_length + output_steps, shift=1, drop_remainder=True)
    dataset = dataset.flat_map(lambda x, y: tf.data.Dataset.zip((x.batch(sequence_length + output_steps), y.batch(sequence_length + output_steps))))
    dataset = dataset.map(lambda x, y: (x[:-output_steps], y[-output_steps:]))
    dataset = dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return dataset

In [None]:
from tensorflow.keras.layers import Bidirectional, Attention, AdditiveAttention, Concatenate
encoder_input = Input(shape=(None, x_train.shape[1]))
encoder_lstm1= Bidirectional(LSTM(64, return_sequences=True))(encoder_input)
encoder_lstm2, forward_h, forward_c, backward_h, backward_c = Bidirectional(LSTM(32, return_state=True))(encoder_lstm1)
state_h = Concatenate()([forward_h, backward_h])
state_c = Concatenate()([forward_c, backward_c])
encoder_states = [forward_h, forward_c, backward_h, backward_c]

decoder_input = Input(shape = (None, x_train.shape[1]))
decoder_lstm1 = Bidirectional(LSTM(32, return_sequences=True))(decoder_input, initial_state = encoder_states)
decoder_lstm2 = Bidirectional(LSTM(32, return_sequences=True))(decoder_lstm1)

attention_layer = Attention()
attention_outputs = attention_layer([encoder_lstm2, decoder_lstm2])
dense_layer = Dense(1)
output_layer = dense_layer(attention_outputs)

encoder_decoder_model = Model(inputs = [encoder_input, decoder_input], outputs = [output_layer])
encoder_decoder_model.compile(optimizer = 'adam', loss = 'mse', metrics = [RootMeanSquaredError(), MeanAbsoluteError()], run_eagerly=True)
encoder_decoder_model.summary()
