Import- Hyperlink to documentation

In [402]:
import numpy as np
import pandas as pd
import random
from sklearn.metrics import mean_absolute_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Conv1D
from tensorflow.keras.optimizers import Adam, RMSprop, SGD
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
import keras.regularizers
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
import numpy as np
from sklearn.feature_selection import RFE, SequentialFeatureSelector
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from scipy import stats
from scipy.stats.mstats import winsorize
from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn.preprocessing import MinMaxScaler

# Supervised Hybrid Production-Level Long Short Term Memory Recurrent Neural Network:
#### _Semi-Automating proccesses for building and optimization_

## Forecasting Absolute Humidity

### Question: Can we predict humidity in San Diego for the next day based on a given humidity, wind speed, wind direction, and pressure for that hour?

An LSTM (Long Short-Term Memory) is a type of recurrent neural network (RNN) that is designed to handle long-term dependencies in sequential data. It uses a memory cell that can store information for long periods of time and a set of gates to control the flow of information into and out of the cell. The gates are used to selectively forget or remember information based on the input and the current state of the cell. LSTMs have been shown to be effective in a wide range of applications including natural language processing, speech recognition, and time series prediction.
The chosen model is a recurrent neural network (RNN) with long short-term memory (LSTM) cells. The reason for this choice is that LSTMs are specifically designed for handling time-series data, which makes them well-suited for forecasting tasks. 
LSTMs have the ability to learn long-term dependencies in the data, which is important for capturing the patterns and trends that may exist in the time-series data. Additionally, LSTMs are capable of handling sequential data of varying lengths, which is useful for this problem since the number of time steps in the input data may vary. LSTMs are often preferred for time-series forecasting problems due to their ability to capture long-term dependencies and their flexibility in handling sequential data.

## 1. Data

In [403]:
import pandas as pd

In [404]:
humidity= pd.read_csv('clean_data/cleaned_humidity.csv')

In [405]:
pressure= pd.read_csv('clean_data/cleaned_pressure.csv')

In [406]:
wind_speed= pd.read_csv('clean_data/cleaned_ws.csv')

In [407]:
wind_dir= pd.read_csv('clean_data/cleaned_wr.csv')

In [408]:
# Rename the columns in the original DataFrames
humidity.columns = [f'humidity_{col}' for col in humidity.columns]
pressure.columns = [f'pressure_{col}' for col in pressure.columns]
wind_dir.columns = [f'wind_dir_{col}' for col in wind_dir.columns]
wind_speed.columns = [f'wind_speed_{col}' for col in wind_speed.columns]

In [409]:
# Concatenate the DataFrames along the columns axis
san_diego_df = pd.concat([humidity[['humidity_datetime']], humidity.drop('humidity_datetime', axis=1), pressure, wind_dir, wind_speed], axis=1)

san_diego_columns = ['humidity_datetime'] + [col for col in san_diego_df.columns if 'San Diego' in col]
san_diego_df = san_diego_df[san_diego_columns]

In [410]:
#Rename the columns
san_diego_df= san_diego_df.rename(columns={"humidity_datetime": "datetime", "humidity_San Diego": "humidity", "pressure_San Diego":"pressure", "wind_dir_San Diego":"wind_dir", "wind_speed_San Diego": "wind_speed"})

In [411]:
#Extract the hour from the datetime column
san_diego_df['hour'] = pd.to_datetime(san_diego_df['datetime']).dt.hour

In [412]:
#Drop datetime
san_diego_df.drop(['datetime'], inplace= True, axis= 1)

In [413]:
#Creating a function to use statistcal tests to determine distribution
weather_elements = ['humidity', 'pressure', 'wind_speed', 'wind_dir']

san_diego_df = san_diego_df.applymap(lambda x: x if np.isfinite(x) else np.nan)

def test_distributions(series):
    
    
    series = series[np.isfinite(series)]
    
    distributions = {
        "normal": stats.norm,
        "exponential": stats.expon,
        "logistic": stats.logistic
    }
    
    results = {}
    for dist_name, dist in distributions.items():
        params = dist.fit(series)
        D, p = stats.kstest(series, dist.cdf, args=params)
        results[dist_name] = (D, p)
    
    return results

for element in weather_elements:
    print(f"{element} distribution:")
    results = test_distributions(san_diego_df[element])
    for dist_name, (D, p) in results.items():
        print(f"{dist_name}: D={D}, p-value={p}")
    print()

humidity distribution:
normal: D=0.07225708153364296, p-value=9.521463621270879e-201
exponential: D=0.3549962526547573, p-value=0.0
logistic: D=0.0565723130142094, p-value=3.622505804104062e-123

pressure distribution:
normal: D=0.14996863978091823, p-value=0.0
exponential: D=0.5242211365851986, p-value=0.0
logistic: D=0.09150311488892317, p-value=0.0

wind_speed distribution:
normal: D=0.26540812964324156, p-value=0.0
exponential: D=0.2836830540579809, p-value=0.0
logistic: D=0.24111324355792074, p-value=0.0

wind_dir distribution:
normal: D=0.13003959649634, p-value=0.0
exponential: D=0.22719236582053187, p-value=0.0
logistic: D=0.10435693010217906, p-value=0.0



In [414]:
#Asid from obviously hour, there doesn't seem to be any pattern of distribution that I can identify at the moment

## 2. Feature Engineering and Preprocessing:

In [415]:
san_diego_df.isnull().any(axis=1).sum()

353

In [416]:
#Dropping Nulls
san_diego_df = san_diego_df.dropna() 

In [417]:
san_diego_df.shape

(44107, 5)

In [418]:
#Making a column for my target
san_diego_df['next_day_humidity'] = san_diego_df['humidity'].shift(-24)

In [419]:
san_diego_df.columns

Index(['humidity', 'pressure', 'wind_dir', 'wind_speed', 'hour',
       'next_day_humidity'],
      dtype='object')

In [420]:
#Identifying and quantifying outliers

In [421]:
def detect_outliers_zscore(df, threshold=3):
    outliers = pd.DataFrame()
    for col in df.columns:
        z_scores = np.abs((df[col] - df[col].mean()) / df[col].std())
        outliers_col = df[z_scores > threshold][col]
        outliers_col.name = col
        outliers = pd.concat([outliers, outliers_col], axis=1)
    return outliers

In [422]:
def get_outlier_list(df, threshold):
    outlier_list = {}
    outlier = detect_outliers_zscore(df, threshold)
    for col in outlier.columns:
        non_null_mask = outlier[col].notnull()
        if non_null_mask.any():
            outlier_list[col] = outlier.loc[non_null_mask, col].values.tolist()
    
    return outlier_list

In [423]:
outliers= get_outlier_list(san_diego_df, threshold= 3)

In [424]:
for element, values in outliers.items():
    print(f"{element}: {len(values)}")

humidity: 113
pressure: 1127
wind_speed: 363
next_day_humidity: 113


In [425]:
#A small amount of outliers for the given data set- moderation is not necessary

Applying the Winsorize method to moderate outliers as a display of proficiency in selecting and applying statistical models and best data practices
Reasons for Selection: 
1. No normal distribution 
2. To maintain data integrity by only removing the extreme outliers- Extreme outliers might indicate faulty data

https://www.statisticshowto.com/winsorize/

In [426]:
#Defining percentile to drop
percentiles = [0.01, 0.99]

In [427]:
#Iterating the method over each feature
for col in weather_elements:
    san_diego_df[col] = winsorize(san_diego_df[col], limits=percentiles)

In [428]:
#Encoding catergorical variables

In [429]:
# Circular encoding for wind direction
san_diego_df['wind_dir_sin'] = np.sin(2*np.pi*san_diego_df['wind_dir']/360)
san_diego_df['wind_dir_cos'] = np.cos(2*np.pi*san_diego_df['wind_dir']/360)

In [430]:
# Drop the original wind_direction column
san_diego_df.drop('wind_dir', axis=1, inplace=True)

In [431]:
#One Hot Encoding 'hour'
#Get the dummies for hour
one_hot = pd.get_dummies(san_diego_df["hour"], prefix="hour")
#Concat the dummy column
san_diego_df = pd.concat([san_diego_df, one_hot], axis=1)

In [432]:
#Dropping original hour
san_diego_df.drop("hour", axis=1, inplace=True)

In [433]:
#Engineering Features

In [434]:
#Capturing the temporal relationship between the current and previous hour's humidity values
san_diego_df['prev_hour_humidity'] = san_diego_df['humidity'].shift(1)

seasonal_decompose() is a function from the statsmodels library that decomposes a time series into its trend, seasonality, and residual components. The resulting components can help identify patterns and relationships in the time series data that may not be easily observable in the raw data

In [435]:
result = seasonal_decompose(san_diego_df['humidity'], model='additive', period=24)

In [436]:
#long-term trend in the humidity data
san_diego_df['humidity_trend'] = result.trend
#recurring seasonal patterns
san_diego_df['humidity_seasonality'] = result.seasonal
# unexplained variance or randomness
san_diego_df['humidity_residual'] = result.resid

In [437]:
san_diego_df.columns

Index(['humidity', 'pressure', 'wind_speed', 'next_day_humidity',
       'wind_dir_sin', 'wind_dir_cos', 'hour_0', 'hour_1', 'hour_2', 'hour_3',
       'hour_4', 'hour_5', 'hour_6', 'hour_7', 'hour_8', 'hour_9', 'hour_10',
       'hour_11', 'hour_12', 'hour_13', 'hour_14', 'hour_15', 'hour_16',
       'hour_17', 'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22',
       'hour_23', 'prev_hour_humidity', 'humidity_trend',
       'humidity_seasonality', 'humidity_residual'],
      dtype='object')

In [438]:
san_diego_df.shape

(44107, 34)

In [439]:
#Seperating Features and Target into X and y
X = san_diego_df.drop('next_day_humidity', axis=1)
y = san_diego_df[['next_day_humidity']]

In [440]:
from sklearn.impute import SimpleImputer

def preprocess_data(X):
    # Impute missing values in X
    imputer_X = SimpleImputer()
    X = imputer_X.fit_transform(X)
        
    # Replace infinite values with NaN in X
    X = np.where(np.isfinite(X), X, np.nan)
    
    return X

In [441]:
X= preprocess_data(X)

In [442]:
#Asid from obviously hour, there doesn't seem to be any pattern of distribution that I can identify at the moment

## 3. Model Building:

### Build

_This code defines a build function that trains an LSTM (Long Short-Term Memory) model to predict the target variable y based on input features X. It explores different feature selection functions, cross-validation methods, and hyperparameters to find the best model. The performance metric used for model evaluation is the mean absolute error (MAE)._

#### Broader Picture and Cross Applications
The LSTM model is a type of recurrent neural network (RNN) that excels in learning patterns from time series data. This function is applicable to a variety of time series prediction tasks, such as stock price forecasting, weather prediction, and energy consumption forecasting. By identifying the best feature selection method, cross-validation method, and hyperparameters, organizations can improve the accuracy of their predictions and make better data-driven decisions.

#### Machine Learning Concepts
* **LSTM:** A type of recurrent neural network designed to handle long-term dependencies in sequences, making it suitable for time series data.
* **Feature Selection:** The process of selecting the most important features from the input data to improve model performance and reduce overfitting.
* **Cross-Validation:** A technique to evaluate the performance of a model by partitioning the dataset into multiple subsets and training the model on each subset, testing it on the remaining data. This helps to obtain a more reliable estimate of the model's performance.
* **Hyperparameters:** Parameters that control the learning process of a model. They are not learned by the model during training but are set beforehand. Examples include the learning rate, batch size, and number of layers in a neural network.

#### Mathematical Concepts and Statistical Analysis
* **Mean Absolute Error (MAE):** A measure of the average difference between the true values and the predicted values. It is calculated as the sum of the absolute differences between the true and predicted values divided by the number of samples.
    _MAE = (1/n) * Σ|y_true - y_pred|_: where y_true is the true target value, y_pred is the predicted value, and n is the number of data points

* **Time Series Cross-Validation:** A cross-validation technique that respects the temporal order of the data, ensuring that the model is trained on past data and evaluated on future data. This helps prevent leakage of information from the future into the past.

#### Reasoning and Decisions Made
1. The code first initializes variables to store the best results found during the search process.
2. It then splits the dataset into training and testing data using time series cross-validation.
3. The input data is transformed into a supervised format suitable for LSTM models.
4. For each combination of feature selection function, and hyperparameters, the code:
    -Performs feature selection on the input data.
    -Creates and trains an LSTM model with the selected features and hyperparameters.
    -Evaluates the model using cross-validation and computes the mean absolute error.
8. Updates the best results if the current model has a lower mean absolute error.
9. Finally, the function returns the best results, including feature selection function, mean absolute error, cross-validation method, model, training history, and hyperparameters.

#### Local Outside References
* to_supervised: A user-defined function that converts the input data into a supervised format suitable for LSTM models.
* feature_selection_function: A user-defined function for selecting the best features from the input data.
* get_hyperparameters: A user-defined function for obtaining a list of possible hyperparameter combinations.
* create_lstm_model: A user-defined function that creates and trains an LSTM model with the given input data and hyperparameters.
* cross_validate: A user-defined function that performs cross-validation for the given model and returns the mean absolute error.

#### Inputs and Outputs
* **Inputs:**
    1. X: The input features as a 2D array or dataframe.
    2. y: The target variable as a 1D array or dataframe.
    3. cv_methods: A dictionary containing cross-validation methods.
    4. metric: The performance metric used for model evaluation.

* **Outputs:**
    1. best_feature_selection_result: The best feature selection function found.
    2. best_mae: The lowest mean absolute error achieved.
    3. best_cv_method: The cross-validation method used for the best model.
    4. best_model: The best LSTM model found.
    5. best_history: The training history of the best LSTM model.
    6. best_hyperparams: The hyperparameters used for the best LSTM model.

In [443]:
metric= mean_absolute_error

In [444]:
def build(X, y, metric):
    # Initialize variables for the best results
    best_mae = float('inf')
    best_cv_method = None
    best_model = None
    best_history = None
    best_hyperparams = None
    best_feature_selection_result = None
        
    # Split the dataset into training and testing data using time series cross-validation
    tscv = TimeSeriesSplit(n_splits=5)
    for train_index, test_index in tscv.split(X):
        X = pd.DataFrame(X, columns=["col"+str(i) for i in range(X.shape[1])])
        train_data, test_data = X.iloc[train_index], X.iloc[test_index]
        train_target, test_target = y.iloc[train_index], y.iloc[test_index]
        
        #Initializing Scaler
        scaler = MinMaxScaler() #Min Max chosen due to limited outliers
        
        #Transforming our Features
        test_data= scaler.fit_transform(test_data)
        train_data = scaler.fit_transform(train_data)
        test_target= scaler.fit_transform(test_target)
        train_target = scaler.fit_transform(train_target)
        
        # Convert to DataFrames
        train_data = pd.DataFrame(train_data, columns=["col"+str(i) for i in range(train_data.shape[1])])
        train_target = pd.DataFrame(train_target, columns=["target"])
        test_data = pd.DataFrame(train_data, columns=["col"+str(i) for i in range(train_data.shape[1])])
        test_target = pd.DataFrame(train_target, columns=["target"])
        
        # Create supervised data for LSTM model
        train_data_with_target = pd.concat([train_data, train_target], axis=1)
        test_data_with_target = pd.concat([test_data, test_target], axis=1)
        X_train, y_train = to_supervised(train_data_with_target, input_steps, output_steps)
        X_test, y_test = to_supervised(test_data_with_target, input_steps, output_steps)
        
        # Perform feature selection and prepare data for model training
        X_train, X_test, best_feature_selection_result = feature_selection_function(X_train, y_train, X_test, y_test)
    
        # Get hyperparameters for LSTM model
        hyperparameters_list, hyperparameter_options = generate_hyperparameter_list()
    
        # Loop through hyperparameters
        for hyperparams in hyperparameters_list:
                    
                # Create and train LSTM model
                model, history = create_lstm_model(X_train, y_train, X_test, y_test, hyperparameter_list)
                    
                # Perform final time series cross-validation
                val_loss, model, history = cross_validate(X, y, hyperparams, cv_method_name, cv_method, input_steps, output_steps, metric)

                # Check if the current model is the best so far
                if val_loss < best_mae:
                    best_mae = val_loss
                    best_cv_method = best_cv_method
                    best_model = model
                    best_history = history
                    best_hyperparams = hyperparams
                    best_feature_selection_result = best_feature_selection_result
    
        print(best_feature_selection_result, best_mae, best_cv_method, best_model, best_history, best_hyperparams)

    # Return the best results
    return best_feature_selection_result, best_mae, best_cv_method, best_model, best_history, best_hyperparams

In [None]:
best_mae, best_cv_method, best_model, best_history, best_hyperparams= build(X, y, metric)

In [342]:
input_steps= 24
output_steps= 24

### To Supervised

_This code defines the to_supervised function that converts time series data into a supervised learning format. Given the input data data, the function creates input/output pairs based on input_steps and output_steps. The input_steps represent the number of time steps to consider for input, while output_steps represent the number of time steps to predict in the future.__

#### Broader Picture and Cross Applications
The function is useful for transforming time series data into a format suitable for training machine learning models like LSTM or other sequence-based models. It can be applied to any time series prediction task where the goal is to predict future values based on past values, such as sales forecasting, air quality prediction, or load demand estimation in power grids.

#### Reasoning and Decisions Made
1. The function checks if input_steps and output_steps are valid positive integers.
2. It initializes two empty lists, X and y, for storing input and output data, respectively.
3. The function loops through the data and, for each time step, appends input_steps number of data points as input and output_steps number of data points as output.
4. The input/output pairs are converted to numpy arrays before returning them.

#### Inputs and Outputs
* **Inputs:**
1. data: The time series data as a 1D or 2D array or dataframe.
2. input_steps: The number of time steps to consider for input.
3. output_steps: The number of time steps to predict in the future.
* **Outputs:**
1. X: The input data in supervised learning format as a numpy array.
2. y: The output data (future predictions) in supervised learning format as a numpy array.

In [343]:
def to_supervised(data, input_steps, output_steps):
    
    # Check that input_steps and output_steps are valid
    if not isinstance(input_steps, int) or not isinstance(output_steps, int) or input_steps <= 0 or output_steps <= 0:
        raise ValueError("Input steps and output steps must be positive integers.")
    
    # Initialize empty lists for input and output data
    X, y = [], []
    
    # Loop through the data to create input/output pairs
    for i in range(len(data)-input_steps-output_steps+1):
        # Append input data for current time step
        X.append(data[i:i+input_steps])
        # Append output data for current time step
        y.append(data[i+input_steps:i+input_steps+output_steps])
    
    # Convert lists to numpy arrays
    X = np.array(X)
    y = np.array(y)
    
    return X, y

### Creating the Model
_The create_lstm_model function takes training and test data, along with a list of hyperparameters, to create, train, and evaluate an LSTM model. It finds the best hyperparameters by looping through all combinations and comparing their validation losses. The function returns the best LSTM model, training history, evaluation metrics, and the best hyperparameters._

#### Broader Picture and Cross Applications

This function is useful for training and selecting the best LSTM model for time series prediction tasks. It can be applied to a wide range of applications, including sales forecasting, air quality prediction, and load demand estimation in power grids. The function's structure can be adapted for other types of neural networks or machine learning models, with appropriate adjustments to the model architecture and hyperparameters.

#### Reasoning and Decisions Made
1. The function initializes variables to store the best model and hyperparameters.
2. It loops through all combinations of hyperparameters.
3. It creates an LSTM model with the given hyperparameters and trains it using the training data.
4. It performs cross-validation using different methods.
5. It updates the best model if the validation loss improves.
6. It returns the best model, training history, and evaluation metrics.

#### Local Outside References
* to_supervised function
* cross_validate function
* cv_methods dictionary
* input_steps variable
* output_steps variable
* metric variable

#### Inputs and Outputs
* **Inputs:**
1. X_train: The training input data in supervised learning format.
2. y_train: The training output data in supervised learning format.
3. X_test: The test input data in supervised learning format.
4. y_test: The test output data in supervised learning format.
5. hyperparameters_list: A list of dictionaries containing combinations of hyperparameters to try.
* **Outputs:**
1. best_model: The best LSTM model found.
2. history: The training history of the best LSTM model.
3. train_metric: The evaluation metric on the training set for the best LSTM model.
4. best_mae: The evaluation metric on the validation set for the

In [344]:
def create_lstm_model(X_train, y_train, X_test, y_test, hyperparameters_list):
    
    # Initialize variables to store the best hyperparameters and model
    models = []
    val_losses = []         # A list to store validation losses for each combination of hyperparameters
    best_val_loss = np.inf  # The best validation loss observed so far
    best_model = None       # The best model observed so far
    best_activation = None  # The activation function that resulted in the best model
    best_optimizer = None   # The optimizer that resulted in the best model
    best_cv_method= None
    
    # Loop through all combinations of activations and optimizers
    for hyperparams in hyperparameters_list:
        n_units= hyperparams['lstm_units']
        learning_rate= hyperparams['learning_rate']
        epochs= hyperparams['epochs']
        batch_size= hyperparams['batch_size']
        dropout= hyperparams['dropout']
        data_aug= hyperparams['data_augmentation']
        activation = hyperparams['activation']
        optimizer = hyperparams['optimizer']
        n_units = hyperparams['lstm_units']
        reg = hyperparams['regularization']
        patience= hyperparams['early_stopping_patience']
        pruning= hyperparams['pruning']
        ens= hyperparms['ensembling']
        
        history= []
        es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=patience)

        # Create the model with the given hyperparameters
        model = Sequential()
        model.add(LSTM(n_units, activation=activation, input_shape=(input_steps, X_train.shape[2]), kernel_regularizer=keras.regularizers.l2(reg), return_sequences=True))
        model.add(LSTM(n_units, activation=activation, input_shape=(input_steps, X_train.shape[2]), kernel_regularizer=keras.regularizers.l2(reg)))
        model.add(Dropout(dropout))

        # Add more LSTM layers
        model.add(LSTM(n_units, activation=activation, kernel_regularizer=keras.regularizers.l2(reg), return_sequences=True))
        model.add(Dropout(dropout))
        model.add(LSTM(n_units, activation=activation, kernel_regularizer=keras.regularizers.l2(reg)))
        model.add(Dropout(dropout))

        # Add a Dense output layer
        model.add(Dense(output_steps))

        # Compile the model
        model.compile(optimizer=optimizer(lr=learning_rate), loss='mean_squared_error', metrics=metric)
    
        if data_aug and dropout >= 0:
            datagen = TimeseriesGenerator(X_train, y_train, length=input_steps, batch_size=batch_size)
            history = model.fit(datagen, epochs=epochs, verbose=1, callbacks=[es])
            models.append(model)
        elif ens:
            models = []
            for i in range(n_models):
                model = Sequential()
            
                for i in range(random.randint(0, 20)):
                    model.add(LSTM(n_units, activation=activation, input_shape=(input_steps, X_train.shape[2]), kernel_regularizer=keras.regularizers.l2(reg), return_sequences=True))
                    model.add(Conv1D(filters=random.randint(21, 40), kernel_size=random.randint(41, 60), activation=activation, input_shape=(input_steps, X_train.shape[2])))
                    model.add(LSTM(n_units, activation=activation, input_shape=(input_steps, X_train.shape[2]), kernel_regularizer=keras.regularizers.l2(reg)))
                    model.add(Dropout(random.randint(0, 100)))
                
                # Add a Dense output layer
                model.add(Dense(output_steps))

                # Compile the model
                model.compile(optimizer=optimizer(lr=learning_rate), loss='mean_squared_error', metrics=metric)

                # Train the model
                history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2, verbose=1)
            
                # Add the trained model to the list
                models.append(model)
                
                # Use pruning to combine predictions if specified
                if pruning:                  
                    
                    # Define hyperparameters and create the pruned LSTM model
                    input_shape = (X_train.shape[1], X_train.shape[2])
                    output_dim = 1
                    dropout_rate = 0.2
                    reg = 0.001
                    threshold = 0.1
                    model, pruner = create_pruned_lstm(input_shape, output_dim, dropout_rate, reg, threshold)

                    # Train the model
                    history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=1, callbacks=[pruner])

                    # Make predictions on the test set
                    y_pred = model.predict(X_test)  
                          
                    for model in models:
                        y_pred = model.predict
            
            # Evaluate the performance of the ensemble model
            mae_ensemble = mean_absolute_error(y_test, y_pred_ensemble)
            print(f'Ensemble MAE: {mae_ensemble}')

        else:
            
            # Train the model
            history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2, verbose=1, callbacks=[es])

        # Cross-validate the model using different methods
        for cv_method_name, cv_method in cv_methods:
            val_loss, model, history = cross_validate(X, y, hyperparams, cv_method_name, cv_method, input_steps, output_steps, metric)
            print('Validation loss:', val_loss)
            print('Hyperparameters:', hyperparams)              
            val_losses.append(val_loss)
            
        # Update the best model if validation loss improves
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model = model
            best_activation = activation
            best_optimizer = optimizer.__name__
            best_cv_method = cv_method
            best_hyperparams = hyperparams

    # Print the best hyperparameters
    print('Selected hyperparameters:', best_hyperparams)
    print('Activation:', best_activation)
    print('Optimizer:', best_optimizer)
    print('Learning rate:', learning_rate)
    print('Best validation loss:', best_val_loss)

    # Return the best model, training history, and the corresponding evaluation metrics
    y_pred_train = best_model.predict(X_train)
    y_pred_val = best_model.predict(X_val)
    train_metric = mae(y_train, y_pred_train)
    best_mae = mae(y_val, y_pred_val)

    return best_model, history, train_metric, best_mae, best_hyperparams

#### Machine Learning Applied

The create_lstm_model function trains a Long Short-Term Memory (LSTM) model for time series prediction. LSTMs are a type of Recurrent Neural Network (RNN) that can learn and remember patterns over long sequences of data, making them particularly suitable for time series data. They overcome the limitations of traditional RNNs by addressing the vanishing and exploding gradient problems during training. This is achieved through the use of memory cells, input gates, output gates, and forget gates that regulate the flow of information and help the LSTM network maintain long-term dependencies.

#### Mathematical Concepts and Statistical Analysis

**Memory Cells:** Memory cells are the central component of LSTM units. They store the long-term information of the sequence. The memory cell state (denoted as C_t) is updated as follows:

C_t = f_t * C_(t-1) + i_t * g_t

Here, f_t is the forget gate output, i_t is the input gate output, and g_t is the candidate memory cell state.

**Gates:** LSTM units have three gates: input, output, and forget gates. These gates control the flow of information into, out of, and within the LSTM unit. Gates perform element-wise multiplication using sigmoid activation functions, which output values between 0 and 1.

_**1. Input Gate:**_ Controls the flow of new information into the memory cell. It is calculated as:

i_t = sigmoid(W_i * [h_(t-1), x_t] + b_i)

_**2. Forget Gate:**_ Controls the amount of past information to retain or forget. It is calculated as:

f_t = sigmoid(W_f * [h_(t-1), x_t] + b_f)

_**3. Output Gate:**_ Controls the output of the LSTM unit based on the current memory cell state. It is calculated as:

o_t = sigmoid(W_o * [h_(t-1), x_t] + b_o)

**Memory Cell State Update:** The memory cell state is updated using the input and forget gates, as well as the current input (x_t). The new memory cell state (C_t) is calculated using the equations mentioned above.

**Hidden State Update:** The hidden state (h_t) represents the LSTM's output at time step t. It is calculated as follows:

h_t = o_t * tanh(C_t)

**Training the LSTM Model:** The LSTM model is trained using backpropagation through time (BPTT) and gradient descent optimization. The model learns the optimal weights and biases for the gates and memory cells to minimize a loss function, typically Mean Squared Error (MSE) for time series prediction tasks.

**Regularization:** Regularization techniques like L2 regularization and dropout are employed to prevent overfitting. L2 regularization adds a penalty term to the loss function proportional to the squared magnitude of the weights, while dropout randomly sets a fraction of input units to 0 during training.

**Hyperparameter Tuning:** This function tests multiple combinations of hyperparameters such as the number of LSTM units, learning rate, dropout rate, and more. It selects the best model by comparing validation losses. This process helps to find the most suitable architecture and training settings for the specific problem.

By combining these mathematical concepts and statistical techniques, the create_lstm_model function trains anLSTM model that can capture complex patterns and long-term dependencies in time series data. The function iteratively trains models with different hyperparameters, evaluates their performance using cross-validation, and selects the best model based on the validation loss. This ensures that the chosen LSTM model has the best architecture and training settings for the specific problem, resulting in a more accurate and robust time series prediction.

#### Cross Applications
_The LSTM model trained by this function can be applied to various time series prediction tasks in different domains, such as:_

1. Stock market prediction
2. Weather forecasting
3. Energy demand forecasting
4. Sales forecasting
5. Traffic flow prediction
6. Human activity recognition

Moreover, the concepts and techniques used in this function can be adapted for other types of neural networks or machine learning models. By making appropriate adjustments to the model architecture, hyperparameters, and optimization techniques, it's possible to create a similar function for training and selecting the best model for other tasks and domains.

### Cross Validation
_The cross_validate function is designed to perform cross-validation on a dataset using a specified cross-validation method to evaluate a model's performance with given hyperparameters. Cross-validation is an essential step in the machine learning process, as it helps prevent overfitting and provides a more reliable estimate of a model's performance on unseen data. This function can be applied to various time series problems and can be adapted to work with different machine learning models and cross-validation techniques._

#### Reasoning and Decisions Made
1. The function initializes three lists to store the mean absolute errors, models, and training history for each fold of the cross-validation process.
2. The function loops through the splits generated by the cross-validation method and prepares the input data for the LSTM model using the previously defined to_supervised function.
3. For each fold, the model is trained and evaluated using the train_and_evaluate_model function, which calculates the mean absolute error, the model, and the training history.
4. The function then calculates the average validation loss across all folds and returns the best model and training history based on the lowest validation loss.

#### Local Outside References:
to_supervised function 

#### Input
* X: A 2D array of input features
* y: A 1D array of target values
* hyperparams: A dictionary containing hyperparameters for the model
* cv_method_name: A string representing the name of the cross-validation method
* cv_method: A cross-validation method object from scikit-learn
* input_steps: The number of input time steps for the LSTM model
* output_steps: The number of output time steps for the LSTM model
* metric: A performance metric function from scikit-learn

#### Outputs: 
* val_loss: The validation loss of the best preforming metric
* best_model: The best preforming model
* best_history: The history of ther best preforming model

In [345]:
def cross_validate(X, y, hyperparams, cv_method_name, cv_method, input_steps, output_steps, metric):
    mae_list = []         # A list to store mean absolute errors for each fold
    model_list = []       # A list to store models for each fold
    history_list = []     # A list to store training history for each fold

    try:
        # Loop through the splits generated by the cross-validation method
        for i, (train_idx, val_idx) in enumerate(cv_method.split(X)):
            # Split the data into training and validation sets
            X_train_cv, X_val_cv = X[train_idx], X[val_idx]
            y_train_cv, y_val_cv = np.array(y)[train_idx], np.array(y)[val_idx]

            # Prepare input data for LSTM
            X_train_cv, y_train_cv = to_supervised(X_train_cv, y_train_cv, input_steps, output_steps)
            X_val_cv, y_val_cv = to_supervised(X_val_cv, y_val_cv, input_steps, output_steps)

            # Train and evaluate the model
            mae, model, history = train_and_evaluate_model(X_train_cv, y_train_cv, X_val_cv, y_val_cv, hyperparams, input_steps, output_steps, metric)
            mae_list.append(mae)
            model_list.append(model)
            history_list.append(history)

    except Exception as e:
        print(f"{cv_method_name}: Is incompatible with the model. Removing from consideration. Error: {e}")
        val_loss = np.inf

    # Calculate average validation loss across all folds
    val_loss = np.mean(mae_list)

    # Return average validation loss, best model, and training history
    return val_loss, model_list[np.argmin(mae_list)], history_list[np.argmin(mae_list)]

#### Machine Learning Applied

In the context of the cross_validate function, cross-validation is a widely-used technique in machine learning to evaluate a model's performance. It involves partitioning the dataset into multiple subsets or "folds" and iteratively training the model on a subset while testing it on the remaining data. This process helps ensure that the model's performance is not biased towards any specific part of the dataset and provides a more accurate estimate of how well the model generalizes to new, unseen data.

#### Mathematical Concepts and Statistical Analysis:

**Average Validation Loss:** After performing cross-validation, the function calculates the average validation loss across all folds by taking the mean of the mean absolute errors for each fold. This average provides a more reliable estimate of the model's performance on unseen data.

#### Broader Picture and Cross Applications:

The cross_validate function is a versatile tool that can be applied to various time series problems and machine learning models. By allowing users to specify different cross-validation methods, the function can be adapted to suit the unique characteristics and requirements of different datasets and problem domains.

For instance, in the context of an organization, cross-validation can be used to evaluate the performance of machine learning models for forecasting sales, predicting equipment maintenance needs, or analyzing customer behavior patterns. By using cross-validation to optimize model performance, organizations can make more informed decisions, reduce costs, and improve overall efficiency.

Furthermore, the cross_validate function can be extended to work with other machine learning models and cross-validation techniques. This flexibility allows for the exploration of various modeling approaches, ultimately leading to better performance and increased value creation within an organization.

### Best Feature Selector
_The feature_selection_function aims to identify the most important features within a dataset to improve the performance of a machine learning model. The function evaluates three different feature selection methods (Recursive Feature Elimination, Sequential Feature Selection, and Correlation-based feature selection) and selects the method that results in the lowest mean absolute error (MAE)._

#### Local outside references:
* select_features_rfe
* select_features_sfs
* feature_selection_correlation

#### Inputs:
* X_train: Training dataset features
* y_train: Training dataset target variable
* X_test: Test dataset features
* y_test: Test dataset target variable

#### Outputs:
* X_train: Updated training dataset features after applying the best feature selection method
* X_test: Updated test dataset features after applying the best feature selection method
* feature_selection: The name of the best feature selection method
* best_feature_selection_result: The tuple containing the best feature selection method name, the best MAE, and the selected features
* The best feature selection result is printed as a side effect

In [346]:
# Function definition
def feature_selection_function(X_train, y_train, X_test, y_test):

    # Initialize variables to store best results
    fbest_mae = float('inf')
    fbest_method = None
    ffeatures= None

    # RFE (Recursive Feature Elimination)
    X_rfe, mae_rfe = select_features_rfe(X_train, y_train, X_test, y_test)
    if mae_rfe < best_mae:
        # Update best results if RFE provides better performance
        fbest_mae = mae_rfe
        best_X = X_rfe
        fbest_method = 'Recursive Feature Selection'
        ffeatures= features
        X_test= X_test

    # SFS (Sequential Feature Selection)
    X_sfs, mae_sfs = select_features_sfs(X_train, y_train, X_test, y_test)
    if mae_sfs < best_mae:
        # Update best results if SFS provides better performance
        fbest_mae = mae_sfs
        best_X = X_sfs
        best_method = 'Sequential Feature Selection'
        ffeatures= features
        X_test= X_test
        
    # Correlation-based feature selection
    X_corr, mae_corr = feature_selection_correlation(X_train, y_train, X_test, y_test)
    if mae_corr < best_mae:
        # Update best results if correlation-based selection provides better performance
        fbest_mae = mae_corr
        best_X = X_corr
        fbest_method = 'Correlation'
        ffeatures= features
        X_test= X_test
    
    # Store the best feature selection results
    best_feature_selection_results= (fbest_method, fbest_mae, features)
    
    # Return the updated training and test datasets, feature selection method, and results
    return X_train, X_test, best_feature_selection_result,  print(best_feature_selection_result)

Machine Learning Concepts:

Recursive Feature Elimination (RFE): RFE is a technique that recursively removes features and builds a model on the remaining features. It uses the model accuracy to identify which features contribute the most to the prediction.

Feature selection is an essential step in the machine learning pipeline as it helps in reducing the dimensionality of the dataset, which in turn reduces the complexity of the model, improves its performance, and reduces the risk of overfitting.
Recursive Feature Elimination (RFE): RFE is a technique that recursively removes features and builds a model on the remaining features. It uses the model accuracy to identify which features contribute the most to the prediction.

Sequential Feature Selection (SFS): SFS is a technique that iteratively adds or removes features based on their contribution to the model's performance. It can be implemented as forward, backward, or bidirectional selection.

Correlation-based feature selection: This method selects features that have the highest correlation with the target variable while maintaining low correlation with each other.


Mathematical Concepts and Statistical Analysis:

Mean Absolute Error (MAE): The performance metric used in this function is the mean absolute error, calculated as the average of the absolute differences between the predicted and actual values.

Correlation: Correlation is a measure of the strength and direction of the relationship between twovariables. It ranges from -1 to 1, where -1 indicates a strong negative relationship, 0 indicates no relationship, and 1 indicates a strong positive relationship.

Broader Picture and Cross Applications:

Feature selection techniques are widely applicable across various domains and industries. In the context of an organization, the feature_selection_function can be used to:

Improve model performance: By selecting only the most relevant features, the model can make better predictions, which can lead to better decision-making and increased efficiency.

Reduce computational resources: With fewer features, the model requires less memory and processing power, which can lead to cost savings, especially when dealing with large datasets.

Enhance interpretability: A simpler model with fewer features is easier to understand and explain to stakeholders, facilitating better communication of the results.

Identify important variables: Feature selection can help identify key drivers of a target variable, which can inform the development of new products, services, or strategies.

Explanation for Non-Coders:

The select_features_rfe function uses Recursive Feature Elimination (RFE) for feature selection. It starts by initializing the number of features to select as 4 and the previous Mean Absolute Error (MAE) as infinity. It then iteratively trains an RFE model with a RandomForestRegressor estimator, fitting the model to the training data, and transforming the training and test datasets to include only the selected features. A Linear Regression model is then trained and evaluated on the selected features, calculating the MAE.

The process continues until the MAE is within the range of -1 to 1 or if the new MAE is further from 0 than the previous one. In the latter case, the number of features to select is increased, and the previous MAE is updated. Finally, the function returns the transformed training and test datasets, the MAE, and the selected features.

Broader Picture and Cross Applications:

Recursive Feature Elimination (RFE) is a versatile and widely applicable feature selection technique. It can be used across various domains and industries to enhance model performance, reduce computational resources, improve interpretability, and identify important variables. It is particularly useful in situations where there are a large number of features and a need to identify a smaller subset that contributes the most to the target variable.

Explanation of Machine Learning Concepts:

RFE is a wrapper-based feature selection method that iteratively removes the least important features and fits a model to the remaining features. It uses an estimator, such as a RandomForestRegressor, to rank the importance of each feature. The least important features are eliminated, and the process continues until a specified number of features remain.
Mathematical Concepts and Statistical Analysis:

Recursive Feature Elimination (RFE) is built upon the concept of model-based feature importance. The RandomForestRegressor, which is used as the estimator in this example, measures feature importance by calculating the average impurity decrease across all decision trees in the forest. The impurity decrease is measured using Gini impurity or entropy.

The Mean Absolute Error (MAE) is used as the evaluation metric in this function. MAE is calculated as the average of the absolute differences between the predicted and actual values:

bash
Copy code
MAE = (1/n) * Σ|y_i - y'_i|
where n is the number of samples, y_i is the actual value, and y'_i is the predicted value. MAE is a common metric for regression problems, as it provides an easily interpretable measure of the average prediction error.

Reasoning and Decisions Made:

The function begins with an initial number of features to select as 4, and it iteratively increases this number until the stopping condition is met. This process allows the function to search for the best number of features to retain, balancing model complexity and predictive performance.

The stopping condition is based on the MAE. If the MAE is within the range of -1 to 1, the process stops, as this indicates a satisfactory level of prediction error. Alternatively, if the new MAE is further from 0 than the previous one, the process stops, as this indicates that adding more features is not improving the model's performance.

Local Outside References and Things Needed to Be Imported:

Imports:

from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
import numpy as np
Inputs and Outputs of the Code:

Inputs:
X_train: Training dataset features
y_train: Training dataset target variable
X_test: Test dataset features
Outputs:
X_train: Transformed training dataset with selected features
X_test: Transformed test dataset with selected features
mae: Mean Absolute Error of the model on the test dataset
features: Selected features using RFE

In [347]:
Explanation for Non-Coders:

The select_features_rfe function uses Recursive Feature Elimination (RFE) for feature selection. It starts by initializing the number of features to select as 4 and the previous Mean Absolute Error (MAE) as infinity. It then iteratively trains an RFE model with a RandomForestRegressor estimator, fitting the model to the training data, and transforming the training and test datasets to include only the selected features. A Linear Regression model is then trained and evaluated on the selected features, calculating the MAE.

The process continues until the MAE is within the range of -1 to 1 or if the new MAE is further from 0 than the previous one. In the latter case, the number of features to select is increased, and the previous MAE is updated. Finally, the function returns the transformed training and test datasets, the MAE, and the selected features.

Broader Picture and Cross Applications:

Recursive Feature Elimination (RFE) is a versatile and widely applicable feature selection technique. It can be used across various domains and industries to enhance model performance, reduce computational resources, improve interpretability, and identify important variables. It is particularly useful in situations where there are a large number of features and a need to identify a smaller subset that contributes the most to the target variable.

Explanation of Machine Learning Concepts:

RFE is a wrapper-based feature selection method that iteratively removes the least important features and fits a model to the remaining features. It uses an estimator, such as a RandomForestRegressor, to rank the importance of each feature. The least important features are eliminated, and the process continues until a specified number of features remain.

SyntaxError: invalid syntax (1642003305.py, line 1)

In [348]:
# Feature Selection Function Using Recursive Feature Elimination
def select_features_rfe(X_train, y_train, X_test, y_test):
    n_features_to_select = 4 # Initialize the number of features to select
    prev_mae = float('inf') # Initialize the previous Mean Absolute Error (MAE) to infinity
    
    # Reshape X_train and y_train to 2D arrays
    X_train = np.reshape(X_train, (X_train.shape[0], -1))
    X_test = np.reshape(X_test, (X_test.shape[0], -1))
    y_train = np.reshape(y_train, (y_train.shape[0], -1))
    y_test = np.reshape(y_test, (y_test.shape[0], -1))
    
    while True: # Continue until a stopping condition is met
        # Create RFE object with a RandomForestRegressor estimator and select 4 features
        rfe = RFE(estimator=RandomForestRegressor(), n_features_to_select=4)
        rfe.fit(X_train, y_train) 
        # Transform X_train and X_test to include only selected features
        X_train = rfe.transform(X_train)
        X_test = rfe.transform(X_test)
        
        # Train and evaluate the model
        model = LinearRegression() # Create a Linear Regression model
        model.fit(X_train, y_train) # Fit the model to the training data
        y_pred = model.predict(X_test) # Generate predictions on the test data
        mae = mean_absolute_error(y_test, y_pred) # Calculate the Mean Absolute Error (MAE)

        # Check if the MAE is within the range of -1 to 1
        if -1 <= mae <= 1:
            break
        # Check if the new MAE is further from 0 than the previous one
        elif np.abs(mae) > np.abs(prev_mae):
            break
        else:
            n_features_to_select += 1 # Increase the number of features to select
            prev_mae = mae # Update the previous MAE to the current MAE
            
        # Get the boolean mask for the selected features
        X_train_mask = rfe.get_support()

        # Get the column names of the selected features
        features = np.array(X.columns)[X_train_mask]

    return X_train, X_test, mae, features

In [349]:
# Feature Selection Using Sequential Feature Selection
def select_features_sfs(X_train, y_train, X_test, y_test):
    n_features_to_select = 4
    prev_mae = float('inf')
    
    # Reshape X_train and y_train to 2D arrays
    X_train = np.reshape(X_train, (X_train.shape[0], -1))
    X_test = np.reshape(X_test, (X_test.shape[0], -1))
    y_train = np.reshape(y_train, (y_train.shape[0], -1))
    y_test = np.reshape(y_test, (y_test.shape[0], -1))

    while True:
        # Create SFS object with a RandomForestRegressor estimator and select n_features_to_select features
        sfs = SequentialFeatureSelector(estimator=RandomForestRegressor(), n_features_to_select=n_features_to_select)
        sfs.fit(X_train, y_train)

        X_train = sfs.transform(X_train)
        X_test = sfs.transform(X_test)

        # Train and evaluate the model
        model = LinearRegression()
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        mae = mean_absolute_error(y_test, y_pred)

        # Check if the MAE is within the range of -1 to 1
        if -1 <= mae <= 1:
            break
        # Check if the new MAE is further from 0 than the previous one
        elif np.abs(mae) > np.abs(prev_mae):
            break
        else:
            n_features_to_select += 1
            prev_mae = mae
                    
    # Get the boolean mask for the selected features
    X_train_mask = sfs.get_support()

    # Get the column names of the selected features
    features = np.array(X.columns)[X_train_mask]
        
    return X_train, X_test, mae, features

In [350]:
def feature_selection_correlation(X_train, y_train, X_test, y_test):
    c_thresholds = [0.1, 0.3, 0.5]
    best_threshold = 0
    best_columns = None
    
    # Compute the correlation matrix while handling potential division by zero
    stddev = np.std(X_train, axis=0, ddof=1)
    X_train_norm = (X_train - np.mean(X_train, axis=0)) / (stddev.reshape(1, -1) + 1e-8)
    corr_matrix = np.dot(X_train_norm.T, X_train_norm) / (X_train_norm.shape[0] - 1)

    # Loop through each threshold and select the features with correlation above that threshold
    for c_threshold in c_thresholds:
        columns = np.full((corr_matrix.shape[0],), True, dtype=bool)
        for i in range(corr_matrix.shape[0]):
            for j in range(i+1, corr_matrix.shape[0]):
                if corr_matrix[i, j] >= c_threshold:
                    if columns[j]:
                        columns[j] = False

        # Check if the current threshold results in a better set of columns than the previous best
        if best_columns is None or sum(columns) > sum(best_columns):
            best_threshold = c_threshold
            best_columns = columns

    # Use the best set of columns found
    X_train = X_train[:, best_columns]
    X_test = X_test[:, best_columns]
    features = column_names[best_columns]

    return X_train, X_test, features

In [351]:
import itertools
from tensorflow import keras
def generate_hyperparameter_list():
    # Dictionary of hyperparameter options, with keys as the names of the hyperparameters
    # and values as arrays of possible values
    hyperparameters_options = {
        'lstm_units': [32, 64, 128],
        'activation': ['relu', 'sigmoid', 'tanh'],
        'dropout': [0, 0.1, 0.2, 0.3],
        'regularization': [0.001, 0.01, 0.1],
        'optimizer': [keras.optimizers.Adam, keras.optimizers.SGD, keras.optimizers.RMSprop],
        'learning_rate': [0.001, 0.01, 0.1],
        'batch_size': [32, 64],
        'epochs': [50, 100],
        'early_stopping_patience': [5, 10, 15],
        'threshold': [0.1, 0.5, 0.9],
        'data_augmentation': [False, True],
        'ensembling': [False, True],
        'pruning': [False, True],
    }
    # Unzip the dictionary to get separate lists of hyperparameter names and possible values
    keys, values = zip(*hyperparameters_options.items())
    # Get all possible combinations of hyperparameter values and create a list of dictionaries
    hyperparameter_list = [dict(zip(keys, v)) for v in itertools.product(*values)]

    return hyperparameter_list

In [352]:
#4
class HoldoutMethod:
    def __init__(self, test_size=0.2):
        self.test_size = test_size
    
    # Method for splitting the data into train and test sets using the Hold Out method
    def split(self, X):
        n_samples = X.shape[0]
        n_test = int(self.test_size * n_samples)
        test_indices = np.random.choice(n_samples, n_test, replace=False)
        train_indices = np.setdiff1d(np.arange(n_samples), test_indices)
        # Returns a list containing tuples of train and test indices
        return [(train_indices, test_indices)]

In [353]:
class Pruner:
    def __init__(self, threshold):
        self.threshold = threshold

    def __call__(self, epoch, logs=None):
        # Define the pruning callback
        pruning_params = {'pruning_schedule': sparsity.PolynomialDecay(initial_sparsity=0.50,
                                                                       final_sparsity=self.threshold,
                                                                       begin_step=2000,
                                                                       end_step=4000)}
        callbacks = [sparsity.UpdatePruningStep(), sparsity.PruningSummaries(log_dir='./logs', profile_batch=0)]
        callbacks.append(sparsity.PruningCallback(pruning_params, verbose=1))

        return callbacks
