In [None]:
# %run path/to/utils.py.ipynb (in this case, I had the utils.py.ipynb notebook in the same folder as the other notebooks)
# Running this will also let you know the version of tenserflow "TF" 
# In my case, I built the TF 2.10.0 version from source for the Windows environment --> latest tenserflow version that supports training models on a gpu in Windows
# Please be aware that you may need to pick the numpy and pandas versions carefully to avoid issues with running this notebook (numpy version = pandas version = ) 
    ## For more details please visit the tensorflow website (build from source - windows): https://www.tensorflow.org/install/source_windows
%run utils.py.ipynb

# Many-to-Many (Sequence-to-Sequence) Bi-LSTM Example

## Contents: 
- Dataset Creation
- Training Model
- Saving Model
- Displaying Results
- Saving Results
- Loading Model
- Evaluating Model
- Additional Examples: (Many-to-One) + LSTM, GRU, Bi-GRU, ANN, Linear Regression

In [None]:
# Enter path of the excel file
Production_data_path = 'Example_Data.xlsx'
excelfile = pd.ExcelFile(Production_data_path)
# Find all sheet names of the excel file in given path
all_sheets = pd.ExcelFile(Production_data_path).sheet_names

# Allow user to select multiple sheet names from a single Excel file in a GUI 
# RETURN VARILABLE NAME: "user_selected_sheets" --> set as a global variable inside "select_sheets" function
user_selected_sheets = select_sheets(sheet_names = all_sheets)
# from my_methods import user_selected_sheets # importing the global variable after setting it up using the GUI (user_selected_sheets is a global variable that is created after executing the GUI)

# Define an empty list to store individual DataFrames
list_of_dfs = []

for sheet in user_selected_sheets:
    
    # Parse data from each worksheet as a Pandas DataFrame
    df = excelfile.parse(sheet)

    # And append it to the list
    list_of_dfs.append(df)
    
# Combine all DataFrames into one
Production_data = pd.concat(list_of_dfs, keys=user_selected_sheets, names=['Sheet_names',None]).reset_index(level=0).reset_index(drop=True)

# Print - High level info on production data
print('\nCountries:')
print(['United States']) # can be extracted from Production_data later
print('\nStates:')
print(['North Dakota']) # can be extracted from Production_data later
print('\nCounties:')
print(list(Production_data['County'].unique()))
print('\nReservoirs:')
print(list(Production_data['targetFormation'].unique()))
print('\nNumber of selected wells:')
print(len(Production_data['API/UWI List'].unique()))

# Print - Low level info on production data
print('\nFirst Production Years Interval:')
print (Production_data['firstProdDate'].min().year,'-',Production_data['firstProdDate'].max().year)
print('\nTrue Vertical Depth (TVD) variation')
print (Production_data['TVD'].min(),'-',Production_data['TVD'].max())

In [None]:
# Process data based on train, val, test percentages and remaining window size for val and test sets
train_percent = 0.6
val_percent = 0.2
test_percent = 0.2
label_columns = ['Monthly Oil','Monthly Gas','Monthly Water']
control_parameters = ['Days']

Processed_Production_data = process_data(Production_data = Production_data, train_percent = train_percent, val_percent = val_percent, test_percent = test_percent, label_columns = label_columns, control_parameters = control_parameters)

In [None]:
# Visualize Processed Production Data
display(Processed_Production_data)

# Add First production date's month and year as part of the static variables to include in the model
Processed_Production_data['First_Prod_Month'] = Processed_Production_data['firstProdDate'].dt.month
Processed_Production_data['First_Prod_Year'] = Processed_Production_data['firstProdDate'].dt.year

In [None]:
# Split data

train_df, val_df, test_df = split_data(Processed_Production_data,train_percent=0.6,val_percent=0.2,test_percent=0.2)

# Count Production length of each well
train_df['frequency'] = train_df.groupby('API/UWI List')['API/UWI List'].transform('count')
val_df['frequency'] = val_df.groupby('API/UWI List')['API/UWI List'].transform('count')
test_df['frequency'] = test_df.groupby('API/UWI List')['API/UWI List'].transform('count')

max_train_prod_period = train_df['frequency'].unique().max()
max_val_prod_period = val_df['frequency'].unique().max()
max_test_prod_period = test_df['frequency'].unique().max()
min_train_prod_period = train_df['frequency'].unique().min()
min_val_prod_period = val_df['frequency'].unique().min()
min_test_prod_period = test_df['frequency'].unique().min()

# Observe maximum and minimum length of production period for each well in training data
# Use this info to decide on train_percent, val_percent, test_percent for splitting the data
# Use this info to decide on the maximum window size for training and testing
print('Training Data Max Production Period:')
print(max_train_prod_period)
print('Validation Data Max Production Period:')
print(max_val_prod_period)
print('Test Data Max Production Period:')
print(max_test_prod_period)
print('Training Data Min Production Period:')
print(min_train_prod_period)
print('Validation Data Min Production Period:')
print(min_val_prod_period)
print('Test Data Min Production Period:')
print(min_test_prod_period)

# Create Frequency plots for production periods in Training, Validation, and Test data
# Additionally use this plot to adjust train_percent, val_percent, test_percent for splitting the data
print('Time Series Length of Each Well (Train,Val,Test):')
fig, ax = plt.subplots(nrows=3, ncols=1)
train_df['API/UWI List'].value_counts().plot(ax=ax[0], kind='bar', xlabel='Unique Wells', ylabel='Frequency', xticks = [], figsize = (15,15), title = 'Training - Time Series Length Distribution')
val_df['API/UWI List'].value_counts().plot(ax=ax[1], kind='bar', xlabel='Unique Wells', ylabel='Frequency', xticks = [], figsize = (15,15), title = 'Validation - Time Series Length Distribution')
test_df['API/UWI List'].value_counts().plot(ax=ax[2], kind='bar', xlabel='Unique Wells', ylabel='Frequency', xticks = [], figsize = (15,15), title = 'Test - Time Series Length Distribution')
fig.tight_layout()
plt.show()

# Observe the training, validation, and test data
# Use this to confirm the data integrity before starting training and testing
print('\nTraining Data:')
display(train_df)
print('Validation Data:')
display(val_df)
print('Test Data:')
display(test_df)

In [None]:
# Normalize Data
train_df, val_df, test_df, train_max, train_min = normalize_data(train_df, val_df, test_df, plot_cols = ['Monthly Oil', 'Monthly Gas', 'Monthly Water', 'Days'], norm_method = 'min-max')

In [None]:
train_df.loc[train_df['API/UWI List'] == 3306101267,'Monthly Water_norm']

In [None]:
# Choose the dynamic variables to include in the model
dynamic_variables = ['Monthly Oil', 'Monthly Gas', 'Monthly Water', 'Days']

print('\nDynamic_variables:\n')
print(dynamic_variables)
print('\nDynamic_variables length:')
print(len(dynamic_variables))

In [None]:
# Input required columns for training
append_str = '_norm'
input_columns = [sub + append_str for sub in dynamic_variables]
labels = ['Monthly Oil', 'Monthly Gas', 'Monthly Water'] # change if necessary
label_columns = [sub + append_str for sub in labels]

print('\nDynamic variables to use in the model:\n')
print(input_columns)
print('\nTarget variables to be predicted @ t = t+1:\n')
print(label_columns)
print('\nControl parameters for prediction of labels:\n')
print(list(set(input_columns) - set(label_columns)))

In [None]:
# Generate Batches for training, validation, and testing (can control many-to-many vs many-to-one + one step ahead predictions vs multi-step ahead predictions)

''' input_width = label_width and shift = 1 and final rnn layer return_sequences = True --> many-to-many with one step ahead predictions'''
''' input_width != label_width, label_width = 1 and shift = 1 and final rnn layer return_sequences = False --> many-to-one with one step ahead predictions'''
input_width = 6
label_width = 6 # many-to-many (sequence-to-sequence)
label_width_one = 1 # many-to-one
batch_number = 512 # determines how many batches to process at once. Ideally, the higher the number of batches, the smoother the gradient surface becomes.
shift = 1 # the shift parameter determines how many steps the window will move when creating the datasets for training, validaton, and testing ( when shift=1, it means that the movement is one-by-one --> for example: a windows size of 6 will cover 1-to-6 points in the inputs and move as 2-to-7 points in the labels (outputs) for forecasting. This window will move one step ahead for the next example as 2-to-7 points for the input and 3-to-8 points for the label (output), etc.)


In [None]:
# Generate Batches for training, validation, and testing

# Generate window based Batches (train, val, test) --> for standard RNN (many-to-many)
w2 = WindowGenerator(input_width = input_width, label_width = label_width, shift = shift, input_columns = input_columns, label_columns = label_columns, shuffle_data = False, batch_number = batch_number, train_df=train_df, val_df=val_df, test_df=test_df) 

# Generate window based Batches (train, val, test) --> for standard RNN (many-to-one) --> This is not trained in this script but, the explanation is given below.
# w2_one = WindowGenerator(input_width = input_width, label_width = label_width_one, shift = shift, input_columns = input_columns, label_columns = label_columns, shuffle_data = False, batch_number = batch_number, train_df=train_df, val_df=val_df, test_df=test_df)

In [None]:
# Setup data for training (faster training times during bayesian optimization with tensor datasets)

# Standard RNN (many-to-many)
w2_train = w2.train
w2_val = w2.val
w2_test = w2.test


In [None]:
# An example method for observing the processed training, validation, and test sets (in this case, the test set was observed but, can observe training, and validaton as well if desired)
# Warning: Uncomment to run and display the dataset, but, be aware, can be very slow for moderately large datasets! Suggested to try with a smaller data set to see if the training, validation, and test sets is created correctly.
# list(w2_test)

In [None]:
# Initial parameters (same for all the models)

# input shape
i = Input(shape=[input_width, len(dynamic_variables)], name='input_0') # shape(input width, dynamic features)

# Validation patience for early stopping (choose this value carefully as it can impact the model performance significantly).
patience = 30

# Callback methods
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=20, min_lr=0.00001) # reduces learning rate during training to learn more accurately

# Callback settings (restores best weights) --> This is early stopping that restores the best weights if the training somehow goes way beyond the optimum point.
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                  restore_best_weights=True,
                                                  patience=patience,
                                                  min_delta = 1e-07,
                                                  mode='min')

# Optimizer settings (Used ADAM in this case but, can choose a different optimizer for testing purposes).
optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001,
                                     beta_1=0.95,
                                     beta_2=0.999,
                                     epsilon=1e-07,
                                     amsgrad=False)



In [None]:
# Reviewer Revision - Adding Bi-LSTM
Standard_best_Bi_LSTM = create_model(final_layer_return_seq = True, loss = 'mse', metrics = ['mae','mape'], layers = 2, dropout_final_layer = False, dropout_value = 0.07490910432625673, units = [[244,250],[132,245]], Bi_directional = True, recurrent_dropout = 'zeros', model_type = 'LSTM', i = i, optimizer = optimizer) # ST_Hybrid = False, , c = c 
Standard_best_Bi_LSTM.summary()

In [None]:
history_standard_best_Bi_LSTM = Standard_best_Bi_LSTM.fit(w2_train, epochs=10, validation_data=w2_val, verbose=2, callbacks=[early_stopping])

In [None]:
# Evaluate the trained model
Standard_best_Bi_LSTM.evaluate(w2_train)
Standard_best_Bi_LSTM.evaluate(w2_val)
Standard_best_Bi_LSTM.evaluate(w2_test)

In [None]:
# Saves a plot of oil, gas, and water phases for each well
# Creates a "Plots" folder where this script is located (if it does not already exist)
# Can select either all wells to evaluate (index_slice_start = 0 and index = "all") or the "ith" well-ot-"nth" well (index_slice_start = i and index = n).
# index_slice_start = i (i represents the index of the well to start the evaluation) i = 0 means the first well and i = 5 means the 6th well.


# Get the directory where the script is running
script_dir = os.getcwd()

# Prepare to save to the "Plots" folder
folder_name = "Plots"
plots_folder_path = os.path.join(script_dir, folder_name)

# Create the "Plots" folder if it does not already exist
os.makedirs(plots_folder_path, exist_ok=True)

print(f"Folder created at: {plots_folder_path}")

saved_data  = save_pred_results(input_width = 6,
                               label_width = 6,
                               norm_method = 'min-max',
                               input_columns = input_columns,
                               label_columns = label_columns,
                               batch_number = 512,
                               index_slice_start = 0, # Chooses which well to begin the evaluation with (0 = first well)
                               path_to_save_pred_plots = plots_folder_path, 
                               model = Standard_best_Bi_LSTM,
                               model_name = 'Bi_LSTM_monthly_rates_min_max_sigmoid',
                               index_slice = 5, # Can choose "all" for evaluating the model on all the wells (Warning: slower if there are a lot of wells to evaluate)
                               train_df = train_df, 
                               val_df = val_df, 
                               test_df = test_df, 
                               train_mu = train_min, 
                               train_sigma = train_max,
                               logscale = 'off',
                               normscale = 'off',
                               plot_loss = 'mae',
                               split_loss = 'off')

In [None]:
# An example for saving the training history into a json file to load later using the "load_model()" function
# Creates the Model_histories" folder to save the training history (if it doesn't already exist)

# Get the directory where the script is running
script_dir = os.getcwd()

# Prepare to save to the "Model_histories" folder
folder_name = "Model_histories"
histories_folder_path = os.path.join(script_dir, folder_name)

# Create the "Model_Histories" folder if it does not already exist
os.makedirs(histories_folder_path, exist_ok=True)

print(f"Folder created at: {histories_folder_path}")

history_path = histories_folder_path

# Get the dictionary containing each metric and the loss for each epoch
history_dict = history_standard_best_Bi_LSTM.history
# Save it under the form of a json file
json.dump(history_dict, open(os.path.join(history_path,'history_Standard_best_Bi_LSTM_Bak_three_min_max_monthly.json'), 'w'))

In [None]:
# Example of loading the previous training history
history_path = histories_folder_path

Bi_LSTM_history = "history_Standard_best_Bi_LSTM_Bak_three_min_max_monthly.json"
history_dict_Bi_LSTM = json.load(open(os.path.join(history_path,Bi_LSTM_history), 'r'))

In [None]:
# A method for saving the saved_data object which includes all of the well data + the computed losses for train, val, and test sets for each phase (oil, gas, water).
# Allows comparing models in the future (saves all of the required results)

# Prepare to save to the "All_Results" folder
folder_name = "All_Results"
All_Results_folder_path = os.path.join(script_dir, folder_name)

# Create the "All_Results" folder if it does not already exist
os.makedirs(All_Results_folder_path, exist_ok=True)

print(f"Folder created at: {All_Results_folder_path}")

save_results_path = All_Results_folder_path

with open(os.path.join(save_results_path,'Standard_best_Bi_LSTM_min_max_monthly.pickle'), 'wb') as f:
    pickle.dump(saved_data, f)

In [None]:
# A method to display the training history
plt.rcParams['figure.figsize'] = [8, 8]
#plt.rcParams.update({'font.size': 20})

# Currently available choices in the history json file (depends on how the model was trained): "loss", "val_loss", "mae", "val_mae", "mape", "val_mape" 
# In this case, only loss (MSE = mean squared error) is shown.
plt.plot(history_dict_Bi_LSTM['loss'], lw = 6, color = 'green', label = 'Bi-LSTM Train')
plt.plot(history_dict_Bi_LSTM['val_loss'], lw = 2, marker = '^', markevery=1, ms = 10, linestyle ='dotted', color = 'red', label = 'Bi-LSTM Val')

plt.xlim(-1,10)
plt.title('Model Training', fontsize = 20)
plt.ylabel('MSE', fontsize = 20)
plt.xlabel('Epochs', fontsize = 20)
plt.xticks(fontsize = 20)
plt.yticks(fontsize = 20)
plt.legend(loc='upper right', fontsize = 15)
#plt.show()
plt.savefig('Min_max_monthly_training_history.png', dpi=300, bbox_inches = 'tight') # saves directly to where the scrip is located. The save path can be modified.

In [None]:
# A method to load all the preiously saved results

save_results_path = All_Results_folder_path

# possible_selections for "error_type" = ['train_error', 'val_error', 'test_error']
# possible_selections for "error" = ['mse','rmse','mae','mape','nmse','nrmse','nmape','nmae','wmape']
# possible_selections for "phase" = ['Monthly Oil', 'Monthly Gas', 'Monthly Water']
# other possible selections = depends on the available keys (please run the next cell and check the other possibilities)


file = open(os.path.join(save_results_path,'Standard_best_Bi_LSTM_min_max_monthly.pickle'),'rb')
Standard_best_Bi_LSTM_results = pickle.load(file)
file.close()

error_type = 'val_error'
error = 'mse' # all the error types given in the 
phase = 'Monthly Oil'

# Test it
Standard_best_Bi_LSTM_results[error_type].loc[Standard_best_Bi_LSTM_results[error_type]['phase'] == phase][error].mean()

In [None]:
# Showing all the possible keys
Standard_best_Bi_LSTM_results.keys()

In [None]:
# Showing the saved "train_results" contents 
Standard_best_Bi_LSTM_results['train_results']

In [None]:
# Saving a model

# Prepare to save to the "All_Results" folder
folder_name = "Models"
Models_folder_path = os.path.join(script_dir, folder_name)

# Create the "All_Results" folder if it does not already exist
os.makedirs(Models_folder_path, exist_ok=True)

print(f"Folder created at: {Models_folder_path}")

save_path = r'C:\Users\mrkoc\Desktop\RNN_Results\Models'
Standard_best_Bi_LSTM.save(os.path.join(save_path,'Standard_best_Bi_LSTM_monthly.h5'))

In [None]:
# Loading a model
Loaded_Bi_LSTM = tf.keras.models.load_model(os.path.join(save_path,'Standard_best_Bi_LSTM_monthly.h5'))

In [None]:
# Evaluate the loaded model with the available data
Loaded_Bi_LSTM.evaluate(w2_train)
Loaded_Bi_LSTM.evaluate(w2_val)
Loaded_Bi_LSTM.evaluate(w2_test)

# Additional Examples
- Additional Examples: (Many-to-One) + LSTM, GRU, Bi-GRU, ANN, Linear Regression

In [None]:
# Standard RNN (many-to-one) 
# To see the explanation on how to train these models using the create_model function, check the training for Bi_GRU at the very bottom of this notebook.

# w2_train_one = w2_one.train
# w2_val_one = w2_one.val
# w2_test_one = w2_one.test

# If desired, you can modify the functions in the "utils" notebook to create a one-to-one or one-to-many model although for production forecasting, it was observed that many-to-many performs brtter (examine how the examples above were created for more insights)

In [None]:
# If desired to create a linear model, you can change the model_type = 'LR'. This still runs an ANN but, a single layer with a linear activation function (Not recommended for production forecasting).

# standard_ann = create_model(final_layer_return_seq = True, loss = 'mse', metrics = ['mae','mape'], layers = 2, dropout_final_layer = False, dropout_value = 0.07490910432625673, units = [144,256], Bi_directional = False, recurrent_dropout = 'zeros', model_type = 'Dense', i = i, optimizer = optimizer)
# standard_ann.summary()

In [None]:
# history_standard_ann = standard_ann.fit(w2_train, epochs=10, validation_data=w2_val, verbose=2, callbacks=[early_stopping])

In [None]:
# standard_LSTM = create_model(final_layer_return_seq = True, loss = 'mse', metrics = ['mae','mape'], layers = 2, dropout_final_layer = False, dropout_value = 0.07490910432625673, units = [144,256], Bi_directional = False, recurrent_dropout = 'zeros', model_type = 'LSTM', i = i, optimizer = optimizer) 
# standard_LSTM.summary()

In [None]:
# history_standard_LSTM = standard_LSTM.fit(w2_train, epochs=10, validation_data=w2_val, verbose=2, callbacks=[early_stopping])

In [None]:
# standard_GRU = create_model(final_layer_return_seq = True, loss = 'mse', metrics = ['mae','mape'], layers = 2, dropout_final_layer = False, dropout_value = 0.07490910432625673, units = [144,256], Bi_directional = False, recurrent_dropout = 'zeros', model_type = 'GRU', i = i, optimizer = optimizer) 
# standard_GRU.summary()

In [None]:
# history_standard_GRU = standard_GRU.fit(w2_train, epochs=10, validation_data=w2_val, verbose=2, callbacks=[early_stopping])

In [None]:
# For Bi-directional models -> units = [forward units, backward units] (repeat for each layer)
    # units format = when layers = 2 --> units = [[244,250],[132,245]]
    # In the example below, since layers = 2, there are 2 units moving forward and backward --> units = [[244,250],[132,245]]. 
    # So for layer#1 --> units --> [244,250] --> [forward units, backward units] and the next one ([132,245]) is the same for layer#2
# This example is for many-to-many models (check above on how to create many-to-one models using the window generator)
# For many-to-one models, the final_layer_return_seq = False

# standard_Bi_GRU = create_model(final_layer_return_seq = True, loss = 'mse', metrics = ['mae','mape'], layers = 2, dropout_final_layer = False, dropout_value = 0.07490910432625673, units = [[244,250],[132,245]], Bi_directional = True, recurrent_dropout = 'zeros', model_type = 'GRU', i = i, optimizer = optimizer)
# standard_Bi_GRU.summary()

In [None]:
# history_standard_Bi_GRU = standard_Bi_GRU.fit(w2_train, epochs=10, validation_data=w2_val, verbose=2, callbacks=[early_stopping])