### This file contains the ANN method with $l_q=14$.

1. The hyperparameter searching and training process is commented out for reproduction. Currently, the best hyperparameters (***for Supplementary Table 1***) and the trained models are stored in `/Reproduction/best_hps.csv` and `/Reproduction/TrainedModels/`, respectively. Please have a look at Section 3.5 and Section 4 of README for more information. If the hyperparameter searching and training process is restored, the best hyperparameters will be stored in `/Reproduction/Results/ANN/best_hps.csv` and the trained models will be stored in `/Reproduction/Results/ANN/TrainedModels/`.

2. The results will be stored in `/Reproduction/Results/ANN/`.

In [None]:
import os
import pandas as pd
import numpy as np
import keras_tuner as kt
import tensorflow as tf
import datetime
import csv
from sklearn.preprocessing import StandardScaler
from tensorflow.keras import layers

pd.options.mode.chained_assignment = None

In [None]:
# This file requires TF = 2.9.0
print(tf.config.list_physical_devices('GPU'), tf.__version__)

In [None]:
# Get the current directory.
current_dir = os.getcwd()

# Set up data source and hyperparameters.
path = current_dir+'/Data14/'
template_length = 14

# Basic fuctions for generating shuffled training and validation data.
def split_sequence(sequence, n_steps_infunction):
    X, y = list(), list()
    for i in range(len(sequence)):
        end_ix = i + n_steps_infunction
        if end_ix > len(sequence)-1:
            break
        seq_x, seq_y = sequence[i:end_ix], sequence[end_ix]
        X.append(seq_x)
        y.append(seq_y)
    return np.array(X), np.array(y)

def shuffle_xy(x1, y1):
    np.random.seed(0)
    assert len(x1) == len(y1)
    p = np.random.permutation(len(x1))
    return x1[p], y1[p]

# Set up random seeds for data splitting.
split_rs = [290, 150, 266, 78, 148, 133, 155, 135, 178, 241]

# Set up directories to store result data.
for rs in split_rs:
    os.makedirs(current_dir+'/Results/ANN/'+str(rs)+'-ResultData/')

In [None]:
# Set up data source and hyperparameters.
path = current_dir+'/Data14/'
template_length = 14
this_m = 200
this_w3 = 1.1

for rs in split_rs:
    ## Splitting
    # Full set --> full training set and test set
    print('Calculating seed', rs, 'at', datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
        
    files = sorted(os.listdir(path))
    pads = [elt[:-7] for elt in files]
    files_df = pd.DataFrame({'filename':files, 'pad':pads})

    num_wells_in_pad_df = pd.DataFrame(files_df['pad'].value_counts()).reset_index()
    num_wells_in_pad_df.columns = ['pad', 'count']
    unique_pads = np.unique(pads)
    unique_pads_df = pd.DataFrame({'pad':unique_pads})
    unique_pads_df = pd.merge(unique_pads_df, num_wells_in_pad_df, on='pad')

    np.random.seed(rs)
    unique_pads_df_shuffled = unique_pads_df.sample(frac=1).reset_index(drop=True)
    counter = 0
    for idx in range(len(unique_pads_df_shuffled)):
        counter += unique_pads_df_shuffled['count'][idx]
        if counter >= 300:
            break
        else:
            continue
    end_of_training = idx

    train_files_shuffled = []
    for idx in range(end_of_training+1):
        pad_name = unique_pads_df_shuffled['pad'][idx]
        for file in files:
            if file[:-7] == pad_name:
                train_files_shuffled.append(file)
            else:
                continue

    test_files_shuffled = []
    for idx in range(end_of_training+1,len(unique_pads_df_shuffled)):
        pad_name = unique_pads_df_shuffled['pad'][idx]
        for file in files:
            if file[:-7] == pad_name:
                test_files_shuffled.append(file)
            else:
                continue

    # Full training set --> training set and validation set
    train_files_shuffled_2, val_files_shuffled = [], []
    for idx in range(1, len(train_files_shuffled)-1):
        if idx >= 239:
            if (train_files_shuffled[idx][:-7] == train_files_shuffled[idx-1][:-7]) and (train_files_shuffled[idx][:-7] != train_files_shuffled[idx+1][:-7]):
                end_of_validation = idx
                break
    
    for idx in range(len(train_files_shuffled)):
        if idx <= end_of_validation:
            train_files_shuffled_2.append(train_files_shuffled[idx])
        else:
            val_files_shuffled.append(train_files_shuffled[idx])
    train_files_shuffled = train_files_shuffled_2

    print('len(train_files_shuffled):', len(train_files_shuffled),'          len(val_files_shuffled):', len(val_files_shuffled), '          len(test_files_shuffled):', len(test_files_shuffled))
    print(test_files_shuffled)

    ## Calculate training data.
    print('Calculating training matrix', 'at', datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
    x_train, y_train = np.empty([1, template_length]), np.empty([1,])

    for m in range(len(train_files_shuffled)):
            
        df = pd.read_excel(path+train_files_shuffled[m], header = 0, sheet_name = 0)
        reopenings = list(df[df['Mark'] == 'reopening'].index)
        reopenings = np.insert(reopenings, len(reopenings), len(df))
        df = df['Q']/df['t']

        for l in range(len(reopenings)-1):
            sub_df = df.iloc[reopenings[l]:reopenings[l+1]]
            x_train_new, y_train_new = split_sequence(sub_df.values, template_length)
            x_train = np.concatenate((x_train, x_train_new), axis=0)
            y_train = np.concatenate((y_train, y_train_new), axis=0)

    x_train, y_train = x_train[1:], y_train[1:]
    x_train, y_train = shuffle_xy(x_train, y_train)

    ## Calculate validation data.
    print('Calculating validation matrix', 'at', datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
    x_val, y_val = np.empty([1, template_length]), np.empty([1,])

    for m in range(len(val_files_shuffled)):
            
        df = pd.read_excel(path+val_files_shuffled[m], header = 0, sheet_name = 0)
        reopenings = list(df[df['Mark'] == 'reopening'].index)
        reopenings = np.insert(reopenings, len(reopenings), len(df))
        df = df['Q']/df['t']

        for l in range(len(reopenings)-1):
            sub_df = df.iloc[reopenings[l]:reopenings[l+1]]
            x_val_new, y_val_new = split_sequence(sub_df.values, template_length)
            x_val = np.concatenate((x_val, x_val_new), axis=0)
            y_val = np.concatenate((y_val, y_val_new), axis=0)
                    
    x_val, y_val = x_val[1:], y_val[1:]
    x_val, y_val = shuffle_xy(x_val, y_val)

    print('len(x_train):', x_train.shape, '          len(y_train):', y_train.shape, '          len(x_val):', x_val.shape, '          len(y_val):', y_val.shape)
    S = StandardScaler().fit(x_train)
    x_train = S.transform(x_train)
    x_val = S.transform(x_val)

    ## Model training.
    tf.keras.utils.set_random_seed(317)
    tf.config.experimental.enable_op_determinism()

    # The hyperparameter searching and training process is commented out for reproduction.
    '''def buildModel(hp):
        model_input = layers.Input(shape=(template_length, 1))
        lstm = layers.LSTM(hp.Choice('lstm_units', [75, 100, 125, 150, 175, 200]), input_shape=(template_length, 1))(model_input)
        dense = layers.Dense(hp.Choice('dense_units', [10, 20, 30, 40, 50]), activation='relu')(lstm)
        model_output = layers.Dense(1, activation='relu')(dense)

        model = tf.keras.models.Model(model_input, model_output)
        opt = tf.keras.optimizers.Adam(hp.Choice('lr', [0.01, 0.001, 0.0001]))
        model.compile(optimizer=opt, loss=tf.keras.losses.MeanSquaredError())
            
        return model
    
    tuner = kt.Hyperband(
        buildModel,
        objective='val_loss',
        max_epochs=1000,
        directory=current_dir+'/Results/ANN/tuner-'+str(rs)
        )
    
    tuner_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=20)
    print('Searching best hyperparameters', 'at', datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
    tuner.search(x_train, y_train, epochs=1000, batch_size=256, validation_data=(x_val, y_val), callbacks=[tuner_stopping], verbose=0)
    best_hps = tuner.get_best_hyperparameters()[0]
    print(best_hps.values)

    with open(current_dir+'/Results/ANN/best_hps.csv','a') as f:
        writer = csv.writer(f)
        writer.writerow([rs, best_hps.values['lstm_units'], best_hps.values['dense_units'], best_hps.values['lr']])
        f.close()

    print('Training', 'at', datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
    stopping = tf.keras.callbacks.EarlyStopping(
        monitor = 'val_loss',
        patience = 20
    )

    checkpoint = tf.keras.callbacks.ModelCheckpoint(
        filepath = current_dir+'/Results/ANN/TrainedModels/ANN_model_' + str(rs) +'.h5',
        monitor = 'val_loss',
        save_best_only = True
    )
    
    lstm_model = tuner.hypermodel.build(best_hps)
    history = lstm_model.fit(x_train, y_train, epochs = 1000, batch_size = 256, validation_data = (x_val, y_val), callbacks = [checkpoint, stopping], verbose=0)
    
    trained_lstm_model = tf.keras.models.load_model(current_dir+'/Results/ANN/TrainedModels/ANN_model_' + str(rs) +'.h5')'''

    # If the previous hyperparameter searching and training process is restored, please comment out the following line:
    trained_lstm_model = tf.keras.models.load_model(current_dir+'/TrainedModels/ANN_model_' + str(rs) +'.h5')

    ## Forecasting
    print('Forecasting', 'at', datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
    for m in range(len(test_files_shuffled)):
        
        df = pd.read_excel(path+test_files_shuffled[m], header = 0, sheet_name = 0)
        df['q'] = df['Q']/df['t']
        
        print('=====Calculating well=====', m, test_files_shuffled[m], 'at', datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
        reopenings = list(df[df['Mark'] == 'reopening'].index)
        reopenings = np.insert(reopenings, len(reopenings), len(df))

        forecasts_multisteps_this_well = []
        y_true_all_this_well = []
        prod_times_this_well = []
        markers_this_well = []

        for l in range(len(reopenings)-1):
            sub_df = df.iloc[reopenings[l]:reopenings[l+1]] 

            y_true_all = sub_df['q'].values
            forecasts_multisteps = list(sub_df['q'][:template_length].values)
            prod_times = sub_df['t'].values
            markers = ['initial'] * template_length
            
            history = sub_df['q'][:template_length].values
            for t in range(len(sub_df)-template_length):
                markers.append('forecast')
                history_data = history.reshape(1, template_length)
                forecast = trained_lstm_model.predict(S.transform(history_data), verbose=0)[0][0]
                forecasts_multisteps.append(forecast)
                history = np.append(history, forecast)
                history = np.delete(history, 0)

            for t in range(len(markers)):
                y_true_all_this_well.append(y_true_all[t])
                forecasts_multisteps_this_well.append(forecasts_multisteps[t])
                prod_times_this_well.append(prod_times[t])
                markers_this_well.append(markers[t])

        # Results
        multi_step_result_df = pd.DataFrame()
        multi_step_result_df['True'] = y_true_all_this_well
        multi_step_result_df['Pred'] = forecasts_multisteps_this_well
        multi_step_result_df['t'] = prod_times_this_well
        multi_step_result_df['Mark'] = markers_this_well
        multi_step_result_df['TrueCumu'] = (multi_step_result_df['True']*multi_step_result_df['t']).cumsum()
        multi_step_result_df['PredCumu'] = (multi_step_result_df['Pred']*multi_step_result_df['t']).cumsum()

        writer = pd.ExcelWriter(current_dir+'/Results/ANN/'+str(rs)+'-ResultData/ResultData-'+str(m)+'-'+test_files_shuffled[m])
        multi_step_result_df.to_excel(writer, float_format='%.5f', header=True, index=False)
        writer.close()