In [None]:
import os
import gc
from datetime import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from tensorflow import config
from tensorflow import keras
from tensorflow.keras import Model, optimizers, Sequential
from tensorflow.keras.layers import Dense, Concatenate
from tensorflow.keras.layers import LSTM
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import MinMaxScaler
import pyarrow.ipc as ipc

In [None]:
physicalDevices = config.experimental.list_physical_devices('GPU')
config.experimental.set_memory_growth(physicalDevices[0], True)

In [None]:
def read_feather_in_chunks(filepath):
    with ipc.RecordBatchFileReader(filepath) as reader:
        print(reader.num_record_batches)
        for batch_index in range(reader.num_record_batches):
            # print("[INFO]. Reading batch-{} which had {} rows".format(batch_index, batch.num_rows))
            if batch_index == 0:
                batch = reader.get_batch(batch_index).to_pandas(use_threads=True, timestamp_as_object=True, )
            else:
                new_batch = reader.get_batch(batch_index).to_pandas(use_threads=True, timestamp_as_object=True, )
                data_df = pd.concat([batch, new_batch], ignore_index=True)
                batch = data_df
            
            if (batch_index + 1) % 2 == 0:
                batch = pd.DataFrame()
                yield data_df

In [None]:
flag = 1
path = "./Inputs/final_inputs/"
for input_file in os.listdir(path):
    batch_num = 1
    os.makedirs(os.path.join("./graphs", input_file[:-8]))
    for batch in read_feather_in_chunks(os.path.join(path, input_file)):
        os.makedirs(os.path.join("./saved_model", input_file[:-8], str(batch_num)+'_batch'))

        print("[INFO]. Pre-Processing Batch-{} Inputs.".format(batch_num))
        for i in tqdm(range(len(batch))):
            batch.iloc[i, 25] += batch.iloc[i, 11]
            batch.iloc[i, 26] += batch.iloc[i, 11]    
            if batch.iloc[i, 10] != np.inf:
                batch.iloc[i, 27] = (1 + batch.iloc[i, 10] / 100) * batch.iloc[i, 27]  # (1 + W/100) * pr) W = 20%, pr = 0.0027
        
        batch = batch.drop(columns=['index', 'time', 'lat', 'lon', 'index_x', 'index_y', 'spatial_ref', 'W', 'T'])
        batch.gravel = batch.gravel.astype(int)
        batch.clay = batch.clay.astype(int)
        batch.silt = batch.silt.astype(int)
        batch.sand = batch.sand.astype(int)
        batch.awc = batch.awc.astype(int)
        batch.cec_soil = batch.cec_soil.astype(int)
        batch.texture_class = batch.texture_class.astype(int)
        batch.CO2 = batch.CO2.astype(int)
        batch['plant-day'] = batch['plant-day'].astype(int)
        batch['maturity-day'] = batch['maturity-day'].astype(int)

        static_data_input = batch[['plant-day', 'maturity-day', 'CO2', 'N', 'A', 'texture_class', 'soil_ph',
                                    'soil_caco3', 'cec_soil', 'oc', 'awc', 'sand', 'silt', 'clay', 'gravel']]
        static_data_label = batch[['yield_mai']]
        weather_array_1 = batch[['tasmax', 'tasmin', 'pr', 'gdd']]
        gc.collect()
        del batch

        # Scaling static and dynamic data.
        scaler = MinMaxScaler(feature_range=(0.01, 1))
        scaled_static_data = scaler.fit_transform(static_data_input)
        scaled_static_label = scaler.fit_transform(static_data_label)
        scaled_dynamic_data = scaler.fit_transform(weather_array_1)
        # scaled_static_data = static_data_input
        # scaled_static_label = static_data_label
        # scaled_dynamic_data = weather_array_1
        gc.collect()
        del static_data_input, static_data_label, weather_array_1
        
        # Splitting the batch in training and testing set.
        test_size = 0.2
        fract = 1 - test_size

        static_X_train = scaled_static_data[:int(len(scaled_static_data) * fract)]
        static_X_test = scaled_static_data[int(len(scaled_static_data) * fract):]

        static_Y_train = scaled_static_label[:int(len(scaled_static_label) * fract)]
        static_Y_test = scaled_static_label[int(len(scaled_static_label) * fract):]

        dynamic_X_train = scaled_dynamic_data[:int(len(scaled_dynamic_data) * fract)]
        dynamic_X_test = scaled_dynamic_data[int(len(scaled_dynamic_data) * fract):]

        gc.collect()
        del scaled_static_data, scaled_static_label, scaled_dynamic_data

        # Defining the neural network for training the model.
        if flag == 1:
            dynamic_input = keras.Input(shape = (dynamic_X_train.shape[1], 1), dtype='float32')
            inner_lstm1 = LSTM(200, return_sequences=True)(dynamic_input)
            inner_lstm2 = LSTM(200, return_sequences=True)(inner_lstm1)
            lstm_out = LSTM(200, return_sequences=False)(inner_lstm2)

            static_input = keras.Input(shape = (static_X_train.shape[1]))
            inner_stat1 = Dense(200, activation='selu')(static_input)
            inner_stat1 = Dense(200, activation='selu')(inner_stat1)
            inner_stat2 = Dense(200, activation='selu')(inner_stat1)     

            x = Concatenate()([lstm_out, inner_stat2])

            x = Dense(200, activation='selu')(x)
            x = Dense(200, activation='selu')(x)
            x = Dense(200, activation='selu')(x)

            dynamic_output = Dense(1, activation = 'selu')(x)

            model = Model(inputs = [dynamic_input, static_input], outputs = [dynamic_output])

            model.compile(loss = keras.metrics.mean_squared_error,
                        optimizer = optimizers.Adam(learning_rate = 1e-5),
                        metrics = [keras.metrics.RootMeanSquaredError(name = 'rmse'), 'mae'])

            logs = "logs/" + datetime.now().strftime("%Y%m%d-%H%M%S")
            es = EarlyStopping(monitor = 'val_loss', mode = 'min', verbose = 1, patience = 3)
            tboard_callback = keras.callbacks.TensorBoard(log_dir = logs, histogram_freq = 1, profile_batch = '500,520')

        else:
            try:
                model = keras.models.load_model(os.path.join("./saved_model", input_file[:-8], str(batch_num-1)+'_batch'))
            except:
                print("[INFO]. Input File has been completed. Moving onto the new input file.")
                model = keras.models.load_model(prev_model)
            es = EarlyStopping(monitor = 'val_loss', mode = 'min', verbose = 1, patience = 3)
            tboard_callback = keras.callbacks.TensorBoard(log_dir = logs, histogram_freq = 1, profile_batch = '500,520')

        # Training the mode on the dataset.
        history = model.fit(x = [dynamic_X_train, static_X_train], y = static_Y_train, validation_split = 0.2, epochs = 20, callbacks = [tboard_callback, es], batch_size = 64)

        plt.plot(history.history['loss'])
        plt.plot(history.history['val_loss'])
        plt.title('model loss')
        plt.ylabel('loss')
        plt.xlabel('epoch')
        plt.legend(['train', 'test'], loc='upper left')
        plt.savefig(os.path.join("./graphs", input_file[:-8], "Batch-{}_loss_stats.jpg".format(batch_num)))
        plt.clf()

        # Saving the model after each epoch.
        model.save(os.path.join("./saved_model", input_file[:-8], str(batch_num)+'_batch'))
        
        if batch_num == 12:
            prev_model = os.path.join("./saved_model", input_file[:-8], str(batch_num)+'_batch')
        
        
        gc.collect()
        flag += 1
        batch_num += 1

In [None]:
print("static_X_train Shape : ", static_X_train.shape)
print("y_train", static_Y_train.shape)

print("\nstatic_X_test Shape  : ", static_X_test.shape)
print("y_test", static_Y_test.shape)

print("\ndynamic_X_train Shape : ", dynamic_X_train.shape)
print("dynamic_X_test Shape  : ", dynamic_X_test.shape)