# **SET UP**

In [None]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Bidirectional
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from matplotlib import pyplot as plt
from matplotlib.dates import date2num
import os

In [None]:
!pip install scienceplots

In [None]:
import scienceplots
plt.style.use(['science', 'no-latex'])
plt.rcParams.update({'font.size': 16})

# **DataAnalyzer class**

In [None]:
class DataAnalyzer:
    def __init__(self, data_or_url):
        if isinstance(data_or_url, str):
            self.data_url = data_or_url
            self.data = self.load_data()
        elif isinstance(data_or_url, pd.DataFrame):
            self.data = data_or_url
        else:
            raise ValueError("Input must be a file path or a pandas DataFrame")

    def load_data(self):
        file_extension = self.data_url.split('.')[-1]
        if file_extension == 'csv':
            return pd.read_csv(self.data_url)
        elif file_extension in ['xlsx', 'xlsm', 'xltx', 'xltm']:
            return pd.read_excel(self.data_url)
        else:
            raise ValueError("Unsupported file format")

    def preprocess_data(self, well_name_column, analyte_name_column, well_name, analyte_name):
        filtered_data = self.data[(self.data[well_name_column] == well_name) & (self.data[analyte_name_column] == analyte_name)].copy()
        filtered_data = filtered_data.dropna(subset=['COLLECTION_DATE', 'RESULT'])
        filtered_data['COLLECTION_DATE'] = pd.to_datetime(filtered_data['COLLECTION_DATE'], errors='coerce')
        filtered_data = filtered_data.dropna(subset=['COLLECTION_DATE'])
        filtered_data = filtered_data[filtered_data['RESULT'] > 0]
        filtered_data['RESULT_LOG'] = np.log10(filtered_data['RESULT'])
        return filtered_data

    def create_sequences(self, data, seq_length):
        xs = []
        ys = []
        for i in range(len(data) - seq_length):
            x = data[i:i+seq_length]
            y = data[i+seq_length]
            xs.append(x)
            ys.append(y)
        return np.array(xs), np.array(ys)

    def lstm_slope_forecast(self, well_name_column, analyte_name_column, analyte_name, seq_length=10, n_bootstrap=100):
        well_names = self.data[self.data[analyte_name_column] == analyte_name][well_name_column].unique()
        results = []

        all_mse = []
        all_r2 = []

        for well_name in well_names:
            filtered_data = self.preprocess_data(well_name_column, analyte_name_column, well_name, analyte_name)
            print(f'Data size after preprocessing for {well_name} and {analyte_name}: {filtered_data.shape}')

            if filtered_data.empty:
                print(f'No data for well: {well_name} and analyte: {analyte_name}')
                continue

            cutoff_date = filtered_data['COLLECTION_DATE'].max() - pd.DateOffset(years=4)
            train_data = filtered_data[filtered_data['COLLECTION_DATE'] <= cutoff_date].copy()
            test_data = filtered_data[filtered_data['COLLECTION_DATE'] > cutoff_date].copy()

            if train_data.empty or test_data.empty:
                print(f'Not enough data to split into training and testing sets for well: {well_name} and analyte: {analyte_name}')
                continue

            scaler = MinMaxScaler(feature_range=(0, 1))
            train_data.loc[:, 'RESULT_LOG'] = scaler.fit_transform(train_data['RESULT_LOG'].values.reshape(-1, 1))
            test_data.loc[:, 'RESULT_LOG'] = scaler.transform(test_data['RESULT_LOG'].values.reshape(-1, 1))

            train_data.loc[:, 'DELTA_RESULT_LOG'] = train_data['RESULT_LOG'].diff().fillna(0)
            test_data.loc[:, 'DELTA_RESULT_LOG'] = test_data['RESULT_LOG'].diff().fillna(0)

            X_train, y_train = self.create_sequences(train_data['DELTA_RESULT_LOG'].values, seq_length)
            if len(X_train) == 0:
                print(f'Not enough training data to create sequences for well: {well_name} and analyte: {analyte_name}')
                continue

            X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))

            model = Sequential()
            model.add(Bidirectional(LSTM(64, return_sequences=True), input_shape=(seq_length, 1)))
            model.add(Dropout(0.2))
            model.add(Bidirectional(LSTM(64)))
            model.add(Dropout(0.2))
            model.add(Dense(1))
            model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

            early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
            reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.0001)

            if len(X_train) > 5:
                validation_split = 0.2
            else:
                validation_split = 0

            model.fit(X_train, y_train, epochs=200, batch_size=32, verbose=2, validation_split=validation_split, callbacks=[early_stopping, reduce_lr])

            lin_reg_model = LinearRegression()
            x_lin_reg = np.array([date2num(d) for d in train_data['COLLECTION_DATE']]).reshape(-1, 1)
            y_lin_reg = scaler.inverse_transform(train_data['RESULT_LOG'].values.reshape(-1, 1)).flatten()
            lin_reg_model.fit(x_lin_reg, y_lin_reg)
            trend = lin_reg_model.predict(x_lin_reg)

            predictions = []
            input_sequence = train_data['DELTA_RESULT_LOG'].values[-seq_length:]

            last_value = train_data['RESULT_LOG'].values[-1]
            for date in test_data['COLLECTION_DATE']:
                input_sequence_reshaped = input_sequence.reshape((1, seq_length, 1))
                delta_prediction = model.predict(input_sequence_reshaped)[0, 0]
                last_value += delta_prediction
                predictions.append(last_value)
                input_sequence = np.append(input_sequence[1:], delta_prediction)

            predictions = scaler.inverse_transform(np.array(predictions).reshape(-1, 1))

            boot_predictions = np.zeros((n_bootstrap, len(predictions)))
            residuals = y_train - model.predict(X_train).flatten()

            for i in range(n_bootstrap):
                boot_residuals = np.random.choice(residuals, len(predictions), replace=True)
                boot_predictions[i, :] = predictions.flatten() + boot_residuals

            lower_bound = np.percentile(boot_predictions, 2.5, axis=0)
            upper_bound = np.percentile(boot_predictions, 97.5, axis=0)

            x_forecast = np.array([date2num(d) for d in test_data['COLLECTION_DATE']]).reshape(-1, 1)
            lin_reg_predictions = lin_reg_model.predict(x_forecast)

            plt.figure(figsize=(10, 6))
            train_data = train_data.sort_values(by='COLLECTION_DATE')
            test_data = test_data.sort_values(by='COLLECTION_DATE')
            historical_data_unscaled_train = scaler.inverse_transform(train_data['RESULT_LOG'].values.reshape(-1, 1)).flatten()
            historical_data_unscaled_test = scaler.inverse_transform(test_data['RESULT_LOG'].values.reshape(-1, 1)).flatten()
            plt.plot(train_data['COLLECTION_DATE'], historical_data_unscaled_train, label='Training Data', color='blue')
            plt.scatter(train_data['COLLECTION_DATE'], historical_data_unscaled_train, color='blue')
            plt.plot(test_data['COLLECTION_DATE'], historical_data_unscaled_test, label='Testing Data', color='purple')
            plt.scatter(test_data['COLLECTION_DATE'], historical_data_unscaled_test, color='purple')
            plt.plot(test_data['COLLECTION_DATE'], predictions, label='Bidirectional LSTM Forecast', linestyle='--', color='red')
            plt.fill_between(test_data['COLLECTION_DATE'], lower_bound, upper_bound, color='orange', alpha=0.3, label='95% Confidence Interval')
            plt.plot(test_data['COLLECTION_DATE'], lin_reg_predictions, label='Linear Regression Forecast', linestyle='--', color='orange')
            plt.xlabel('Date')
            plt.ylabel('Log-Concentration')
            plt.title(f'{well_name} - {analyte_name} Forecast')
            plt.legend()

            save_dir = 'plots'
            if not os.path.exists(save_dir):
                os.makedirs(save_dir)
            plot_path = os.path.join(save_dir, f'forecast_{well_name}_{analyte_name}_Bidirectional_LSTM.png')
            plt.savefig(plot_path)
            plt.show()

            print(f'Saved plot for well: {well_name} at {plot_path}')
            results.append({
                'well_name': well_name,
                'analyte_name': analyte_name,
                'model_name': 'Bidirectional LSTM',
                'plot_path': plot_path
            })

            y_pred = model.predict(X_train)
            y_true = y_train
            mse = mean_squared_error(y_true, y_pred)
            r2 = r2_score(y_true, y_pred)
            all_mse.append(mse)
            all_r2.append(r2)

            print(f'MSE for well {well_name} with Bidirectional LSTM: {mse}')
            print(f'R^2 for well {well_name} with Bidirectional LSTM: {r2}')

        filtered_mse = [mse for mse in all_mse if not np.isnan(mse) and mse > 0]
        filtered_r2 = [r2 for r2 in all_r2 if not np.isnan(r2) and r2 > 0]

        if filtered_mse:
            avg_mse = np.mean(filtered_mse)
        else:
            avg_mse = float('nan')

        if filtered_r2:
            avg_r2 = np.mean(filtered_r2)
        else:
            avg_r2 = float('nan')

        print(f'Average MSE: {avg_mse}')
        print(f'Average R^2: {avg_r2}')

        return pd.DataFrame(results)

In [None]:
data_analyzer = DataAnalyzer('https://raw.githubusercontent.com/ALTEMIS-DOE/pylenm/master/notebooks/data/FASB_Data_thru_3Q2015_Reduced_Demo.csv')

# **LSTM model**

In [None]:
forecast_results = data_analyzer.lstm_slope_forecast(
    well_name_column='STATION_ID',
    analyte_name_column='ANALYTE_NAME',
    analyte_name='STRONTIUM-90'
)
print(forecast_results)

In [None]:
forecast_results_2 = data_analyzer.lstm_slope_forecast(
    well_name_column='STATION_ID',
    analyte_name_column='ANALYTE_NAME',
    analyte_name='IODINE-129'
)
print(forecast_results_2)