In [1]:
import pandas as pd
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import os
import pickle

## General variables

In [2]:
# Path to get data to be versioned
data_folder = '../../../Data/Training'
# Path to create the versioned data
pickle_data_path = f'{data_folder}/wavelet_pickle_data'

pits = list(range(1,5))
redox_sensors = list(range(1,6))
dt_2023 = datetime(2023, 1,1, 0, 0, 0)

## Support functions

In [3]:
def create_folder(path: str):
    if not os.path.exists(path):
        os.makedirs(path)

## Create directories

In [4]:
# Create root folder
create_folder(pickle_data_path)
for sub_folder in ['2022', '2022_sensors', '2023', '2023_sensors']:
    # Create main data folder
    main_path = f'{pickle_data_path}/{sub_folder}'
    create_folder(main_path)
    # Create scaled data folder
    scaled_path = f'{main_path}/Scaled'
    create_folder(scaled_path)
    # Create prediction data folders
    if '2022' in sub_folder:
        create_folder(f'{main_path}/Prediction')
        create_folder(f'{scaled_path}/Prediction')

# Generate data

Training data excludes redox values >=900. We chose this approach so that the ML model only focuses on finding redox_errors (fluctuation on normal scale values).

## Training data support functions

In [5]:
def get_y(data: pd.DataFrame):
    """Return target value for all sensor training"""
    return data[data['TIMESTAMP'] < dt_2023].loc[:,[f'Redox_error_flag']]

def get_X(data: pd.DataFrame, get_2023: bool = False):
    """Return training features for all sensor training"""
    cols_to_remove = ['log_redox(1)', 'log_redox(2)', 'log_redox(3)', 'log_redox(4)', 'log_redox(5)',
                      'Redox_error_flag(1)', 'Redox_error_flag(2)', 'Redox_error_flag(3)', 'Redox_error_flag(4)',
                      'Redox_error_flag(5)', 'Redox_error_flag_available', 'Redox_error_flag', 'TIMESTAMP_DIFF']

    if get_2023:
        return data.loc[data['TIMESTAMP'] >= dt_2023].loc[:,~data.columns.isin(cols_to_remove)]
    return data.loc[data['TIMESTAMP'] < dt_2023].loc[:,~data.columns.isin(cols_to_remove)]

def get_y_sensor(data: pd.DataFrame, redox_sensor: int):
    """Return target value for specific sensor training"""
    return data[data['TIMESTAMP'] < dt_2023].loc[:,[f'Redox_error_flag({redox_sensor})']]

def get_X_sensor(data: pd.DataFrame, redox_sensor: int, get_2023: bool = False):
    """Return training features for specific sensor training"""
    cols = [f'Redox_Avg({redox_sensor})', f'Temp_T12_Avg({redox_sensor})', f'EC_Avg({redox_sensor})', f'Matric_potential_Avg({redox_sensor})',
            'Water_level_Avg', 'Temp_ottpls_Avg', 'BatterymV_Min', f'WC{redox_sensor}',
            f'Redox_Avg({redox_sensor})_sigma_b_24', f'Redox_Avg({redox_sensor})_sigma_f_24',
            f'Redox_Avg({redox_sensor})_sigma_b_12', f'Redox_Avg({redox_sensor})_sigma_f_12',
            f'Wave_period_0.5({redox_sensor})', f'Wave_period_0.7({redox_sensor})', f'Wave_period_0.9({redox_sensor})',
            f'Wave_period_1.1({redox_sensor})', f'Wave_period_1.5({redox_sensor})', f'Wave_period_1.9({redox_sensor})',
            f'Wave_period_2.5({redox_sensor})', f'Wave_period_3.3({redox_sensor})', f'Wave_period_4.4({redox_sensor})',
            'TIMESTAMP', 'pit_number']

    if get_2023:
        return data.loc[data['TIMESTAMP'] >= dt_2023].loc[:,cols]
    return data.loc[data['TIMESTAMP'] < dt_2023].loc[:,cols]

def get_data():
    """Return full data with custom added features"""
    return pd.read_csv(f'{data_folder}/Raw_training_data_full_wavelet.csv', parse_dates=['TIMESTAMP'])

def get_training_data(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.DataFrame, y_test: pd.DataFrame, X_train_scaled: pd.DataFrame, X_test_scaled: pd.DataFrame):
    """Return data where high values (>=900) are removed. This makes sure that we only target redox errors and not also too high (faulty) values"""
    X_train.drop(['TIMESTAMP', 'pit_number'], axis=1, inplace=True)
    X_test.drop(['TIMESTAMP', 'pit_number'], axis=1, inplace=True)
    X_train_scaled.drop(['TIMESTAMP', 'pit_number'], axis=1, inplace=True)
    X_test_scaled.drop(['TIMESTAMP', 'pit_number'], axis=1, inplace=True)
    train_ids = X_train.loc[(X_train['Redox_Avg(1)'] <= 900) & (X_train['Redox_Avg(2)'] <= 900) & (X_train['Redox_Avg(3)'] <= 900) & (X_train['Redox_Avg(4)'] <= 900) & (X_train['Redox_Avg(5)'] <= 900)].index.array
    test_ids = X_test.loc[(X_test['Redox_Avg(1)'] <= 900) & (X_test['Redox_Avg(2)'] <= 900) & (X_test['Redox_Avg(3)'] <= 900) & (X_test['Redox_Avg(4)'] <= 900) & (X_test['Redox_Avg(5)'] <= 900)].index.array
    return (X_train.loc[train_ids], X_test.loc[test_ids], y_train.loc[train_ids], y_test.loc[test_ids], X_train_scaled.loc[train_ids], X_test_scaled.loc[test_ids])

def get_training_data_sensor(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.DataFrame, y_test: pd.DataFrame, X_train_scaled: pd.DataFrame, X_test_scaled: pd.DataFrame, redox_sensor: int):
    """Return sensors data where high values (>=900) are removed. This makes sure that we only target redox errors and not also too high (faulty) values"""
    X_train.drop(['TIMESTAMP', 'pit_number'], axis=1, inplace=True)
    X_test.drop(['TIMESTAMP', 'pit_number'], axis=1, inplace=True)
    X_train_scaled.drop(['TIMESTAMP', 'pit_number'], axis=1, inplace=True)
    X_test_scaled.drop(['TIMESTAMP', 'pit_number'], axis=1, inplace=True)
    train_ids = X_train.loc[(X_train[f'Redox_Avg({redox_sensor})'] <= 900)].index.array
    test_ids = X_test.loc[(X_test[f'Redox_Avg({redox_sensor})'] <= 900)].index.array
    return (X_train.loc[train_ids], X_test.loc[test_ids], y_train.loc[train_ids], y_test.loc[test_ids], X_train_scaled.loc[train_ids], X_test_scaled.loc[test_ids])

def custom_scaler(df: pd.DataFrame, redox_sensor: int = 0):
    """MinMax scaling for features"""
    scaled_data = df.copy()
    features_to_scale = []
    if redox_sensor > 0:
        features_to_scale = [f'Redox_Avg({redox_sensor})', f'Temp_T12_Avg({redox_sensor})', f'EC_Avg({redox_sensor})', f'Matric_potential_Avg({redox_sensor})',
                            'Water_level_Avg', 'Temp_ottpls_Avg', 'BatterymV_Min', f'WC{redox_sensor}', f'Redox_Avg({redox_sensor})_sigma_b_24',
                            f'Redox_Avg({redox_sensor})_sigma_f_24', f'Redox_Avg({redox_sensor})_sigma_b_12', f'Redox_Avg({redox_sensor})_sigma_f_12',
                            f'Wave_period_0.5({redox_sensor})', f'Wave_period_0.7({redox_sensor})', f'Wave_period_0.9({redox_sensor})',
                            f'Wave_period_1.1({redox_sensor})', f'Wave_period_1.5({redox_sensor})', f'Wave_period_1.9({redox_sensor})',
                            f'Wave_period_2.5({redox_sensor})', f'Wave_period_3.3({redox_sensor})', f'Wave_period_4.4({redox_sensor})']
    else:
        features_to_scale = ['Water_level_Avg', 'Temp_ottpls_Avg', 'BatterymV_Min',
                             f'Redox_Avg(1)', f'Temp_T12_Avg(1)', f'EC_Avg(1)', f'Matric_potential_Avg(1)', f'WC1', f'Redox_Avg(1)_sigma_b_24', f'Redox_Avg(1)_sigma_f_24', f'Redox_Avg(1)_sigma_b_12', f'Redox_Avg(1)_sigma_f_12',
                             f'Wave_period_0.5(1)', f'Wave_period_0.7(1)', f'Wave_period_0.9(1)',f'Wave_period_1.1(1)', f'Wave_period_1.5(1)', f'Wave_period_1.9(1)', f'Wave_period_2.5(1)', f'Wave_period_3.3(1)', f'Wave_period_4.4(1)',
                             f'Redox_Avg(2)', f'Temp_T12_Avg(2)', f'EC_Avg(2)', f'Matric_potential_Avg(2)', f'WC2', f'Redox_Avg(2)_sigma_b_24', f'Redox_Avg(2)_sigma_f_24', f'Redox_Avg(2)_sigma_b_12', f'Redox_Avg(2)_sigma_f_12',
                             f'Wave_period_0.5(2)', f'Wave_period_0.7(2)', f'Wave_period_0.9(2)',f'Wave_period_1.1(2)', f'Wave_period_1.5(2)', f'Wave_period_1.9(2)', f'Wave_period_2.5(2)', f'Wave_period_3.3(2)', f'Wave_period_4.4(2)',
                             f'Redox_Avg(3)', f'Temp_T12_Avg(3)', f'EC_Avg(3)', f'Matric_potential_Avg(3)', f'WC3', f'Redox_Avg(3)_sigma_b_24', f'Redox_Avg(3)_sigma_f_24', f'Redox_Avg(3)_sigma_b_12', f'Redox_Avg(3)_sigma_f_12',
                             f'Wave_period_0.5(3)', f'Wave_period_0.7(3)', f'Wave_period_0.9(3)',f'Wave_period_1.1(3)', f'Wave_period_1.5(3)', f'Wave_period_1.9(3)', f'Wave_period_2.5(3)', f'Wave_period_3.3(3)', f'Wave_period_4.4(3)',
                             f'Redox_Avg(4)', f'Temp_T12_Avg(4)', f'EC_Avg(4)', f'Matric_potential_Avg(4)', f'WC4', f'Redox_Avg(4)_sigma_b_24', f'Redox_Avg(4)_sigma_f_24', f'Redox_Avg(4)_sigma_b_12', f'Redox_Avg(4)_sigma_f_12',
                             f'Wave_period_0.5(4)', f'Wave_period_0.7(4)', f'Wave_period_0.9(4)',f'Wave_period_1.1(4)', f'Wave_period_1.5(4)', f'Wave_period_1.9(4)', f'Wave_period_2.5(4)', f'Wave_period_3.3(4)', f'Wave_period_4.4(4)',
                             f'Redox_Avg(5)', f'Temp_T12_Avg(5)', f'EC_Avg(5)', f'Matric_potential_Avg(5)', f'WC5', f'Redox_Avg(5)_sigma_b_24', f'Redox_Avg(5)_sigma_f_24', f'Redox_Avg(5)_sigma_b_12', f'Redox_Avg(5)_sigma_f_12',
                             f'Wave_period_0.5(5)', f'Wave_period_0.7(5)', f'Wave_period_0.9(5)',f'Wave_period_1.1(5)', f'Wave_period_1.5(5)', f'Wave_period_1.9(5)', f'Wave_period_2.5(5)', f'Wave_period_3.3(5)', f'Wave_period_4.4(5)']

    scaled_data[features_to_scale] = MinMaxScaler().fit_transform(df[features_to_scale])

    return scaled_data

## Get data and train test split

In [6]:
full_data = get_data()

## 2022 Full train test split

In [7]:
X = get_X(full_data)
y = get_y(full_data)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1, stratify=y)

# Scale data
X_test_scaled = custom_scaler(X_test)
X_train_scaled = custom_scaler(X_train)

# Save prediction data
X_test.to_pickle(f'{pickle_data_path}/2022/Prediction/X_test.pkl')
y_test.to_pickle(f'{pickle_data_path}/2022/Prediction/y_test.pkl')

X_test_scaled.to_pickle(f'{pickle_data_path}/2022/Scaled/Prediction/X_test_scaled.pkl')

# Get training data
X_train, X_test, y_train, y_test, X_train_scaled, X_test_scaled = get_training_data(X_train, X_test, y_train, y_test, X_train_scaled, X_test_scaled)

# Save training data
X_train.to_pickle(f'{pickle_data_path}/2022/X_train.pkl')
X_test.to_pickle(f'{pickle_data_path}/2022/X_test.pkl')
y_train.to_pickle(f'{pickle_data_path}/2022/y_train.pkl')
y_test.to_pickle(f'{pickle_data_path}/2022/y_test.pkl')

X_train_scaled.to_pickle(f'{pickle_data_path}/2022/Scaled/X_train_scaled.pkl')
X_test_scaled.to_pickle(f'{pickle_data_path}/2022/Scaled/X_test_scaled.pkl')

## 2022 specific sensors train test split

In [8]:
for redox_sensor in redox_sensors:
    X = get_X_sensor(full_data, redox_sensor)
    y = get_y_sensor(full_data, redox_sensor)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1, stratify=y)
    
    # Scale data
    X_test_scaled = custom_scaler(X_test, redox_sensor)
    X_train_scaled = custom_scaler(X_train, redox_sensor)

    # Save prediction data
    X_test.to_pickle(f'{pickle_data_path}/2022_sensors/Prediction/X_test_sensor_{redox_sensor}.pkl')
    y_test.to_pickle(f'{pickle_data_path}/2022_sensors/Prediction/y_test_sensor_{redox_sensor}.pkl')

    X_test_scaled.to_pickle(f'{pickle_data_path}/2022_sensors/Scaled/Prediction/X_test_scaled_sensor_{redox_sensor}.pkl')

    # Get training data
    X_train, X_test, y_train, y_test, X_train_scaled, X_test_scaled = get_training_data_sensor(X_train, X_test, y_train, y_test, X_train_scaled, X_test_scaled, redox_sensor)

    # Save training data
    X_train.to_pickle(f'{pickle_data_path}/2022_sensors/X_train_sensor_{redox_sensor}.pkl')
    X_test.to_pickle(f'{pickle_data_path}/2022_sensors/X_test_sensor_{redox_sensor}.pkl')
    y_train.to_pickle(f'{pickle_data_path}/2022_sensors/y_train_sensor_{redox_sensor}.pkl')
    y_test.to_pickle(f'{pickle_data_path}/2022_sensors/y_test_sensor_{redox_sensor}.pkl')

    X_train_scaled.to_pickle(f'{pickle_data_path}/2022_sensors/Scaled/X_train_scaled_sensor_{redox_sensor}.pkl')
    X_test_scaled.to_pickle(f'{pickle_data_path}/2022_sensors/Scaled/X_test_scaled_sensor_{redox_sensor}.pkl')

## Redox error flag counts in training and testing data (For full data)

In [9]:
y_train = f'{pickle_data_path}/2022/y_train.pkl'
train_y = pickle.load(open(y_train, 'rb'))
training_y = len(train_y[train_y[f'Redox_error_flag']==True])

y_test = f'{pickle_data_path}/2022/y_test.pkl'
test_y = pickle.load(open(y_test, 'rb'))
testing_y = len(test_y[test_y[f'Redox_error_flag']==True])

print(f'\nTraining_Y: {training_y}\nTesting_Y: {testing_y}\nPercentage in testing: {testing_y/training_y}\n')


Training_Y: 7336
Testing_Y: 3157
Percentage in testing: 0.4303435114503817



## Redox error flag counts in training and testing data (For sensor separated data)

In [10]:
for sensor in redox_sensors:
    y_train = f'{pickle_data_path}/2022_sensors/y_train_sensor_{sensor}.pkl'
    train_y = pickle.load(open(y_train, 'rb'))
    training_y = len(train_y[train_y[f'Redox_error_flag({sensor})']==True])

    y_test = f'{pickle_data_path}/2022_sensors/y_test_sensor_{sensor}.pkl'
    test_y = pickle.load(open(y_test, 'rb'))
    testing_y = len(test_y[test_y[f'Redox_error_flag({sensor})']==True])

    print(f'Sensor {sensor}: \n\tTraining_Y: {training_y}\n\tTesting_Y: {testing_y}\n\tPercentage in testing: {testing_y/training_y}\n')

Sensor 1: 
	Training_Y: 2494
	Testing_Y: 1069
	Percentage in testing: 0.4286287089013633

Sensor 2: 
	Training_Y: 8806
	Testing_Y: 3774
	Percentage in testing: 0.42857142857142855

Sensor 3: 
	Training_Y: 9571
	Testing_Y: 4104
	Percentage in testing: 0.4287953191933967

Sensor 4: 
	Training_Y: 9599
	Testing_Y: 4116
	Percentage in testing: 0.4287946661110532

Sensor 5: 
	Training_Y: 9625
	Testing_Y: 4126
	Percentage in testing: 0.42867532467532465



# Get 2023 prediction data

## 2023 full Testing

In [11]:
X_2023 = get_X(full_data, True)
X_2023.to_pickle(f'{pickle_data_path}/2023/test.pkl')

X_2023_scaled = custom_scaler(X_2023)

X_2023_scaled.to_pickle(f'{pickle_data_path}/2023/Scaled/test_scaled.pkl')

## 2023 specific sensors Testing

In [12]:
for redox_sensor in redox_sensors:
    X_2023 = get_X_sensor(full_data, redox_sensor, True)
    X_2023.to_pickle(f'{pickle_data_path}/2023_sensors/test_sensor_{redox_sensor}.pkl')

    X_2023_scaled = custom_scaler(X_2023, redox_sensor)

    X_2023_scaled.to_pickle(f'{pickle_data_path}/2023_sensors/Scaled/test_scaled_sensor_{redox_sensor}.pkl')