In [1]:
import pandas as pd
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import os
import pickle

## General variables

In [2]:
training_data_folder = '../../../Data/Training'
pickle_data_path = f'{training_data_folder}/pickle_data'
pits = list(range(1,5))
redox_sensors = list(range(1,6))
dt_2023 = datetime(2023, 1,1, 0, 0, 0)

## Support functions

In [3]:
def get_y(data: pd.DataFrame, redox_sensor: int):
    return data[data['TIMESTAMP'] < dt_2023].loc[:,[f'Redox_error_flag({redox_sensor})']]

def get_X(data: pd.DataFrame, redox_sensor: int, get_2023: bool = False):
    cols = ['TIMESTAMP', f'Redox_Avg({redox_sensor})', f'Temp_T12_Avg({redox_sensor})', f'EC_Avg({redox_sensor})', f'Matric_potential_Avg({redox_sensor})',
            'Water_level_Avg', 'Temp_ottpls_Avg', 'BatterymV_Min', f'WC{redox_sensor}', 'pit_number',
            f'Redox_error_flag({redox_sensor})', 'Redox_error_flag', f'Redox_Avg({redox_sensor})_sigma_b_24', f'Redox_Avg({redox_sensor})_sigma_f_24',
            f'Redox_Avg({redox_sensor})_sigma_b_12', f'Redox_Avg({redox_sensor})_sigma_f_12']
    if get_2023:
        return data.loc[data['TIMESTAMP'] >= dt_2023].loc[:,cols]
    return data.loc[data['TIMESTAMP'] < dt_2023].loc[:,cols]

def get_data():
    data = pd.read_csv(f'{training_data_folder}/Raw_training_data_full.csv', parse_dates=['TIMESTAMP'])
    data = data.loc[(data['Redox_Avg(1)'] <= 900) & (data['Redox_Avg(2)'] <= 900) & (data['Redox_Avg(3)'] <= 900) & (data['Redox_Avg(4)'] <= 900) & (data['Redox_Avg(5)'] <= 900)]
    return data

def custom_scaler(df: pd.DataFrame, redox_sensor: int):
    scaled_data = df.copy()
    features_to_scale = [f'Redox_Avg({redox_sensor})', f'Temp_T12_Avg({redox_sensor})', f'EC_Avg({redox_sensor})', f'Matric_potential_Avg({redox_sensor})',
                         'Water_level_Avg', 'Temp_ottpls_Avg', 'BatterymV_Min', f'WC{redox_sensor}', f'Redox_Avg({redox_sensor})_sigma_b_24',
                         f'Redox_Avg({redox_sensor})_sigma_f_24', f'Redox_Avg({redox_sensor})_sigma_b_12', f'Redox_Avg({redox_sensor})_sigma_f_12']

    scaled_data[features_to_scale] = MinMaxScaler().fit_transform(df[features_to_scale])

    return scaled_data

## Get full training data

In [4]:
full_data = get_data()

  full_data = get_data()


## Removed columns

In [5]:
removed_cols = ['Redox_Avg({num != redox_sensor})', 'Temp_T12_Avg({num != redox_sensor})', 'EC_Avg({num != redox_sensor})',
                    'Matric_potential_Avg({num != redox_sensor})', 'WC{num != redox_sensor}', 'log_redox(all)',
                    'Redox_error_flag({num != redox_sensor})', 'Redox_error_flag_available', 'TIMESTAMP_DIFF',
                    'Redox_Avg({num != redox_sensor})_sigma_b_24', 'Redox_Avg({num != redox_sensor})_sigma_f_24',
                    'Redox_Avg({num != redox_sensor})_sigma_b_12', 'Redox_Avg({num != redox_sensor})_sigma_f_12']
removed_cols

['Redox_Avg({num != redox_sensor})',
 'Temp_T12_Avg({num != redox_sensor})',
 'EC_Avg({num != redox_sensor})',
 'Matric_potential_Avg({num != redox_sensor})',
 'WC{num != redox_sensor}',
 'log_redox(all)',
 'Redox_error_flag({num != redox_sensor})',
 'Redox_error_flag_available',
 'TIMESTAMP_DIFF',
 'Redox_Avg({num != redox_sensor})_sigma_b_24',
 'Redox_Avg({num != redox_sensor})_sigma_f_24',
 'Redox_Avg({num != redox_sensor})_sigma_b_12',
 'Redox_Avg({num != redox_sensor})_sigma_f_12']

## Create directories

In [6]:
def create_folder(path: str):
    if not os.path.exists(path):
        os.makedirs(path)

create_folder(pickle_data_path)
for sub_folder in ['2022', '2023']:
    path = f'{pickle_data_path}/{sub_folder}'
    create_folder(path)
    if '2022' in sub_folder:
        sub_path = f'{path}/Scaled'
        create_folder(sub_path)

## 2022 train test split

In [7]:
for redox_sensor in redox_sensors:
    X = get_X(full_data, redox_sensor)
    y = get_y(full_data, redox_sensor)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1, stratify=y[f'Redox_error_flag({redox_sensor})'])
    X_train.to_pickle(f'{pickle_data_path}/2022/X_train_sensor_{redox_sensor}.pkl')
    X_test.to_pickle(f'{pickle_data_path}/2022/X_test_sensor_{redox_sensor}.pkl')
    y_train.to_pickle(f'{pickle_data_path}/2022/y_train_sensor_{redox_sensor}.pkl')
    y_test.to_pickle(f'{pickle_data_path}/2022/y_test_sensor_{redox_sensor}.pkl')

    X_train_scaled = custom_scaler(X_train, redox_sensor)
    X_test_scaled = custom_scaler(X_test, redox_sensor)

    X_train_scaled.to_pickle(f'{pickle_data_path}/2022/Scaled/X_train_scaled_sensor_{redox_sensor}.pkl')
    X_test_scaled.to_pickle(f'{pickle_data_path}/2022/Scaled/X_test_scaled_sensor_{redox_sensor}.pkl')

## 2023 Testing

In [8]:
for redox_sensor in redox_sensors:
    X_2023 = get_X(full_data, redox_sensor, True)
    X.to_pickle(f'{pickle_data_path}/2023/sensor_{redox_sensor}.pkl')

## Redox error flag counts in training and testing data

In [9]:
for sensor in redox_sensors:
    X_train = f'{pickle_data_path}/2022/X_train_sensor_{sensor}.pkl'
    train_X = pickle.load(open(X_train, 'rb'))
    training_X = len(train_X[train_X[f'Redox_error_flag({sensor})']==True])

    y_train = f'{pickle_data_path}/2022/y_train_sensor_{sensor}.pkl'
    train_y = pickle.load(open(y_train, 'rb'))
    Training_y = len(train_y[train_y[f'Redox_error_flag({sensor})']==True])

    X_test = f'{pickle_data_path}/2022/X_test_sensor_{sensor}.pkl'
    test_X = pickle.load(open(X_test, 'rb'))
    testing_X = len(test_X[test_X[f'Redox_error_flag({sensor})']==True])

    y_test = f'{pickle_data_path}/2022/y_test_sensor_{sensor}.pkl'
    test_y = pickle.load(open(y_test, 'rb'))
    testing_y = len(test_y[test_y[f'Redox_error_flag({sensor})']==True])

    print(f'Sensor {sensor}: \n\tTraining_X: {training_X}\n\tTraining_Y: {Training_y}\n\tTesting_X: {testing_X}\n\tTesting_Y: {testing_y}\n\tPercentage in testing: {testing_X/training_X}\n')

Sensor 1: 
	Training_X: 2494
	Training_Y: 2494
	Testing_X: 1069
	Testing_Y: 1069
	Percentage in testing: 0.4286287089013633

Sensor 2: 
	Training_X: 7281
	Training_Y: 7281
	Testing_X: 3120
	Testing_Y: 3120
	Percentage in testing: 0.4285125669550886

Sensor 3: 
	Training_X: 7295
	Training_Y: 7295
	Testing_X: 3126
	Testing_Y: 3126
	Percentage in testing: 0.4285126799177519

Sensor 4: 
	Training_X: 7295
	Training_Y: 7295
	Testing_X: 3126
	Testing_Y: 3126
	Percentage in testing: 0.4285126799177519

Sensor 5: 
	Training_X: 7295
	Training_Y: 7295
	Testing_X: 3126
	Testing_Y: 3126
	Percentage in testing: 0.4285126799177519

