In [1]:
import pandas as pd
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import os

In [2]:
training_data_folder = '../../../Data/Training'

In [3]:
def get_y(data: pd.DataFrame, redox_sensor: int):
    return data.loc[:,[f'Redox_error_flag({redox_sensor})']]

def get_X(data: pd.DataFrame, redox_sensor: int):
    cols = ['TIMESTAMP', f'Redox_Avg({redox_sensor})', f'Temp_T12_Avg({redox_sensor})', f'EC_Avg({redox_sensor})', f'Matric_potential_Avg({redox_sensor})',
            'Water_level_Avg', 'Temp_ottpls_Avg', 'BatterymV_Min', f'WC{redox_sensor}', 'pit_number',
            f'Redox_error_flag({redox_sensor})', 'Redox_error_flag', f'Redox_Avg({redox_sensor})_sigma_b_24', f'Redox_Avg({redox_sensor})_sigma_f_24',
            f'Redox_Avg({redox_sensor})_sigma_b_12', f'Redox_Avg({redox_sensor})_sigma_f_12']
    return data.loc[:,cols]

def get_train_2022_test_2023_split(data: pd.DataFrame, redox_sensor: int, random_state: int):
    dt_2023 = datetime(2023, 1,1, 0, 0, 0)
    data_2022 = data.loc[data['TIMESTAMP'] < dt_2023].sample(frac=1, random_state=random_state)
    data_2023 = data.loc[data['TIMESTAMP'] >= dt_2023].sample(frac=1, random_state=random_state)
    X_train = get_X(data_2022, redox_sensor)
    y_train = get_y(data_2022, redox_sensor)
    X_test = get_X(data_2023, redox_sensor)
    y_test = get_y(data_2023, redox_sensor)
    return (X_train, X_test, y_train, y_test)

def get_data():
    return pd.read_csv(f'{training_data_folder}/Raw_training_data_full.csv', parse_dates=['TIMESTAMP'])

In [4]:
data = get_data()

  data = get_data()


## Removed columns

In [5]:
removed_cols = ['Redox_Avg({num != redox_sensor})', 'Temp_T12_Avg({num != redox_sensor})', 'EC_Avg({num != redox_sensor})',
                    'Matric_potential_Avg({num != redox_sensor})', 'WC{num != redox_sensor}', 'log_redox(all)',
                    'Redox_error_flag({num != redox_sensor})', 'Redox_error_flag_available', 'TIMESTAMP_DIFF',
                    'Redox_Avg({num != redox_sensor})_sigma_b_24', 'Redox_Avg({num != redox_sensor})_sigma_f_24',
                    'Redox_Avg({num != redox_sensor})_sigma_b_12', 'Redox_Avg({num != redox_sensor})_sigma_f_12']
removed_cols

['Redox_Avg({num != redox_sensor})',
 'Temp_T12_Avg({num != redox_sensor})',
 'EC_Avg({num != redox_sensor})',
 'Matric_potential_Avg({num != redox_sensor})',
 'WC{num != redox_sensor}',
 'log_redox(all)',
 'Redox_error_flag({num != redox_sensor})',
 'Redox_error_flag_available',
 'TIMESTAMP_DIFF',
 'Redox_Avg({num != redox_sensor})_sigma_b_24',
 'Redox_Avg({num != redox_sensor})_sigma_f_24',
 'Redox_Avg({num != redox_sensor})_sigma_b_12',
 'Redox_Avg({num != redox_sensor})_sigma_f_12']

## Scaler

In [6]:
def custom_scaler(df: pd.DataFrame, redox_sensor: int):
    scaled_data = df.copy()
    features_to_scale = [f'Redox_Avg({redox_sensor})', f'Temp_T12_Avg({redox_sensor})', f'EC_Avg({redox_sensor})', f'Matric_potential_Avg({redox_sensor})',
                         'Water_level_Avg', 'Temp_ottpls_Avg', 'BatterymV_Min', f'WC{redox_sensor}', f'Redox_Avg({redox_sensor})_sigma_b_24',
                         f'Redox_Avg({redox_sensor})_sigma_f_24', f'Redox_Avg({redox_sensor})_sigma_b_12', f'Redox_Avg({redox_sensor})_sigma_f_12']

    scaled_data[features_to_scale] = MinMaxScaler().fit_transform(df[features_to_scale])

    return scaled_data

## Create directories

In [7]:
pickle_data_path = f'{training_data_folder}/pickle_data'

def create_folder(path: str):
    if not os.path.exists(path):
        os.makedirs(path)

create_folder(pickle_data_path)
for sub_folder in ['Mixed', 'Year']:
    path = f'{pickle_data_path}/{sub_folder}'
    create_folder(path)
    sub_path = f'{path}/Scaled'
    create_folder(sub_path)

## Mixed train test split

In [8]:
for i in range(1,6):
    X_train, X_test, y_train, y_test = train_test_split(get_X(data, i), get_y(data, i), test_size=0.30, random_state=1)
    X_train.to_pickle(f'{pickle_data_path}/Mixed/X_train_mixed_sensor_{i}.pkl')
    X_test.to_pickle(f'{pickle_data_path}/Mixed/X_test_mixed_sensor_{i}.pkl')
    y_train.to_pickle(f'{pickle_data_path}/Mixed/y_train_mixed_sensor_{i}.pkl')
    y_test.to_pickle(f'{pickle_data_path}/Mixed/y_test_mixed_sensor_{i}.pkl')

    X_train_scaled = custom_scaler(X_train, i)
    X_test_scaled = custom_scaler(X_test, i)

    X_train_scaled.to_pickle(f'{pickle_data_path}/Mixed/Scaled/X_train_scaled_mixed_sensor_{i}.pkl')
    X_test_scaled.to_pickle(f'{pickle_data_path}/Mixed/Scaled/X_test_scaled_mixed_sensor_{i}.pkl')

## 2022 Training & 2023 Testing

In [9]:
for i in range(1,6):
    X_train, X_test, y_train, y_test = get_train_2022_test_2023_split(data, i, random_state=1)

    X_train.to_pickle(f'{pickle_data_path}/Year/X_train_2022_sensor_{i}.pkl')
    X_test.to_pickle(f'{pickle_data_path}/Year/X_test_2023_sensor_{i}.pkl')
    y_train.to_pickle(f'{pickle_data_path}/Year/y_train_2022_sensor_{i}.pkl')
    y_test.to_pickle(f'{pickle_data_path}/Year/y_test_2023_sensor_{i}.pkl')

    X_train_scaled = custom_scaler(X_train, i)
    X_test_scaled = custom_scaler(X_test, i)

    X_train_scaled.to_pickle(f'{pickle_data_path}/Year/Scaled/X_train_scaled_2022_sensor_{i}.pkl')
    X_test_scaled.to_pickle(f'{pickle_data_path}/Year/Scaled/X_test_scaled_2023_sensor_{i}.pkl')