### Imports

In [1]:
import pandas as pd
import numpy as np
#from scipy import 
#from sklearn import
from datetime import datetime
import re
# pd.set_option("display.max_rows", 100)
# from IPython.core.display import display

### General variables

In [2]:
data_folder = '../../../Data/'
print_text_result = False
raw_data_names = ['pit1_data-2022', 'pit2_data-2022', 'pit3_data-2022', 'pit4_data-2022',
                  'pit1_data-2023', 'pit2_data-2023', 'pit3_data-2023', 'pit4_data-2023']
clean_data_names = ['VII_PIT1_2022', 'VII_PIT2_2022', 'VII_PIT3_2022', 'VII_PIT4_2022']
df_pairs = [('pit1_data-2022', 'VII_PIT1_2022'), ('pit2_data-2022', 'VII_PIT2_2022'), ('pit3_data-2022', 'VII_PIT3_2022'), ('pit4_data-2022', 'VII_PIT4_2022'),
            ('pit1_data-2023', None), ('pit2_data-2023', None), ('pit3_data-2023', None), ('pit4_data-2023', None)]
col_types = ['int64', 'int64', 'int64', 'int64', 'int64', 'int64', 'int64', 'float64', 'int64', 'int64',
             'float64', 'int64', 'int64', 'float64', 'int64', 'int64', 'float64', 'int64', 'int64', 'float64',
             'int64', 'float64', 'float64', 'float64', 'float64', 'int64', 'float64', 'float64', 'float64', 'float64',
             'float64', 'float64', 'float64', 'float64', 'float64', 'int64', 'bool', 'bool', 'int64', 'float64',
             'float64', 'float64', 'float64', 'float64', 'float64', 'float64', 'bool']

### Helper functions

In [3]:

def print_result(header: str, init_count: int, new_count: int):
    print(header)
    print(f'\tinit row cnt: {init_count}')
    print(f'\t# of rows deleted: {init_count - new_count}')
    print(f'\tresult row count : {new_count}')

def remove_pit_suffix(name: str) -> str:
    """
    Remove suffix '_pit<number>' from header
    """
    re_match = re.search(r'_pit\d+$', name)
    if re_match:
        name = name[:re_match.start()]
    return name

def filter_dates(df: pd.DataFrame, file_name: str) -> int:
    init_row_cnt = len(df)
    start_date_2023 = datetime(2023, 1,1, 0, 0, 0)

    if '2022' in file_name:
        df.drop(df[df['TIMESTAMP'] >= start_date_2023].index, inplace=True)
    else:
        df.drop(df[df['TIMESTAMP'] < start_date_2023].index, inplace=True)

    if print_text_result: print_result('FILTERING DATES BY YEAR:', init_row_cnt, len(df))

    return (init_row_cnt - len(df))

def filter_duplicate_date(df: pd.DataFrame) -> int:
    init_row_cnt = len(df)
    
    df.drop_duplicates(subset='TIMESTAMP', inplace=True)

    if print_text_result: print_result('FILTERING DUPLICATE DATES:', init_row_cnt, len(df))

    return (init_row_cnt - len(df))

def filter_missing_values(df: pd.DataFrame, file_name: str, remove_empy=True) -> int:
    init_row_cnt = df.shape[0]
    if remove_empy and 'VII' not in file_name:
        df.dropna(inplace=True)
    # TODO else:
        # filter cleaned data
        # fill missing data

    if print_text_result: print_result('FILTERING MISSING VALUES:', init_row_cnt, len(df))

    return (init_row_cnt - len(df))

def filter_df(df: pd.DataFrame, file_name: str) -> dict:
    """
    Filters the given DataFrame by removing specific rows
    """
    diff_dict = dict()
    diff_dict['dates_removed'] = filter_dates(df, file_name)
    diff_dict['duplicates_removed'] = filter_duplicate_date(df)
    diff_dict['missing_data_removed'] = filter_missing_values(df, file_name)
    return diff_dict

def fix_types(df: pd.DataFrame, file_name: str) -> pd.DataFrame:
    """
    Fix data types from given DataFrame
    """
    if 'VII' in file_name:
        return df
    for col_type, col_name in zip(col_types, list(df.columns.array)[1:]):
        if col_type == 'int64':
            df[col_name] = df[col_name].astype('float64').astype('int64')
        elif col_type == 'float64':
            df[col_name] = df[col_name].astype('float64')
        else:
            df[col_name] = df[col_name].astype('bool')
    return df

def add_pit_column(df: pd.DataFrame, file_name: str) -> pd.DataFrame:
    if 'VII' in file_name:
        return df
    pit_n = file_name[3]
    df['pit_number'] = int(pit_n)
    return df

def merge_raw_cleaned(raw_dfs, clean_dfs, df_pairs: list[(str, str)] = df_pairs):
    merged_dfs = []
    for pair in df_pairs:
        raw = raw_dfs[pair[0]]
        if pair[1] is None:
            for x in range(1,6):
                raw[f'Redox_error_flag({x})'] = False
            raw['Redox_error_flag_available'] = False
            merged_dfs.append(raw)
        else:
            cleaned = clean_dfs[pair[1]]
            for x in range(1,6):
                cleaned[f'Redox_error_flag({x})'] = (cleaned['Redox_error_flag'] == True) & (cleaned[f'Redox_Avg({x})'].isna())
            cleaned = cleaned.drop(['Redox_error_flag'], axis=1)
            merged = raw.merge(
                cleaned[['TIMESTAMP', 'Redox_error_flag(1)', 'Redox_error_flag(2)', 'Redox_error_flag(3)', 'Redox_error_flag(4)', 'Redox_error_flag(5)']],
                how='left',
                left_on='TIMESTAMP',
                right_on='TIMESTAMP'
            )
            merged['Redox_error_flag_available'] = True
            merged_dfs.append(merged)
    return merged_dfs

def add_redox_log_cols(df: pd.DataFrame) -> pd.DataFrame:
    for x in range(1,6):
        df[f'neg_log({x})'] = df[f'Redox_Avg({x})'] < 0
        df[f'log_redox({x})'] = np.log(np.abs(df[f'Redox_Avg({x})']))
    return df

def load_data(file_names: list[str], data_folder: str) -> dict[str, pd.DataFrame]:
    """
    Load data from given file names
    :param file_names: list of file names
    :param data_folder: folder name where data is located
    :return: dictionary of DataFrames
    """
    dfs = dict()
    report_df = pd.DataFrame()

    for file_name in file_names:
        df = pd.read_csv(data_folder+file_name+'.csv', parse_dates=['TIMESTAMP'])
        df.rename(mapper=remove_pit_suffix, axis='columns', inplace=True)
            
        if print_text_result: print(f'===== {file_name} =====')

        stats = filter_df(df, file_name)
        df = fix_types(df, file_name)
        df = add_pit_column(df, file_name)
        df = add_redox_log_cols(df)
        report_df = pd.concat([report_df, pd.DataFrame(stats, index=[file_name])])
        dfs[file_name] = df

        if print_text_result: print('\n')

    report_df = report_df.assign(Total = lambda x: (x.sum(axis=1)))
    print(report_df)

    return dfs

### Load data

In [4]:
raw_data = load_data(raw_data_names, data_folder)
clean_data = load_data(clean_data_names, data_folder)

  raw_data = load_data(raw_data_names, data_folder)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  raw_data = load_data(raw_data_names, data_folder)
  raw_data = load_data(raw_data_names, data_folder)
  raw_data = load_data(raw_data_names, data_folder)
  raw_data = load_data(raw_data_names, data_folder)


                dates_removed  duplicates_removed  missing_data_removed  Total
pit1_data-2022              1                  12                   267    280
pit2_data-2022              1                  24                  4348   4373
pit3_data-2022              1                  12                   167    180
pit4_data-2022              1                  12                   125    138
pit1_data-2023            287                  15                   279    581
pit2_data-2023           3248               25552                  7215  36015
pit3_data-2023            287                  88                   279    654
pit4_data-2023            287                 196                  1143   1626


  clean_data = load_data(clean_data_names, data_folder)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  clean_data = load_data(clean_data_names, data_folder)


               dates_removed  duplicates_removed  missing_data_removed  Total
VII_PIT1_2022              1                 192                     0    193
VII_PIT2_2022              1                  24                     0     25
VII_PIT3_2022              1                  12                     0     13
VII_PIT4_2022              1                  12                     0     13


  clean_data = load_data(clean_data_names, data_folder)


### Check data

In [5]:
raw_data['pit1_data-2022'].head()

Unnamed: 0,TIMESTAMP,RECORD,Redox_Avg(1),Redox_Avg(2),Redox_Avg(3),Redox_Avg(4),Redox_Avg(5),CCVWC_Avg(1),Temp_T12_Avg(1),EC_Avg(1),...,WC2,WC3,WC4,WC5,pit_number,log_redox(1),log_redox(2),log_redox(3),log_redox(4),log_redox(5)
0,2022-04-12 09:00:00,31570,138,301,176,84,61,2404,0.2,79,...,0.393623,0.435904,0.45879,0.504951,1,4.927254,5.70711,5.170484,4.430817,4.110874
1,2022-04-12 09:05:00,31571,138,301,176,84,61,2405,0.2,79,...,0.393623,0.435904,0.458402,0.504563,1,4.927254,5.70711,5.170484,4.430817,4.110874
2,2022-04-12 09:10:00,31572,138,301,176,84,61,2405,0.2,80,...,0.393623,0.435904,0.458402,0.504951,1,4.927254,5.70711,5.170484,4.430817,4.110874
3,2022-04-12 09:15:00,31573,138,302,175,84,61,2405,0.2,80,...,0.394011,0.435904,0.45879,0.504951,1,4.927254,5.710427,5.164786,4.430817,4.110874
4,2022-04-12 09:20:00,31574,138,301,175,84,61,2405,0.2,80,...,0.393623,0.435904,0.45879,0.504951,1,4.927254,5.70711,5.164786,4.430817,4.110874


### Dtypes in each dataframe

    NOTE: Cleaned data dtypes not changed yet. Need to think how to deal with missing values for column which should be converted to int64 from original float64

In [6]:
i = 1
print('\t\t\t'+'1\t\t'+'\t'.join(str(a) for a in [*range(2,49)]))
for t1 in zip([*raw_data.items(), *clean_data.items()]):
    print(f'{t1[0][0]}\t{i}\t'+'\t'.join(str(x) for x in t1[0][1].dtypes.array))
    i +=1

			1		2	3	4	5	6	7	8	9	10	11	12	13	14	15	16	17	18	19	20	21	22	23	24	25	26	27	28	29	30	31	32	33	34	35	36	37	38	39	40	41	42	43	44	45	46	47	48
pit1_data-2022	1	datetime64[ns]	int64	int64	int64	int64	int64	int64	int64	float64	int64	int64	float64	int64	int64	float64	int64	int64	float64	int64	int64	float64	int64	float64	float64	float64	float64	int64	float64	float64	float64	float64	float64	float64	float64	float64	float64	int64	bool	bool	int64	float64	float64	float64	float64	float64	float64	float64	int64	float64	float64	float64	float64	float64
pit2_data-2022	2	datetime64[ns]	int64	int64	int64	int64	int64	int64	int64	float64	int64	int64	float64	int64	int64	float64	int64	int64	float64	int64	int64	float64	int64	float64	float64	float64	float64	int64	float64	float64	float64	float64	float64	float64	float64	float64	float64	int64	bool	bool	int64	float64	float64	float64	float64	float64	float64	float64	int64	float64	float64	float64	float64	float64
pit3_data-2022	3	datetime64[ns]	int64	int64	int64	int64	i

### Remove timestamps from raw data that do not match in cleaned data

In [7]:
for pair in df_pairs:
    raw_data_name, clean_data_name = pair[0], pair[1]
    if clean_data_name:
        print(f'Checking {raw_data_name} vs {clean_data_name}')
        clean_timestamps = clean_data[clean_data_name]['TIMESTAMP'].to_numpy()
        prev_row_count = len(raw_data[raw_data_name])
        raw_data[raw_data_name] = raw_data[raw_data_name].loc[raw_data[raw_data_name]['TIMESTAMP'].isin(clean_timestamps) == True]
        print(f'\t Rows removed {prev_row_count-len(raw_data[raw_data_name])}')

Checking pit1_data-2022 vs VII_PIT1_2022
	 Rows removed 0
Checking pit2_data-2022 vs VII_PIT2_2022
	 Rows removed 0
Checking pit3_data-2022 vs VII_PIT3_2022
	 Rows removed 0
Checking pit4_data-2022 vs VII_PIT4_2022
	 Rows removed 1916


### Add redox error flag columns and merge them with raw data

In [60]:
merged_dfs = merge_raw_cleaned(raw_data, clean_data, df_pairs)

### Combine all raw data

In [61]:
raw = pd.DataFrame()
for df in merged_dfs:
    raw = pd.concat([raw, df])

training_folder_path = f'{data_folder}/Training/'
raw.to_csv(f'{training_folder_path}Raw_training_data_full.csv')