In [10]:
import re
import pandas as pd
from datetime import datetime

In [19]:
df = pd.read_csv("data/df_hack_final_processed.csv")
df = df.astype({'MEAS_DT': 'datetime64[ns]'})

In [None]:
def get_dataframe_per_fm(df, i, j):
    non_digit_features = [col for col in df.columns.tolist() if not re.findall(r'\d+', col)]
    filtered_columns_1 = [col for col in df.columns.tolist() if re.findall(f'{i}\\.{j}', col)]
    filtered_columns_2 = [col for col in df.columns.tolist() if re.findall(f'_{i}$', col)]
    return df[non_digit_features+filtered_columns_1+filtered_columns_2]

dfd = {f"fm{i}.{j}" : get_dataframe_per_fm(df, i, j) for j in range(1,3) for i in range(1,7)}

In [21]:
test_dataframe = pd.read_csv("data/test.csv")
test_dataframe = test_dataframe.astype({'MEAS_DT': 'datetime64[ns]'})

In [22]:
dfd_test = {f"fm{i}.{j}" : get_dataframe_per_fm(test_dataframe, i, j) for j in range(1, 3) for i in range(1, 7)}

In [23]:
def train_test_split(train_, test_):
    train = pd.concat([train_, test_]).drop_duplicates(subset='MEAS_DT', keep=False)
    test = pd.merge(test_, train_, how='inner', on='MEAS_DT', suffixes=('_y', '_X'))
    return {
        "train" : train, 
        "test" : test
        }

In [24]:
dfd_tt = {k : train_test_split(df_train_fm, dfd_test[k]) for k, df_train_fm in dfd.items()}

## CatBoost

In [29]:
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import mean_absolute_error
import numpy as np

In [30]:
from collections import defaultdict

In [31]:
models = {}
mae_values = defaultdict()
mape_values = defaultdict()

In [32]:
def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [41]:
dfd_tt.keys()

dict_keys(['fm1.1', 'fm2.1', 'fm3.1', 'fm4.1', 'fm5.1', 'fm6.1', 'fm1.2', 'fm2.2', 'fm3.2', 'fm4.2', 'fm5.2', 'fm6.2'])

In [None]:
for floatmachine_name in dfd_tt.keys():

    train = dfd_tt[floatmachine_name]['train']
    test = dfd_tt[floatmachine_name]['test']

    output_features = [f.replace("_y", "") for f in test.columns.tolist() if "_y" in f]
    train = train.dropna(subset=output_features)

    for output_feature in output_features:
        print(f"training {floatmachine_name} : {output_feature}...")

        features_to_train_on = train.columns.tolist()
        features_to_train_on.remove('MEAS_DT')
        features_to_train_on = [f for f in features_to_train_on if not "min" in f and "max" not in f]

        train_pool = Pool(train[features_to_train_on], label=train[output_feature])
        test_pool = Pool(test[features_to_train_on], label=test[output_feature + "_X"])

        model = CatBoostRegressor()

        model.fit(train_pool, eval_set=test_pool, use_best_model=True, plot=False, silent=True)

        train_predictions = model.predict(train_pool)
        test_predictions = model.predict(test_pool)

        train_mae = mean_absolute_error(train[output_feature], train_predictions)
        test_mae = mean_absolute_error(test[output_feature + "_X"], test_predictions)

        train_mape = mean_absolute_percentage_error(train[output_feature], train_predictions)
        test_mape = mean_absolute_percentage_error(test[output_feature + "_X"], test_predictions)

        mae_values[output_feature] = test_mae
        mape_values[output_feature] = test_mape

        print(f"Metrics for {floatmachine_name} : {output_feature}")
        print(f"Train MAE: {round(train_mae, 4)}, Train MAPE: {round(train_mape, 4)}%")
        print(f"Test MAE: {round(test_mae, 4)}, Test MAPE: {round(test_mape, 4)}%")
        print("----------------------------------//----------------------------------")

        models[f"{floatmachine_name}:{output_feature}"] = model

training fm1.1 : Ni_1.1C_min...
Metrics for fm1.1 : Ni_1.1C_min
Train MAE: 0.0831, Train MAPE: 3.1357%
Test MAE: 0.074, Test MAPE: 2.7001%
----------------------------------//----------------------------------
training fm1.1 : Ni_1.1C_max...
Metrics for fm1.1 : Ni_1.1C_max
Train MAE: 0.0818, Train MAPE: 2.6839%
Test MAE: 0.0813, Test MAPE: 2.6651%
----------------------------------//----------------------------------
training fm1.1 : Cu_1.1C_min...
Metrics for fm1.1 : Cu_1.1C_min
Train MAE: 0.0436, Train MAPE: 0.9841%
Test MAE: 0.0472, Test MAPE: 1.0607%
----------------------------------//----------------------------------
training fm1.1 : Cu_1.1C_max...
Metrics for fm1.1 : Cu_1.1C_max
Train MAE: 0.049, Train MAPE: 1.0463%
Test MAE: 0.0536, Test MAPE: 1.1567%
----------------------------------//----------------------------------
training fm2.1 : Cu_2.1T_min...
Metrics for fm2.1 : Cu_2.1T_min
Train MAE: 0.0048, Train MAPE: 1.2941%
Test MAE: 0.0052, Test MAPE: 1.5664%
------------------

In [34]:
mae_list = [v for _,v in mae_values.items()]
mape_list = [v for _,v in mape_values.items()]

print(f"Mean MAE {round(sum(mae_list) / len(mae_list), 4)}")
print(f"Mean MAPE {round(sum(mape_list) / len(mape_list), 4)}%")

Mean MAE 0.061
Mean MAPE 2.5243%
