In [17]:
import re
import pandas as pd
from datetime import datetime

In [48]:
import warnings
warnings.filterwarnings("ignore")

In [18]:
df = pd.read_csv("data/df_hack_final_processed.csv")
df = df.astype({'MEAS_DT': 'datetime64[ns]'})

In [19]:
interaction_graph = {
    "fm1" : [],
    "fm2" : ["Cu_1.1C", "Ni_1.1C", "Cu_1.2C", "Ni_1.2C"],
    "fm3" : ["Cu_2.1C", "Ni_2.1C", "Cu_2.2C", "Ni_2.2C"],
    "fm4" : ["Cu_2.1C", "Ni_2.1C", "Cu_2.2C", "Ni_2.2C", "Cu_3.1C", "Ni_3.1C", "Cu_3.2C", "Ni_3.2C", "Ni_6.1C", "Ni_6.2C"],
    "fm5" : ["Ni_4.1C", "Ni_4.1T"],
    "fm6" : ["Ni_5.1C", "Ni_5.1T", "Ni_5.2C", "Ni_5.2T"]
    }

In [20]:
def get_dataframe_per_fm(df, i, j, train=True):
    non_digit_features = [col for col in df.columns.tolist() if not re.findall(r'\d+', col)]
    filtered_columns_1 = [col for col in df.columns.tolist() if re.findall(f'{i}\\.{j}', col)]
    filtered_columns_2 = [col for col in df.columns.tolist() if re.findall(f'_{i}$', col)]
    add_features = []
    if train:
        add_features = interaction_graph[f'fm{i}']

    return df[non_digit_features+filtered_columns_1+filtered_columns_2 + add_features]

dfd = {f"fm{i}.{j}" : get_dataframe_per_fm(df, i, j) for j in range(1,3) for i in range(1,7)}

In [21]:
test_dataframe = pd.read_csv("data/test.csv")
test_dataframe = test_dataframe.astype({'MEAS_DT': 'datetime64[ns]'})

In [22]:
dfd_test = {f"fm{i}.{j}" : get_dataframe_per_fm(test_dataframe, i, j, False) for j in range(1, 3) for i in range(1, 7)}

In [23]:
def train_test_split(train_, test_):
    train = pd.concat([train_, test_]).drop_duplicates(subset='MEAS_DT', keep=False)
    test = pd.merge(test_, train_, how='inner', on='MEAS_DT', suffixes=('_y', '_X'))
    return {
        "train" : train, 
        "test" : test
        }

In [24]:
dfd_tt = {k : train_test_split(df_train_fm, dfd_test[k]) for k, df_train_fm in dfd.items()}

## CatBoost and Prophet as an Ensemble

Самая базовая идея с предсказыванием временных рядов. Сделать ансамбль усреднив ответы двух моделей, либо понять где например работает лучше одна, а где другая.

In [25]:
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import mean_absolute_error
import numpy as np
from prophet import Prophet

In [26]:
from collections import defaultdict

In [27]:
boosting_models = {}
prophet_models = {}
mae_values = defaultdict()
mape_values = defaultdict()

In [28]:
def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [None]:
def train_prophet(train, test, output_features):
    """
    следующий способ:
    сделать 1 модель на концентрацию.
    """
    # output_features = list(set([feature.replace("_min", "").replace("_max", "") for feature in output_features]))
    models = {}
    predictions = {}
    for feature in output_features:
        print(feature)
        train_ = train[['MEAS_DT', feature]]
        train_.columns = ['ds', 'y']
        m = Prophet().fit(train_)
        future = m.make_future_dataframe(periods=len(test), freq='H', include_history=False)
        forecast = m.predict(future)
        models[feature] = m
        predictions[feature] = forecast['yhat'].values
        # predictions[feature] = {
        #     'yhat': forecast['yhat'].values,
        #     'yhat_lower': forecast['yhat_lower'].values,
        #     'yhat_upper': forecast['yhat_upper'].values
        # }
    return models, predictions

def train_catboost(train, test, output_features):
    models = {}
    for output_feature in output_features:
        features_to_train_on = train.columns.tolist()
        features_to_train_on.remove('MEAS_DT')
        features_to_train_on = [f for f in features_to_train_on if not "min" in f and "max" not in f]

        train_pool = Pool(train[features_to_train_on], label=train[output_feature])
        test_pool = Pool(test[features_to_train_on], label=test[output_feature + "_X"])

        model = CatBoostRegressor(loss_function='MAE')
        model.fit(train_pool, eval_set=test_pool, use_best_model=True, plot=False, silent=True)
        models[output_feature] = model
        print(f"catboost trained {output_feature}")
    return models

In [None]:
def ensemble_predictions(prophet_predictions, catboost_models, train, test, output_features, weight_prophet=0.5):
    ensemble_predictions = {}
    for output_feature in output_features:
        prophet_yhat = prophet_predictions[output_feature]

        features_to_train_on = train.columns.tolist()
        features_to_train_on.remove('MEAS_DT')
        features_to_train_on = [f for f in features_to_train_on if not "min" in f and "max" not in f]
        test_pool = Pool(test[features_to_train_on])
        catboost_predictions = catboost_models[output_feature].predict(test_pool)

        ensemble_predictions[output_feature] = (prophet_yhat + catboost_predictions) / 2
    return ensemble_predictions

In [61]:
for floatmachine_name in dfd_tt.keys():
    train = dfd_tt[floatmachine_name]['train']
    test = dfd_tt[floatmachine_name]['test']

    output_features = [f.replace("_y", "") for f in test.columns.tolist() if "_y" in f]
    train = train.dropna(subset=output_features)

    prophet_models, prophet_predictions = train_prophet(train, test, output_features)

    catboost_models = train_catboost(train, test, output_features)

    ensemble_preds = ensemble_predictions(prophet_predictions, catboost_models, train, test, output_features)

    for output_feature in output_features:
        test_mae = mean_absolute_error(test[output_feature + "_X"], ensemble_preds[output_feature])
        test_mape = mean_absolute_percentage_error(test[output_feature + "_X"], ensemble_preds[output_feature])
        
        mae_values[output_feature] =  test_mae
        mape_values[output_feature] =  test_mape
        
        print(f"Metrics for {floatmachine_name} : {output_feature}")
        print(f"Test MAE: {round(test_mae, 4)}, Test MAPE: {round(test_mape, 4)}%")
        print("----------------------------------//----------------------------------")

Ni_1.1C_min


08:06:09 - cmdstanpy - INFO - Chain [1] start processing
08:06:14 - cmdstanpy - INFO - Chain [1] done processing


Ni_1.1C_max


08:06:15 - cmdstanpy - INFO - Chain [1] start processing
08:06:21 - cmdstanpy - INFO - Chain [1] done processing


Cu_1.1C_min


08:06:22 - cmdstanpy - INFO - Chain [1] start processing
08:06:27 - cmdstanpy - INFO - Chain [1] done processing


Cu_1.1C_max


08:06:29 - cmdstanpy - INFO - Chain [1] start processing
08:06:32 - cmdstanpy - INFO - Chain [1] done processing


catboost trained Ni_1.1C_min
catboost trained Ni_1.1C_max
catboost trained Cu_1.1C_min
catboost trained Cu_1.1C_max
(6740,)
(6740,)
(6740,)
(6740,)
Metrics for fm1.1 : Ni_1.1C_min
Test MAE: 0.135, Test MAPE: 4.8947%
----------------------------------//----------------------------------
Metrics for fm1.1 : Ni_1.1C_max
Test MAE: 0.262, Test MAPE: 8.2397%
----------------------------------//----------------------------------
Metrics for fm1.1 : Cu_1.1C_min
Test MAE: 0.2831, Test MAPE: 6.2573%
----------------------------------//----------------------------------
Metrics for fm1.1 : Cu_1.1C_max
Test MAE: 0.1733, Test MAPE: 3.8311%
----------------------------------//----------------------------------
Cu_2.1T_min


08:06:42 - cmdstanpy - INFO - Chain [1] start processing
08:06:46 - cmdstanpy - INFO - Chain [1] done processing


Cu_2.1T_max


08:06:47 - cmdstanpy - INFO - Chain [1] start processing
08:06:52 - cmdstanpy - INFO - Chain [1] done processing


catboost trained Cu_2.1T_min
catboost trained Cu_2.1T_max
(6740,)
(6740,)
Metrics for fm2.1 : Cu_2.1T_min
Test MAE: 0.0845, Test MAPE: 27.1503%
----------------------------------//----------------------------------
Metrics for fm2.1 : Cu_2.1T_max
Test MAE: 0.0869, Test MAPE: 26.2145%
----------------------------------//----------------------------------
Cu_3.1T_min


08:06:58 - cmdstanpy - INFO - Chain [1] start processing
08:07:03 - cmdstanpy - INFO - Chain [1] done processing


Cu_3.1T_max


08:07:04 - cmdstanpy - INFO - Chain [1] start processing
08:07:06 - cmdstanpy - INFO - Chain [1] done processing


catboost trained Cu_3.1T_min
catboost trained Cu_3.1T_max
(6740,)
(6740,)
Metrics for fm3.1 : Cu_3.1T_min
Test MAE: 0.3739, Test MAPE: 36.7118%
----------------------------------//----------------------------------
Metrics for fm3.1 : Cu_3.1T_max
Test MAE: 0.3823, Test MAPE: 20.1664%
----------------------------------//----------------------------------
Ni_4.1T_min


08:07:12 - cmdstanpy - INFO - Chain [1] start processing
08:07:17 - cmdstanpy - INFO - Chain [1] done processing


Ni_4.1T_max


08:07:18 - cmdstanpy - INFO - Chain [1] start processing
08:07:23 - cmdstanpy - INFO - Chain [1] done processing


Ni_4.1C_min


08:07:24 - cmdstanpy - INFO - Chain [1] start processing
08:07:29 - cmdstanpy - INFO - Chain [1] done processing


Ni_4.1C_max


08:07:30 - cmdstanpy - INFO - Chain [1] start processing
08:07:35 - cmdstanpy - INFO - Chain [1] done processing


catboost trained Ni_4.1T_min
catboost trained Ni_4.1T_max
catboost trained Ni_4.1C_min
catboost trained Ni_4.1C_max
(6740,)
(6740,)
(6740,)
(6740,)
Metrics for fm4.1 : Ni_4.1T_min
Test MAE: 0.1024, Test MAPE: 13.98%
----------------------------------//----------------------------------
Metrics for fm4.1 : Ni_4.1T_max
Test MAE: 0.106, Test MAPE: 13.8808%
----------------------------------//----------------------------------
Metrics for fm4.1 : Ni_4.1C_min
Test MAE: 0.4799, Test MAPE: 10.1226%
----------------------------------//----------------------------------
Metrics for fm4.1 : Ni_4.1C_max
Test MAE: 0.5281, Test MAPE: 10.693%
----------------------------------//----------------------------------
Ni_5.1T_min


08:07:46 - cmdstanpy - INFO - Chain [1] start processing
08:07:48 - cmdstanpy - INFO - Chain [1] done processing


Ni_5.1T_max


08:07:49 - cmdstanpy - INFO - Chain [1] start processing
08:07:52 - cmdstanpy - INFO - Chain [1] done processing


Ni_5.1C_min


08:07:53 - cmdstanpy - INFO - Chain [1] start processing
08:07:57 - cmdstanpy - INFO - Chain [1] done processing


Ni_5.1C_max


08:07:58 - cmdstanpy - INFO - Chain [1] start processing
08:08:02 - cmdstanpy - INFO - Chain [1] done processing


catboost trained Ni_5.1T_min
catboost trained Ni_5.1T_max
catboost trained Ni_5.1C_min
catboost trained Ni_5.1C_max
(6740,)
(6740,)
(6740,)
(6740,)
Metrics for fm5.1 : Ni_5.1T_min
Test MAE: 0.0956, Test MAPE: 13.8169%
----------------------------------//----------------------------------
Metrics for fm5.1 : Ni_5.1T_max
Test MAE: 0.104, Test MAPE: 14.3625%
----------------------------------//----------------------------------
Metrics for fm5.1 : Ni_5.1C_min
Test MAE: 0.4471, Test MAPE: 6.7529%
----------------------------------//----------------------------------
Metrics for fm5.1 : Ni_5.1C_max
Test MAE: 0.1745, Test MAPE: 2.5194%
----------------------------------//----------------------------------
Ni_6.1T_min


08:08:14 - cmdstanpy - INFO - Chain [1] start processing
08:08:18 - cmdstanpy - INFO - Chain [1] done processing


Ni_6.1T_max


08:08:19 - cmdstanpy - INFO - Chain [1] start processing
08:08:25 - cmdstanpy - INFO - Chain [1] done processing


Ni_6.1C_min


08:08:26 - cmdstanpy - INFO - Chain [1] start processing
08:08:31 - cmdstanpy - INFO - Chain [1] done processing


Ni_6.1C_max


08:08:32 - cmdstanpy - INFO - Chain [1] start processing
08:08:37 - cmdstanpy - INFO - Chain [1] done processing


catboost trained Ni_6.1T_min
catboost trained Ni_6.1T_max
catboost trained Ni_6.1C_min
catboost trained Ni_6.1C_max
(6740,)
(6740,)
(6740,)
(6740,)
Metrics for fm6.1 : Ni_6.1T_min
Test MAE: 0.2029, Test MAPE: 15.2046%
----------------------------------//----------------------------------
Metrics for fm6.1 : Ni_6.1T_max
Test MAE: 0.1819, Test MAPE: 13.2664%
----------------------------------//----------------------------------
Metrics for fm6.1 : Ni_6.1C_min
Test MAE: 0.3363, Test MAPE: 3.6326%
----------------------------------//----------------------------------
Metrics for fm6.1 : Ni_6.1C_max
Test MAE: 0.3988, Test MAPE: 4.1593%
----------------------------------//----------------------------------
Ni_1.2C_min


08:08:47 - cmdstanpy - INFO - Chain [1] start processing
08:08:51 - cmdstanpy - INFO - Chain [1] done processing


Ni_1.2C_max


08:08:52 - cmdstanpy - INFO - Chain [1] start processing
08:08:57 - cmdstanpy - INFO - Chain [1] done processing


Cu_1.2C_min


08:08:58 - cmdstanpy - INFO - Chain [1] start processing
08:09:03 - cmdstanpy - INFO - Chain [1] done processing


Cu_1.2C_max


08:09:04 - cmdstanpy - INFO - Chain [1] start processing
08:09:10 - cmdstanpy - INFO - Chain [1] done processing


catboost trained Ni_1.2C_min
catboost trained Ni_1.2C_max
catboost trained Cu_1.2C_min
catboost trained Cu_1.2C_max
(6740,)
(6740,)
(6740,)
(6740,)
Metrics for fm1.2 : Ni_1.2C_min
Test MAE: 0.3656, Test MAPE: 13.2996%
----------------------------------//----------------------------------
Metrics for fm1.2 : Ni_1.2C_max
Test MAE: 0.3885, Test MAPE: 12.9046%
----------------------------------//----------------------------------
Metrics for fm1.2 : Cu_1.2C_min
Test MAE: 0.4636, Test MAPE: 9.9606%
----------------------------------//----------------------------------
Metrics for fm1.2 : Cu_1.2C_max
Test MAE: 0.8669, Test MAPE: 17.5098%
----------------------------------//----------------------------------
Cu_2.2T_min


08:09:19 - cmdstanpy - INFO - Chain [1] start processing
08:09:26 - cmdstanpy - INFO - Chain [1] done processing


Cu_2.2T_max


08:09:27 - cmdstanpy - INFO - Chain [1] start processing
08:09:32 - cmdstanpy - INFO - Chain [1] done processing


catboost trained Cu_2.2T_min
catboost trained Cu_2.2T_max
(6740,)
(6740,)
Metrics for fm2.2 : Cu_2.2T_min
Test MAE: 0.0402, Test MAPE: 8.8731%
----------------------------------//----------------------------------
Metrics for fm2.2 : Cu_2.2T_max
Test MAE: 0.0396, Test MAPE: 8.2818%
----------------------------------//----------------------------------
Cu_3.2T_min


08:09:38 - cmdstanpy - INFO - Chain [1] start processing
08:09:42 - cmdstanpy - INFO - Chain [1] done processing


Cu_3.2T_max


08:09:44 - cmdstanpy - INFO - Chain [1] start processing
08:09:47 - cmdstanpy - INFO - Chain [1] done processing


catboost trained Cu_3.2T_min
catboost trained Cu_3.2T_max
(6740,)
(6740,)
Metrics for fm3.2 : Cu_3.2T_min
Test MAE: 0.1538, Test MAPE: 13.9041%
----------------------------------//----------------------------------
Metrics for fm3.2 : Cu_3.2T_max
Test MAE: 0.2287, Test MAPE: 14.7755%
----------------------------------//----------------------------------
Ni_4.2T_min


08:09:53 - cmdstanpy - INFO - Chain [1] start processing
08:09:56 - cmdstanpy - INFO - Chain [1] done processing


Ni_4.2T_max


08:09:57 - cmdstanpy - INFO - Chain [1] start processing
08:10:00 - cmdstanpy - INFO - Chain [1] done processing


Ni_4.2C_min


08:10:01 - cmdstanpy - INFO - Chain [1] start processing
08:10:06 - cmdstanpy - INFO - Chain [1] done processing


Ni_4.2C_max


08:10:07 - cmdstanpy - INFO - Chain [1] start processing
08:10:14 - cmdstanpy - INFO - Chain [1] done processing


catboost trained Ni_4.2T_min
catboost trained Ni_4.2T_max
catboost trained Ni_4.2C_min
catboost trained Ni_4.2C_max
(6740,)
(6740,)
(6740,)
(6740,)
Metrics for fm4.2 : Ni_4.2T_min
Test MAE: 0.0608, Test MAPE: 7.9771%
----------------------------------//----------------------------------
Metrics for fm4.2 : Ni_4.2T_max
Test MAE: 0.0768, Test MAPE: 9.2245%
----------------------------------//----------------------------------
Metrics for fm4.2 : Ni_4.2C_min
Test MAE: 0.4494, Test MAPE: 11.3246%
----------------------------------//----------------------------------
Metrics for fm4.2 : Ni_4.2C_max
Test MAE: 0.3363, Test MAPE: 8.0155%
----------------------------------//----------------------------------
Ni_5.2T_min


08:10:25 - cmdstanpy - INFO - Chain [1] start processing
08:10:28 - cmdstanpy - INFO - Chain [1] done processing


Ni_5.2T_max


08:10:30 - cmdstanpy - INFO - Chain [1] start processing
08:10:33 - cmdstanpy - INFO - Chain [1] done processing


Ni_5.2C_min


08:10:34 - cmdstanpy - INFO - Chain [1] start processing
08:10:39 - cmdstanpy - INFO - Chain [1] done processing


Ni_5.2C_max


08:10:40 - cmdstanpy - INFO - Chain [1] start processing
08:10:46 - cmdstanpy - INFO - Chain [1] done processing


catboost trained Ni_5.2T_min
catboost trained Ni_5.2T_max
catboost trained Ni_5.2C_min
catboost trained Ni_5.2C_max
(6740,)
(6740,)
(6740,)
(6740,)
Metrics for fm5.2 : Ni_5.2T_min
Test MAE: 0.1069, Test MAPE: 15.3785%
----------------------------------//----------------------------------
Metrics for fm5.2 : Ni_5.2T_max
Test MAE: 0.111, Test MAPE: 15.2005%
----------------------------------//----------------------------------
Metrics for fm5.2 : Ni_5.2C_min
Test MAE: 0.265, Test MAPE: 3.7501%
----------------------------------//----------------------------------
Metrics for fm5.2 : Ni_5.2C_max
Test MAE: 0.2854, Test MAPE: 3.8012%
----------------------------------//----------------------------------
Ni_6.2T_min


08:10:55 - cmdstanpy - INFO - Chain [1] start processing
08:11:01 - cmdstanpy - INFO - Chain [1] done processing


Ni_6.2T_max


08:11:02 - cmdstanpy - INFO - Chain [1] start processing
08:11:07 - cmdstanpy - INFO - Chain [1] done processing


Ni_6.2C_min


08:11:08 - cmdstanpy - INFO - Chain [1] start processing
08:11:13 - cmdstanpy - INFO - Chain [1] done processing


Ni_6.2C_max


08:11:14 - cmdstanpy - INFO - Chain [1] start processing
08:11:18 - cmdstanpy - INFO - Chain [1] done processing


catboost trained Ni_6.2T_min
catboost trained Ni_6.2T_max
catboost trained Ni_6.2C_min
catboost trained Ni_6.2C_max
(6740,)
(6740,)
(6740,)
(6740,)
Metrics for fm6.2 : Ni_6.2T_min
Test MAE: 0.1619, Test MAPE: 14.2272%
----------------------------------//----------------------------------
Metrics for fm6.2 : Ni_6.2T_max
Test MAE: 0.1346, Test MAPE: 10.6873%
----------------------------------//----------------------------------
Metrics for fm6.2 : Ni_6.2C_min
Test MAE: 0.1457, Test MAPE: 1.5719%
----------------------------------//----------------------------------
Metrics for fm6.2 : Ni_6.2C_max
Test MAE: 0.1688, Test MAPE: 1.7662%
----------------------------------//----------------------------------


In [62]:
mae_list = [v for _,v in mae_values.items()]
mape_list = [v for _,v in mape_values.items()]

print(f"Mean MAE {round(sum(mae_list) / len(mae_list), 4)}")
print(f"Mean MAPE {round(sum(mape_list) / len(mape_list), 4)}%")

Mean MAE 0.2447
Mean MAPE 11.4573%
