In [17]:
import re
import pandas as pd
from datetime import datetime

In [48]:
import warnings
warnings.filterwarnings("ignore")

In [18]:
df = pd.read_csv("data/df_hack_final_processed.csv")
df = df.astype({'MEAS_DT': 'datetime64[ns]'})

In [19]:
interaction_graph = {
    "fm1" : [],
    "fm2" : ["Cu_1.1C", "Ni_1.1C", "Cu_1.2C", "Ni_1.2C"],
    "fm3" : ["Cu_2.1C", "Ni_2.1C", "Cu_2.2C", "Ni_2.2C"],
    "fm4" : ["Cu_2.1C", "Ni_2.1C", "Cu_2.2C", "Ni_2.2C", "Cu_3.1C", "Ni_3.1C", "Cu_3.2C", "Ni_3.2C", "Ni_6.1C", "Ni_6.2C"],
    "fm5" : ["Ni_4.1C", "Ni_4.1T"],
    "fm6" : ["Ni_5.1C", "Ni_5.1T", "Ni_5.2C", "Ni_5.2T"]
    }

In [20]:
def get_dataframe_per_fm(df, i, j, train=True):
    non_digit_features = [col for col in df.columns.tolist() if not re.findall(r'\d+', col)]
    filtered_columns_1 = [col for col in df.columns.tolist() if re.findall(f'{i}\\.{j}', col)]
    filtered_columns_2 = [col for col in df.columns.tolist() if re.findall(f'_{i}$', col)]
    add_features = []
    if train:
        add_features = interaction_graph[f'fm{i}']

    return df[non_digit_features+filtered_columns_1+filtered_columns_2 + add_features]

dfd = {f"fm{i}.{j}" : get_dataframe_per_fm(df, i, j) for j in range(1,3) for i in range(1,7)}

In [21]:
test_dataframe = pd.read_csv("data/test.csv")
test_dataframe = test_dataframe.astype({'MEAS_DT': 'datetime64[ns]'})

In [22]:
dfd_test = {f"fm{i}.{j}" : get_dataframe_per_fm(test_dataframe, i, j, False) for j in range(1, 3) for i in range(1, 7)}

In [23]:
def train_test_split(train_, test_):
    train = pd.concat([train_, test_]).drop_duplicates(subset='MEAS_DT', keep=False)
    test = pd.merge(test_, train_, how='inner', on='MEAS_DT', suffixes=('_y', '_X'))
    return {
        "train" : train, 
        "test" : test
        }

In [24]:
dfd_tt = {k : train_test_split(df_train_fm, dfd_test[k]) for k, df_train_fm in dfd.items()}

## CatBoost and Prophet as an Ensemble

Самая базовая идея с предсказыванием временных рядов. Сделать ансамбль усреднив ответы двух моделей, либо понять где например работает лучше одна, а где другая.

In [25]:
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import mean_absolute_error
import numpy as np
from prophet import Prophet

In [26]:
from collections import defaultdict

In [27]:
boosting_models = {}
prophet_models = {}
mae_values = defaultdict()
mape_values = defaultdict()

In [28]:
def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [51]:
output_features

['Ni_1.1C_min', 'Ni_1.1C_max', 'Cu_1.1C_min', 'Cu_1.1C_max']

In [59]:
def train_prophet(train, test, output_features):
    """
    самый простой способ профетом пока предиктить просто min и max
    """
    # output_features = list(set([feature.replace("_min", "").replace("_max", "") for feature in output_features]))
    models = {}
    predictions = {}
    for feature in output_features:
        print(feature)
        train_ = train[['MEAS_DT', feature]]
        train_.columns = ['ds', 'y']
        m = Prophet().fit(train_)
        future = m.make_future_dataframe(periods=len(test), freq='H', include_history=False)
        forecast = m.predict(future)
        models[feature] = m
        predictions[feature] = forecast['yhat'].values
        # predictions[feature] = {
        #     'yhat': forecast['yhat'].values,
        #     'yhat_lower': forecast['yhat_lower'].values,
        #     'yhat_upper': forecast['yhat_upper'].values
        # }
    return models, predictions

def train_catboost(train, test, output_features):
    models = {}
    for output_feature in output_features:
        features_to_train_on = train.columns.tolist()
        features_to_train_on.remove('MEAS_DT')
        features_to_train_on = [f for f in features_to_train_on if not "min" in f and "max" not in f]

        train_pool = Pool(train[features_to_train_on], label=train[output_feature])
        test_pool = Pool(test[features_to_train_on], label=test[output_feature + "_X"])

        model = CatBoostRegressor(loss_function='MAE')
        model.fit(train_pool, eval_set=test_pool, use_best_model=True, plot=False, silent=True)
        models[output_feature] = model
        print(f"catboost trained {output_feature}")
    return models

In [60]:
def ensemble_predictions(prophet_predictions, catboost_models, train, test, output_features, weight_prophet=0.5):
    ensemble_predictions = {}
    for output_feature in output_features:
        prophet_yhat = prophet_predictions[output_feature]

        features_to_train_on = train.columns.tolist()
        features_to_train_on.remove('MEAS_DT')
        features_to_train_on = [f for f in features_to_train_on if not "min" in f and "max" not in f]
        test_pool = Pool(test[features_to_train_on])
        catboost_predictions = catboost_models[output_feature].predict(test_pool)

        print(prophet_yhat.shape)
        ensemble_predictions[output_feature] = (prophet_yhat + catboost_predictions) / 2
    return ensemble_predictions

In [None]:
for floatmachine_name in dfd_tt.keys():
    train = dfd_tt[floatmachine_name]['train']
    test = dfd_tt[floatmachine_name]['test']

    output_features = [f.replace("_y", "") for f in test.columns.tolist() if "_y" in f]
    train = train.dropna(subset=output_features)

    prophet_models, prophet_predictions = train_prophet(train, test, output_features)

    catboost_models = train_catboost(train, test, output_features)

    ensemble_preds = ensemble_predictions(prophet_predictions, catboost_models, train, test, output_features)

    for output_feature in output_features:
        test_mae = mean_absolute_error(test[output_feature + "_X"], ensemble_preds[output_feature])
        test_mape = mean_absolute_percentage_error(test[output_feature + "_X"], ensemble_preds[output_feature])
        
        mae_values[output_feature] =  test_mae
        mape_values[output_feature] =  test_mape
        
        print(f"Metrics for {floatmachine_name} : {output_feature}")
        print(f"Test MAE: {round(test_mae, 4)}, Test MAPE: {round(test_mape, 4)}%")
        print("----------------------------------//----------------------------------")

Ni_1.1C_min


08:06:09 - cmdstanpy - INFO - Chain [1] start processing


In [59]:
mae_list = [v for _,v in mae_values.items()]
mape_list = [v for _,v in mape_values.items()]

print(f"Mean MAE {round(sum(mae_list) / len(mae_list), 4)}")
print(f"Mean MAPE {round(sum(mape_list) / len(mape_list), 4)}%")

Mean MAE 0.0606
Mean MAPE 2.5088%
