In [42]:
import re
import pandas as pd
from datetime import datetime

In [43]:
df = pd.read_csv("data/df_hack_final_processed.csv")
df = df.astype({'MEAS_DT': 'datetime64[ns]'})

In [44]:
interaction_graph = {
    "fm1" : [],
    "fm2" : ["Cu_1.1C", "Ni_1.1C", "Cu_1.2C", "Ni_1.2C"],
    "fm3" : ["Cu_2.1C", "Ni_2.1C", "Cu_2.2C", "Ni_2.2C"],
    "fm4" : ["Cu_2.1C", "Ni_2.1C", "Cu_2.2C", "Ni_2.2C", "Cu_3.1C", "Ni_3.1C", "Cu_3.2C", "Ni_3.2C", "Ni_6.1C", "Ni_6.2C"],
    "fm5" : ["Ni_4.1C", "Ni_4.1T"],
    "fm6" : ["Ni_5.1C", "Ni_5.1T", "Ni_5.2C", "Ni_5.2T"]
    }

In [45]:
def get_dataframe_per_fm(df, i, j, train=True):
    non_digit_features = [col for col in df.columns.tolist() if not re.findall(r'\d+', col)]
    filtered_columns_1 = [col for col in df.columns.tolist() if re.findall(f'{i}\\.{j}', col)]
    filtered_columns_2 = [col for col in df.columns.tolist() if re.findall(f'_{i}$', col)]
    add_features = []
    if train:
        add_features = interaction_graph[f'fm{i}']

    return df[non_digit_features+filtered_columns_1+filtered_columns_2 + add_features]

dfd = {f"fm{i}.{j}" : get_dataframe_per_fm(df, i, j) for j in range(1,3) for i in range(1,7)}

In [None]:
dfd['fm1.1']['Cu_1.1C_next']
dfd['fm1.1']['Ni_1.1C_next']

Unnamed: 0,MEAS_DT,Cu_oreth,Ni_oreth,Ore_mass,Cu_resth,Ni_resth,Ni_rec,Cu_1.1C,Ni_1.1C,Cu_1.1C_min,Cu_1.1C_max,Ni_1.1C_min,Ni_1.1C_max,Mass_1,Dens_1
0,2024-01-01 00:00:00,2.6097,1.5313,1096.5,1.8008,8.2067,,4.4612,2.5949,4.4,4.6,2.4,2.7,1240.597656,1.388724
1,2024-01-01 00:15:00,2.5548,1.4842,1123.0,1.8160,7.8830,,4.5157,2.5875,4.4,4.6,2.4,2.7,1205.422363,1.375562
2,2024-01-01 00:30:00,2.5109,1.4355,840.0,1.9386,7.7173,0.970170,4.4968,2.6646,4.4,4.6,2.4,2.7,1188.762573,1.371214
3,2024-01-01 00:45:00,2.4765,1.3852,824.0,2.0530,7.8480,0.968639,4.5375,2.6634,4.4,4.6,2.4,2.7,1151.888672,1.374260
4,2024-01-01 01:00:00,2.3585,1.3368,0.0,2.1132,7.9884,0.974205,4.5612,2.6661,4.4,4.6,2.4,2.7,1104.101318,1.357874
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30331,2024-11-11 22:45:00,2.7133,1.4200,1231.0,1.3916,8.2084,0.939579,4.5610,2.6709,4.4,4.7,2.6,2.9,1377.498413,1.405367
30332,2024-11-11 23:00:00,2.7582,1.5234,1149.5,1.3975,8.0928,0.944490,4.4902,2.6537,4.5,5.0,2.5,3.3,1412.641235,1.419056
30333,2024-11-11 23:15:00,2.7487,1.5030,1210.5,1.3968,8.1204,0.943409,4.5264,2.6525,4.5,5.0,2.5,3.3,1365.101562,1.401895
30334,2024-11-11 23:30:00,2.7067,1.5257,1389.5,1.3965,8.0961,0.943442,4.4615,2.6480,4.5,5.0,2.5,3.3,1382.615723,1.406324


In [19]:
test_dataframe = pd.read_csv("data/test.csv")
test_dataframe = test_dataframe.astype({'MEAS_DT': 'datetime64[ns]'})

In [20]:
dfd_test = {f"fm{i}.{j}" : get_dataframe_per_fm(test_dataframe, i, j, False) for j in range(1, 3) for i in range(1, 7)}

In [21]:
def train_test_split(train_, test_):
    train = pd.concat([train_, test_]).drop_duplicates(subset='MEAS_DT', keep=False)
    test = pd.merge(test_, train_, how='inner', on='MEAS_DT', suffixes=('_y', '_X'))
    return {
        "train" : train, 
        "test" : test
        }

In [22]:
dfd_tt = {k : train_test_split(df_train_fm, dfd_test[k]) for k, df_train_fm in dfd.items()}

## CatBoost and Prophet as an feature

Самая базовая идея с предсказыванием временных рядов - это в качестве фичи для трейна дать значение куда пойдет концентрация на следующем шаге.

- во время обучения катбуста, я его учу с концентрацией на этом шаге + следующий шаг

- во врем инференса катбуста (то есть на тесте, хотя на этом наборе данных можно и взять трейн из будущего), я использую профет, чтобы проанализировать куда пойдет следующий шаг

In [34]:
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import mean_absolute_error
import numpy as np
from prophet import Prophet

In [24]:
from collections import defaultdict

In [None]:
boosting_models = {}
prophet_models = {}
mae_values = defaultdict()
mape_values = defaultdict()

In [39]:
def train_prophet(train, output_features):
    """
    one can train using logisitic if have enough time
    """
    output_features = list(set([feature.replace("_min", "").replace("_max", "") for feature in output_features]))
    models = {}
    for feature in output_features:
        
        train_ = train[['MEAS_DT', feature]]
        train_.columns = ['ds', 'y']
        m = Prophet().fit(train_)
        # m = Prophet(changepoint_prior_scale=0.01).fit(train)
        # future = m.make_future_dataframe(periods=300, freq='H', include_history=False)
        models[feature] = m
    
    return models

In [36]:
def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [None]:
for floatmachine_name in dfd_tt.keys():

    train = dfd_tt[floatmachine_name]['train']
    test = dfd_tt[floatmachine_name]['test']

    output_features = [f.replace("_y", "") for f in test.columns.tolist() if "_y" in f]
    train = train.dropna(subset=output_features)
    
    pair_of_boosting_models = {}
    for output_feature in output_features:
        print(f"training {floatmachine_name} : {output_feature}...")
        
        # train prophet for each fm       
        prophet_models.update(train_prophet(train, output_features))

        # train catboost for each fm
        features_to_train_on = train.columns.tolist()
        features_to_train_on.remove('MEAS_DT')
        features_to_train_on = [f for f in features_to_train_on if not "min" in f and "max" not in f]

        train_pool = Pool(train[features_to_train_on], label=train[output_feature])
        test_pool = Pool(test[features_to_train_on], label=test[output_feature + "_X"])

        model = CatBoostRegressor(loss_function='MAE')

        model.fit(train_pool, eval_set=test_pool, use_best_model=True, plot=False, silent=True)

        # make prediction and evaluation
        train_predictions = model.predict(train_pool)
        test_predictions = model.predict(test_pool)

        train_mae = mean_absolute_error(train[output_feature], train_predictions)
        test_mae = mean_absolute_error(test[output_feature + "_X"], test_predictions)

        train_mape = mean_absolute_percentage_error(train[output_feature], train_predictions)
        test_mape = mean_absolute_percentage_error(test[output_feature + "_X"], test_predictions)

        mae_values[output_feature] = test_mae
        mape_values[output_feature] = test_mape

        print(f"Metrics for {floatmachine_name} : {output_feature}")
        print(f"Train MAE: {round(train_mae, 4)}, Train MAPE: {round(train_mape, 4)}%")
        print(f"Test MAE: {round(test_mae, 4)}, Test MAPE: {round(test_mape, 4)}%")
        print("----------------------------------//----------------------------------")

        boosting_models[floatmachine_name] = model

training fm1.1 : Ni_1.1C_min...


KeyboardInterrupt: 

In [59]:
mae_list = [v for _,v in mae_values.items()]
mape_list = [v for _,v in mape_values.items()]

print(f"Mean MAE {round(sum(mae_list) / len(mae_list), 4)}")
print(f"Mean MAPE {round(sum(mape_list) / len(mape_list), 4)}%")

Mean MAE 0.0606
Mean MAPE 2.5088%
