In [84]:
import re
import pandas as pd
from datetime import datetime

In [142]:
df = pd.read_csv("data/df_hack_final_processed.csv")
df = df.astype({'MEAS_DT': 'datetime64[ns]'})

In [143]:
interaction_graph = {
    "fm1" : [],
    "fm2" : ["Cu_1.1C", "Ni_1.1C", "Cu_1.2C", "Ni_1.2C"],
    "fm3" : ["Cu_2.1C", "Ni_2.1C", "Cu_2.2C", "Ni_2.2C"],
    "fm4" : ["Cu_2.1C", "Ni_2.1C", "Cu_2.2C", "Ni_2.2C", "Cu_3.1C", "Ni_3.1C", "Cu_3.2C", "Ni_3.2C", "Ni_6.1C", "Ni_6.2C"],
    "fm5" : ["Ni_4.1C", "Ni_4.1T"],
    "fm6" : ["Ni_5.1C", "Ni_5.1T", "Ni_5.2C", "Ni_5.2T"]
    }

In [144]:
def get_dataframe_per_fm(df, i, j, train=True):
    non_digit_features = [col for col in df.columns.tolist() if not re.findall(r'\d+', col)]
    filtered_columns_1 = [col for col in df.columns.tolist() if re.findall(f'{i}\\.{j}', col)]
    filtered_columns_2 = [col for col in df.columns.tolist() if re.findall(f'_{i}$', col)]
    add_features = []
    if train:
        add_features = interaction_graph[f'fm{i}']

    return df[non_digit_features+filtered_columns_1+filtered_columns_2 + add_features]

dfd = {f"fm{i}.{j}" : get_dataframe_per_fm(df, i, j) for j in range(1,3) for i in range(1,7)}

In [145]:
test_dataframe = pd.read_csv("data/test.csv")
test_dataframe = test_dataframe.astype({'MEAS_DT': 'datetime64[ns]'})

In [146]:
dfd_test = {f"fm{i}.{j}" : get_dataframe_per_fm(test_dataframe, i, j, False) for j in range(1, 3) for i in range(1, 7)}

In [147]:
def train_test_split(train_, test_):
    train = pd.concat([train_, test_]).drop_duplicates(subset='MEAS_DT', keep=False)
    test = pd.merge(test_, train_, how='inner', on='MEAS_DT', suffixes=('_y', '_X'))
    return {
        "train" : train, 
        "test" : test
        }

In [8]:
dfd_tt = {k : train_test_split(df_train_fm, dfd_test[k]) for k, df_train_fm in dfd.items()}

## CatBoost

In [9]:
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import mean_absolute_error
import numpy as np

In [10]:
from collections import defaultdict

In [11]:
models = {}
mae_values = defaultdict()
mape_values = defaultdict()

In [12]:
def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [13]:
for floatmachine_name in dfd_tt.keys():

    train = dfd_tt[floatmachine_name]['train']
    test = dfd_tt[floatmachine_name]['test']

    output_features = [f.replace("_y", "") for f in test.columns.tolist() if "_y" in f]
    train = train.dropna(subset=output_features)

    for output_feature in output_features:
        print(f"training {floatmachine_name} : {output_feature}...")

        features_to_train_on = train.columns.tolist()
        features_to_train_on.remove('MEAS_DT')
        features_to_train_on = [f for f in features_to_train_on if not "min" in f and "max" not in f]

        train_pool = Pool(train[features_to_train_on], label=train[output_feature])
        test_pool = Pool(test[features_to_train_on], label=test[output_feature + "_X"])

        model = CatBoostRegressor(loss_function='MAE')

        model.fit(train_pool, eval_set=test_pool, use_best_model=True, plot=False, silent=True)

        train_predictions = model.predict(train_pool)
        test_predictions = model.predict(test_pool)

        train_mae = mean_absolute_error(train[output_feature], train_predictions)
        test_mae = mean_absolute_error(test[output_feature + "_X"], test_predictions)

        train_mape = mean_absolute_percentage_error(train[output_feature], train_predictions)
        test_mape = mean_absolute_percentage_error(test[output_feature + "_X"], test_predictions)

        mae_values[output_feature] = test_mae
        mape_values[output_feature] = test_mape

        print(f"Metrics for {floatmachine_name} : {output_feature}")
        print(f"Train MAE: {round(train_mae, 4)}, Train MAPE: {round(train_mape, 4)}%")
        print(f"Test MAE: {round(test_mae, 4)}, Test MAPE: {round(test_mape, 4)}%")
        print("----------------------------------//----------------------------------")

        models[f"{floatmachine_name}:{output_feature}"] = model

training fm1.1 : Ni_1.1C_min...
Metrics for fm1.1 : Ni_1.1C_min
Train MAE: 0.0764, Train MAPE: 2.9078%
Test MAE: 0.0716, Test MAPE: 2.6358%
----------------------------------//----------------------------------
training fm1.1 : Ni_1.1C_max...
Metrics for fm1.1 : Ni_1.1C_max
Train MAE: 0.0805, Train MAPE: 2.6062%
Test MAE: 0.0782, Test MAPE: 2.5392%
----------------------------------//----------------------------------
training fm1.1 : Cu_1.1C_min...
Metrics for fm1.1 : Cu_1.1C_min
Train MAE: 0.0417, Train MAPE: 0.9467%
Test MAE: 0.0416, Test MAPE: 0.9407%
----------------------------------//----------------------------------
training fm1.1 : Cu_1.1C_max...
Metrics for fm1.1 : Cu_1.1C_max
Train MAE: 0.0418, Train MAPE: 0.8851%
Test MAE: 0.0468, Test MAPE: 1.0041%
----------------------------------//----------------------------------
training fm2.1 : Cu_2.1T_min...
Metrics for fm2.1 : Cu_2.1T_min
Train MAE: 0.0041, Train MAPE: 1.0908%
Test MAE: 0.0044, Test MAPE: 1.3156%
----------------

In [14]:
mae_list = [v for _,v in mae_values.items()]
mape_list = [v for _,v in mape_values.items()]

print(f"Mean MAE {round(sum(mae_list) / len(mae_list), 4)}")
print(f"Mean MAPE {round(sum(mape_list) / len(mape_list), 4)}%")

Mean MAE 0.0561
Mean MAPE 2.2451%


### create submission

In [148]:
for floatmachine_name in dfd_tt.keys():

    train = dfd_tt[floatmachine_name]['train']
    test = dfd_tt[floatmachine_name]['test']

    output_features = [f.replace("_y", "") for f in test.columns.tolist() if "_y" in f]
    train = train.dropna(subset=output_features)

    for output_feature in output_features:
        # print(f"training {floatmachine_name} : {output_feature}...")

        features_to_train_on = train.columns.tolist()
        features_to_train_on.remove('MEAS_DT')
        features_to_train_on = [f for f in features_to_train_on if not "min" in f and "max" not in f]

        test_pool = Pool(test[features_to_train_on], label=test[output_feature + "_X"])

        test_predictions = models[f"{floatmachine_name}:{output_feature}"].predict(test_pool)

        test_dataframe[output_feature] = test_predictions

### apply contraints

In [149]:
test_dataframe.head()

Unnamed: 0,MEAS_DT,Ni_1.1C_min,Ni_1.1C_max,Cu_1.1C_min,Cu_1.1C_max,Ni_1.2C_min,Ni_1.2C_max,Cu_1.2C_min,Cu_1.2C_max,Cu_2.1T_min,...,Ni_5.2C_min,Ni_5.2C_max,Ni_6.1T_min,Ni_6.1T_max,Ni_6.1C_min,Ni_6.1C_max,Ni_6.2T_min,Ni_6.2T_max,Ni_6.2C_min,Ni_6.2C_max
0,2024-01-19 12:15:00,2.481546,2.715016,4.713474,4.894614,2.470214,2.775176,4.217431,4.572527,0.432215,...,6.285106,6.616127,1.159235,1.198409,8.750826,9.022657,1.044972,1.116736,9.016537,9.186243
1,2024-01-19 12:30:00,2.482487,2.701063,4.729127,4.898569,2.487303,2.757664,4.222346,4.518989,0.434375,...,6.134394,6.478068,1.168488,1.207221,8.819671,9.079372,0.986264,1.066936,8.990309,9.150371
2,2024-01-19 12:45:00,2.471117,2.674688,4.731913,4.900725,2.487283,2.76434,4.216692,4.540899,0.432317,...,6.191514,6.529948,1.168365,1.207964,8.836875,9.095844,1.023982,1.099386,8.984703,9.148937
3,2024-01-19 13:00:00,2.538296,2.757147,4.800525,4.930732,2.471932,2.768727,4.197948,4.583269,0.432662,...,6.234217,6.54192,1.1853,1.226144,8.757807,9.031335,1.027313,1.102029,8.979641,9.147365
4,2024-01-19 13:15:00,2.41003,2.634403,4.854644,5.02006,2.504131,2.776939,4.321796,4.620784,0.440222,...,6.303597,6.633395,1.172602,1.215029,8.771203,9.060555,1.008023,1.095601,9.001304,9.177726


#### sliding window

In [150]:
window_size = 8

def process_column(column):
    return [m for i in range(0, len(column), window_size) for m in [column.iloc[i:i+window_size].mean()] * window_size][:len(column)]


In [151]:
for col in test_dataframe.columns[1:]:
    test_dataframe[col] = process_column(test_dataframe[col])

In [152]:
test_dataframe.head()

Unnamed: 0,MEAS_DT,Ni_1.1C_min,Ni_1.1C_max,Cu_1.1C_min,Cu_1.1C_max,Ni_1.2C_min,Ni_1.2C_max,Cu_1.2C_min,Cu_1.2C_max,Cu_2.1T_min,...,Ni_5.2C_min,Ni_5.2C_max,Ni_6.1T_min,Ni_6.1T_max,Ni_6.1C_min,Ni_6.1C_max,Ni_6.2T_min,Ni_6.2T_max,Ni_6.2C_min,Ni_6.2C_max
0,2024-01-19 12:15:00,2.504468,2.718727,4.796418,4.95892,2.56272,2.848913,4.386671,4.726288,0.437108,...,6.265971,6.597801,1.169664,1.209819,8.782473,9.058255,1.003522,1.084081,9.004448,9.173535
1,2024-01-19 12:30:00,2.504468,2.718727,4.796418,4.95892,2.56272,2.848913,4.386671,4.726288,0.437108,...,6.265971,6.597801,1.169664,1.209819,8.782473,9.058255,1.003522,1.084081,9.004448,9.173535
2,2024-01-19 12:45:00,2.504468,2.718727,4.796418,4.95892,2.56272,2.848913,4.386671,4.726288,0.437108,...,6.265971,6.597801,1.169664,1.209819,8.782473,9.058255,1.003522,1.084081,9.004448,9.173535
3,2024-01-19 13:00:00,2.504468,2.718727,4.796418,4.95892,2.56272,2.848913,4.386671,4.726288,0.437108,...,6.265971,6.597801,1.169664,1.209819,8.782473,9.058255,1.003522,1.084081,9.004448,9.173535
4,2024-01-19 13:15:00,2.504468,2.718727,4.796418,4.95892,2.56272,2.848913,4.386671,4.726288,0.437108,...,6.265971,6.597801,1.169664,1.209819,8.782473,9.058255,1.003522,1.084081,9.004448,9.173535


#### Roundings

In [153]:
import re
import math

increments = {
    'Ni_1.*C': 0.1,
    'Cu_1.*C': 0.1,
    'Cu_2.*T': 0.01,
    'Cu_3.*T': 0.05,
    'Ni_4.*T': 0.01,
    'Ni_4.*C': 0.05,
    'Ni_5.*T': 0.01,
    'Ni_5.*C': 0.05,
    'Ni_6.*T': 0.01,
    'Ni_6.*C': 0.05
}

def round_up(value, increment):
    if value:
        return math.ceil(value / increment) * increment
    else:
        return value

def round_down(value, increment):
    if value:
        return math.floor(value / increment) * increment
    else:
        return value


for column in test_dataframe.columns:
    if len(column) > 6:
        increment = column[:5] + "*" + column[6]
        if increment in increments:
            increment = increments[column[:5] + "*" + column[6]]
            if column.endswith("min"):
                test_dataframe[column] = test_dataframe[column].apply(round_down, args=(increment,))
            elif column.endswith("max"):
                test_dataframe[column] = test_dataframe[column].apply(round_up, args=(increment,))
            else:
                continue
        else:
            continue

In [155]:
test_dataframe.head()

Unnamed: 0,MEAS_DT,Ni_1.1C_min,Ni_1.1C_max,Cu_1.1C_min,Cu_1.1C_max,Ni_1.2C_min,Ni_1.2C_max,Cu_1.2C_min,Cu_1.2C_max,Cu_2.1T_min,...,Ni_5.2C_min,Ni_5.2C_max,Ni_6.1T_min,Ni_6.1T_max,Ni_6.1C_min,Ni_6.1C_max,Ni_6.2T_min,Ni_6.2T_max,Ni_6.2C_min,Ni_6.2C_max
0,2024-01-19 12:15:00,2.5,2.8,4.7,5.0,2.5,2.9,4.3,4.8,0.43,...,6.25,6.6,1.16,1.21,8.75,9.1,1.0,1.09,9.0,9.2
1,2024-01-19 12:30:00,2.5,2.8,4.7,5.0,2.5,2.9,4.3,4.8,0.43,...,6.25,6.6,1.16,1.21,8.75,9.1,1.0,1.09,9.0,9.2
2,2024-01-19 12:45:00,2.5,2.8,4.7,5.0,2.5,2.9,4.3,4.8,0.43,...,6.25,6.6,1.16,1.21,8.75,9.1,1.0,1.09,9.0,9.2
3,2024-01-19 13:00:00,2.5,2.8,4.7,5.0,2.5,2.9,4.3,4.8,0.43,...,6.25,6.6,1.16,1.21,8.75,9.1,1.0,1.09,9.0,9.2
4,2024-01-19 13:15:00,2.5,2.8,4.7,5.0,2.5,2.9,4.3,4.8,0.43,...,6.25,6.6,1.16,1.21,8.75,9.1,1.0,1.09,9.0,9.2


In [159]:
test_dataframe.shape

(6740, 41)

In [157]:
test_dataframe.isna().sum().sum()

0

In [158]:
test_dataframe.to_csv('test_results/test.csv', index=None)