In [1]:
# Визуализации
from matplotlib import pyplot as plt
import seaborn as sns

In [2]:
# Базовые библиотеки
import math
import pathlib
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error

In [3]:
# Базовый ввод данных
DATA_DIR = pathlib.Path("./data")  # Откуда считываем данные
RS = 42
train = pd.read_parquet(DATA_DIR.joinpath("train.parquet"))
CAT = ["feature4"]
TARGETS = ["target0", "target1"]  # список таргетов для модели
FTS = train.filter(like="feature").columns.difference(CAT)  # Выбираем фичи для модели
train["gas"] = 0
train.loc[train.feature4=="gas2", "gas"] = 1
FTS = FTS.union(["gas"])  # список фичей для модели


In [4]:
X_tr, X_val, y_tr, y_val = train_test_split(train[FTS], train[TARGETS], train_size=0.5, random_state=RS)  # разбивка на трейн и тест

In [27]:
import datetime
print(datetime.datetime.now())

2023-06-04 21:56:49.487250


In [5]:
def tune_function(fn, a, b):
    if fn == 'log':
        return lambda x: math.log(x-a+b)
    elif fn == 'none':
        return lambda x: x
    elif fn =='sqr':
        return lambda x: x**2/a+b
    else:
        return lambda x: x

In [6]:
tuner0 = {
    'feature0': {'fn': 'log', 'a': 0, 'b': 50},
    'feature1': {'fn': 'log', 'a': 15, 'b': 0},
    'feature2': {'fn': 'log', 'a': 47, 'b': 0},
    'feature3': {'fn': 'log', 'a': 0, 'b': 52},
    'feature5': {'fn': 'log', 'a': 20, 'b': 0},
    'feature6': {'fn': 'none', 'a': 0, 'b': 0},
    'feature7': {'fn': 'log', 'a': 48.44, 'b': 0},
    'feature8': {'fn': 'none', 'a': 0, 'b': 0},
    'feature9': {'fn': 'none', 'a': 0, 'b': 0},
    'feature10': {'fn': 'none', 'a': 0, 'b': 0},
    'feature11': {'fn': 'log', 'a': 0, 'b': 21},
    'feature12': {'fn': 'log', 'a': 25, 'b': 0},
    'feature13': {'fn': 'none', 'a': 0, 'b': 0},
    'feature14': {'fn': 'none', 'a': 0, 'b': 0},
    'feature15': {'fn': 'none', 'a': 0, 'b': 0},
    'feature16': {'fn': 'log', 'a': 0, 'b': 0},
    'feature17': {'fn': 'log', 'a': 0, 'b': 15},
    'feature18': {'fn': 'log', 'a': 0, 'b': 5},
    'feature19': {'fn': 'none', 'a': 0, 'b': 0},
    'feature20': {'fn': 'none', 'a': 0, 'b': 50},
    'feature21': {'fn': 'none', 'a': 0, 'b': 0},
    'feature22': {'fn': 'none', 'a': 0, 'b': 0},
    'feature23': {'fn': 'log', 'a': 32, 'b': 0},
    'feature24': {'fn': 'none', 'a': 0, 'b': 0}
}

In [7]:
tuner1 = {
    'feature0': {'fn': 'log', 'a': 0, 'b': 50},
    'feature1': {'fn': 'log', 'a': 15, 'b': 0},
    'feature2': {'fn': 'log', 'a': 47, 'b': 0},
    'feature3': {'fn': 'log', 'a': 0, 'b': 52},
    'feature5': {'fn': 'log', 'a': 20, 'b': 0},
    'feature6': {'fn': 'none', 'a': 0, 'b': 0},
    'feature7': {'fn': 'log', 'a': 48.44, 'b': 0},
    'feature8': {'fn': 'none', 'a': 0, 'b': 0},
    'feature9': {'fn': 'none', 'a': 0, 'b': 50},
    'feature10': {'fn': 'none', 'a': 0, 'b': 0},
    'feature11': {'fn': 'log', 'a': 0, 'b': 21},
    'feature12': {'fn': 'log', 'a': 25, 'b': 0},
    'feature13': {'fn': 'sqr', 'a': 120, 'b': 0},
    'feature14': {'fn': 'none', 'a': 0, 'b': 0},
    'feature15': {'fn': 'none', 'a': 0, 'b': 0},
    'feature16': {'fn': 'log', 'a': 0, 'b': 0},
    'feature17': {'fn': 'log', 'a': 0, 'b': 15},
    'feature18': {'fn': 'log', 'a': 0, 'b': 5},
    'feature19': {'fn': 'none', 'a': 0, 'b': 0},
    'feature20': {'fn': 'none', 'a': 0, 'b': 50},
    'feature21': {'fn': 'none', 'a': 0, 'b': 0},
    'feature22': {'fn': 'none', 'a': 0, 'b': 0},
    'feature23': {'fn': 'log', 'a': 32, 'b': 0},
    'feature24': {'fn': 'none', 'a': 0, 'b': 0}    
}

In [8]:
def data_tune(df, tuner):
    df_out = df.copy()
    for column, func in tuner.items():
        # print(column)
        # print(df_out[column].min())
        func_type = func['fn']
        a = func['a']        
        b = func['b']  
        # print(a, b)
        df_out[column] = df_out[column].apply(tune_function(func_type, a=a, b=b))
        
    return df_out
    

In [9]:
tuner = {"target0": tuner0, 
         "target1": tuner1}

data = {}
val_data = {}
for tg in TARGETS:    
    data[tg] = data_tune(X_tr, tuner[tg])
    val_data[tg] = data_tune(X_val, tuner[tg])


## Нормализация

In [10]:
from sklearn.preprocessing import StandardScaler
import pickle

In [11]:
scalers = {}
for tg in TARGETS:    
    std_scaler = StandardScaler()
    std_scaler.fit(data[tg][FTS])
    data[tg][FTS] = std_scaler.transform(data[tg][FTS])
    val_data[tg][FTS] = std_scaler.transform(val_data[tg][FTS])
    
    scalers[tg] = std_scaler
    with open(f'scaler_{tg}.pickle', 'wb') as f:
        pickle.dump(std_scaler, f)

## Модель


In [12]:
from sklearn.neural_network import MLPRegressor

In [13]:
data[tg][FTS].columns

Index(['feature0', 'feature1', 'feature10', 'feature11', 'feature12',
       'feature13', 'feature14', 'feature15', 'feature16', 'feature17',
       'feature18', 'feature19', 'feature2', 'feature20', 'feature21',
       'feature22', 'feature23', 'feature24', 'feature3', 'feature5',
       'feature6', 'feature7', 'feature8', 'feature9', 'gas'],
      dtype='object')

In [16]:
val_data[tg][FTS].columns

Index(['feature0', 'feature1', 'feature10', 'feature11', 'feature12',
       'feature13', 'feature14', 'feature15', 'feature16', 'feature17',
       'feature18', 'feature19', 'feature2', 'feature20', 'feature21',
       'feature22', 'feature23', 'feature24', 'feature3', 'feature5',
       'feature6', 'feature7', 'feature8', 'feature9', 'gas'],
      dtype='object')

In [15]:
y_tr[tg]

32437      6.591908
3862      32.869708
142669    52.239636
82407      1.844760
148010     4.921912
            ...    
119879     4.189427
103694    37.123909
131932     4.369440
146867    41.967148
121958     6.730656
Name: target1, Length: 76708, dtype: float64

In [17]:
y_tr

Unnamed: 0,target0,target1
32437,31.802060,6.591908
3862,73.901540,32.869708
142669,82.801944,52.239636
82407,22.129998,1.844760
148010,27.839774,4.921912
...,...,...
119879,23.377460,4.189427
103694,75.559708,37.123909
131932,23.061055,4.369440
146867,71.048091,41.967148


In [18]:
y_val

Unnamed: 0,target0,target1
37090,78.808773,44.337961
61708,73.675072,36.809781
56423,86.550986,52.653176
17732,80.526145,51.910781
62874,76.246480,64.766980
...,...,...
53683,22.579341,2.861112
132894,88.243660,52.671246
33184,24.461949,7.425830
130623,20.666541,1.510098


In [19]:
%%time
models = {}
tr_preds = {}
val_preds = {}

for tg in TARGETS:
    print(f"{tg}", "=" * 10)
    mlp = MLPRegressor(hidden_layer_sizes=(26, 1024, 512, 256), random_state=RS, verbose=True, n_iter_no_change=50, tol=0.00005, max_iter=300, warm_start=True)    
    mlp.fit(data[tg][FTS], y_tr[tg]) 

    tr_preds[tg] = mlp.predict(data[tg][FTS])
    val_preds[tg] = mlp.predict(val_data[tg][FTS])
    
    with open(f'tune_mini_{tg}.pickle', 'wb') as f:
        pickle.dump(mlp, f)

    models[tg] = mlp
    
    print(f"MAPE (train {tg}): {mean_absolute_percentage_error(y_tr[tg], tr_preds[tg]) * 100:.3f} %")
    print(f"MAPE (val {tg}): {mean_absolute_percentage_error(y_val[tg], val_preds[tg]) * 100:.3f} %")

Iteration 1, loss = 47.77065282
Iteration 2, loss = 0.69333168
Iteration 3, loss = 0.39806794
Iteration 4, loss = 0.30512106
Iteration 5, loss = 0.25276489
Iteration 6, loss = 0.23942497
Iteration 7, loss = 0.21972058
Iteration 8, loss = 0.19598012
Iteration 9, loss = 0.18105652
Iteration 10, loss = 0.18752838
Iteration 11, loss = 0.17367579
Iteration 12, loss = 0.13383710
Iteration 13, loss = 0.14485145
Iteration 14, loss = 0.16575252
Iteration 15, loss = 0.14811094
Iteration 16, loss = 0.11699339
Iteration 17, loss = 0.11909832
Iteration 18, loss = 0.13106994
Iteration 19, loss = 0.12204702
Iteration 20, loss = 0.09683883
Iteration 21, loss = 0.10061622
Iteration 22, loss = 0.10384789
Iteration 23, loss = 0.11080918
Iteration 24, loss = 0.08712971
Iteration 25, loss = 0.09457540
Iteration 26, loss = 0.08961861
Iteration 27, loss = 0.10556569
Iteration 28, loss = 0.07471263
Iteration 29, loss = 0.09900719
Iteration 30, loss = 0.07733388
Iteration 31, loss = 0.06955450
Iteration 32, lo

NameError: name 'val_pred' is not defined

In [20]:
print(f"MAPE (val {tg}): {mean_absolute_percentage_error(y_val[tg], val_preds[tg]) * 100:.3f} %")

MAPE (val target0): 0.349 %


In [21]:
tg = 'target1'
print(f"{tg}", "=" * 10)
mlp = MLPRegressor(hidden_layer_sizes=(26, 1024, 512, 256), random_state=RS, verbose=True, n_iter_no_change=50, tol=0.00005, max_iter=300, warm_start=True)    
mlp.fit(data[tg][FTS], y_tr[tg]) 

tr_preds[tg] = mlp.predict(data[tg][FTS])
val_preds[tg] = mlp.predict(val_data[tg][FTS])

with open(f'tune_mini_{tg}.pickle', 'wb') as f:
    pickle.dump(mlp, f)

models[tg] = mlp

print(f"MAPE (train {tg}): {mean_absolute_percentage_error(y_tr[tg], tr_preds[tg]) * 100:.3f} %")
print(f"MAPE (val {tg}): {mean_absolute_percentage_error(y_val[tg], val_preds[tg]) * 100:.3f} %")

Iteration 1, loss = 16.36847566
Iteration 2, loss = 0.49065673
Iteration 3, loss = 0.39879318
Iteration 4, loss = 0.35788786
Iteration 5, loss = 0.34153834
Iteration 6, loss = 0.29013127
Iteration 7, loss = 0.29354053
Iteration 8, loss = 0.26708897
Iteration 9, loss = 0.25362942
Iteration 10, loss = 0.26261696
Iteration 11, loss = 0.24445817
Iteration 12, loss = 0.25543328
Iteration 13, loss = 0.23305379
Iteration 14, loss = 0.20979912
Iteration 15, loss = 0.21528445
Iteration 16, loss = 0.22679329
Iteration 17, loss = 0.23167875
Iteration 18, loss = 0.23721660
Iteration 19, loss = 0.20784882
Iteration 20, loss = 0.22737183
Iteration 21, loss = 0.22540192
Iteration 22, loss = 0.19194393
Iteration 23, loss = 0.19435074
Iteration 24, loss = 0.22590799
Iteration 25, loss = 0.19299731
Iteration 26, loss = 0.19773706
Iteration 27, loss = 0.20661440
Iteration 28, loss = 0.18856971
Iteration 29, loss = 0.19470892
Iteration 30, loss = 0.20216475
Iteration 31, loss = 0.19538391
Iteration 32, lo



MAPE (train target1): 1.333 %
MAPE (val target1): 1.429 %


In [78]:
# with open(f'bad_tune_mini_{tg}.pickle', 'wb') as f:
#     pickle.dump(mlp, f)

In [None]:
# X_tr, X_val, y_tr, y_val = train_test_split(train[FTS], train[TARGETS], train_size=0.5, random_state=RS)  # разбивка на трейн и тест

In [None]:
# print(f"MAPE (train): {mean_absolute_percentage_error(y_tr, tr_preds) * 100:.3f} %")
# print(f"MAPE (val): {mean_absolute_percentage_error(y_val, val_preds) * 100:.3f} %")

## Полное обучение

In [48]:
tuner = {"target0": tuner0, 
         "target1": tuner1}

data = {}
val_data = {}
for tg in TARGETS:    
    data[tg] = data_tune(train, tuner[tg])


In [49]:
scalers = {}
for tg in TARGETS:    
    std_scaler = StandardScaler()
    std_scaler.fit(data[tg][FTS])
    data[tg][FTS] = std_scaler.transform(data[tg][FTS])
    
    scalers[tg] = std_scaler
    with open(f'full_scaler_{tg}.pickle', 'wb') as f:
        pickle.dump(std_scaler, f)

In [50]:
%%time
models = {}
tr_preds = {}
val_preds = {}

for tg in TARGETS:
    print(f"{tg}", "=" * 10)
    mlp = MLPRegressor(hidden_layer_sizes=(26, 1024, 512, 256), random_state=RS, verbose=True, n_iter_no_change=50, tol=0.00005, max_iter=300, warm_start=True)    
    mlp.fit(data[tg][FTS], y_tr[tg]) 

    tr_preds[tg] = mlp.predict(data[tg][FTS])
    # val_pred[tg] = mlp.predict(val_data[tg])
    
    with open(f'tune_full_{tg}.pickle', 'wb') as f:
        pickle.dump(mlp, f)

    models[tg] = mlp
    
    print(f"MAPE (train {tg}): {mean_absolute_percentage_error(y_tr[tg], tr_preds[tg]) * 100:.3f} %")
    # print(f"MAPE (val {tg}): {mean_absolute_percentage_error(y_val, val_pred[tg]) * 100:.3f} %")



ValueError: could not convert string to float: 'gas1'