In [1]:
import os
import pickle
import numpy as np
import tensorflow as tf

from models.mlp import emb_mlp

from utils.args import get_args
from utils.config import load_config

NUMERICAL_KEYS = ['bid', 'budget', 'engagements','page_views', 'clicks', 
                  'active_days', 'media_spend', 'media_cpc','cpe']

CATEGORICAL_KEYS = ['group', 'item', 'channel', 'date', 'headline', 'storySummary',
                    'IABCategory', 'targetGeo', 'targetInterest', 'targetAge', 'targetOs', 
                    'targetDevices','targetGender', 'targetLanguages', 'CATEGORY_1']

import matplotlib.pyplot as plt

tf.config.run_functions_eagerly(True)

In [2]:
def save_pickle(filename, file):
    filehandler = open(filename, "wb")
    pickle.dump(file, filehandler)
    filehandler.close()

def load_pickle(filename):
    file = open(filename,'rb')
    object_file = pickle.load(file)
    file.close()
    return object_file

In [3]:
def log_loss(config, history, _flag):
    filepath = os.path.join("experiments", config["name"], "exp_loss.log")

    _idx = np.where(history.history['val_loss'] == np.min(history.history['val_loss']))[0][0]
    _loss = history.history['loss'][_idx]
    _val_loss = history.history['val_loss'][_idx]
    
    _str = f"{_flag.upper()} {config['name']}\nTrain Loss: {_loss} | Valid Loss: {_val_loss}\n"

    with open(filepath, "a") as fhandle:
        fhandle.write(_str)

def get_train_params(config, model_weights_path):
    _learning_rate = tf.keras.optimizers.schedules.ExponentialDecay(initial_learning_rate=config["train"]["learning_rate"],
                                                                decay_steps=config["train"]["decay_steps"],
                                                                decay_rate=config["train"]["decay_rate"])

    _board_path = os.path.join("experiments", config['name'], "logs")
    _board = tf.keras.callbacks.TensorBoard(log_dir=_board_path)

    _best_model = tf.keras.callbacks.ModelCheckpoint(model_weights_path, save_best_only=True, 
                                                     monitor='val_loss', 
                                                     mode='min')
    
    _callbacks = [_board, _best_model]
    return _learning_rate, _callbacks


In [9]:
# engage_keys = ["group", "item", "channel", "headline", "storySummary"]
# spend_keys = ["channel", "headline", "storySummary"]

def select_keys(encoded_values, _keys):
    filtered_encoded_values = {_key: encoded_values[_key] for _key in _keys} 
    encoded_indexes = [list(encoded_values.keys()).index(_key) for _key in _keys]
    return filtered_encoded_values, encoded_indexes


def train_target_model(config, encoded_values_indexes, _flag): # "engagements or media_spend"
    encoded_values, encoded_indexes = encoded_values_indexes

    if config["train"]["use_bert"]:
        model = bert_mlp(config, encoded_values)
    else:
        model = emb_mlp(config, encoded_values)
    
    model.summary()

    model_weights_path = os.path.join("experiments", config["name"], f"model_{_flag}/model.hdf5")
    _learning_rate, _callbacks = get_train_params(config, model_weights_path)

    _opt  = tf.keras.optimizers.Adam(learning_rate=_learning_rate)
    _loss = tf.keras.losses.MeanSquaredError()

    model.compile(optimizer=_opt, loss=_loss)
    
    # Data
    data_path = config["pre_process"]["processed_data_path"]
    pkl_data = load_pickle(data_path)

    train_num, train_cat, train_bert, train_true = pkl_data[0]
    test_num, test_cat, test_bert, test_true = pkl_data[1]

    train_cat = train_cat[:, encoded_indexes]
    test_cat = test_cat[:, encoded_indexes]

    train_true = train_true[_flag]
    test_true = test_true[_flag]

    train_cat_in = [train_cat[:, idx] for idx in range(train_cat.shape[1])]
    train_data = [train_num]
    train_data.extend(train_cat_in)

    if config["train"]["use_bert"]:
        train_data.append(train_bert)

    if config["train"]["use_test_as_valid"]:
        test_cat_in = [test_cat[:, idx] for idx in range(test_cat.shape[1])]
        test_data = [test_num]
        test_data.extend(test_cat_in)

        if config["train"]["use_bert"]:
            test_data.append(test_bert)

        _valid_data = (test_data, test_true)
    else:
        _valid_data = None
        
    history = model.fit(
            x=train_data,
            y=train_true,
            batch_size=config["train"]["batch_size"],
            epochs=config["train"]["epochs"],
            verbose="auto",
            callbacks=_callbacks,
            validation_split=config["train"]["valid_split"],
            validation_data=_valid_data,
            shuffle=True,
            initial_epoch=0
        )

    model.load_weights(model_weights_path)
    model_path = model_weights_path.replace(".hdf5",".keras")
    model.save(model_path)

    history_path = model_weights_path.replace("model.hdf5", "train_history.pkl")
    save_pickle(history_path, history)
    log_loss(config, history, _flag)

In [13]:
config_filepath = "configs/baseline.json"
config = load_config(config_filepath)

config["train"]["epochs"] = 100

print("Changed Config: \n\n", config)

Changed Config: 

 {'name': 'baseline', 'seed': 42, 'datapath': '../problem_merged_data.csv', 'pre_process': {'processed_data_path': 'processed_data/data_filtered_split0.8.pkl', 'num_input': ['bid', 'budget'], 'cat_input': None, 'engage_cat_input': ['group', 'item', 'channel', 'date', 'headline', 'storySummary', 'IABCategory', 'targetGeo', 'targetInterest', 'targetAge', 'targetOs', 'targetDevices', 'targetGender', 'targetLanguages', 'CATEGORY_1'], 'spend_cat_input': ['group', 'item', 'channel', 'date', 'headline', 'storySummary', 'IABCategory', 'targetGeo', 'targetInterest', 'targetAge', 'targetOs', 'targetDevices', 'targetGender', 'targetLanguages', 'CATEGORY_1'], 'target_output': ['engagements', 'media_spend'], 'train_split': 0.8, 'remove_outliers': False, 'std_limit': 3}, 'train': {'cat_emb_dim': 4, 'learning_rate': 0.001, 'decay_steps': 20, 'decay_rate': 0.98, 'dense_dims': [1024, 1024, 1024, 1024, 1024, 1024, 16], '_drop_out_rate': 0.2, 'use_test_as_valid': True, 'valid_split': 0.

In [14]:
desired_models = config["pre_process"]["target_output"]

for _flag in desired_models[:1]:
    encoded_values_path = os.path.join("experiments", config["name"], "encoded_values.pkl")
    encoded_values = load_pickle(encoded_values_path)
    print(encoded_values.keys())

    if _flag == "engagements":
        _keys = config["pre_process"]["engage_cat_input"]
    elif _flag == "media_spend":
        _keys = config["pre_process"]["spend_cat_input"]
    else:
        _keys = list(encoded_values.keys())

    encoded_values_indexes = select_keys(encoded_values, _keys)
    
    train_target_model(config, encoded_values_indexes, _flag)

dict_keys(['group', 'item', 'channel', 'date', 'headline', 'storySummary', 'IABCategory', 'targetGeo', 'targetInterest', 'targetAge', 'targetOs', 'targetDevices', 'targetGender', 'targetLanguages', 'CATEGORY_1'])
Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_34 (InputLayer)          [(None, 1)]          0           []                               
                                                                                                  
 input_35 (InputLayer)          [(None, 1)]          0           []                               
                                                                                                  
 input_36 (InputLayer)          [(None, 1)]          0           []                               
                                                                             

 flatten_2 (Flatten)            (None, 60)           0           ['concatenate_4[0][0]']          
                                                                                                  
 concatenate_5 (Concatenate)    (None, 62)           0           ['input_33[0][0]',               
                                                                  'flatten_2[0][0]']              
                                                                                                  
 dense_16 (Dense)               (None, 1024)         64512       ['concatenate_5[0][0]']          
                                                                                                  
 dropout_14 (Dropout)           (None, 1024)         0           ['dense_16[0][0]']               
                                                                                                  
 dense_17 (Dense)               (None, 1024)         1049600     ['dropout_14[0][0]']             
          

Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
INFO:tensorflow:Assets written to: ram://867a8546-13ad-40b1-9712-31ee795502b6/assets


In [15]:
# # For History evaluation
# history_path = os.path.join("experiments", "baseline", "model_engagements", "train_history.pkl")

# history = load_pickle(history_path)

# plt.figure(figsize=(5, 3))
# plt.plot(history.history['loss'])
# plt.plot(history.history['val_loss'])
# plt.title('Media Spend Model Training')
# plt.ylabel('RMSE')
# plt.xlabel('Epoch')
# plt.legend(['Train', 'Val'], loc='upper left')

# plt.grid()
# plt.show()