# Embeddings Model Pipeline

**Instructions:** Either fill out the config files in this folder before the run, or you can manually update the varoious config dictionaries with hardcoded values in this jupyter notebook. 

**My recommendation:** Fill out the config files first and then you can make the iterative and interactive changes in this script.

In [None]:
import yaml
import pymysql
import time
import datetime as dt
import gadgets as gd
import pandas as pd
import numpy as np
import pickle
import nltk
import pymysql
import os
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.corpus  import stopwords
from nltk.stem import PorterStemmer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import GRU, RNN, Conv1D, GlobalAveragePooling1D
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Input, Flatten, LSTM, Lambda, Embedding,concatenate 
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.optimizers import SGD, Adam, RMSprop,Adagrad,Adamax,Nadam
from tensorflow.keras import utils
from tensorflow.keras.utils import plot_model
from tensorflow.keras.callbacks import EarlyStopping,ModelCheckpoint,ReduceLROnPlateau,LearningRateScheduler
from tensorflow.keras.preprocessing.text import Tokenizer
import pickle

jobs = 20
config = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=jobs,
                       inter_op_parallelism_threads=jobs,
                       allow_soft_placement=True,
                       device_count={'CPU':jobs})
session = tf.compat.v1.Session(config=config)
tf.compat.v1.keras.backend.set_session(session)

In [None]:
data_config = None
with open("read_data_config.yml", "r") as config_file:
    data_config = yaml.safe_load(config_file)

model_info_config = None
with open("model_info_table_config.yml", "r") as config_file:
    model_info_config = yaml.safe_load(config_file)

model_config = None
with open("train_model_config.yml", "r") as config_file:
    model_config = yaml.safe_load(config_file)

pipeline_config = None
with open("run_model_pipeline_config.yml", "r") as config_file:
    pipeline_config = yaml.safe_load(config_file)

process_config = None
with open("process_data_config.yml", "r") as config_file:
    process_config = yaml.safe_load(config_file)

## Pre-Run

#### Creates folder structure for model

In [None]:
def make_folder_structure(model_train_date):
    model_data_base_path = (
        os.getcwd() + '/models/' 
    )
    model_folder = "_".join(str(dt.datetime.now()).split(" "))
    model_full_path = model_data_base_path + model_folder
    
    if not os.path.isdir(model_data_base_path):
        os.mkdir(model_data_base_path)
    
    os.mkdir(model_full_path)
    os.mkdir(model_full_path + "/encoders")
    os.mkdir(model_full_path + "/scalers")
    os.mkdir(model_full_path + "/imputers")
    os.mkdir(model_full_path + "/tokenizers")
       
    return model_full_path

In [None]:
model_train_date = str(dt.datetime.now())
model_expires_date = str(
    (dt.datetime.now() + dt.timedelta(days = pipeline_config['model_train_interval'])
).date())
# create path to save stuff in
model_path = make_folder_structure(model_train_date)
print(model_path)

## Data Fetching

#### If you want to Hardcode training, test, validation dates, run this next cell. (if skipped, the config file options will be used)

In [None]:
date_dict = {}
date_dict['prediction_date_end'] = dt.date(2021, 3, 5)
date_dict['prediction_date_start'] = dt.date(2021, 3, 2)

date_dict['validation_date_end'] = dt.date(2021, 3, 1)
date_dict['validation_date_start'] = dt.date(2021, 2, 27)

date_dict['train_date_end'] = dt.date(2021, 2, 28)
date_dict['train_date_start'] = dt.date(2021, 2, 10)

process_config = {**process_config, **date_dict}

##### Utility functions for reading data below

In [None]:
def query_formatter(**kwargs):
    return data_config["sme_query_template"].format(**kwargs)

def connect_db(db_dict):
    return pymysql.connect(**db_dict)

def make_date_list(start_date: str, end_date: str, format='%Y-%m-%d'):
    start_date = dt.datetime.strptime(start_date, format).strftime(
        '%Y-%m-%d'
    )
    end_date = dt.datetime.strptime(end_date, format).strftime(
        '%Y-%m-%d'
    )
    start_end = (dt.datetime.strptime(end_date, format)
                 - dt.timedelta(days=1)).strftime('%Y-%m-%d')
    end_start = (dt.datetime.strptime(start_date, format)
                 + dt.timedelta(days=1)).strftime('%Y-%m-%d')

    date_range = np.concatenate(
        [np.arange(start_date,
                   start_end,
                   dtype='datetime64[D]').reshape(-1, 1),
         np.arange(end_start,
                   end_date,
                   dtype='datetime64[D]').reshape(-1, 1)],
        axis=1)

    return date_range

def make_date_list_from_datetime(start_date, end_date):
    return make_date_list(str(start_date), str(end_date + dt.timedelta(2)))

In [None]:
'''
Makes the data dictionary that is used in formatting the multi-threaded sme queries
'''
def make_data_dict(start_date, end_date):
    sql_list_str = lambda l: ",\n".join(l)
    list_str = lambda l: '"' + '",\n"'.join(l) + '"'
    data_dict = {
        **data_config,
        "date_start": start_date,
        "date_end": end_date,
        "programid_list_str" : list_str(data_config['inscope_programs']),
        "feature_list_str": sql_list_str(data_config['raw_feature_list']),
        "eng_feature_list_str": sql_list_str(data_config['eng_feature_list'])
    }
    data_dict['sme_query'] = query_formatter(**data_dict)
    data_dict['date_list'] = make_date_list_from_datetime(start_date, end_date)
    data_dict['db_connector'] = lambda **kwargs: connect_db(data_dict['db_args'])
    return data_dict
'''
Runs multi-threaded sme queries based on the given data dictionary
'''
def read_data(data_dict):
    production_reader = gd.sql.MultiSQL(connector=data_dict['db_connector'])
    print(">>> start: {}".format(time.asctime()))
    data = gd.sql.trans.lower_columns(
    production_reader.get_data(
        query_gen=gd.sql.make_query_gen(
            data_dict['sme_query'],
            vals=data_dict['date_list']),
        threads = data_dict['read_threads']
    ))
    print("<<< finished sme read: {}".format(time.asctime()))
    return data

#### Reads data and saves it into All Data

In [None]:
# read data
all_data = read_data(make_data_dict(process_config['train_date_start'], process_config['prediction_date_end']))

## Data Pre-processing

0. Indexes data based on index specified in config files. can be multi-index.
1. Splits the data into train, validation, test
2. Splits data by type (numerical, categorical, text, utility) as specified in the config file
3. Imputes numerical data, scales numerical data using the Standard scaler
4. Imputes missing categories with an 'NL' for no label. Creates smalls for categorical variables, then One Hot Encodes the non-smalls categories.
5. Cleans text fields (keeps only alphanumeric characters), removes stop words, stems, tokenizes
6. utility features are skipped over and attached after other data processed. (these are things like vdn, on_off, v_calltime_dt, programid)

In [None]:
def find_non_rare_labels(df, variable, tolerance):
    temp = df.groupby([variable])[variable].count() / len(df)
    non_rare = [x for x in temp.loc[temp>tolerance].index.values]
    return non_rare


def remove_dates(df, params):
    df = df.loc[-df[params['date_feature']].astype('str').isin(params['excluded_dates']), :]
    return df

def split_by_date(df, params, drop_date = True):  
    
    if 'test_date_start' not in params.keys() and 'prediction_date_start' in params.keys():
        params['test_date_start'] = params['prediction_date_start']
    if 'test_date_end' not in params.keys() and 'prediction_date_end' in params.keys():
        params['test_date_end'] = params['prediction_date_end']
    
    
    df_train = df[
        (df[params['date_feature']] >= str(params['train_date_start']))
        & (df[params['date_feature']] <= str(params['train_date_end']))
    ].copy()
    
    df_val = df[
        (df[params['date_feature']] >= str(params['validation_date_start']))
        & (df[params['date_feature']] <= str(params['validation_date_end']))
    ].copy()
    
    df_test = df[
        (df[params['date_feature']] >= str(params['test_date_start']))
        & (df[params['date_feature']] <= str(params['test_date_end']))
    ].copy()
    
    if drop_date:
        df_train.drop(params['date_feature'], axis = 1, inplace = True)
        df_val.drop(params['date_feature'], axis = 1, inplace = True)
        df_test.drop(params['date_feature'], axis = 1, inplace = True)
        
    return df_train, df_val, df_test

def append_smalls(categories):
    if 'smalls' not in categories:
        categories = np.array([*categories, 'smalls'], dtype = categories.dtype)
    return categories
    
def process_num_preds(num, params, model_path):
    sc = None
    with open(model_path + '/scalers/standard_scaler.pickle', 'rb') as handle:
        sc = pickle.load(handle)
    num = num.replace('NA', np.nan)
    num = num.replace('', np.nan)
    num.drop(params['date_feature'], axis = 1, inplace = True)
    for col in num.columns:
        if col != params['date_feature']:
            try:
                mode = None
                with open(model_path + '/imputers/mode_' + col +'.pickle', 'rb') as handle:
                    mode = pickle.load(handle)

                num[col] = num[col].fillna(mode) 
                num[col] = num[col].astype(np.float32)
            except:
                num.drop([col], axis = 1, inplace = True)
                print('{} dropped!'.format(col))
    
    
    num_cols = num.columns
    df_num = sc.transform(num)
    df_num = pd.DataFrame(df_num, columns = num_cols, index = num.index)
    
    return df_num

def process_num(num, params, model_path):
    sc = StandardScaler()
    
    num = num.replace('NA', np.nan)
    num = num.replace('', np.nan)
    
    for col in num.columns:
        if col != params['date_feature']:
            try:
                num[col] = num[col].astype(np.float32)
                mode = num[col].mode()[0]
                num[col] = num[col].fillna(mode)
                with open(model_path + '/imputers/mode_' + col +'.pickle', 'wb') as handle:
                    pickle.dump(mode, handle, protocol=pickle.HIGHEST_PROTOCOL)
                
            except:
                num.drop([col], axis = 1, inplace = True)
                print('{} dropped!'.format(col))
    
    
    num_train, num_val, num_test = split_by_date(num, params)
    num_cols = num_train.columns
    
    sc.fit(num_train)

    df_train_num = sc.transform(num_train)
    df_val_num = sc.transform(num_val)
    df_test_num = sc.transform(num_test)

    df_train_num = pd.DataFrame(df_train_num, columns = num_cols, index = num_train.index)
    df_val_num = pd.DataFrame(df_val_num, columns = num_cols, index = num_val.index)
    df_test_num = pd.DataFrame(df_test_num, columns = num_cols, index = num_test.index)
    
    with open(model_path + '/scalers/standard_scaler.pickle', 'wb') as handle:
        pickle.dump(sc, handle, protocol=pickle.HIGHEST_PROTOCOL)

    
    return df_train_num, df_val_num, df_test_num

def process_cat_preds(cat, params, model_path):
    ohe = None
    with open(model_path + '/encoders/one_hot_encoder.pickle', 'rb') as handle:
        ohe = pickle.load(handle)
    
    cat = cat.replace('NA', np.nan)
    cat = cat.replace('', np.nan)
    cat = cat.replace(np.nan, 'NL')
    cat.drop(params['date_feature'], axis = 1, inplace = True)
    
    for col in cat.columns:
        if col != params['date_feature']:
            frequent_cat = None
            with open(model_path + '/imputers/smalls_' + col +'.pickle', 'rb') as handle:
                frequent_cat = pickle.load(handle)
                
            cat[col] = np.where(cat[col].isin(frequent_cat), cat[col], 'smalls')
        
    cat_cols = cat.columns
    ohe_cols = ohe.get_feature_names(cat_cols.values)
    
    df_cat = ohe.transform(cat.values)
    df_cat = pd.DataFrame(df_cat, columns = ohe_cols, index = cat.index)
    
    return df_cat


def process_cat(cat, params, model_path):
    ohe = OneHotEncoder(sparse = False)
    
    cat = cat.replace('NA', np.nan)
    cat = cat.replace('', np.nan)
    cat = cat.replace(np.nan, 'NL')
    
    for col in cat.columns:
        if col != params['date_feature']:
            frequent_cat = find_non_rare_labels(cat, col, params['smalls'])
            cat[col] = np.where(cat[col].isin(frequent_cat), cat[col], 'smalls')
            
            with open(model_path + '/imputers/smalls_' + col +'.pickle', 'wb') as handle:
                pickle.dump(frequent_cat, handle, protocol=pickle.HIGHEST_PROTOCOL)
            
    cat_train, cat_val, cat_test = split_by_date(cat, params)
    cat_cols = cat_train.columns
    
    ohe.fit(cat_train)
    ohe.categories_ =  [append_smalls(cat) for cat in ohe.categories_]
    
    ohe_cols = ohe.get_feature_names(cat_cols.values)
    
    df_train_cat = ohe.transform(cat_train.values)
    df_val_cat = ohe.transform(cat_val.values)
    df_test_cat = ohe.transform(cat_test.values)
    
    df_train_cat = pd.DataFrame(df_train_cat, columns = ohe_cols, index = cat_train.index)
    df_val_cat = pd.DataFrame(df_val_cat, columns = ohe_cols, index = cat_val.index)
    df_test_cat = pd.DataFrame(df_test_cat, columns = ohe_cols, index = cat_test.index)
    
    with open(model_path + '/encoders/one_hot_encoder.pickle', 'wb') as handle:
        pickle.dump(ohe, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
    return df_train_cat, df_val_cat, df_test_cat

def clean_text(text_input):
    cleaner = RegexpTokenizer(r'\w+')
    stop_words = stopwords.words('english')
    stemmer = PorterStemmer()
    text = ["" if x is None else x.lower().strip() for x in text_input]
    tokenized = [word_tokenize(x) for x in text]
    tokenized = [(filter(None, x)) for x in tokenized]
    cleaned_tokenized = [cleaner.tokenize(" ".join(x)) for x in tokenized]
    filtered_tokenized = [[s for s in x if s not in stop_words] for x in cleaned_tokenized]
    sentences = [" ".join(row) for row in filtered_tokenized]

    return pd.Series(data = sentences, index = text_input.index)

def process_nlp(nlp, params, prediction_only = False):
    for col in nlp.columns:
        if col != params['date_feature']:
            nlp[col] = clean_text(nlp[col])
            
    if prediction_only:
        return nlp
    
    nlp_train, nlp_val, nlp_test = split_by_date(nlp, params)
    
    return nlp_train, nlp_val, nlp_test

def process_data_train(in_df, params, model_path):
    df = in_df.copy()
    df.set_index(params['index_features'], inplace = True)
    df[params['date_feature']] = pd.to_datetime(df[params['date_feature']])
    df[params['target_feature']] = df[params['target_feature']].astype(np.float64)
    
    df = remove_dates(df, params)
    
    util = df[params['utility_features'] + [params['target_feature'], params['date_feature']]].copy()
    nlp = df[params['nlp_features'] + [params['date_feature']]].copy()
    cat = df[params['categoric_features'] + [params['date_feature']]].copy()
    num = df[params['numeric_features'] + [params['date_feature']]].copy()
    
    
    cat_train, cat_val, cat_test = process_cat(cat, params, model_path)
    num_train, num_val, num_test = process_num(num, params, model_path)   
    nlp_train, nlp_val, nlp_test = process_nlp(nlp, params)
    util_train, util_val, util_test = split_by_date(util, params, drop_date = False)
    
    train = pd.concat([util_train, cat_train, nlp_train, num_train], axis = 1)
    val = pd.concat([util_val, cat_val, nlp_val, num_val], axis = 1)
    test = pd.concat([util_test, cat_test, nlp_test, num_test], axis = 1)
    
    return train, val, test 

def process_data_predict(in_df, params, model_path):
    df = in_df.copy()
    df.set_index(params['index_features'], inplace = True)
    df[params['date_feature']] = pd.to_datetime(df[params['date_feature']])
    df[params['target_feature']] = df[params['target_feature']].astype(np.float64)
    
    util = df[params['utility_features'] + [params['target_feature'], params['date_feature']]].copy()
    nlp = df[params['nlp_features'] + [params['date_feature']]].copy()
    cat = df[params['categoric_features'] + [params['date_feature']]].copy()
    num = df[params['numeric_features'] + [params['date_feature']]].copy()
    
    
    cat_pred = process_cat_preds(cat, params, model_path)
    num_pred = process_num_preds(num, params, model_path)   
    nlp_pred = process_nlp(nlp, params, prediction_only = True)
    util_pred = util
    
    pred = pd.concat([util_pred, cat_pred, nlp_pred, num_pred], axis = 1)
    
    return pred

def process_data(in_df, params, model_path, prediction_only = False):
    nltk.data.path.append(params['nltk_data_dir'])
    if prediction_only:
        return process_data_predict(in_df, params, model_path)
    
    return process_data_train(in_df, params, model_path)

In [None]:
# process all data and save imputers, etc
train, val, test = process_data(all_data, process_config, model_path)

## Trains Model and output predictions

In [None]:
def build_seqs_pred(text, col, params, model_path):
    tokenizer = None
    with open(model_path + '/tokenizers/text_tokenizer_' + col + '.pickle', 'rb') as handle:
        tokenizer = pickle.load(handle)
        
    sequences = tokenizer.texts_to_sequences(text)
    sequences_padded = sequence.pad_sequences(sequences, maxlen = params['max_length'], padding='post', truncating='post')
    
    return sequences_padded  
    
def build_seqs(text_train, text_val, text_test, col, params, model_path):
    tokenizer = Tokenizer(num_words=params['max_words'], oov_token = '<OOV>')
    tokenizer.fit_on_texts(text_train)

    train_sequences = tokenizer.texts_to_sequences(text_train)
    train_sequences_padded = sequence.pad_sequences(train_sequences, maxlen=params['max_length'], padding='post', truncating='post')
    
    val_sequences = tokenizer.texts_to_sequences(text_val)
    val_sequences_padded = sequence.pad_sequences(val_sequences, maxlen=params['max_length'], padding='post', truncating='post')

    test_sequences = tokenizer.texts_to_sequences(text_test)
    test_sequences_padded = sequence.pad_sequences(test_sequences, maxlen=params['max_length'], padding='post', truncating='post')
    
    with open(model_path + '/tokenizers/text_tokenizer_' + col + '.pickle', 'wb') as handle:
                pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
    return train_sequences_padded, val_sequences_padded, test_sequences_padded    
    
def make_callbacks(params):
    early_stp = EarlyStopping(monitor='val_mse', min_delta=100, 
                            patience=5, verbose=1, mode='auto', restore_best_weights=True)
    reducer = ReduceLROnPlateau(monitor="loss", factor=0.1, patience=2, min_delta=100, mode='min')
    decay = LearningRateScheduler(scheduler)
    if params['lr_modifier'] == 'decay':
        return [early_stp, decay]
    elif params['lr_modifier'] == 'reducer':
        return [early_stp, reducer]
    else:
        return [early_stp]
    
def scheduler(epoch, lr):
    if epoch < 5:
        return lr
    else:
        return lr/epoch

def build_text_model(inp, params):
    text_model = (Embedding(params['max_words'], params['embedding_dim'], input_length = params['max_length']))(inp)
    
    for layer_num in np.arange(0, params['n_layers_gru']):
        layer_size = int(params['embedding_dim'] / (2**layer_num))
        text_model = (GRU(layer_size, 
                          return_sequences = not (layer_num == (params['n_layers_gru'] - 1))))(text_model)
        
    text_model = (Dense(8, activation = params['activation']))(text_model)
    
    return text_model
    
def build_nlp_model(train_shape, nlp_len, params):
    inp_feats = Input(shape = (train_shape,))
    inp_tknzd = lambda: Input(shape = (params['max_length'], ))

    model = (Dense(2**(params['n_layers'] + 2), activation = params['activation']))(inp_feats)
    model = (Dropout(params['dropout']))(model)
        
    for layer_num in np.flip(np.arange(2, params['n_layers'] + 1)):
        layer_size = 2**(layer_num + 1)
        model = (Dense(layer_size, activation = params['activation']))(model)
        if layer_size > 100:
            model = (Dropout(params['dropout']))(model)

    models = [model]  
    
    # Building as many text models as there are in nlp seqs    
    text_inputs = [inp_tknzd() for n in range(nlp_len)]
    text_models = [build_text_model(inp, params) for inp in text_inputs]
    
    model = concatenate(models + text_models)
    model = (Dense(1, activation=params['output_activation']))(model)
    model = Model(inputs = [inp_feats] + text_inputs, outputs = model)
    
    optimizer = None
    if params['optimizer'] =='SGD':
        optimizer=SGD(learning_rate=params['learning_rate'])
    elif params['optimizer'] =='Adam':
        optimizer=Adam(learning_rate=params['learning_rate'])
    elif params['optimizer'] =='RMSprop':
        optimizer=RMSprop(learning_rate=params['learning_rate'])
    elif params['optimizer'] =='Adagrad':
        optimizer=Adagrad(learning_rate=params['learning_rate'])
    elif params['optimizer'] =='Adamax':
        optimizer=Adamax(learning_rate=params['learning_rate'])
    elif params['optimizer'] =='Nadam':
        optimizer=Nadam(learning_rate=params['learning_rate'])

    model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['mse'])
    
    return model
    
def split_data_by_type(df, process_config):
    X = df[process_config['numeric_features'] + process_config['cat_ohe_features']].copy()
    y = df[process_config['target_feature']].copy()
    nlp = df[process_config['nlp_features']].copy()
    util = df[process_config['utility_features'] + [process_config['date_feature']]].copy()
    
    return X, y, nlp, util
 
def prep_data_pred(pred, model_path, process_config, model_config):
    process_config['cat_ohe_features'] = [
        x for x in pred.columns if any([y in x for y in process_config['categoric_features']])]
    
    X, y, nlp, util = split_data_by_type(pred, process_config)
        
    nlp_seqs = []
    for col in nlp.columns:
        seq = build_seqs_pred(nlp[col], col, model_config, model_path)
        nlp_seqs.append(seq)
        
    return (X, y, nlp_seqs, util)
    
    
def prep_data(train, val, test, model_path, process_config, model_config):
    process_config['cat_ohe_features'] = [
        x for x in train.columns if any([y in x for y in process_config['categoric_features']])]
    
    X_train, y_train, nlp_train, util_train = split_data_by_type(train, process_config)
    X_val, y_val, nlp_val, util_val = split_data_by_type(val, process_config)
    X_test, y_test, nlp_test, util_test =  split_data_by_type(test, process_config)
    
    nlp_seqs_train = []
    nlp_seqs_val = []
    nlp_seqs_test = []
    for col in nlp_train.columns:
        seq_train, seq_val, seq_test = build_seqs(nlp_train[col], nlp_val[col], nlp_test[col], col, model_config, model_path)
        nlp_seqs_train.append(seq_train)
        nlp_seqs_val.append(seq_val)
        nlp_seqs_test.append(seq_test)
        
    return (X_train, y_train, nlp_seqs_train, util_train,
            X_val, y_val, nlp_seqs_val, util_val,
            X_test, y_test, nlp_seqs_test, util_test)

def run_model(train, val, test, model_path, process_config, model_config):
    X_train, y_train, nlp_seqs_train, util_train, \
    X_val, y_val, nlp_seqs_val, util_val, \
    X_test, y_test, nlp_seqs_test, util_test = prep_data(train, val, test, model_path, process_config, model_config)
    
    model = build_nlp_model(X_train.shape[1], len(nlp_seqs_train), model_config)
    model.fit(
        [X_train] + nlp_seqs_train, 
        y_train, 
        validation_data = ([X_val] + nlp_seqs_val, y_val),
        epochs = model_config['epochs'], 
        batch_size = model_config['batch_size'],
        callbacks = make_callbacks(model_config), 
        verbose = model_config['verbose']
    )
    
    model.save(model_path + "/model")
    
    train_preds = model.predict([X_train] + nlp_seqs_train)
    val_preds = model.predict([X_val] + nlp_seqs_val)
    test_preds = model.predict([X_test] + nlp_seqs_test)
        
    util_train['target'] = y_train
    util_val['target'] = y_val
    util_test['target'] = y_test
    
    util_train['prediction'] = train_preds
    util_val['prediction'] = val_preds
    util_test['prediction'] = test_preds
    
    return model, util_train, util_val, util_test

def run_model_predict(pred, model_path, process_config, model_config):
    X, y, nlp_seqs, util = prep_data_pred(pred, model_path, process_config, model_config)

    model = tf.keras.models.load_model(model_path + "/model")
    
    preds = model.predict([X] + nlp_seqs)
        
    util['target'] = y
    util['prediction'] = preds
    
    return model, util

In [None]:
model, out_train, out_val, out_test = run_model(train, val, test, model_path, process_config, model_config)

In [None]:
model.summary()

In [None]:
tf.keras.utils.plot_model(model, show_shapes=True)

## Gets Validations

In [None]:
def w_cov(x, y, w):
    return np.sum(w * (x - np.average(x, weights = w)) * (y - np.average(y, weights = w))) / np.sum(w)
def w_corr(x, y, w):
    return w_cov(x, y, w) / np.sqrt(w_cov(x, x, w)*w_cov(y, y, w))
def get_weighted_corr(df, pred, truth):
    return w_corr(df[pred], df[truth], (df['calls_off'] + df['calls_on']))
    
def get_overall_lift(gain_df):
    overall_lift = gain_df.groupby('v_calltime_dt').agg(
        pred_incrs = ('prediction_incr', np.sum),
        true_incrs = ('target_incr', np.sum),
        calls_on = ('calls_on', np.sum),
        calls_off = ('calls_off', np.sum)
    )
    overall_lift['pred_lift'] = overall_lift['pred_incrs'] / overall_lift['calls_on']
    overall_lift['true_lift'] = overall_lift['true_incrs'] / overall_lift['calls_on']
    return overall_lift
    
def get_validation_data(out_train, out_val, out_test):
    from sklearn.metrics import mean_squared_error
    val_data = {}
    
    # RMSES
    val_data['train_rmse'] = mean_squared_error(out_train['target'], out_train['prediction'], squared = False)
    val_data['validation_rmse'] = mean_squared_error(out_val['target'], out_val['prediction'], squared = False)
    
    return val_data

In [None]:
# get validations
val_data = get_validation_data(out_train, out_val, out_test)

## Upload model data into table

In [None]:
def update_metadata(metadata, model_info_config):
    model_data_update = model_info_config['insert_model_info_query'].format(**metadata)
    model_info_conn = pymysql.connect(**model_info_config['db_args'])
    
    cur = model_info_conn.cursor()
    cur.execute('SET autocommit = 1')
    cur.execute('set sql_safe_updates = 0')
    cur.execute(model_data_update)
    cur.close()
    model_info_conn.close()
    

In [None]:
# get, and upload metadata
metadata = {
    **model_info_config,
    **val_data,
    **process_config,
    'model_trained_on': model_train_date,
    'model_expires_on': model_expires_date,
    'model_output_base_dir': model_path
}
update_metadata(metadata, model_info_config)

## Upload predictions

In [None]:
def upload_preds(pred_data, process_config, model_info_config, pipeline_config):
    model_info_conn = pymysql.connect(**model_info_config['db_args'])
    cur = model_info_conn.cursor()
    cur.execute('SET autocommit = 1')
    cur.execute('set sql_safe_updates = 0')
    
    for index, row in pred_data.reset_index().iterrows():
        output_data = dict(zip(
            process_config['index_features'], 
            row[process_config['index_features']]
        ))
        output_data ['prediction'] = row['prediction']
        insert_query = pipeline_config['output_query'].format(
            **output_data,
            **pipeline_config
        )
        cur.execute(insert_query)
    cur.close()
    model_info_conn.close()

In [None]:
# upload predictions
upload_preds(out_test, process_config, model_info_config, pipeline_config)