<a href="https://colab.research.google.com/github/gr3ybr0w/cookbook/blob/master/Machine_Learning/Neural_Networks/TensorFlow/TFestimatorAPI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import os
import shutil
import datetime
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from tensorflow.python.data import Dataset
import pickle
import psutil
tf.logging.set_verbosity(tf.logging.ERROR)

import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None

In [0]:
# config
KILL_TENSORBOARD = True
FRESH_START = True
EPOCHS = 100
BATCH_SIZE = 40
HIDDEN_UNITS = [1028, 1028, 1028]
LEARNING_RATE = 0.0001
DROPOUT = 0.25
VERSION = tf.__version__
CWD = os.getcwd()
PARENT_DIR = os.path.split(CWD)[0]
DATETIME = datetime.datetime.utcnow()
DATA_DIR = os.path.join(PARENT_DIR, 'data')
MODEL_DIR = os.path.join(PARENT_DIR, 'models')
TARGET_COLUMN = 'sumUKQuantity'
MODEL_TYPE_DIR = os.path.join(MODEL_DIR, 'UKpreds')
MODEL_ZOO = os.path.join(PARENT_DIR, 'model_zoo')

print("Kill tensorboard: {}".format(KILL_TENSORBOARD))
print("Starting a fresh: {}".format(FRESH_START))
print("Running for {} epochs".format(EPOCHS))
print("Batch size: {}".format(BATCH_SIZE))
print("Hidden Units: {}".format(str(HIDDEN_UNITS)))
print("Learning rate: {}".format(LEARNING_RATE))
print("Dropout: {}".format(DROPOUT))
print("Using TensorFlow version: {}".format(VERSION))
print("Current working directory: {}".format(CWD))
print("Parent director: {}".format(PARENT_DIR))
print("Date time: {}".format(DATETIME))
print("Data directory: {}".format(DATA_DIR))
print("Model directory: {}".format(MODEL_DIR))
print("Target column {}".format(TARGET_COLUMN))
print("Model type directory: {}".format(MODEL_TYPE_DIR))
print("Model zoo directory: {}".format(MODEL_ZOO))

In [0]:
def killTensorboard():
    PROCNAME = "tensorboard.exe"
    for proc in psutil.process_iter():
        # check whether the process name matches
        if proc.name() == PROCNAME:
            proc.kill()

In [0]:
if KILL_TENSORBOARD:
    killTensorboard()

if FRESH_START:
    shutil.rmtree(MODEL_TYPE_DIR, ignore_errors = True)

In [0]:
fileObject = open(os.path.join(DATA_DIR, 'feateng_parameters'),'rb')  
# load the object from the file into var b
feature_params = pickle.load(fileObject)

In [0]:
def get_vocab_file(col_name):
    filename = col_name + 'cats.txt'
    return os.path.join(DATA_DIR, 'catfiles', filename)

In [0]:
def create_DNNfeature_cols():
    # Continuous columns
    UnitCostPrice = tf.feature_column.numeric_column(key='UnitCostPrice')
#     Year = tf.feature_column.numeric_column(key='Year')
    RRP_GBP = tf.feature_column.numeric_column(key='RRP_GBP')
    ExVatSalesValue = tf.feature_column.numeric_column(key='ExVatSalesValue')
    
    # categorical columns
    Week = tf.feature_column.categorical_column_with_vocabulary_file(key='Week', vocabulary_file=get_vocab_file('Week'),vocabulary_size=None)
    Quarter = tf.feature_column.categorical_column_with_vocabulary_list(key='Quarter', vocabulary_list=['1','2','3','4'])
    UKSize = tf.feature_column.categorical_column_with_vocabulary_file(key='UKSize', vocabulary_file=get_vocab_file('UKSize'), vocabulary_size=None)
    BaseColour = tf.feature_column.categorical_column_with_vocabulary_file(key='BaseColour', vocabulary_file=get_vocab_file('BaseColour'), vocabulary_size=None)
    ProductBrand = tf.feature_column.categorical_column_with_vocabulary_file(key='ProductBrand', vocabulary_file=get_vocab_file('ProductBrand'), vocabulary_size=None)
    Department = tf.feature_column.categorical_column_with_vocabulary_file(key='Department', vocabulary_file=get_vocab_file('Department'), vocabulary_size=None)
    ProductType = tf.feature_column.categorical_column_with_vocabulary_file(key='ProductType', vocabulary_file=get_vocab_file('ProductType'), vocabulary_size=None)
    ProductSubType = tf.feature_column.categorical_column_with_vocabulary_file(key='ProductSubType', vocabulary_file=get_vocab_file('ProductSubType'), vocabulary_size=None)
    Silo = tf.feature_column.categorical_column_with_vocabulary_file(key='Silo', vocabulary_file=get_vocab_file('Silo'), vocabulary_size=None)
    SubSilo = tf.feature_column.categorical_column_with_vocabulary_file(key='SubSilo', vocabulary_file=get_vocab_file('SubSilo'), vocabulary_size=None)
    Level = tf.feature_column.categorical_column_with_vocabulary_file(key='Level', vocabulary_file=get_vocab_file('Level'), vocabulary_size=None)
    Sport = tf.feature_column.categorical_column_with_vocabulary_file(key='Sport', vocabulary_file=get_vocab_file('Sport'), vocabulary_size=None)
    Supplier = tf.feature_column.categorical_column_with_vocabulary_file(key='Supplier', vocabulary_file=get_vocab_file('Supplier'), vocabulary_size=None)
    

    feature_columns = [
        UnitCostPrice,
        RRP_GBP,
        ExVatSalesValue,
        tf.feature_column.indicator_column(Week),
        tf.feature_column.indicator_column(Quarter),
        tf.feature_column.indicator_column(UKSize),
        tf.feature_column.indicator_column(BaseColour),
        tf.feature_column.indicator_column(ProductBrand),
        tf.feature_column.indicator_column(Department),
        tf.feature_column.indicator_column(ProductType),
        tf.feature_column.indicator_column(ProductSubType),
        tf.feature_column.indicator_column(Silo),
        tf.feature_column.indicator_column(SubSilo),
        tf.feature_column.indicator_column(Level),
        tf.feature_column.indicator_column(Sport),
        tf.feature_column.indicator_column(Supplier),
    ]
    
    return feature_columns

In [0]:
CSV_COLUMNS = ['StockID','QuickRef','ProductBrand','Department','ProductType',
               'ProductSubType','Silo','SubSilo','Level','BaseColour',
               'Sport','Supplier','UKSize','UnitCostPrice','ExVatSalesValue',
               'RRP_GBP','Quarter','Week','sumUKQuantity','sumNonUKQuantity','totalqty']

DEFAULTS = [['UNKNOWN'],['UNKNOWN'],['UNKNOWN'],['UNKNOWN'],['UNKNOWN'],
            ['UNKNOWN'],['UNKNOWN'],['UNKNOWN'],['UNKNOWN'],['UNKNOWN'],
            ['UNKNOWN'],['UNKNOWN'],['UNKNOWN'],[0.0],      [0.0],      
            [0.0],      ['UKKNOWN'],['UNKNOWN'],[0.0],      [0.0], [0.0]]


def get_train():
    return tf.contrib.data.make_csv_dataset(file_pattern=os.path.join(DATA_DIR, 'traindf.csv'),
                                            batch_size=BATCH_SIZE,                                            
                                            column_names=None,
                                            column_defaults=DEFAULTS,
                                            label_name=TARGET_COLUMN,
                                            num_parallel_reads=8,
                                            prefetch_buffer_size=1000,
                                            sloppy=True,
                                            num_rows_for_inference=None)


def get_test():
    return tf.contrib.data.make_csv_dataset(file_pattern=os.path.join(DATA_DIR, 'test_df.csv'),
                                            batch_size=BATCH_SIZE,
                                            column_names=None,
                                            column_defaults=DEFAULTS,
                                            label_name=TARGET_COLUMN,
                                            shuffle=False,
                                            num_epochs=1,
                                            num_parallel_reads=8,
                                            prefetch_buffer_size=1000,
                                            sloppy=True,
                                            num_rows_for_inference=None)

def get_test_complete():
    return tf.contrib.data.make_csv_dataset(file_pattern=os.path.join(DATA_DIR, 'test_df.csv'),
                                            batch_size=feature_params.get('testdf_len'),
                                            column_names=None,
                                            column_defaults=DEFAULTS,
                                            label_name=TARGET_COLUMN,
                                            shuffle=False,
                                            num_epochs=1,
                                            num_parallel_reads=8,
                                            prefetch_buffer_size=1000,
                                            sloppy=True,
                                            num_rows_for_inference=None)

def get_valid_complete():
    return tf.contrib.data.make_csv_dataset(file_pattern=os.path.join(DATA_DIR, 'valid_df.csv'),
                                            batch_size=feature_params.get('valid_df_len'),
                                            column_names=None,
                                            column_defaults=DEFAULTS,
                                            label_name=TARGET_COLUMN,
                                            shuffle=False,
                                            num_epochs=1,
                                            num_parallel_reads=8,
                                            prefetch_buffer_size=1000,
                                            sloppy=True,
                                            num_rows_for_inference=None)

In [0]:
def add_engineered(features):
    features['UnitCostPrice'] = (features['UnitCostPrice'] - feature_params['UnitCostPrice_min']) / (feature_params['UnitCostPrice_max'] - feature_params['UnitCostPrice_min'])
    features['RRP_GBP'] = (features['RRP_GBP'] - feature_params['RRP_GBP_min']) / (feature_params['RRP_GBP_max'] - feature_params['RRP_GBP_min'])
    features['ExVatSalesValue'] = (features['ExVatSalesValue'] - feature_params['ExVatSalesValue_min']) / (feature_params['ExVatSalesValue_max'] - feature_params['ExVatSalesValue_min'])
    return features

In [0]:
def serving_input_fn():
    receiver_tensors  = {
        'ProductBrand': tf.placeholder(tf.string, [None]),
        'Department': tf.placeholder(tf.string, [None]),
        'ProductType': tf.placeholder(tf.string, [None]),
        'ProductSubType': tf.placeholder(tf.string, [None]),
        'Silo': tf.placeholder(tf.string, [None]),
        'SubSilo': tf.placeholder(tf.string, [None]),
        'Level': tf.placeholder(tf.string, [None]),
        'BaseColour': tf.placeholder(tf.string, [None]),
        'Sport': tf.placeholder(tf.string, [None]),
        'Supplier': tf.placeholder(tf.string, [None]),
        'UKSize': tf.placeholder(tf.string, [None]),
        'UnitCostPrice': tf.placeholder(tf.float32, [None]),
        'ExVatSalesValue': tf.placeholder(tf.float32, [None]),
        'RRP_GBP': tf.placeholder(tf.float32, [None]),
        'Quarter': tf.placeholder(tf.string, [None]),
        'Week': tf.placeholder(tf.string, [None])
    }
    
    features = receiver_tensors.copy()
    return tf.estimator.export.ServingInputReceiver(features=add_engineered(features),
                                                    receiver_tensors=receiver_tensors)

In [0]:
# Create estimator train and evaluate function
def train_and_evaluate(output_dir, num_train_steps):

    config = tf.ConfigProto()
    config.gpu_options.per_process_gpu_memory_fraction = 0.2
    config.intra_op_parallelism_threads = 1
    config.inter_op_parallelism_threads = 1
    
    my_config = tf.estimator.RunConfig(save_checkpoints_steps=np.round((feature_params['traindf_len'] / BATCH_SIZE) /2),
                                       keep_checkpoint_max=1,
                                       session_config=config)
    
    estimator = tf.estimator.DNNRegressor(hidden_units=HIDDEN_UNITS,
                                      activation_fn=tf.nn.relu,
                                      model_dir=output_dir,
                                      feature_columns=create_DNNfeature_cols(),
                                      optimizer=tf.train.AdamOptimizer(learning_rate=LEARNING_RATE),
                                      dropout=DROPOUT,
                                      config=my_config
                                     )

    #Add rmse evaluation metric
    def rmse(labels, predictions):
        pred_values = tf.cast(predictions['predictions'], tf.float64)
        return {'rmse': tf.metrics.root_mean_squared_error(tf.cast(labels, tf.float64), pred_values)}
    estimator = tf.contrib.estimator.add_metrics(estimator,rmse)
    
    
    # Hook to stop training if loss does not decrease in over 100000 steps.
    hook = tf.contrib.estimator.stop_if_no_decrease_hook(estimator,
                                                         metric_name='rmse',
                                                         max_steps_without_decrease=40000,
                                                         eval_dir=None,
                                                         min_steps=0,
                                                         run_every_secs=60,
                                                         run_every_steps=None)
    train_spec = tf.estimator.TrainSpec(input_fn=get_train, max_steps=num_train_steps, hooks=[hook])
    
    exporter = tf.estimator.BestExporter('exports', serving_input_fn)
#     exporter = tf.estimator.LatestExporter('UK', serving_input_fn)
    
    eval_spec = tf.estimator.EvalSpec(input_fn=get_test_complete,
                                      steps = np.round((feature_params['traindf_len'] / BATCH_SIZE) /2),
                                      exporters=exporter, # set the model to be exported
                                      start_delay_secs = 0, # start evaluating after N seconds, 
                                      throttle_secs = 0  # evaluate every N seconds
                                     )
    
    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
    return estimator

In [0]:
Steps = (EPOCHS * feature_params['traindf_len']) / BATCH_SIZE
Steps

In [0]:
# steps in an epoch
(feature_params['traindf_len'] / BATCH_SIZE)

In [0]:
estimator = train_and_evaluate(output_dir=MODEL_TYPE_DIR, num_train_steps=Steps)

## testing

In [0]:
test_evaluations = estimator.evaluate(input_fn=get_test_complete)
# validation_evaluations = estimator.evaluate(input_fn=get_valid_complete)

# print("Training data evaluations: {}".format(train_evaluations))
print("Test data evaluations:\n {}".format(test_evaluations))
# print("Validation data evaluations:\n {}".format(validation_evaluations))

In [0]:
# creating an info file to be copied to the model zoo

with open(MODEL_TYPE_DIR + '\\info.txt', 'w') as info_file:
    info_file.write('\nConfig for Initbuy')
    info_file.write('\nEpochs: {}'.format(EPOCHS))
    info_file.write('\nBatch size: {}'.format(BATCH_SIZE))
    info_file.write('\nHidden Units: {}'.format(HIDDEN_UNITS))
    info_file.write('\nLearning rate: {}'.format(LEARNING_RATE))
    info_file.write('\nDropout: {}'.format(DROPOUT))
    info_file.write('\nTF version: {}'.format(VERSION))
    info_file.write("\nTest data evaluations:\n {}".format(test_evaluations))
#     info_file.write("\nValidation data evaluations:\n {}".format(validation_evaluations))

In [0]:
killTensorboard() # needs to stop before files can be copied

# making ukpreds if it doesn't yet exist
UKpreds_ZOO_PATH = os.path.join(MODEL_ZOO, 'UKpreds')
if not os.path.isdir(UKpreds_ZOO_PATH):
    os.mkdir(UKpreds_ZOO_PATH)

# make path for this version of the model
UKpreds_ZOO_PATH_DT = os.path.join(UKpreds_ZOO_PATH, datetime.datetime.strftime(DATETIME, '%Y_%m_%d_%H_%M'))
os.mkdir(UKpreds_ZOO_PATH_DT)

# copy the data over
def copytree(src, dst, symlinks=False, ignore=None):
    for item in os.listdir(src):
        s = os.path.join(src, item)
        d = os.path.join(dst, item)
        if os.path.isdir(s):
            shutil.copytree(s, d, symlinks, ignore)
        else:
            shutil.copy2(s, d)

copytree(MODEL_TYPE_DIR, UKpreds_ZOO_PATH_DT)