<a href="https://colab.research.google.com/github/fellowship/deep-and-wide-bandit/blob/dev/TensorFlow_W_D_W%26D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Required Installs

# Import necessary packages

In [1]:
import tensorflow as tf
tf.test.gpu_device_name()

'/device:GPU:0'

In [2]:
#Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
from pathlib import Path

import pandas as pd
import numpy as np
from zipfile import ZipFile
import re
import json
import pickle as pkl
import re

%matplotlib inline
import matplotlib.pyplot as plt

from functools import partial
import random
import seaborn as sns
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
from pprint import pprint
from sklearn.metrics import confusion_matrix
from IPython.core.interactiveshell import InteractiveShell  
InteractiveShell.ast_node_interactivity = "all"

#Makes panda and numpy easier to read
pd.set_option('display.float_format', lambda x: '%.3f' % x)
np.set_printoptions(precision=3, suppress=True)

# Get the data ready


We extract the weekly dataset CSVs & shortlisted train+valid index CSVs s.t. we have 1 train and 1 valid index CSV per weekly CSV.



In [4]:
def delete_all(directory):
    for item in sorted(directory.rglob('*')):        
        if item.is_file():
          print(f"[INFO] Deleting file {item.name}")
          item.unlink()
        elif item.is_dir():
          delete_all(item)
          item.rmdir()
          print(f"[INFO] Deleting folder {item.name}")

In [5]:
overwrite_zip_extract = False

#Build the list of train_valid items
data_paths_l = [Path("/content/drive/MyDrive/Bandit_Project/BanditsData/Jul_Dec_2019.zip"),
                 Path("/content/drive/MyDrive/Bandit_Project/BanditsData/Jan_Jun_2020.zip"),
                 Path("/content/drive/MyDrive/Bandit_Project/BanditsData/Jul_Sep_2020.zip"),
                Path("/content/drive/MyDrive/Bandit_Project/BanditsData/Old/Train_5pct_Jul_Dec_2019.zip"),
                Path("/content/drive/MyDrive/Bandit_Project/BanditsData/Old/Train_5pct_Jan_Jun_2020.zip")]

#Extract items to a folder in your GDrive
folder = Path("/content/drive/MyDrive/Bandit_Project/aleksey")

#If overwrite flag is set to True - we delete all the files
if overwrite_zip_extract:
  delete_all(folder)
  for path in data_paths_l:
      with ZipFile(path, 'r') as zip_obj:
        print(f"[INFO] Extracting {path.name}:")
        zip_obj.extractall(folder)

#Create train, valid & test folder
train_folder = (folder/'train')
train_folder.mkdir(exist_ok=True)

valid_folder = (folder/'valid')
valid_folder.mkdir(exist_ok=True)

test_folder = (folder/'test')
test_folder.mkdir(exist_ok=True)

## Subsetting Train & Valid

In [6]:
#Function to export data corresponding to chosen indices to train/valid folder
def export_data_subset(data_l, save_folder, idx_l=None, overwrite_flag = False):

  cnt = 0

  #If the folder is not empty and overwrite_flag is set to True, delete all files in the folder
  if overwrite_flag:
    
    print(f"\n[INFO] Deleting files in {save_folder.name} directory...")
    delete_all(save_folder)
  
  #Check whether idx_l is None
  if not(idx_l):
    
    #Just copy-paste all files from data_l to save_folder
    for data_path in data_l:
      
      destination = save_folder/(data_path.name)
      if not destination.exists():
        data_path.replace(destination)
  
  else:

    #Iterate over the (weekly data path, training set indices path) zipped object
    for (data_path, idx_path) in zip(data_l, idx_l):

      if (cnt + 1) % 5 == 0:
        print(f"[INFO] Building {save_folder.name} file from {data_path.name}")
      
      #Use pandas to read weekly data + corresponding index CSV files
      data = pd.read_csv(data_path)
      idx = pd.read_csv(idx_path, header=None, squeeze=True).tolist()

      #Subset the data and save it to the appropriate csv file
      data_subset = data.iloc[idx, :]

      #Save the data subset
      data_subset.to_csv(save_folder/(data_path.name), index=False, compression="gzip", header=True)

    #Increment Counter
    cnt += 1

In [7]:
def process_input(source, dest, x_cols, y_col, overwrite=True, ctype = "infer"):
  
  filename = source.name + ".csv.gz"
  dest_path = dest/filename
  
  #Check whether files exist in destination + should not overwrite - If they do, print an error message  
  if dest_path.exists():
    if not(overwrite):
      print(f"[ERROR] {dest_path.name} currently exists. Pls set overwrite flag to True!")
      return dest_path
    else:
      #Delete current files in the dest folder
      delete_all(dest)

  cnt = 0

  #Iterate over each file in source
  for file_path in source.iterdir():

    #Print update
    print(f"[INFO] Currently working on {source.name}: {file_path.name}")
    
    #Read in the data
    data = pd.read_csv(file_path, compression=ctype, header=[0])

    #Shortlist columns to get the overall CSV
    cols = x_cols + y_col
    subset = data[cols]     

    #Check whether first CSV file —> include header, otherwise ignore
    header_flag = True if not(cnt) else False

    #Save to dest    
    subset.to_csv(dest_path, mode='a', compression="gzip", header=header_flag, index=False)

    #Increment counter
    cnt += 1
  
  #Return the path to the processed input file
  return dest_path

In [8]:
#Setting overwrite flag
overwrite_dset = False

#We need to connect a file like "sends_2019_wk26.csv" WITH "selected_rows_sends_2019_wk26.csv"
weekly_data_path_l = sorted([i for i in folder.iterdir() if re.search("/sends", str(i), re.I)], key=lambda x: str(x))
valid_indices_path_l = sorted([i for i in folder.iterdir() if re.search("/selected_rows_train", str(i), re.I)], key=lambda x: str(x))
#print(f"[INFO] Displaying indices to build validation data for {str(weekly_data_path_l[0])}: {valid_indices_path_l[0]}")

#train_indices_path_l = sorted([i for i in folder.iterdir() if re.search("/selected_rows_sends", str(i), re.I)], key=lambda x: str(x))
#print(f"[INFO] Displaying indices to build training data for {str(weekly_data_path_l[0])}: {train_indices_path_l[0]}")

#Execute only if valid folder is empty
if not(list(valid_folder.iterdir())) or overwrite_dset:

  #Run the export function
  export_data_subset(data_l = weekly_data_path_l, save_folder = valid_folder, 
                     idx_l = valid_indices_path_l, overwrite_flag = overwrite_dset)

else:
  print("[INFO] Validation Data Already Created...\n")

#Check if training folder is empty
if not(list(train_folder.iterdir())) or overwrite_dset:

  #Run the export function
  export_data_subset(data_l = weekly_data_path_l, save_folder = train_folder, overwrite_flag = overwrite_dset)

else:
  print("\n[INFO] Training Data Already Created...")

train_list = sorted([file_path for file_path in train_folder.iterdir()], key = lambda x: str(x))
valid_list = sorted([file_path for file_path in valid_folder.iterdir()], key = lambda x: str(x))
test_list = sorted([file_path for file_path in test_folder.iterdir()], key = lambda x: str(x))

print("\n[INFO] Displaying the first 5 elements of train_list:")
pprint(train_list[:5])
print("\n[INFO] Displaying the first 5 elements of valid_list:")
pprint(valid_list[:5])
print("\n[INFO] Displaying the first 5 elements of test_list:")
pprint(test_list[:5])

[INFO] Validation Data Already Created...


[INFO] Training Data Already Created...

[INFO] Displaying the first 5 elements of train_list:
[PosixPath('/content/drive/MyDrive/Bandit_Project/aleksey/train/sends_2019_wk26.csv'),
 PosixPath('/content/drive/MyDrive/Bandit_Project/aleksey/train/sends_2019_wk27.csv'),
 PosixPath('/content/drive/MyDrive/Bandit_Project/aleksey/train/sends_2019_wk28.csv'),
 PosixPath('/content/drive/MyDrive/Bandit_Project/aleksey/train/sends_2019_wk29.csv'),
 PosixPath('/content/drive/MyDrive/Bandit_Project/aleksey/train/sends_2019_wk30.csv')]

[INFO] Displaying the first 5 elements of valid_list:
[PosixPath('/content/drive/MyDrive/Bandit_Project/aleksey/valid/sends_2019_wk26.csv'),
 PosixPath('/content/drive/MyDrive/Bandit_Project/aleksey/valid/sends_2019_wk27.csv'),
 PosixPath('/content/drive/MyDrive/Bandit_Project/aleksey/valid/sends_2019_wk28.csv'),
 PosixPath('/content/drive/MyDrive/Bandit_Project/aleksey/valid/sends_2019_wk29.csv'),
 PosixPath('/content/dr

In [9]:
#Display head of first element of both training & validation subset
train_sample = pd.read_csv(train_list[0], header=[0])
print("[INFO] Sample Training Data")
train_sample.head()
valid_sample = pd.read_csv(valid_list[0], compression='gzip', header=[0])
print("[INFO] Sample Validation Data")
valid_sample.head()
test_sample = pd.read_csv(test_list[0], header=[0])
print("[INFO] Sample Test Data")
test_sample.head()

#Get the column names of the data
data_col_names = train_sample.columns.tolist()
print("[INFO] The full list of column names include:")
pprint(data_col_names)

[INFO] Sample Training Data


Unnamed: 0,riid,retention_score,frequency_score,recency_score,sends_since_last_open,times_seen,times_open,days_subscr,aq_year,aq_week,aq_dayofweek,aq_period,campaign_id,campaign_category,campaign_Brand,campaign_Core,campaign_Dedicated,campaign_InnovationSpotlight,campaign_NewArrivals,campaign_ProductSpotlight,campaign_Replen,campaign_Tops,campaign_Trend,campaign_Other,discount,promo,sale,is_one_for_free,free_shipping,is_exclusive,has_urgency,sl_contains_price,is_discount_mentioned,message_size,sent_week,sent_dayofweek,sent_hr,opened,unsub,rev_3dv2,reward,optimal_action
0,194725242,1.474,4,0.451,19,0,0,1352,2015,41,6,Non-Holiday,59090182,Tops,0,0,0,0,0,0,0,1,0,0,40,1,0,0,0,0,0,1,0,157528,26,0,17,1,0,77.32,-10,1
1,232343542,28.0,26,7.87,1,0,0,544,2018,0,2,Non-Holiday,59090182,Tops,0,0,0,0,0,0,0,1,0,0,40,1,0,0,0,0,0,1,0,157792,26,0,17,1,0,17.98,8,1
2,58700702,28.0,39,13.089,0,0,0,1581,2015,8,1,Non-Holiday,59090182,Tops,0,0,0,0,0,0,0,1,0,0,40,1,0,0,0,0,0,1,0,157878,26,0,17,1,0,17.98,9,1
3,263122862,28.0,53,15.349,0,0,0,109,2019,10,3,Non-Holiday,59090182,Tops,0,0,0,0,0,0,0,1,0,0,40,1,0,0,0,0,0,1,0,158325,26,0,17,1,0,35.96,9,1
4,987902,28.0,26,11.127,0,0,0,2372,2013,0,1,Other,59090182,Tops,0,0,0,0,0,0,0,1,0,0,40,1,0,0,0,0,0,1,0,156214,26,0,17,1,0,36.97,9,1


[INFO] Sample Validation Data


Unnamed: 0,riid,retention_score,frequency_score,recency_score,sends_since_last_open,times_seen,times_open,days_subscr,aq_year,aq_week,aq_dayofweek,aq_period,campaign_id,campaign_category,campaign_Brand,campaign_Core,campaign_Dedicated,campaign_InnovationSpotlight,campaign_NewArrivals,campaign_ProductSpotlight,campaign_Replen,campaign_Tops,campaign_Trend,campaign_Other,discount,promo,sale,is_one_for_free,free_shipping,is_exclusive,has_urgency,sl_contains_price,is_discount_mentioned,message_size,sent_week,sent_dayofweek,sent_hr,opened,unsub,rev_3dv2,reward,optimal_action
0,263471642,28.0,8,5.379,1,0,0,92,2019,12,6,Non-Holiday,59090182,Tops,0,0,0,0,0,0,0,1,0,0,40,1,0,0,0,0,0,1,0,156730,26,0,17,1,0,75.0,8,1
1,218682462,28.0,25,3.873,0,0,0,804,2017,15,1,Non-Holiday,59090182,Tops,0,0,0,0,0,0,0,1,0,0,40,1,0,0,0,0,0,1,0,158117,26,0,17,1,0,17.98,9,1
2,232668962,28.0,84,15.843,0,0,0,538,2018,1,1,Non-Holiday,59090182,Tops,0,0,0,0,0,0,0,1,0,0,40,1,0,0,0,0,0,1,0,157875,26,0,17,1,0,42.56,9,1
3,165404162,0.757,4,0.109,37,0,0,1474,2015,24,3,Non-Holiday,59090182,Tops,0,0,0,0,0,0,0,1,0,0,40,1,0,0,0,0,0,1,0,155171,26,0,17,0,0,0.0,-38,0
4,211481602,1.647,6,0.391,17,0,0,960,2016,45,6,Non-Holiday,59090182,Tops,0,0,0,0,0,0,0,1,0,0,40,1,0,0,0,0,0,1,0,157379,26,0,17,0,0,0.0,-18,0


[INFO] Sample Test Data


Unnamed: 0,riid,retention_score,frequency_score,recency_score,sends_since_last_open,times_seen,times_open,days_subscr,aq_year,aq_week,aq_dayofweek,aq_period,campaign_id,campaign_category,campaign_Brand,campaign_Core,campaign_Dedicated,campaign_InnovationSpotlight,campaign_NewArrivals,campaign_ProductSpotlight,campaign_Replen,campaign_Tops,campaign_Trend,campaign_Other,discount,promo,sale,is_one_for_free,free_shipping,is_exclusive,has_urgency,sl_contains_price,is_discount_mentioned,message_size,sent_week,sent_dayofweek,sent_hr,opened,unsub,rev_3dv2,reward,optimal_action
0,6092702,28.0,16,5.86,0,96,16,2732,2013,1,5,Non-Holiday,59445402,Dedicated,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,130782,26,0,15,1,0,99.96,9,1
1,197298302,0.622,1,0.207,45,87,7,1681,2015,47,6,Holiday,59445402,Dedicated,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,130531,26,0,15,1,0,79.92,-36,1
2,276708662,28.0,10,12.0,0,3,3,16,2020,24,5,Non-Holiday,59445402,Dedicated,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,129919,26,0,15,1,0,49.98,9,1
3,277083802,28.0,4,11.429,0,0,0,4,2020,26,3,Non-Holiday,59445402,Dedicated,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,130201,26,0,15,1,0,97.9,9,1
4,253962622,28.0,11,3.434,0,132,18,662,2018,36,3,Non-Holiday,59445402,Dedicated,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,130968,26,0,15,1,0,84.9,9,1


[INFO] The full list of column names include:
['riid',
 'retention_score',
 'frequency_score',
 'recency_score',
 'sends_since_last_open',
 'times_seen',
 'times_open',
 'days_subscr',
 'aq_year',
 'aq_week',
 'aq_dayofweek',
 'aq_period',
 'campaign_id',
 'campaign_category',
 'campaign_Brand',
 'campaign_Core',
 'campaign_Dedicated',
 'campaign_InnovationSpotlight',
 'campaign_NewArrivals',
 'campaign_ProductSpotlight',
 'campaign_Replen',
 'campaign_Tops',
 'campaign_Trend',
 'campaign_Other',
 'discount',
 'promo',
 'sale',
 'is_one_for_free',
 'free_shipping',
 'is_exclusive',
 'has_urgency',
 'sl_contains_price',
 'is_discount_mentioned',
 'message_size',
 'sent_week',
 'sent_dayofweek',
 'sent_hr',
 'opened',
 'unsub',
 'rev_3dv2',
 'reward',
 'optimal_action']


In [10]:
#Setting overwrite flag for Processed Data
overwrite_processed = False

#Create a folders to contain the processed records
dest_folder = Path("/content/drive/MyDrive/Bandit_Project/aleksey/processed")
dest_folder.mkdir(exist_ok=True)

#Build the context that you would like to keep track of
user_context_cols = ['riid', 'retention_score', 'frequency_score', 'recency_score', 'sends_since_last_open'] #Not using any aquisition features

campaign_context_cols = ['campaign_category', 'discount', 'promo', 'sale', 'is_one_for_free','free_shipping','is_exclusive','has_urgency']

email_context_cols = ['sl_contains_price','is_discount_mentioned','sent_week','sent_dayofweek','sent_hr']

context_cols = user_context_cols + campaign_context_cols + email_context_cols
print(f"[INFO] The context is {len(context_cols)} cols long...")

#Not sure how to handle train_y: For now, optimal_action classification 
outcomes_cols = ["opened", "unsub", "rev_3dv2"]
reward_cols = ["reward"]
action_cols = ["optimal_action"]

#Process the files in both train & valid
train_file_path = process_input(train_folder, dest_folder, context_cols, action_cols, overwrite_processed)
val_file_path = process_input(valid_folder, dest_folder, context_cols, action_cols, overwrite_processed, ctype="gzip")
test_file_path = process_input(test_folder, dest_folder, context_cols, action_cols, overwrite_processed)

[INFO] The context is 18 cols long...
[ERROR] train.csv.gz currently exists. Pls set overwrite flag to True!
[ERROR] valid.csv.gz currently exists. Pls set overwrite flag to True!
[ERROR] test.csv.gz currently exists. Pls set overwrite flag to True!


# Statistical Analysis Of Outcomes

[DONE] In this section, we have confirmed 3 things:


1.   How many people opened vs did not open
2.   How many people unsubscribed vs did not unsubscribe
3.   Whether optimal action is 1 for open OR purchase and 0 for not open OR unsub



In [11]:
"""
length_l = []
opens_l = []
unsubs_l = []
unique_users_s = set()
campaign_types_s = set()

#Iterate through the weekly data
for file_path in weekly_data_path_l:

  #Print status update
  print(f"[INFO] Working on {file_path.name}")
  
  #Count the number of elements
  df = pd.read_csv(file_path)
  length = len(df)
  length_l.append(length)

  #Count Opens vs Not Opens
  opens = (df["opened"] == 1).astype(int).sum()
  opens_l.append(opens)

  #Count Unsubs vs Not Unsubs
  unsubs = (df["unsub"] == 1).astype(int).sum()
  unsubs_l.append(unsubs)

  #Assert whether optimal action follows the rule
  #Create a Panda Series that follows this rule and assert
  check_series = pd.Series(np.ones_like(df["optimal_action"].values, dtype=int))

  check_series[df["opened"] == 0] = 0
  check_series[df["unsub"] == 1] = 0
  assert (df["optimal_action"] == check_series).all()

  #Update the unique user set with user IDs from the df
  unique_users = list(df["riid"].unique())
  unique_users_s.update(unique_users)

  #Update the unique campaign type set from the df
  campaign_types = list(df['campaign_category'].unique())
  campaign_types_s.update(campaign_types)

total_length = sum(length_l)
print(f"[INFO] Total number of data - {total_length}")

opened = sum(opens_l)/total_length
unsub = sum(unsubs_l)/total_length

print(f"[INFO] % of opened - {opened}")
print(f"[INFO] % of unsubscribed - {unsub}")
print(f"[INFO] # of unique users - {len(unique_users_s)}")
print(f"[INFO] # of unique campaign types - {len(campaign_types_s)}")

"""

'\nlength_l = []\nopens_l = []\nunsubs_l = []\nunique_users_s = set()\ncampaign_types_s = set()\n\n#Iterate through the weekly data\nfor file_path in weekly_data_path_l:\n\n  #Print status update\n  print(f"[INFO] Working on {file_path.name}")\n  \n  #Count the number of elements\n  df = pd.read_csv(file_path)\n  length = len(df)\n  length_l.append(length)\n\n  #Count Opens vs Not Opens\n  opens = (df["opened"] == 1).astype(int).sum()\n  opens_l.append(opens)\n\n  #Count Unsubs vs Not Unsubs\n  unsubs = (df["unsub"] == 1).astype(int).sum()\n  unsubs_l.append(unsubs)\n\n  #Assert whether optimal action follows the rule\n  #Create a Panda Series that follows this rule and assert\n  check_series = pd.Series(np.ones_like(df["optimal_action"].values, dtype=int))\n\n  check_series[df["opened"] == 0] = 0\n  check_series[df["unsub"] == 1] = 0\n  assert (df["optimal_action"] == check_series).all()\n\n  #Update the unique user set with user IDs from the df\n  unique_users = list(df["riid"].uniqu

# Preparing the Dataset for Tensorflow

## Creating the Dataloader

In [12]:
#Given CSV file patterns, convert them to DL
def csvs_to_dataloader(file_pattern, target = "optimal_action", shuffle=True, 
                       batch_size=1024, ctype=None, n_epochs = None):
  
  return tf.data.experimental.make_csv_dataset(file_pattern = file_pattern, 
                                             batch_size = batch_size,
                                             label_name= target,
                                             header=True, 
                                             num_epochs=n_epochs,
                                             shuffle=True,  
                                             shuffle_seed=42,
                                             compression_type=ctype, 
                                             ignore_errors=True)
  

"""
#Given a dataframe in memory
def df_to_dataloader(dataframe, target, shuffle=True, batch_size=32):
  dataframe = dataframe.copy()
  labels = dataframe.pop(target)
  ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  dl = ds.batch(batch_size)
  return dl
"""  

'\n#Given a dataframe in memory\ndef df_to_dataloader(dataframe, target, shuffle=True, batch_size=32):\n  dataframe = dataframe.copy()\n  labels = dataframe.pop(target)\n  ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))\n  if shuffle:\n    ds = ds.shuffle(buffer_size=len(dataframe))\n  dl = ds.batch(batch_size)\n  return dl\n'

In [13]:
"""
def input_fn(df_data, target, num_epochs = 50, shuffle = True, batch_size = 32):
  
  #df_data = pd.read_csv(data_file, header=[0], skiprows=1)
  # remove NaN elements
  #df_data = df_data.dropna(how="any", axis=0)
  df_data = df_data.copy()
  labels = df_data.pop(target)
  return tf.compat.v1.estimator.inputs.pandas_input_fn(
      x=df_data,
      y=labels,
      batch_size=batch_size,
      num_epochs=num_epochs,
      shuffle=shuffle)
"""

'\ndef input_fn(df_data, target, num_epochs = 50, shuffle = True, batch_size = 32):\n  \n  #df_data = pd.read_csv(data_file, header=[0], skiprows=1)\n  # remove NaN elements\n  #df_data = df_data.dropna(how="any", axis=0)\n  df_data = df_data.copy()\n  labels = df_data.pop(target)\n  return tf.compat.v1.estimator.inputs.pandas_input_fn(\n      x=df_data,\n      y=labels,\n      batch_size=batch_size,\n      num_epochs=num_epochs,\n      shuffle=shuffle)\n'

In [14]:
batch_size=1024
n_epochs=25

train_file_pattern = "/content/drive/MyDrive/Bandit_Project/aleksey/processed/train.csv.gz"
train_dl_train = lambda : csvs_to_dataloader(train_file_pattern, n_epochs = n_epochs, ctype="GZIP")
train_dl_fit = csvs_to_dataloader(train_file_pattern, n_epochs = n_epochs, ctype="GZIP")

val_file_pattern = "/content/drive/MyDrive/Bandit_Project/aleksey/processed/valid.csv.gz"
val_dl_train = lambda : csvs_to_dataloader(val_file_pattern, n_epochs = n_epochs, ctype="GZIP")
val_dl_fit = csvs_to_dataloader(val_file_pattern, n_epochs = n_epochs, ctype="GZIP")

test_file_pattern = "/content/drive/MyDrive/Bandit_Project/aleksey/processed/test.csv.gz"
test_dl_train = lambda : csvs_to_dataloader(test_file_pattern, n_epochs = n_epochs, ctype="GZIP")
test_dl_fit = csvs_to_dataloader(test_file_pattern, n_epochs = n_epochs, ctype="GZIP")

pprint(train_dl_fit)
pprint(val_dl_fit)
pprint(test_dl_fit)

#train_dl = input_fn(train, "optimal_action", batch_size=batch_size)
#val_dl = input_fn(val, "optimal_action", shuffle=False, batch_size=batch_size)
#train_dl = lambda : df_to_dataloader(train, "optimal_action", batch_size=batch_size)
#val_dl = lambda : df_to_dataloader(val, "optimal_action", shuffle=False, batch_size=batch_size)

<PrefetchDataset shapes: (OrderedDict([(riid, (None,)), (retention_score, (None,)), (frequency_score, (None,)), (recency_score, (None,)), (sends_since_last_open, (None,)), (campaign_category, (None,)), (discount, (None,)), (promo, (None,)), (sale, (None,)), (is_one_for_free, (None,)), (free_shipping, (None,)), (is_exclusive, (None,)), (has_urgency, (None,)), (sl_contains_price, (None,)), (is_discount_mentioned, (None,)), (sent_week, (None,)), (sent_dayofweek, (None,)), (sent_hr, (None,))]), (None,)), types: (OrderedDict([(riid, tf.int32), (retention_score, tf.float32), (frequency_score, tf.int32), (recency_score, tf.float32), (sends_since_last_open, tf.int32), (campaign_category, tf.string), (discount, tf.int32), (promo, tf.int32), (sale, tf.int32), (is_one_for_free, tf.int32), (free_shipping, tf.int32), (is_exclusive, tf.int32), (has_urgency, tf.int32), (sl_contains_price, tf.int32), (is_discount_mentioned, tf.int32), (sent_week, tf.int32), (sent_dayofweek, tf.int32), (sent_hr, tf.int

## Base Columns

### Numeric Columns

1. 'retention_score'
2. 'frequency_score'
3. 'recency_score'
4. 'sends_since_last_open'
5. 'discount'
6. 'sent_week'
7. 'sent_dayofweek'
8. 'sent_hr'

To bucketize or not?

In [15]:
#Get the global mean & std statistics
rolling_stats_filepath = Path("/content/drive/MyDrive/Bandit_Project/rolling_statistics.pkl")
with rolling_stats_filepath.open(mode='rb') as rolling_stats_file:
  rolling_stats = pkl.load(rolling_stats_file)

#Create a function that standardizes the column to N(0, 1)
def standardize_column(data, mean, std):                       
  data = (tf.cast(data, dtype=tf.float32) - mean)/std
  return tf.reshape(data, [-1, 1])

In [16]:
#Initialize a list to contain the numeric feature columns
numeric_feature_layer = []
numeric_feature_layer_input = {}

#Create 2 dictionaries with key as numeric feature column name
#and val as the value of the numeric feature column name
numeric_feature_col_names = ['retention_score','recency_score','frequency_score',
                             'sent_week','sent_dayofweek','sent_hr','discount',
                             'sends_since_last_open']
MEANS = {feature: rolling_stats[feature]["mean"] for feature in numeric_feature_col_names}
STDS = {feature: rolling_stats[feature]["std"] for feature in numeric_feature_col_names}

#Generate numeric cols
for feature in numeric_feature_col_names:
  numeric_feature_col = tf.feature_column.numeric_column(feature, 
                                                     normalizer_fn=partial(standardize_column, mean=MEANS[feature], std=STDS[feature]))
  numeric_feature_layer.append(numeric_feature_col)
  numeric_feature_layer_input[feature] = tf.keras.Input(shape=(), name=feature, dtype=tf.float32)

#Display the columns
print("[INFO] The numeric feature columns are:")
pprint(numeric_feature_layer)

print("\n[INFO] The inputs to numeric feature columns are:")
pprint(numeric_feature_layer_input)

[INFO] The numeric feature columns are:
[NumericColumn(key='retention_score', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=functools.partial(<function standardize_column at 0x7f0f3ad97b00>, mean=11.467980895825553, std=11.35391986430546)),
 NumericColumn(key='recency_score', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=functools.partial(<function standardize_column at 0x7f0f3ad97b00>, mean=1.23904221901564, std=2.216794122042123)),
 NumericColumn(key='frequency_score', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=functools.partial(<function standardize_column at 0x7f0f3ad97b00>, mean=14.977138288600283, std=20.754428265423773)),
 NumericColumn(key='sent_week', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=functools.partial(<function standardize_column at 0x7f0f3ad97b00>, mean=28.594960628048973, std=14.377041994557581)),
 NumericColumn(key='sent_dayofweek', shape=(1,), default_value=None, dtype=tf.float32, norma

### Categorical Columns

1. 'riid'
2. 'campaign_category'
3. 'promo'
4. 'sale'
5. 'is_one_for_free'
6. 'free_shipping'
7. 'is_exclusive'
8. 'has_urgency'
9. 'sl_contains_price'
10.'is_discount_mentioned'

In [17]:
#Initialize a list to contain the categorical feature columns
categorical_feature_layer = []
categorical_feature_layer_input = {}

CATEGORIES = {
    'promo' : [0, 1],
    'sale' : [0, 1],
    'campaign_category': ['Trend', 'NewArrivals', 'Dedicated', 'InnovationSpotlight', 'Core', 'Replen', 'ProductSpotlight', 'Other', 'Brand', 'Tops'],
    'is_one_for_free': [0, 1],
    'free_shipping': [0, 1],
    'is_exclusive': [0, 1],
    'has_urgency': [0, 1],
    'sl_contains_price': [0, 1],
    'is_discount_mentioned': [0, 1],
}

#Generate categorical cols
for (feature, vocab) in CATEGORIES.items():
  categorical_feature_col = tf.feature_column.indicator_column(tf.feature_column.categorical_column_with_vocabulary_list(feature, vocab))
  #categorical_feature_col = tf.feature_column.categorical_column_with_vocabulary_list(feature, vocab)
  categorical_feature_layer.append(categorical_feature_col)
  if feature == 'campaign_category':
    categorical_feature_layer_input[feature] = tf.keras.Input(shape=(), name=feature, dtype=tf.string)
  else:
    categorical_feature_layer_input[feature] = tf.keras.Input(shape=(), name=feature, dtype=tf.int64)

#Display the columns
print("[INFO] The categorical feature columns are:")
pprint(categorical_feature_layer)

print("\n[INFO] The inputs to categorical columns are:")
pprint(categorical_feature_layer_input)

[INFO] The categorical feature columns are:
[IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='promo', vocabulary_list=(0, 1), dtype=tf.int64, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='sale', vocabulary_list=(0, 1), dtype=tf.int64, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='campaign_category', vocabulary_list=('Trend', 'NewArrivals', 'Dedicated', 'InnovationSpotlight', 'Core', 'Replen', 'ProductSpotlight', 'Other', 'Brand', 'Tops'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='is_one_for_free', vocabulary_list=(0, 1), dtype=tf.int64, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='free_shipping', vocabulary_list=(0, 1), dtype=tf.int64, default_value=-1, num_oov_buckets=0)),
 Indicat

In [18]:
#Create the riid embedding col
embedding_feature_layer = []
embedding_feature_layer_input = {}

riid = tf.feature_column.categorical_column_with_hash_bucket("riid", hash_bucket_size=2000000, dtype=tf.int64)
riid_embedding = tf.feature_column.embedding_column(riid, dimension=32)
embedding_feature_layer.append(riid_embedding)
embedding_feature_layer_input["riid"] = tf.keras.Input(shape=(), name="riid", dtype=tf.int64)

#Display the columns
print("[INFO] The numeric embedding columns are:")
pprint(embedding_feature_layer)

print("\n[INFO] The inputs to embedding feature columns are:")
pprint(embedding_feature_layer_input)

[INFO] The numeric embedding columns are:
[EmbeddingColumn(categorical_column=HashedCategoricalColumn(key='riid', hash_bucket_size=2000000, dtype=tf.int64), dimension=32, combiner='mean', initializer=<tensorflow.python.ops.init_ops.TruncatedNormal object at 0x7f0f3ad2fdd0>, ckpt_to_load_from=None, tensor_name_in_ckpt=None, max_norm=None, trainable=True, use_safe_embedding_lookup=True)]

[INFO] The inputs to embedding feature columns are:
{'riid': <KerasTensor: shape=(None,) dtype=int64 (created by layer 'riid')>}


## For Wide Model

### Crossed Columns

Crossing the following feature-combinations:

1.   'riid' vs. 'campaign_category'
2.   'riid' vs. 'discount'
3.   'riid' vs. 'is_one_for_free'
4.   'riid' vs. 'free_shipping'
5.   'riid' vs. 'is_exclusive'
6.   'riid' vs. 'has_urgency',
7.   'riid' vs. 'sl_contains_price',
8. 'riid' vs. 'is_discount_mentioned',
9. 'riid' vs. 'sent_week',
10. 'riid' vs. 'sent_dayofweek',
11. 'riid' vs. 'sent_hr'



In [19]:
crossed_columns = [
  tf.feature_column.crossed_column(["riid", 'campaign_category'], hash_bucket_size=20000000),
  tf.feature_column.crossed_column(["riid", 'discount'], hash_bucket_size=10000000),
  tf.feature_column.crossed_column(["riid", 'is_one_for_free'], hash_bucket_size=4000000),
  tf.feature_column.crossed_column(["riid", 'free_shipping'], hash_bucket_size=4000000),
  tf.feature_column.crossed_column(["riid", 'is_exclusive'], hash_bucket_size=4000000),
  tf.feature_column.crossed_column(["riid", 'has_urgency'], hash_bucket_size=4000000),
  tf.feature_column.crossed_column(["riid", 'sl_contains_price'], hash_bucket_size=4000000),
  tf.feature_column.crossed_column(["riid", 'is_discount_mentioned'], hash_bucket_size=4000000),
  tf.feature_column.crossed_column(["riid", 'sent_week'], hash_bucket_size=10000000),
  tf.feature_column.crossed_column(["riid", 'sent_dayofweek'], hash_bucket_size=60000000),
  tf.feature_column.crossed_column(["riid", 'sent_hr'], hash_bucket_size=50000000),
]
crossed_columns_names = ["riid_X_campaign_category",
                         "riid_X_discount",
                         "riid_X_is_one_for_free",
                         "riid_X_free_shipping",
                         "riid_X_is_exclusive",
                         "riid_X_has_urgency",
                         "riid_X_sl_contains_price",
                         "riid_X_is_discount_mentioned",
                         "riid_X_sent_week",
                         "riid_X_sent_dayofweek",
                         "riid_X_sent_hr"]
crossed_columns_input = {colname: tf.keras.Input(shape=(), name=colname, dtype=tf.int64)
                                    for colname in crossed_columns_names}
#Display the columns
print("[INFO] The crossed feature columns are:")
pprint(crossed_columns)

print("\n[INFO] The inputs to crossed feature columns are:")
pprint(crossed_columns_input)

[INFO] The crossed feature columns are:
[CrossedColumn(keys=('riid', 'campaign_category'), hash_bucket_size=20000000, hash_key=None),
 CrossedColumn(keys=('riid', 'discount'), hash_bucket_size=10000000, hash_key=None),
 CrossedColumn(keys=('riid', 'is_one_for_free'), hash_bucket_size=4000000, hash_key=None),
 CrossedColumn(keys=('riid', 'free_shipping'), hash_bucket_size=4000000, hash_key=None),
 CrossedColumn(keys=('riid', 'is_exclusive'), hash_bucket_size=4000000, hash_key=None),
 CrossedColumn(keys=('riid', 'has_urgency'), hash_bucket_size=4000000, hash_key=None),
 CrossedColumn(keys=('riid', 'sl_contains_price'), hash_bucket_size=4000000, hash_key=None),
 CrossedColumn(keys=('riid', 'is_discount_mentioned'), hash_bucket_size=4000000, hash_key=None),
 CrossedColumn(keys=('riid', 'sent_week'), hash_bucket_size=10000000, hash_key=None),
 CrossedColumn(keys=('riid', 'sent_dayofweek'), hash_bucket_size=60000000, hash_key=None),
 CrossedColumn(keys=('riid', 'sent_hr'), hash_bucket_size=5

In [20]:
wide_columns = numeric_feature_layer + crossed_columns
wide_columns_input = {**numeric_feature_layer_input, **crossed_columns_input}

#Display the columns
print("[INFO] The wide columns are:")
pprint(wide_columns)

print("\n[INFO] The inputs to wide columns are:")
pprint(wide_columns_input)

[INFO] The wide columns are:
[NumericColumn(key='retention_score', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=functools.partial(<function standardize_column at 0x7f0f3ad97b00>, mean=11.467980895825553, std=11.35391986430546)),
 NumericColumn(key='recency_score', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=functools.partial(<function standardize_column at 0x7f0f3ad97b00>, mean=1.23904221901564, std=2.216794122042123)),
 NumericColumn(key='frequency_score', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=functools.partial(<function standardize_column at 0x7f0f3ad97b00>, mean=14.977138288600283, std=20.754428265423773)),
 NumericColumn(key='sent_week', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=functools.partial(<function standardize_column at 0x7f0f3ad97b00>, mean=28.594960628048973, std=14.377041994557581)),
 NumericColumn(key='sent_dayofweek', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=fu

## For Deep Model

In [21]:
deep_columns = numeric_feature_layer + categorical_feature_layer + embedding_feature_layer
deep_columns_input = {**numeric_feature_layer_input, **categorical_feature_layer_input, **embedding_feature_layer_input}

#Display the columns
print("[INFO] The deep feature columns are:")
pprint(deep_columns)

print("\n[INFO] The inputs to deep feature columns are:")
pprint(deep_columns_input)

[INFO] The deep feature columns are:
[NumericColumn(key='retention_score', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=functools.partial(<function standardize_column at 0x7f0f3ad97b00>, mean=11.467980895825553, std=11.35391986430546)),
 NumericColumn(key='recency_score', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=functools.partial(<function standardize_column at 0x7f0f3ad97b00>, mean=1.23904221901564, std=2.216794122042123)),
 NumericColumn(key='frequency_score', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=functools.partial(<function standardize_column at 0x7f0f3ad97b00>, mean=14.977138288600283, std=20.754428265423773)),
 NumericColumn(key='sent_week', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=functools.partial(<function standardize_column at 0x7f0f3ad97b00>, mean=28.594960628048973, std=14.377041994557581)),
 NumericColumn(key='sent_dayofweek', shape=(1,), default_value=None, dtype=tf.float32, normaliz

# Model Comparison

In [22]:
models_dir = Path("/content/drive/MyDrive/Bandit_Project/models")
wmodel = (models_dir/"Wide").mkdir(exist_ok=True)
dmodel = (models_dir/"Deep").mkdir(exist_ok=True)
wdmodel = (models_dir/"W&D").mkdir(exist_ok=True)
bwdmodel = (models_dir/"Bayesian W&D").mkdir(exist_ok=True)

#Hyperparameters
lr = 1e-3
n_steps = 1500000

## Simple Wide Model

In [62]:
wm = tf.estimator.LinearClassifier(
    model_dir=wmodel, 
    feature_columns=wide_columns,
    n_classes=2,
    optimizer=tf.keras.optimizers.Adam(learning_rate = lr))

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmpct35zelt', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [63]:
wm.train(train_dl_train, steps=n_steps)
wm_results = wm.evaluate(test_dl_train, steps=None)
for key in sorted(wm_results):
  print("%s: %s" % (key, wm_results[key]))

INFO:tensorflow:Calling model_fn.




INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Calling checkpoint listeners before saving checkpoint 0...
INFO:tensorflow:Saving checkpoints for 0 into /tmp/tmpct35zelt/model.ckpt.
INFO:tensorflow:Calling checkpoint listeners after saving checkpoint 0...
INFO:tensorflow:loss = 0.6931472, step = 0
INFO:tensorflow:global_step/sec: 9.17988
INFO:tensorflow:loss = 0.56775856, step = 100 (10.896 sec)
INFO:tensorflow:global_step/sec: 9.52557
INFO:tensorflow:loss = 0.4798948, step = 200 (10.500 sec)
INFO:tensorflow:global_step/sec: 9.41919
INFO:tensorflow:loss = 0.43979174, step = 300 (10.617 sec)
INFO:tensorflow:global_step/sec: 9.41907
INFO:tensorflow:loss = 0.47828147, step = 400 (10.615 sec)
INFO:tensorflow:global_step/sec: 9.48341
INFO:tensorflow:loss = 0.4324661, step = 500 (10.547 sec)
INFO:tensorflow:global_step/sec:

<tensorflow_estimator.python.estimator.canned.linear.LinearClassifierV2 at 0x7f1408162410>

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2021-03-02T21:06:56Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmpct35zelt/model.ckpt-25972
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Inference Time : 1413.60249s
INFO:tensorflow:Finished evaluation at 2021-03-02-21:30:30
INFO:tensorflow:Saving dict for global step 25972: accuracy = 0.7411525, accuracy_baseline = 0.84903985, auc = 0.61487967, auc_precision_recall = 0.34246373, average_loss = 0.75202763, global_step = 25972, label/mean = 0.15096018, loss = 0.7520281, precision = 0.28304592, prediction/mean = 0.34225115, recall = 0.4662135
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 25972: /tmp/tmpct35zelt/model.ckpt-25972
accuracy: 0.7411525
accuracy_baseline: 0.84903985
auc: 0.61487967
auc_precision_recall: 0.34246373
average_loss: 0.75202763
global_step: 25

## Simple Deep Model

In [64]:
dm = tf.estimator.DNNClassifier(
    hidden_units=[512, 256, 128], 
    feature_columns=deep_columns, 
    model_dir=dmodel, 
    n_classes=2,
    optimizer=tf.keras.optimizers.Adam(learning_rate = lr)
)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmpkbmplq0c', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [65]:
dm.train(train_dl_train, steps=n_steps)
dm_results = dm.evaluate(test_dl_train, steps=None)
for key in sorted(dm_results):
  print("%s: %s" % (key, dm_results[key]))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Calling checkpoint listeners before saving checkpoint 0...
INFO:tensorflow:Saving checkpoints for 0 into /tmp/tmpkbmplq0c/model.ckpt.
INFO:tensorflow:Calling checkpoint listeners after saving checkpoint 0...
INFO:tensorflow:loss = 0.7091495, step = 0
INFO:tensorflow:global_step/sec: 24.3188
INFO:tensorflow:loss = 0.37694913, step = 100 (4.116 sec)
INFO:tensorflow:global_step/sec: 25.0725
INFO:tensorflow:loss = 0.39945105, step = 200 (3.986 sec)
INFO:tensorflow:global_step/sec: 25.0234
INFO:tensorflow:loss = 0.38545108, step = 300 (3.996 sec)
INFO:tensorflow:global_step/sec: 25.1424
INFO:tensorflow:loss = 0.4140769, step = 400 (3.979 sec)
INFO:tensorflow:global_step/sec: 25.0113
INFO:tensorflow:loss = 0.3987895, step = 500 (4.003 sec)
INF

<tensorflow_estimator.python.estimator.canned.dnn.DNNClassifierV2 at 0x7f12f4f4b5d0>

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2021-03-02T21:47:53Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmpkbmplq0c/model.ckpt-25972
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Inference Time : 1078.22848s
INFO:tensorflow:Finished evaluation at 2021-03-02-22:05:51
INFO:tensorflow:Saving dict for global step 25972: accuracy = 0.81409216, accuracy_baseline = 0.84903985, auc = 0.69695014, auc_precision_recall = 0.35091102, average_loss = 1.634197, global_step = 25972, label/mean = 0.15096018, loss = 1.6342001, precision = 0.39633992, prediction/mean = 0.1735402, recall = 0.4425631
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 25972: /tmp/tmpkbmplq0c/model.ckpt-25972
accuracy: 0.81409216
accuracy_baseline: 0.84903985
auc: 0.69695014
auc_precision_recall: 0.35091102
average_loss: 1.634197
global_step: 25972

## Simple Wide & Deep Model

In [23]:
wdm = tf.estimator.DNNLinearCombinedClassifier(
    model_dir=wdmodel, 
    linear_feature_columns=crossed_columns,
    dnn_feature_columns=deep_columns,
    dnn_hidden_units=[512, 256, 128],
    n_classes=2)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmpoi4fj9ew', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [24]:
wdm.train(input_fn=train_dl_train, steps=n_steps)
wdm_results = wdm.evaluate(input_fn=test_dl_train, steps=None)
for key in sorted(wdm_results):
  print("%s: %s" % (key, wdm_results[key]))

Instructions for updating:
Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.
INFO:tensorflow:Calling model_fn.




Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Calling checkpoint listeners before saving checkpoint 0...
INFO:tensorflow:Saving checkpoints for 0 into /tmp/tmpoi4fj9ew/model.ckpt.
INFO:tensorflow:Calling checkpoint listeners after saving checkpoint 0...
INFO:tensorflow:loss = 0.6777444, step = 0
INFO:tensorflow:global_step/sec: 20.7516
INFO:tensorflow:loss = 0.52892697, step = 100 (4.824 sec)
INFO:tensorflow:global_step/sec: 23.9531
INFO:tensorflow:loss = 0.46981627, step = 200 (4.174 sec)
INFO:tensorflow:global_step/sec: 22.7475
INFO:tensorflow:loss = 0.42621803, step = 300 (4.393 sec)
INFO:tensorflow:global_step/sec: 23.7146
INFO:tensorflow:loss = 0.45286438, step = 400 (4.218 sec)
INFO:tensorflow:

<tensorflow_estimator.python.estimator.canned.dnn_linear_combined.DNNLinearCombinedClassifierV2 at 0x7f0f3c77a750>

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2021-03-03T06:03:32Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmpoi4fj9ew/model.ckpt-25972
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Inference Time : 1597.71992s
INFO:tensorflow:Finished evaluation at 2021-03-03-06:30:10
INFO:tensorflow:Saving dict for global step 25972: accuracy = 0.8863095, accuracy_baseline = 0.84903985, auc = 0.8525112, auc_precision_recall = 0.59181666, average_loss = 0.31785402, global_step = 25972, label/mean = 0.15096018, loss = 0.31785432, precision = 0.67167586, prediction/mean = 0.21185075, recall = 0.4829211
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 25972: /tmp/tmpoi4fj9ew/model.ckpt-25972
accuracy: 0.8863095
accuracy_baseline: 0.84903985
auc: 0.8525112
auc_precision_recall: 0.59181666
average_loss: 0.31785402
global_step: 259

## Bayesian Wide & Deep Model

1.   Wide & Deep Models are not connected to common output nodes @ the end.
2.   They are first connected to a common dropout layer which can be turned on during inference time as well.
3.   The dropout layer is then connected to output nodes.



In [59]:
def wide_and_deep_model(wide_inputs, wide_feature_columns, 
                        deep_inputs, dnn_feature_columns, dnn_hidden_units, 
                        multihead_count = 64, p_value=0.5):

    #Build the Deep Network
    #deep = tf.keras.layers.DenseFeatures(dnn_feature_columns, name='deep_inputs')(deep_inputs)
    deep_input_layer = tf.keras.layers.DenseFeatures(dnn_feature_columns, name='deep_inputs')
    deep = deep_input_layer(deep_inputs)

    for layerno, numnodes in enumerate(dnn_feature_columns):
        deep = tf.keras.layers.Dense(numnodes, activation='relu', name='dnn_{}'.format(layerno+1))(deep)        
    
    #Build the Wide Network
    wide_input_layer = tf.keras.layers.DenseFeatures(wide_feature_columns, name='wide_inputs')
    wide = wide_input_layer(wide_inputs)

    #Concatenate the Wide & Deep
    both = tf.keras.layers.concatenate([deep, wide], name='both')

    #Create the multi-head layer
    multihead_pre_dropout = tf.keras.layers.dropout(p_value)(both, training=True)
    multihead = tf.keras.layers.Dense(multihead_count, activation='relu', name='multihead')(multihead_pre_dropout)
    multihead_dropout = tf.keras.layers.dropout(p_value)(multihead, training=True)

    #Create the output layer
    output = tf.keras.layers.Dense(2, activation='softmax', name='optimal_action')(multihead_dropout)
    model = tf.keras.Model(inputs, output)
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    return model

bwdmodel = wide_and_deep_model(wide_inputs = crossed_columns_input, wide_feature_columns = crossed_columns, 
                               deep_inputs = deep_columns_input, dnn_feature_columns = deep_columns, 
                               dnn_hidden_units = [512, 256, 128], multihead_count = 64, p_value=0.5)
tf.keras.utils.plot_model(bwdmodel, '/content/drive/MyDrive/Bandit_Project/models/bayesian_w&d.png', show_shapes=False, rankdir='LR')

ValueError: ignored

In [None]:
pprint(deep_columns)

In [25]:
tf.keras.layers.DenseFeatures([deep_columns[10]])(train_dl_fit.take(1)).numpy()

ValueError: ignored