<a href="https://colab.research.google.com/github/fellowship/deep-and-wide-bandit/blob/dev/TensorFlow_W_D_W%26D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Required Installs

# Import necessary packages

In [1]:
import tensorflow as tf
tf.test.gpu_device_name()

'/device:GPU:0'

In [2]:
#Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
from pathlib import Path

import pandas as pd
import numpy as np
from zipfile import ZipFile
import re
import json
import pickle as pkl
import re

%matplotlib inline
import matplotlib.pyplot as plt

from functools import partial
import random
import seaborn as sns
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
from pprint import pprint
from sklearn.metrics import confusion_matrix
from IPython.core.interactiveshell import InteractiveShell  
InteractiveShell.ast_node_interactivity = "all"

#Makes panda and numpy easier to read
pd.set_option('display.float_format', lambda x: '%.3f' % x)
np.set_printoptions(precision=3, suppress=True)

#import torch
#from torch.utils.data import Dataset

# Get the data ready


We extract the weekly dataset CSVs & shortlisted train+valid index CSVs s.t. we have 1 train and 1 valid index CSV per weekly CSV.



In [4]:
overwrite_zip_extract = False

#Build the list of train_valid items
train_valid_l = [Path("/content/drive/MyDrive/Bandit_Project/BanditsData/Train_Valid_Jul_Dec_2019.zip"),
                 Path("/content/drive/MyDrive/Bandit_Project/BanditsData/Train_Valid_Jan_Jun_2020.zip")]

#Extract items to a folder in your GDrive
folder = Path("/content/drive/MyDrive/Bandit_Project/aleksey")

#Create a train & valid folder
train_folder = (folder/'train')
train_folder.mkdir(exist_ok=True)

valid_folder = (folder/'valid')
valid_folder.mkdir(exist_ok=True)

#Iterate through the list
if not(train_folder.exists() or valid_folder.exists()) or overwrite_zip_extract:
  
  for path in train_valid_l:    
      with ZipFile(path, 'r') as zip_obj:
        zip_obj.extractall(folder)

#TBD: We need to implement code that determines action when overwrite is set to True

## Subsetting Train & Valid

In [5]:
#Function to export data corresponding to chosen indices to train/valid folder
def export_data_subset(data_l, idx_l, save_folder, overwrite_flag = False):

  cnt = 0

  #If the folder is not empty and overwrite_flag is set to True, delete all files in the folder
  if overwrite_flag:
    
    print(f"\n[INFO] Deleting files in {save_folder.name} directory...")
    for file_path in save_folder.iterdir():
      
      print(f"[INFO] Deleting {file_path.name}")
      file_path.unlink()
  
  #Iterate over the (weekly data path, training set indices path) zipped object
  for (data_path, idx_path) in zip(data_l, idx_l):

    if (cnt + 1) % 5 == 0:
      print(f"[INFO] Building {save_folder.name} file from {data_path.name}")
    
    #Use pandas to read weekly data + corresponding index CSV files
    data = pd.read_csv(data_path)
    idx = pd.read_csv(idx_path, header=None, squeeze=True).tolist()

    #Subset the data and save it to the appropriate csv file
    data_subset = data.iloc[idx, :]

    #Save the data subset
    data_subset.to_csv(save_folder/(data_path.name), index=False, compression="gzip", header=True)

    #Increment Counter
    cnt += 1

In [6]:
def process_input(source, dest, x_cols, y_col, overwrite=True):
  
  filename = source.name + ".csv.gz"
  dest_path = dest/filename
  
  #Check whether files exist in destination + should not overwrite - If they do, print an error message  
  if dest_path.exists():
    if not(overwrite):
      print(f"[ERROR] {dest_path.name} currently exists. Pls set overwrite flag to True!")
      return dest_path
    else:
      #Delete current files in the dest folder
      for file_path in dest.iterdir():
        file_path.unlink()

  cnt = 0

  #Iterate over each file in source
  for file_path in source.iterdir():

    #Print update
    print(f"[INFO] Currently working on {source.name}: {file_path.name}")
    
    #Read in the data
    data = pd.read_csv(file_path, compression="gzip", header=[0])

    #Shortlist columns to get the overall CSV
    cols = x_cols + y_col
    subset = data[cols]

    """
    #Process numeric columns
    for col in x_cols:

      if col in stats.keys():

        mean = stats[col]["mean"]
        std = stats[col]["std"]
        subset.loc[:, col] = (subset.loc[:, col] - mean)/std
    """        

    #Check whether first CSV file —> include header, otherwise ignore
    header_flag = True if not(cnt) else False

    #Save to dest    
    subset.to_csv(dest_path, mode='a', compression="gzip", header=header_flag, index=False)

    #Increment counter
    cnt += 1
  
  #Return the path to the processed input file
  return dest_path

In [7]:
#Setting overwrite flag
overwrite_dset = False

#We need to connect a file like "sends_2019_wk26.csv" WITH "selected_rows_sends_2019_wk26.csv"
weekly_data_path_l = sorted([i for i in folder.iterdir() if re.search("/sends", str(i), re.I)], key=lambda x: str(x))
train_indices_path_l = sorted([i for i in folder.iterdir() if re.search("/selected_rows_sends", str(i), re.I)], key=lambda x: str(x))
valid_indices_path_l = sorted([i for i in folder.iterdir() if re.search("/selected_rows_valid", str(i), re.I)], key=lambda x: str(x))

print(f"[INFO] Displaying indices to build training data for {str(weekly_data_path_l[0])}: {train_indices_path_l[0]}")
print(f"[INFO] Displaying indices to build validation data for {str(weekly_data_path_l[0])}: {valid_indices_path_l[0]}")

#Check if training folder is empty
if not(list(train_folder.iterdir())) or overwrite_dset:

  #Run the export function
  export_data_subset(data_l = weekly_data_path_l, idx_l = train_indices_path_l, 
                     save_folder = train_folder, overwrite_flag = False)

else:
  print("\n[INFO] Training Data Already Created...")

#Execute only if valid folder is empty
if not(list(valid_folder.iterdir())) or overwrite_dset:

  #Run the export function
  export_data_subset(data_l = weekly_data_path_l, idx_l = valid_indices_path_l, 
                     save_folder = valid_folder, overwrite_flag = overwrite_dset)

else:
  print("[INFO] Validation Data Already Created...\n")

train_list = sorted([file_path for file_path in train_folder.iterdir()], key = lambda x: str(x))
valid_list = sorted([file_path for file_path in valid_folder.iterdir()], key = lambda x: str(x))

print("\n[INFO] Displaying the first 5 elements of train_list:")
pprint(train_list[:5])
print("\n[INFO] Displaying the first 5 elements of train_list:")
pprint(valid_list[:5])

[INFO] Displaying indices to build training data for /content/drive/MyDrive/Bandit_Project/aleksey/sends_2019_wk26.csv: /content/drive/MyDrive/Bandit_Project/aleksey/selected_rows_sends_2019_wk26.csv
[INFO] Displaying indices to build validation data for /content/drive/MyDrive/Bandit_Project/aleksey/sends_2019_wk26.csv: /content/drive/MyDrive/Bandit_Project/aleksey/selected_rows_valid_sends_2019_wk26.csv

[INFO] Training Data Already Created...
[INFO] Validation Data Already Created...


[INFO] Displaying the first 5 elements of train_list:
[PosixPath('/content/drive/MyDrive/Bandit_Project/aleksey/train/sends_2019_wk26.csv'),
 PosixPath('/content/drive/MyDrive/Bandit_Project/aleksey/train/sends_2019_wk27.csv'),
 PosixPath('/content/drive/MyDrive/Bandit_Project/aleksey/train/sends_2019_wk28.csv'),
 PosixPath('/content/drive/MyDrive/Bandit_Project/aleksey/train/sends_2019_wk29.csv'),
 PosixPath('/content/drive/MyDrive/Bandit_Project/aleksey/train/sends_2019_wk30.csv')]

[INFO] Displaying

In [8]:
#Display head of first element of both training & validation subset
train_sample = pd.read_csv(train_list[0], compression="gzip", header=[0])
print("[INFO] Sample Training Data")
train_sample.head()
valid_sample = pd.read_csv(valid_list[0], compression="gzip", header=[0])
print("[INFO] Sample Validation Data")
valid_sample.head()

#Get the column names of the data
data_col_names = train_sample.columns.tolist()
print("[INFO] The full list of column names include:")
pprint(data_col_names)

[INFO] Sample Training Data


Unnamed: 0,riid,retention_score,frequency_score,recency_score,sends_since_last_open,days_subscr,aq_year,aq_week,aq_dayofweek,aq_period,campaign_id,campaign_category,campaign_Brand,campaign_Core,campaign_Dedicated,campaign_InnovationSpotlight,campaign_NewArrivals,campaign_ProductSpotlight,campaign_Replen,campaign_Tops,campaign_Trend,campaign_Other,discount,promo,sale,is_one_for_free,free_shipping,is_exclusive,has_urgency,sl_contains_price,is_discount_mentioned,message_size,sent_week,sent_dayofweek,sent_hr,opened,unsub,rev_3dv2,reward,optimal_action
0,193066422,28.0,74,9.947,1,1413,2015,32,1,Non-Holiday,59090182,Tops,0,0,0,0,0,0,0,1,0,0,40,1,0,0,0,0,0,1,0,158124,26,0,17,1,0,0.0,-2,1
1,261648242,1.077,14,0.636,26,173,2019,1,2,Non-Holiday,59090182,Tops,0,0,0,0,0,0,0,1,0,0,40,1,0,0,0,0,0,1,0,158358,26,0,17,0,0,0.0,-27,0
2,236488582,4.667,2,0.351,6,487,2018,8,3,Non-Holiday,59090182,Tops,0,0,0,0,0,0,0,1,0,0,40,1,0,0,0,0,0,1,0,157864,26,0,17,0,0,0.0,-7,0
3,2589382,1.867,4,0.255,15,3172,2010,42,6,Non-Holiday,59090182,Tops,0,0,0,0,0,0,0,1,0,0,40,1,0,0,0,0,0,1,0,154873,26,0,17,0,0,0.0,-16,0
4,238665702,28.0,29,7.41,0,468,2018,11,1,Non-Holiday,59090182,Tops,0,0,0,0,0,0,0,1,0,0,40,1,0,0,0,0,0,1,0,156448,26,0,17,1,0,0.0,-1,1


[INFO] Sample Validation Data


Unnamed: 0,riid,retention_score,frequency_score,recency_score,sends_since_last_open,days_subscr,aq_year,aq_week,aq_dayofweek,aq_period,campaign_id,campaign_category,campaign_Brand,campaign_Core,campaign_Dedicated,campaign_InnovationSpotlight,campaign_NewArrivals,campaign_ProductSpotlight,campaign_Replen,campaign_Tops,campaign_Trend,campaign_Other,discount,promo,sale,is_one_for_free,free_shipping,is_exclusive,has_urgency,sl_contains_price,is_discount_mentioned,message_size,sent_week,sent_dayofweek,sent_hr,opened,unsub,rev_3dv2,reward,optimal_action
0,184980282,28.0,36,4.705,0,1474,2015,24,3,Non-Holiday,59090182,Tops,0,0,0,0,0,0,0,1,0,0,40,1,0,0,0,0,0,1,0,155587,26,0,17,0,0,0.0,-1,0
1,156790362,14.0,5,2.982,2,1474,2015,24,3,Non-Holiday,59090182,Tops,0,0,0,0,0,0,0,1,0,0,40,1,0,0,0,0,0,1,0,155511,26,0,17,0,0,0.0,-3,0
2,253723882,28.0,81,12.276,0,301,2018,35,0,Non-Holiday,59090182,Tops,0,0,0,0,0,0,0,1,0,0,40,1,0,0,0,0,0,1,0,156725,26,0,17,1,0,0.0,-1,1
3,8735422,28.0,79,12.276,0,1940,2014,9,6,Non-Holiday,59090182,Tops,0,0,0,0,0,0,0,1,0,0,40,1,0,0,0,0,0,1,0,155137,26,0,17,1,0,0.0,-1,1
4,181205722,14.0,49,5.056,2,1474,2015,24,3,Non-Holiday,59090182,Tops,0,0,0,0,0,0,0,1,0,0,40,1,0,0,0,0,0,1,0,155094,26,0,17,0,0,0.0,-3,0


[INFO] The full list of column names include:
['riid',
 'retention_score',
 'frequency_score',
 'recency_score',
 'sends_since_last_open',
 'days_subscr',
 'aq_year',
 'aq_week',
 'aq_dayofweek',
 'aq_period',
 'campaign_id',
 'campaign_category',
 'campaign_Brand',
 'campaign_Core',
 'campaign_Dedicated',
 'campaign_InnovationSpotlight',
 'campaign_NewArrivals',
 'campaign_ProductSpotlight',
 'campaign_Replen',
 'campaign_Tops',
 'campaign_Trend',
 'campaign_Other',
 'discount',
 'promo',
 'sale',
 'is_one_for_free',
 'free_shipping',
 'is_exclusive',
 'has_urgency',
 'sl_contains_price',
 'is_discount_mentioned',
 'message_size',
 'sent_week',
 'sent_dayofweek',
 'sent_hr',
 'opened',
 'unsub',
 'rev_3dv2',
 'reward',
 'optimal_action']


In [16]:
#Setting overwrite flag for Processed Data
overwrite_processed = False

#Create a folders to contain the processed records
dest_folder = Path("/content/drive/MyDrive/Bandit_Project/aleksey/processed")
dest_folder.mkdir(exist_ok=True)

#Build the context that you would like to keep track of
user_context_cols = ['riid', 'retention_score', 'frequency_score', 'recency_score', 'sends_since_last_open'] #Not using any aquisition features

campaign_context_cols = ['campaign_category', 'discount', 'promo', 'sale', 'is_one_for_free','free_shipping','is_exclusive','has_urgency']

email_context_cols = ['sl_contains_price','is_discount_mentioned','sent_week','sent_dayofweek','sent_hr']

context_cols = user_context_cols + campaign_context_cols + email_context_cols
print(f"[INFO] The context is {len(context_cols)} cols long...")

#Not sure how to handle train_y: For now, optimal_action classification 
outcomes_cols = ["opened", "unsub", "rev_3dv2"]
reward_cols = ["reward"]
action_cols = ["optimal_action"]

#Process the files in both train & valid
train_file_path = process_input(train_folder, dest_folder, context_cols, action_cols, overwrite_processed)
val_file_path = process_input(valid_folder, dest_folder, context_cols, action_cols, overwrite_processed)

#Print the heads of the training & validation data
train = pd.read_csv(train_file_path)
val = pd.read_csv(val_file_path)

print(f"\n[INFO] We have {len(train)} elements in the Training Set")
print("[INFO] Printing the head of Training Set:")
train.head(5)
print(f"\n[INFO] We have {len(val)} elements in the Validation Set")
print("[INFO] Printing the head of Validation Set:")
val.head(5)


[INFO] The context is 18 cols long...
[INFO] Currently working on train: sends_2019_wk27.csv
[INFO] Currently working on train: sends_2019_wk26.csv
[INFO] Currently working on train: sends_2019_wk28.csv
[INFO] Currently working on train: sends_2019_wk29.csv
[INFO] Currently working on train: sends_2019_wk31.csv
[INFO] Currently working on train: sends_2019_wk32.csv
[INFO] Currently working on train: sends_2019_wk33.csv
[INFO] Currently working on train: sends_2019_wk30.csv
[INFO] Currently working on train: sends_2019_wk34.csv
[INFO] Currently working on train: sends_2019_wk35.csv
[INFO] Currently working on train: sends_2019_wk36.csv
[INFO] Currently working on train: sends_2019_wk37.csv
[INFO] Currently working on train: sends_2019_wk38.csv
[INFO] Currently working on train: sends_2019_wk39.csv
[INFO] Currently working on train: sends_2019_wk40.csv
[INFO] Currently working on train: sends_2019_wk41.csv
[INFO] Currently working on train: sends_2019_wk42.csv
[INFO] Currently working on

Unnamed: 0,riid,retention_score,frequency_score,recency_score,sends_since_last_open,campaign_category,discount,promo,sale,is_one_for_free,free_shipping,is_exclusive,has_urgency,sl_contains_price,is_discount_mentioned,sent_week,sent_dayofweek,sent_hr,optimal_action
0,259865082,9.333,18,1.555,3,Trend,50,1,0,0,0,0,0,1,0,27,1,21,1
1,7138122,28.0,25,7.071,0,Trend,50,1,0,0,0,0,0,1,0,27,1,21,1
2,223293342,0.737,1,0.022,38,Trend,50,1,0,0,0,0,0,1,0,27,1,21,0
3,211545802,0.824,1,0.13,34,Trend,50,1,0,0,0,0,0,1,0,27,1,21,0
4,163348802,28.0,31,7.04,1,Trend,50,1,0,0,0,0,0,1,0,27,1,21,0



[INFO] We have 212245 elements in the Validation Set
[INFO] Printing the head of Validation Set:


Unnamed: 0,riid,retention_score,frequency_score,recency_score,sends_since_last_open,campaign_category,discount,promo,sale,is_one_for_free,free_shipping,is_exclusive,has_urgency,sl_contains_price,is_discount_mentioned,sent_week,sent_dayofweek,sent_hr,optimal_action
0,184980282,28.0,36,4.705,0,Tops,40,1,0,0,0,0,0,1,0,26,0,17,0
1,156790362,14.0,5,2.982,2,Tops,40,1,0,0,0,0,0,1,0,26,0,17,0
2,253723882,28.0,81,12.276,0,Tops,40,1,0,0,0,0,0,1,0,26,0,17,1
3,8735422,28.0,79,12.276,0,Tops,40,1,0,0,0,0,0,1,0,26,0,17,1
4,181205722,14.0,49,5.056,2,Tops,40,1,0,0,0,0,0,1,0,26,0,17,0


# Statistical Analysis Of Outcomes

[DONE] In this section, we have confirmed 3 things:


1.   How many people opened vs did not open
2.   How many people unsubscribed vs did not unsubscribe
3.   Whether optimal action is 1 for open OR purchase and 0 for not open OR unsub



In [10]:
"""
length_l = []
opens_l = []
unsubs_l = []
unique_users_s = set()
campaign_types_s = set()

#Iterate through the weekly data
for file_path in weekly_data_path_l:

  #Print status update
  print(f"[INFO] Working on {file_path.name}")
  
  #Count the number of elements
  df = pd.read_csv(file_path)
  length = len(df)
  length_l.append(length)

  #Count Opens vs Not Opens
  opens = (df["opened"] == 1).astype(int).sum()
  opens_l.append(opens)

  #Count Unsubs vs Not Unsubs
  unsubs = (df["unsub"] == 1).astype(int).sum()
  unsubs_l.append(unsubs)

  #Assert whether optimal action follows the rule
  #Create a Panda Series that follows this rule and assert
  check_series = pd.Series(np.ones_like(df["optimal_action"].values, dtype=int))

  check_series[df["opened"] == 0] = 0
  check_series[df["unsub"] == 1] = 0
  assert (df["optimal_action"] == check_series).all()

  #Update the unique user set with user IDs from the df
  unique_users = list(df["riid"].unique())
  unique_users_s.update(unique_users)

  #Update the unique campaign type set from the df
  campaign_types = list(df['campaign_category'].unique())
  campaign_types_s.update(campaign_types)

total_length = sum(length_l)
print(f"[INFO] Total number of data - {total_length}")

opened = sum(opens_l)/total_length
unsub = sum(unsubs_l)/total_length

print(f"[INFO] % of opened - {opened}")
print(f"[INFO] % of unsubscribed - {unsub}")
print(f"[INFO] # of unique users - {len(unique_users_s)}")
print(f"[INFO] # of unique campaign types - {len(campaign_types_s)}")

"""

'\nlength_l = []\nopens_l = []\nunsubs_l = []\nunique_users_s = set()\ncampaign_types_s = set()\n\n#Iterate through the weekly data\nfor file_path in weekly_data_path_l:\n\n  #Print status update\n  print(f"[INFO] Working on {file_path.name}")\n  \n  #Count the number of elements\n  df = pd.read_csv(file_path)\n  length = len(df)\n  length_l.append(length)\n\n  #Count Opens vs Not Opens\n  opens = (df["opened"] == 1).astype(int).sum()\n  opens_l.append(opens)\n\n  #Count Unsubs vs Not Unsubs\n  unsubs = (df["unsub"] == 1).astype(int).sum()\n  unsubs_l.append(unsubs)\n\n  #Assert whether optimal action follows the rule\n  #Create a Panda Series that follows this rule and assert\n  check_series = pd.Series(np.ones_like(df["optimal_action"].values, dtype=int))\n\n  check_series[df["opened"] == 0] = 0\n  check_series[df["unsub"] == 1] = 0\n  assert (df["optimal_action"] == check_series).all()\n\n  #Update the unique user set with user IDs from the df\n  unique_users = list(df["riid"].uniqu

# Preparing the Dataset for Tensorflow

## Creating the Dataloader

In [17]:
#Given a dataframe in memory
def df_to_dataloader(dataframe, target, shuffle=True, batch_size=32):
  dataframe = dataframe.copy()
  labels = dataframe.pop(target)
  ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  return ds

In [18]:
def set_shape(value):
    value.set_shape((17, ))
    return value

In [13]:
"""
def input_fn(df_data, target, num_epochs = 50, shuffle = True, batch_size = 32):
  
  #df_data = pd.read_csv(data_file, header=[0], skiprows=1)
  # remove NaN elements
  #df_data = df_data.dropna(how="any", axis=0)
  df_data = df_data.copy()
  labels = df_data.pop(target)
  return tf.compat.v1.estimator.inputs.pandas_input_fn(
      x=df_data,
      y=labels,
      batch_size=batch_size,
      num_epochs=num_epochs,
      shuffle=shuffle)
"""

'\ndef input_fn(df_data, target, num_epochs = 50, shuffle = True, batch_size = 32):\n  \n  #df_data = pd.read_csv(data_file, header=[0], skiprows=1)\n  # remove NaN elements\n  #df_data = df_data.dropna(how="any", axis=0)\n  df_data = df_data.copy()\n  labels = df_data.pop(target)\n  return tf.compat.v1.estimator.inputs.pandas_input_fn(\n      x=df_data,\n      y=labels,\n      batch_size=batch_size,\n      num_epochs=num_epochs,\n      shuffle=shuffle)\n'

In [19]:
batch_size=512

#train_dl = input_fn(train, "optimal_action", batch_size=batch_size)
#val_dl = input_fn(val, "optimal_action", shuffle=False, batch_size=batch_size)
train_dl = lambda : df_to_dataloader(train, "optimal_action", batch_size=batch_size)
#train_dl = train_dl.map(set_shape).batch(batch_size)

val_dl = lambda : df_to_dataloader(val, "optimal_action", shuffle=False, batch_size=batch_size)
#val_dl = val_dl.map(set_shape).batch(batch_size)

## Base Columns

### Numeric Columns

1. 'retention_score'
2. 'frequency_score'
3. 'recency_score'
4. 'sends_since_last_open'
5. 'discount'
6. 'sent_week'
7. 'sent_dayofweek'
8. 'sent_hr'

To bucketize or not?

In [20]:
#Get the global mean & std statistics
rolling_stats_filepath = Path("/content/drive/MyDrive/Bandit_Project/rolling_statistics.pkl")
with rolling_stats_filepath.open(mode='rb') as rolling_stats_file:
  rolling_stats = pkl.load(rolling_stats_file)

#Create a function that standardizes the column to N(0, 1)
def standardize_column(data, mean, std):                       
  data = (tf.cast(data, dtype=tf.float32) - mean)/std
  return tf.reshape(data, [-1, 1])

In [21]:
#Initialize a list to contain the numeric feature columns
numeric_feature_layer = []
numeric_feature_layer_input = {}

#Create 2 dictionaries with key as numeric feature column name
#and val as the value of the numeric feature column name
numeric_feature_col_names = ['retention_score','recency_score','frequency_score',
                             'sent_week','sent_dayofweek','sent_hr','discount',
                             'sends_since_last_open']
MEANS = {feature: rolling_stats[feature]["mean"] for feature in numeric_feature_col_names}
STDS = {feature: rolling_stats[feature]["std"] for feature in numeric_feature_col_names}

#Generate numeric cols
for feature in numeric_feature_col_names:
  numeric_feature_col = tf.feature_column.numeric_column(feature, 
                                                     normalizer_fn=partial(standardize_column, mean=MEANS[feature], std=STDS[feature]))
  numeric_feature_layer.append(numeric_feature_col)
  numeric_feature_layer_input[feature] = tf.keras.Input(shape=(1,), name=feature)

#Display the columns
print("[INFO] The numeric feature columns are:")
pprint(numeric_feature_layer)

print("\n[INFO] The inputs to numeric feature columns are:")
pprint(numeric_feature_layer_input)

[INFO] The numeric feature columns are:
[NumericColumn(key='retention_score', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=functools.partial(<function standardize_column at 0x7f25d982ab00>, mean=11.467980895825553, std=11.35391986430546)),
 NumericColumn(key='recency_score', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=functools.partial(<function standardize_column at 0x7f25d982ab00>, mean=1.23904221901564, std=2.216794122042123)),
 NumericColumn(key='frequency_score', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=functools.partial(<function standardize_column at 0x7f25d982ab00>, mean=14.977138288600283, std=20.754428265423773)),
 NumericColumn(key='sent_week', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=functools.partial(<function standardize_column at 0x7f25d982ab00>, mean=28.594960628048973, std=14.377041994557581)),
 NumericColumn(key='sent_dayofweek', shape=(1,), default_value=None, dtype=tf.float32, norma

### Categorical Columns

1. 'riid'
2. 'campaign_category'
3. 'promo'
4. 'sale'
5. 'is_one_for_free'
6. 'free_shipping'
7. 'is_exclusive'
8. 'has_urgency'
9. 'sl_contains_price'
10.'is_discount_mentioned'

In [56]:
#Initialize a list to contain the categorical feature columns
categorical_feature_layer = []
categorical_feature_layer_input = {}

CATEGORIES = {
    'promo' : [0, 1],
    'sale' : [0, 1],
    'campaign_category': ['Trend', 'NewArrivals', 'Dedicated', 'InnovationSpotlight', 'Core', 'Replen', 'ProductSpotlight', 'Other', 'Brand', 'Tops'],
    'is_one_for_free': [0, 1],
    'free_shipping': [0, 1],
    'is_exclusive': [0, 1],
    'has_urgency': [0, 1],
    'sl_contains_price': [0, 1],
    'is_discount_mentioned': [0, 1],
}

#Generate categorical cols
for (feature, vocab) in CATEGORIES.items():
  categorical_feature_col = tf.feature_column.indicator_column(tf.feature_column.categorical_column_with_vocabulary_list(feature, vocab))
  categorical_feature_layer.append(categorical_feature_col)
  categorical_feature_layer_input[feature] = tf.keras.Input(shape=(1,), name=feature)

#Create the riid categorical col
riid = tf.feature_column.categorical_column_with_hash_bucket("riid", hash_bucket_size=2000000, dtype=tf.int64)
#categorical_feature_layer.append(riid)
#categorical_feature_layer_input["riid"] = tf.keras.Input(shape=(1,), name="riid")

## For Wide Model

### Crossed Columns

Crossing the following feature-combinations:

1.   'riid' vs. 'campaign_category'
2.   'riid' vs. 'discount'
3.   'riid' vs. 'is_one_for_free'
4.   'riid' vs. 'free_shipping'
5.   'riid' vs. 'is_exclusive'
6.   'riid' vs. 'has_urgency',
7.   'riid' vs. 'sl_contains_price',
8. 'riid' vs. 'is_discount_mentioned',
9. 'riid' vs. 'sent_week',
10. 'riid' vs. 'sent_dayofweek',
11. 'riid' vs. 'sent_hr'



In [50]:
crossed_columns = [
  tf.feature_column.crossed_column(["riid", 'campaign_category'], hash_bucket_size=20000000),
  tf.feature_column.crossed_column(["riid", 'discount'], hash_bucket_size=10000000),
  tf.feature_column.crossed_column(["riid", 'is_one_for_free'], hash_bucket_size=4000000),
  tf.feature_column.crossed_column(["riid", 'free_shipping'], hash_bucket_size=4000000),
  tf.feature_column.crossed_column(["riid", 'is_exclusive'], hash_bucket_size=4000000),
  tf.feature_column.crossed_column(["riid", 'has_urgency'], hash_bucket_size=4000000),
  tf.feature_column.crossed_column(["riid", 'sl_contains_price'], hash_bucket_size=4000000),
  tf.feature_column.crossed_column(["riid", 'is_discount_mentioned'], hash_bucket_size=4000000),
  tf.feature_column.crossed_column(["riid", 'sent_week'], hash_bucket_size=10000000),
  tf.feature_column.crossed_column(["riid", 'sent_dayofweek'], hash_bucket_size=60000000),
  tf.feature_column.crossed_column(["riid", 'sent_hr'], hash_bucket_size=50000000),
]
wide_columns = numeric_feature_layer + crossed_columns

## For Deep Model

In [58]:
deep_columns = numeric_feature_layer + categorical_feature_layer
deep_columns.append(tf.feature_column.embedding_column(riid, dimension=20))

# Model Comparison

In [30]:
models_dir = Path("/content/drive/MyDrive/Bandit_Project/models")
wmodel = (models_dir/"Wide").mkdir(exist_ok=True)
dmodel = (models_dir/"Deep").mkdir(exist_ok=True)
wdmodel = (models_dir/"W&D").mkdir(exist_ok=True)

#Hyperparameters
n_epochs=25
lr = 1e-3
n_steps = int(n_epochs * len(train) / batch_size)

## Wide Model

In [31]:
wm = tf.estimator.LinearClassifier(
    model_dir=wmodel, 
    feature_columns=wide_columns,
    n_classes=2,
    optimizer=tf.keras.optimizers.Adam(learning_rate = lr))

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmpmezi3063', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [33]:
wm.train(train_dl, steps=n_steps)
wm_results = wm.evaluate(val_dl, steps=None)
for key in sorted(wm_results):
  print("%s: %s" % (key, wm_results[key]))

accuracy: 0.84693164
accuracy_baseline: 0.795835
auc: 0.83979183
auc_precision_recall: 0.63070023
average_loss: 0.39133254
global_step: 1377
label/mean: 0.204165
loss: 0.39142895
precision: 0.66245246
prediction/mean: 0.28447124
recall: 0.51028085


## Deep Model

In [59]:
dm = tf.estimator.DNNClassifier(
    hidden_units=[512, 256, 128], 
    feature_columns=deep_columns, 
    model_dir=dmodel, 
    n_classes=2,
    optimizer=tf.keras.optimizers.Adam(learning_rate = lr)
)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmpdp8gqx52', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [60]:
dm.train(train_dl, steps=n_steps)
dm_results = dm.evaluate(val_dl, steps=None)
for key in sorted(dm_results):
  print("%s: %s" % (key, dm_results[key]))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Calling checkpoint listeners before saving checkpoint 0...
INFO:tensorflow:Saving checkpoints for 0 into /tmp/tmpdp8gqx52/model.ckpt.
INFO:tensorflow:Calling checkpoint listeners after saving checkpoint 0...
INFO:tensorflow:loss = 0.6626018, step = 0
INFO:tensorflow:global_step/sec: 34.8699
INFO:tensorflow:loss = 0.3518513, step = 100 (2.870 sec)
INFO:tensorflow:global_step/sec: 36.3829
INFO:tensorflow:loss = 0.32728338, step = 200 (2.745 sec)
INFO:tensorflow:global_step/sec: 35.4342
INFO:tensorflow:loss = 0.3174025, step = 300 (2.825 sec)
INFO:tensorflow:global_step/sec: 36.612
INFO:tensorflow:loss = 0.31972384, step = 400 (2.732 sec)
INFO:tensorflow:global_step/sec: 36.8281
INFO:tensorflow:loss = 0.34913072, step = 500 (2.717 sec)
INFO

<tensorflow_estimator.python.estimator.canned.dnn.DNNClassifierV2 at 0x7f22c2b73c50>

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2021-03-01T17:24:26Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmpdp8gqx52/model.ckpt-1377
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Inference Time : 6.52355s
INFO:tensorflow:Finished evaluation at 2021-03-01-17:24:33
INFO:tensorflow:Saving dict for global step 1377: accuracy = 0.8544229, accuracy_baseline = 0.795835, auc = 0.85163444, auc_precision_recall = 0.66007787, average_loss = 0.35322976, global_step = 1377, label/mean = 0.204165, loss = 0.3533473, precision = 0.71287704, prediction/mean = 0.21013147, recall = 0.4804883
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 1377: /tmp/tmpdp8gqx52/model.ckpt-1377
accuracy: 0.8544229
accuracy_baseline: 0.795835
auc: 0.85163444
auc_precision_recall: 0.66007787
average_loss: 0.35322976
global_step: 1377
label/mean:

## Wide & Deep Model

In [62]:
wdm = tf.estimator.DNNLinearCombinedClassifier(
    model_dir=wdmodel, 
    linear_feature_columns=crossed_columns,
    dnn_feature_columns=deep_columns,
    dnn_hidden_units=[512, 256, 128],
    n_classes=2)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmp6e1qqhhw', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [63]:
wdm.train(input_fn=train_dl, steps=n_steps)
wdm_results = wdm.evaluate(input_fn=val_dl, steps=None)
for key in sorted(wdm_results):
  print("%s: %s" % (key, wdm_results[key]))

INFO:tensorflow:Calling model_fn.




Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Calling checkpoint listeners before saving checkpoint 0...
INFO:tensorflow:Saving checkpoints for 0 into /tmp/tmp6e1qqhhw/model.ckpt.
INFO:tensorflow:Calling checkpoint listeners after saving checkpoint 0...
INFO:tensorflow:loss = 0.7212872, step = 0
INFO:tensorflow:global_step/sec: 28.1737
INFO:tensorflow:loss = 0.5517755, step = 100 (3.551 sec)
INFO:tensorflow:global_step/sec: 36.4099
INFO:tensorflow:loss = 0.4467112, step = 200 (2.743 sec)
INFO:tensorflow:global_step/sec: 35.6876
INFO:tensorflow:loss = 0.4185835, step = 300 (2.802 sec)
INFO:tensorflow:global_step/sec: 35.0818
INFO:tensorflow:loss = 0.3979456, step = 400 (2.852 sec)
INFO:tensorflow:glob

<tensorflow_estimator.python.estimator.canned.dnn_linear_combined.DNNLinearCombinedClassifierV2 at 0x7f25d86434d0>

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2021-03-01T17:27:25Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmp6e1qqhhw/model.ckpt-1377
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Inference Time : 35.72161s
INFO:tensorflow:Finished evaluation at 2021-03-01-17:28:01
INFO:tensorflow:Saving dict for global step 1377: accuracy = 0.85342413, accuracy_baseline = 0.795835, auc = 0.8427185, auc_precision_recall = 0.63138276, average_loss = 0.3625078, global_step = 1377, label/mean = 0.204165, loss = 0.36262825, precision = 0.71225643, prediction/mean = 0.20428027, recall = 0.47326517
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 1377: /tmp/tmp6e1qqhhw/model.ckpt-1377
accuracy: 0.85342413
accuracy_baseline: 0.795835
auc: 0.8427185
auc_precision_recall: 0.63138276
average_loss: 0.3625078
global_step: 1377
label/mean