# DIN (Electronics 10%)

## Import

In [1]:
import os
import sys
import time
import json
import random
import pickle
import argparse
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras.layers as nn

## Configuration

In [2]:
def argparser():
    parser = argparse.ArgumentParser()

    parser.add_argument('--lr', default=0.1, help='learning rate', type=float)
    parser.add_argument('--train_batch_size', default=32, help='batch size', type=int)
    parser.add_argument('--test_batch_size', default=512, help='batch size', type=int)
    parser.add_argument('--epochs', default=10, help='number of epochs', type=int)
    parser.add_argument('--print_step', default=1000, help='step size for print log', type=int)

    parser.add_argument('--dataset_dir', default='./data/', help='dataset path')
    parser.add_argument('--model_path', default='./models/', help='model load path', type=str)
    parser.add_argument('--log_path', default='./logs/', help='log path fot tensorboard', type=str)
    parser.add_argument('--is_reuse', default=False)
    parser.add_argument('--multi_gpu', default=False)

    parser.add_argument('--user_count', default=192403, help='number of users', type=int)
    parser.add_argument('--item_count', default=63001, help='number of items', type=int)
    parser.add_argument('--cate_count', default=801, help='number of categories', type=int)

    parser.add_argument('--user_dim', default=128, help='dimension of user', type=int)
    parser.add_argument('--item_dim', default=64, help='dimension of item', type=int)
    parser.add_argument('--cate_dim', default=64, help='dimension of category', type=int)

    parser.add_argument('--dim_layers', default=[80,40,1], type=int)

    args = parser.parse_args()

    return args

In [3]:
sys.argv = [''] # add this to resolve the execution problem, when all arguments have a default value
args = argparser()

## Dataset

### Description of Dataset

This dataset includes reviews (ratings, text, helpfulness votes), product metadata (descriptions, category information, price, brand, and image features), and links (also viewed/also bought graphs).

**Reviews:**
* `asin`: ID of the product
* `reviewerID`: ID of the reviewer
* `unixReviewTime`: time of the review (unix time)

**Metadata:**
* `asin`: ID of the product
* `categories`: list of categories the product belongs to

Metadata includes descriptions, price, sales-rank, brand info, and co-purchasing links

### Process of Data

In [4]:
# read the reviews json file as `reviews_df`
# reviews_Electronics_5
# Magazine_Subscriptions
# Software
with open('./data/raw_data/reviews_Electronics_5.json') as fin:
    df = {}
    for i, line in enumerate(fin):
        df[i] = eval(line)
        # df[i] = json.loads(line)
    reviews_df = pd.DataFrame.from_dict(df, orient='index')

In [5]:
# view the overall reviews dataset
# reviews_Electronics_5: 1689188 rows × 9 columns
reviews_df

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,AO94DHGC771SJ,0528881469,amazdnu,"[0, 0]",We got this GPS for my husband who is an (OTR)...,5.0,Gotta have GPS!,1370131200,"06 2, 2013"
1,AMO214LNFCEI4,0528881469,Amazon Customer,"[12, 15]","I'm a professional OTR truck driver, and I bou...",1.0,Very Disappointed,1290643200,"11 25, 2010"
2,A3N7T0DY83Y4IG,0528881469,C. A. Freeman,"[43, 45]","Well, what can I say. I've had this unit in m...",3.0,1st impression,1283990400,"09 9, 2010"
3,A1H8PY3QHMQQA0,0528881469,"Dave M. Shaw ""mack dave""","[9, 10]","Not going to write a long review, even thought...",2.0,"Great grafics, POOR GPS",1290556800,"11 24, 2010"
4,A24EV6RXELQZ63,0528881469,Wayne Smith,"[0, 0]",I've had mine for a year and here's what we go...,1.0,"Major issues, only excuses for support",1317254400,"09 29, 2011"
...,...,...,...,...,...,...,...,...,...
1689183,A34BZM6S9L7QI4,B00LGQ6HL8,"Candy Cane ""Is it just me?""","[1, 1]",Burned these in before listening to them for a...,5.0,Boom -- Pop -- Pow. These deliver.,1405555200,"07 17, 2014"
1689184,A1G650TTTHEAL5,B00LGQ6HL8,"Charles Spanky ""Zumina Reviews""","[0, 0]",Some people like DJ style headphones or earbud...,5.0,"Thin and light, without compromising on sound ...",1405382400,"07 15, 2014"
1689185,A25C2M3QF9G7OQ,B00LGQ6HL8,Comdet,"[0, 0]",I&#8217;m a big fan of the Brainwavz S1 (actua...,5.0,Same form factor and durability as the S1 with...,1405555200,"07 17, 2014"
1689186,A1E1LEVQ9VQNK,B00LGQ6HL8,J. Chambers,"[0, 0]","I've used theBrainwavz S1 In Ear Headphones, a...",5.0,Superb audio quality in a very comfortable set...,1405641600,"07 18, 2014"


In [6]:
# take only the first n/10 groups
reviews_df_groupby = reviews_df.groupby('reviewerID')
grouped = [g[1] for g in list(reviews_df_groupby)[:int(len(reviews_df_groupby)/10)]]
reviews_df = pd.concat(grouped)

In [7]:
reviews_df

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
321546,A000715434M800HLCENK9,B000UYYZ0M,DP,"[0, 0]",So the screen itself is OK. it is an actual sc...,1.0,Spring is not strong,1400457600,"05 19, 2014"
450446,A000715434M800HLCENK9,B001EHAI6Y,DP,"[0, 0]",I had a complicated set up for my screen. I ne...,5.0,Exactly what i wanted,1400457600,"05 19, 2014"
738088,A000715434M800HLCENK9,B003AFONFU,DP,"[1, 1]",The mount is good if you account for the play ...,3.0,beware of the play,1400457600,"05 19, 2014"
766200,A000715434M800HLCENK9,B003ES5ZUU,DP,"[0, 0]",For some reason this product doesnt work that ...,2.0,Not great with Apple TV,1400457600,"05 19, 2014"
1678142,A000715434M800HLCENK9,B00HMZG3YS,DP,"[0, 0]",Great box Exactly what i needed. it isnt water...,5.0,Very good,1400457600,"05 19, 2014"
...,...,...,...,...,...,...,...,...,...
565072,A1DJR7B306SJIY,B0026RHPSU,"James ""Jim""","[8, 13]",In my opinion this is a very poor choice of wa...,1.0,Why buy an expensive dock?,1295308800,"01 18, 2011"
697886,A1DJR7B306SJIY,B0031QNP8O,"James ""Jim""","[1, 3]",I had a Garmin Trek prior to buying this. I wa...,2.0,Difficult to operate for the neophyte,1381622400,"10 13, 2013"
742523,A1DJR7B306SJIY,B003BEDQR6,"James ""Jim""","[0, 0]","For the price, this is a great system. The fac...",4.0,Good sound,1358640000,"01 20, 2013"
1453061,A1DJR7B306SJIY,B0097BEF1S,"James ""Jim""","[7, 27]",The main gripes I have about this product is n...,1.0,Apple is deceitful,1368662400,"05 16, 2013"


In [8]:
# serialize the object `reviews_df` to the open file object `reviews.pkl`
with open('./data/reviews.pkl', 'wb') as f:
    pickle.dump(reviews_df, f, pickle.HIGHEST_PROTOCOL)

In [9]:
# read the product metadata json file as `meta_df`
with open('./data/raw_data/meta_Electronics.json') as fin:
    df = {}
    for i, line in enumerate(fin):
        df[i] = eval(line)
        # df[i] = json.loads(line)
    meta_df = pd.DataFrame.from_dict(df, orient='index')

In [10]:
# view the overall product meta dataset
meta_df

Unnamed: 0,asin,imUrl,description,categories,title,price,salesRank,related,brand
0,0132793040,http://ecx.images-amazon.com/images/I/31JIPhp%...,The Kelby Training DVD Mastering Blend Modes i...,"[[Electronics, Computers & Accessories, Cables...",Kelby Training DVD: Mastering Blend Modes in A...,,,,
1,0321732944,http://ecx.images-amazon.com/images/I/31uogm6Y...,,"[[Electronics, Computers & Accessories, Cables...",Kelby Training DVD: Adobe Photoshop CS5 Crash ...,,,,
2,0439886341,http://ecx.images-amazon.com/images/I/51k0qa8f...,Digital Organizer and Messenger,"[[Electronics, Computers & Accessories, PDAs, ...",Digital Organizer and Messenger,8.15,{'Electronics': 144944},"{'also_viewed': ['0545016266', 'B009ECM8QY', '...",
3,0511189877,http://ecx.images-amazon.com/images/I/41HaAhbv...,The CLIKR-5 UR5U-8780L remote control is desig...,"[[Electronics, Accessories & Supplies, Audio &...",CLIKR-5 Time Warner Cable Remote Control UR5U-...,23.36,,"{'also_viewed': ['B001KC08A4', 'B00KUL8O0W', '...",
4,0528881469,http://ecx.images-amazon.com/images/I/51FnRkJq...,"Like its award-winning predecessor, the Intell...","[[Electronics, GPS & Navigation, Vehicle GPS, ...",Rand McNally 528881469 7-inch Intelliroute TND...,299.99,,"{'also_viewed': ['B006ZOI9OY', 'B00C7FKT2A', '...",
...,...,...,...,...,...,...,...,...,...
498191,BT008V9J9U,http://ecx.images-amazon.com/images/I/313e6SJm...,Vehicle suction cup mount (replacement) NOTICE...,"[[Electronics, GPS & Navigation, GPS System Ac...",Suction Cup Mount,21.99,,{'buy_after_viewing': ['B000EPFCC2']},Garmin
498192,BT008SXQ4C,http://ecx.images-amazon.com/images/I/31oF9oNv...,Quatech - 1 Port PCMCIA to DB-25 Parallel Adap...,"[[Electronics, Computers & Accessories, Cables...",Parallel PCMCIA Card 1PORT Epp,23.99,,"{'also_bought': ['B000SR2H4W', 'B001Q7X0W6'], ...",
498193,BT008G3W52,http://ecx.images-amazon.com/images/I/21WIrX5f...,C2G - 5m Ultma USB 2.0 A Mini B Cble,"[[Electronics, Computers & Accessories, Cables...",C2G / Cables to Go 5M Ultima USB 2.0 Cable,18.91,,"{'bought_together': ['B0002D6QJO'], 'buy_after...",C2G
498194,BT008UKTMW,http://ecx.images-amazon.com/images/I/41TNAVmf...,Keyboard drawer.,"[[Electronics, Computers & Accessories, Cables...",Underdesk Keyboard Drawer,25.54,,"{'also_viewed': ['B0002LD0ZY', 'B0002LCZP0', '...",Fellowes


In [11]:
# filter out product meta dataset
meta_df = meta_df[meta_df['asin'].isin(reviews_df['asin'].unique())]
meta_df = meta_df.reset_index(drop=True)

In [12]:
meta_df

Unnamed: 0,asin,imUrl,description,categories,title,price,salesRank,related,brand
0,0972683275,http://ecx.images-amazon.com/images/I/41hYJ9Mw...,The VideoSecu TV mount is a mounting solution ...,"[[Electronics, Accessories & Supplies, Audio &...",VideoSecu 24&quot; Long Arm TV Wall Mount Low ...,29.99,{},"{'also_bought': ['B000X3KOD2', 'B0074FGR74', '...",VideoSecu
1,1400532620,http://ecx.images-amazon.com/images/I/519ca3cu...,Barnes & Noble Nook eReader - no 3GMeet nook. ...,"[[Electronics, eBook Readers & Accessories]]",Barnes &amp; Noble Nook eReader - no 3G,74.95,{'Electronics': 23071},"{'also_bought': ['B0035CLBT4', 'B004X18N24', '...",Barnes &amp; Noble
2,140053271X,http://ecx.images-amazon.com/images/I/51jat7CV...,Barnes & Noble Nook Simple Touch Wi-Fi ReaderI...,"[[Electronics, eBook Readers & Accessories, eB...",Barnes &amp; Noble Nook Simple Touch eBook Rea...,79.49,,"{'also_bought': ['B007UXNHNM', 'B007UXNHGY', '...",Barnes &amp; Noble
3,1400532736,http://ecx.images-amazon.com/images/I/413fSdlM...,The NOOK Simple Touch eReader allows you to re...,"[[Electronics, eBook Readers & Accessories, eB...",Nook Simple Touch eReader,62.99,{'Electronics': 4945},"{'also_bought': ['B0055ZDRI2', 'B007UXNHGY', '...",Barnes &amp; Noble
4,1400698987,http://ecx.images-amazon.com/images/I/51lnBzuR...,The Nook HD Tablet has an amazing combination ...,"[[Electronics, Computers & Accessories, Tablets]]",Nook HD 7&quot; 8GB Tablet,158.99,,"{'also_bought': ['B00AAKLIIS', 'B00E9IAQ1C', '...",Barnes &amp; Noble
...,...,...,...,...,...,...,...,...,...
44113,B00KSLCU72,http://ecx.images-amazon.com/images/I/41wlybKm...,FosPower FUSE Universal World Travel USB AC Ad...,"[[Electronics, Accessories & Supplies, Batteri...",FosPower FUSE World-Wide Universal AC Adapter ...,7.99,,"{'also_bought': ['B00JJOEV9Y', 'B00L8HA5L8', '...",
44114,B00KVNY2KA,http://ecx.images-amazon.com/images/I/417lJxa9...,The Satechi Spectrum Mouse Wired Optical Mouse...,"[[Electronics, Computers & Accessories, Cables...",Satechi Spectrum Mouse Wired Optical Mouse (Si...,24.99,,"{'also_viewed': ['B00LIBH4YK', 'B00CJKW4WQ', '...",
44115,B00KWHMR6G,http://ecx.images-amazon.com/images/I/41phatTV...,,"[[Electronics, Computers & Accessories, Networ...",NETGEAR AC3200 Nighthawk X6 Tri-Band WiFi Rout...,299.99,{},"{'also_bought': ['B008I64O78', 'B008I64EKA', '...",Netgear
44116,B00KYMCJF8,http://ecx.images-amazon.com/images/I/518oN4Vz...,"Omaker-Open your mind,we are the makerOmaker B...","[[Electronics, Portable Audio & Video, MP3 Pla...",Omaker M3-Outdoor Sport Rugged Square Design S...,29.99,{},"{'also_bought': ['B00J0CVVGQ', 'B00KQCJ0CG', '...",


In [13]:
# serialize the object `meta_df` to the open file object `meta.pkl`
with open('./data/meta.pkl', 'wb') as f:
    pickle.dump(meta_df, f, pickle.HIGHEST_PROTOCOL)

In [14]:
# filter out reviews dataset
reviews_df = reviews_df[reviews_df['asin'].isin(meta_df['asin'].unique())]
reviews_df = reviews_df.reset_index(drop=True)
reviews_df

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A000715434M800HLCENK9,B000UYYZ0M,DP,"[0, 0]",So the screen itself is OK. it is an actual sc...,1.0,Spring is not strong,1400457600,"05 19, 2014"
1,A000715434M800HLCENK9,B001EHAI6Y,DP,"[0, 0]",I had a complicated set up for my screen. I ne...,5.0,Exactly what i wanted,1400457600,"05 19, 2014"
2,A000715434M800HLCENK9,B003AFONFU,DP,"[1, 1]",The mount is good if you account for the play ...,3.0,beware of the play,1400457600,"05 19, 2014"
3,A000715434M800HLCENK9,B003ES5ZUU,DP,"[0, 0]",For some reason this product doesnt work that ...,2.0,Not great with Apple TV,1400457600,"05 19, 2014"
4,A000715434M800HLCENK9,B00HMZG3YS,DP,"[0, 0]",Great box Exactly what i needed. it isnt water...,5.0,Very good,1400457600,"05 19, 2014"
...,...,...,...,...,...,...,...,...,...
168576,A1DJR7B306SJIY,B0026RHPSU,"James ""Jim""","[8, 13]",In my opinion this is a very poor choice of wa...,1.0,Why buy an expensive dock?,1295308800,"01 18, 2011"
168577,A1DJR7B306SJIY,B0031QNP8O,"James ""Jim""","[1, 3]",I had a Garmin Trek prior to buying this. I wa...,2.0,Difficult to operate for the neophyte,1381622400,"10 13, 2013"
168578,A1DJR7B306SJIY,B003BEDQR6,"James ""Jim""","[0, 0]","For the price, this is a great system. The fac...",4.0,Good sound,1358640000,"01 20, 2013"
168579,A1DJR7B306SJIY,B0097BEF1S,"James ""Jim""","[7, 27]",The main gripes I have about this product is n...,1.0,Apple is deceitful,1368662400,"05 16, 2013"


In [15]:
# select useful features
reviews_df = reviews_df[['reviewerID', 'asin', 'unixReviewTime']]
reviews_df

Unnamed: 0,reviewerID,asin,unixReviewTime
0,A000715434M800HLCENK9,B000UYYZ0M,1400457600
1,A000715434M800HLCENK9,B001EHAI6Y,1400457600
2,A000715434M800HLCENK9,B003AFONFU,1400457600
3,A000715434M800HLCENK9,B003ES5ZUU,1400457600
4,A000715434M800HLCENK9,B00HMZG3YS,1400457600
...,...,...,...
168576,A1DJR7B306SJIY,B0026RHPSU,1295308800
168577,A1DJR7B306SJIY,B0031QNP8O,1381622400
168578,A1DJR7B306SJIY,B003BEDQR6,1358640000
168579,A1DJR7B306SJIY,B0097BEF1S,1368662400


In [16]:
# Electronics: 63001 rows × 2 columns
# select useful features
if 'category' in meta_df.columns:
    meta_df = meta_df[['asin', 'category']]
    meta_df['category'] = meta_df['category'].apply(lambda x: ['Magazine Subscriptions'] if len(x)==0 else x)
else:
    # Electronics
    meta_df = meta_df[['asin', 'categories']]

meta_df

Unnamed: 0,asin,categories
0,0972683275,"[[Electronics, Accessories & Supplies, Audio &..."
1,1400532620,"[[Electronics, eBook Readers & Accessories]]"
2,140053271X,"[[Electronics, eBook Readers & Accessories, eB..."
3,1400532736,"[[Electronics, eBook Readers & Accessories, eB..."
4,1400698987,"[[Electronics, Computers & Accessories, Tablets]]"
...,...,...
44113,B00KSLCU72,"[[Electronics, Accessories & Supplies, Batteri..."
44114,B00KVNY2KA,"[[Electronics, Computers & Accessories, Cables..."
44115,B00KWHMR6G,"[[Electronics, Computers & Accessories, Networ..."
44116,B00KYMCJF8,"[[Electronics, Portable Audio & Video, MP3 Pla..."


In [17]:
# Electronics: 63001 rows × 2 columns
# only one category
if 'category' in meta_df.columns:
    meta_df['category'] = meta_df['category'].map(lambda x: x[-1])
else:
    meta_df['categories'] = meta_df['categories'].map(lambda x: x[-1][-1])
meta_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  meta_df['categories'] = meta_df['categories'].map(lambda x: x[-1][-1])


Unnamed: 0,asin,categories
0,0972683275,TV Ceiling & Wall Mounts
1,1400532620,eBook Readers & Accessories
2,140053271X,eBook Readers
3,1400532736,eBook Readers
4,1400698987,Tablets
...,...,...
44113,B00KSLCU72,AC Adapters
44114,B00KVNY2KA,Mice
44115,B00KWHMR6G,Wireless Access Points
44116,B00KYMCJF8,MP3 Players


In [18]:
def build_map(df, col_name):
    key = sorted(df[col_name].unique().tolist())
    m = dict(zip(key, range(len(key))))
    df[col_name] = df[col_name].map(lambda x: m[x])
    return m, key

In [19]:
asin_map, asin_key = build_map(meta_df, 'asin')
if 'category' in meta_df.columns:
    cate_map, cate_key = build_map(meta_df, 'category')
else:
    cate_map, cate_key = build_map(meta_df, 'categories')
revi_map, revi_key = build_map(reviews_df, 'reviewerID')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col_name] = df[col_name].map(lambda x: m[x])


In [20]:
user_count, item_count, cate_count, example_count =\
    len(revi_map), len(asin_map), len(cate_map), reviews_df.shape[0]
print('user_count: %d\titem_count: %d\tcate_count: %d\texample_count: %d' %
      (user_count, item_count, cate_count, example_count))
# Electronics: user_count: 192403	item_count: 63001	cate_count: 801	example_count: 1689188

user_count: 19240	item_count: 44118	cate_count: 766	example_count: 168581


In [21]:
# Electronics: 63001 rows × 2 columns
meta_df = meta_df.sort_values('asin')
meta_df = meta_df.reset_index(drop=True)
meta_df

Unnamed: 0,asin,categories
0,0,675
1,1,682
2,2,682
3,3,763
4,4,764
...,...,...
44113,44113,1
44114,44114,462
44115,44115,757
44116,44116,443


In [22]:
# Electronics: 1689188 rows × 3 columns
reviews_df['asin'] = reviews_df['asin'].map(lambda x: asin_map[x])
reviews_df = reviews_df.sort_values(['reviewerID', 'unixReviewTime'])
reviews_df = reviews_df.reset_index(drop=True)
reviews_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews_df['asin'] = reviews_df['asin'].map(lambda x: asin_map[x])


Unnamed: 0,reviewerID,asin,unixReviewTime
0,0,9123,1400457600
1,0,12521,1400457600
2,0,19848,1400457600
3,0,20498,1400457600
4,0,43644,1400457600
...,...,...,...
168576,19239,37249,1368662400
168577,19239,10115,1374796800
168578,19239,12317,1374796800
168579,19239,18643,1381622400


In [23]:
cate_list = [meta_df['categories'][i] for i in range(len(asin_map))]
cate_list = np.array(cate_list, dtype=np.int32)

In [24]:
# serialize the data to the open file object `remap.pkl`
with open('./data/remap.pkl', 'wb') as f:
    pickle.dump(reviews_df, f, pickle.HIGHEST_PROTOCOL) # uid, iid
    pickle.dump(cate_list, f, pickle.HIGHEST_PROTOCOL) # cid of iid line
    pickle.dump((user_count, item_count, cate_count, example_count), f, pickle.HIGHEST_PROTOCOL)
    pickle.dump((asin_key, cate_key, revi_key), f, pickle.HIGHEST_PROTOCOL)

### Split of Data

In [25]:
random.seed(1234)

train_set = []
test_set = []

for reviewerID, hist in reviews_df.groupby('reviewerID'):
    pos_list = hist['asin'].tolist()
    neg_list = []
    for _ in range(len(pos_list)):
        neg = pos_list[0]
        while neg in pos_list + neg_list:
            neg = random.randint(0, item_count - 1)
        neg_list.append(neg)

    for i in range(1, len(pos_list) - 1):
        hist = pos_list[:i]
        train_set.append((reviewerID, hist, pos_list[i], 1))
        train_set.append((reviewerID, hist, neg_list[i], 0))
    
    label = (pos_list[-1], neg_list[-1])
    test_set.append((reviewerID, hist, label))

random.shuffle(train_set)
random.shuffle(test_set)

assert len(test_set) == user_count

In [26]:
train_set[0]

(9184, [9519, 8918, 14514, 21572, 22732], 140, 0)

In [27]:
len(train_set)

260202

In [28]:
test_set[0]

(18882, [12289, 25828, 27089, 27393], (31900, 20416))

In [29]:
len(test_set)

19240

In [30]:
# serialize the data to the open file object `dataset.pkl`
with open('./data/dataset.pkl', 'wb') as f:
    pickle.dump(train_set, f, pickle.HIGHEST_PROTOCOL)
    pickle.dump(test_set, f, pickle.HIGHEST_PROTOCOL)
    pickle.dump(cate_list, f, pickle.HIGHEST_PROTOCOL)
    pickle.dump((user_count, item_count, cate_count), f, pickle.HIGHEST_PROTOCOL)

### Data Loader

In [31]:
with open(args.dataset_dir+'dataset.pkl', 'rb') as f:
    train_set = pickle.load(f, encoding='latin1')
    test_set = pickle.load(f, encoding='latin1')
    cate_list = pickle.load(f, encoding='latin1')
    cate_list = tf.convert_to_tensor(cate_list, dtype=tf.int64)
    user_count, item_count, cate_count = pickle.load(f)

In [32]:
class DataLoader:
    def __init__(self, batch_size, data):
        self.batch_size = batch_size
        self.data = data
        self.epoch_size = len(self.data) // self.batch_size
        if self.epoch_size * self.batch_size < len(self.data):
            self.epoch_size += 1
        self.i = 0

    def __iter__(self):
        self.i = 0
        return self

    def __next__(self):
        if self.i == self.epoch_size:
            raise StopIteration
        ts = self.data[self.i * self.batch_size : min((self.i+1) * self.batch_size,
                                                      len(self.data))]
        self.i += 1

        u, i, y, sl = [], [], [], []
        for t in ts:
            u.append(t[0])
            i.append(t[2])
            y.append(t[3])
            sl.append(len(t[1]))
        max_sl = max(sl)

        hist_i = np.zeros([len(ts), max_sl], np.int64)

        k = 0
        for t in ts:
            for l in range(len(t[1])):
                hist_i[k][l] = t[1][l]
            k += 1

        return tf.convert_to_tensor(u), tf.convert_to_tensor(i), \
               tf.convert_to_tensor(y), tf.convert_to_tensor(hist_i), \
               sl

In [33]:
class DataLoaderTest:
    def __init__(self, batch_size, data):

        self.batch_size = batch_size
        self.data = data
        self.epoch_size = len(self.data) // self.batch_size
        if self.epoch_size * self.batch_size < len(self.data):
            self.epoch_size += 1
        self.i = 0

    def __iter__(self):
        self.i = 0
        return self

    def __next__(self):

        if self.i == self.epoch_size:
            raise StopIteration

        ts = self.data[self.i * self.batch_size : min((self.i+1) * self.batch_size,
                                                      len(self.data))]
        self.i += 1

        u, i, j, sl = [], [], [], []
        for t in ts:
            u.append(t[0])
            i.append(t[2][0])
            j.append(t[2][1])
            sl.append(len(t[1]))
        max_sl = max(sl)

        hist_i = np.zeros([len(ts), max_sl], np.int64)

        k = 0
        for t in ts:
            for l in range(len(t[1])):
                hist_i[k][l] = t[1][l]
            k += 1

        return tf.convert_to_tensor(u), tf.convert_to_tensor(i), \
               tf.convert_to_tensor(j), tf.convert_to_tensor(hist_i), \
               sl

    def __len__(self):
        return len(self.data)

In [34]:
def get_dataloader(train_batch_size, test_batch_size):
    return DataLoader(train_batch_size, train_set), DataLoaderTest(test_batch_size, test_set), \
           user_count, item_count, cate_count, cate_list

## Model

### Attention

In [35]:
class attention(tf.keras.layers.Layer):
    def __init__(self, keys_dim, dim_layers):
        super(attention, self).__init__()
        self.keys_dim = keys_dim

        self.fc = tf.keras.Sequential()
        for dim_layer in dim_layers[:-1]:
            self.fc.add(nn.Dense(dim_layer, activation='sigmoid'))
        self.fc.add(nn.Dense(dim_layers[-1], activation=None))

    def call(self, queries, keys, keys_length):
        queries = tf.tile(tf.expand_dims(queries, 1), [1, tf.shape(keys)[1], 1])
        # outer product ?
        din_all = tf.concat([queries, keys, queries-keys, queries*keys], axis=-1)
        outputs = tf.transpose(self.fc(din_all), [0,2,1])

        # Mask
        key_masks = tf.sequence_mask(keys_length, max(keys_length), dtype=tf.bool)  # [B, T]
        key_masks = tf.expand_dims(key_masks, 1)
        paddings = tf.ones_like(outputs) * (-2 ** 32 + 1)
        outputs = tf.where(key_masks, outputs, paddings)  # [B, 1, T]

        # Scale
        outputs = outputs / (self.keys_dim ** 0.5)

        # Activation
        outputs = tf.keras.activations.softmax(outputs, -1)  # [B, 1, T]

        # Weighted sum
        outputs = tf.squeeze(tf.matmul(outputs, keys))  # [B, H]

        return outputs

### Dice

In [36]:
class dice(tf.keras.layers.Layer):
    def __init__(self, feat_dim):
        super(dice, self).__init__()
        self.feat_dim = feat_dim
        self.alphas= tf.Variable(tf.zeros([feat_dim]), dtype=tf.float32)
        self.beta  = tf.Variable(tf.zeros([feat_dim]), dtype=tf.float32)

        self.bn = tf.keras.layers.BatchNormalization(center=False, scale=False)

    def call(self, _x, axis=-1, epsilon=0.000000001):

        reduction_axes = list(range(len(_x.get_shape())))
        del reduction_axes[axis]
        broadcast_shape = [1] * len(_x.get_shape())
        broadcast_shape[axis] = self.feat_dim

        mean = tf.reduce_mean(_x, axis=reduction_axes)
        brodcast_mean = tf.reshape(mean, broadcast_shape)
        std = tf.reduce_mean(tf.square(_x - brodcast_mean) + epsilon, axis=reduction_axes)
        std = tf.sqrt(std)
        brodcast_std = tf.reshape(std, broadcast_shape)

        x_normed = self.bn(_x)
        x_p = tf.keras.activations.sigmoid(self.beta * x_normed)

        return self.alphas * (1.0 - x_p) * _x + x_p * _x

### PReLU

In [37]:
def parametric_relu(_x):
    with tf.variable_scope(name_or_scope='', reuse=tf.AUTO_REUSE):
        alphas = tf.get_variable('alpha', _x.get_shape()[-1],
                                 initializer=tf.constant_initializer(0.0),
                                 dtype=tf.float32)
    pos = tf.nn.relu(_x)
    neg = alphas * (_x - abs(_x)) * 0.5

    return pos + neg

### Base

In [38]:
class Base(tf.keras.Model):
    def __init__(self, user_count, item_count, cate_count, cate_list,
                       user_dim, item_dim, cate_dim,
                       dim_layers):
        super(Base, self).__init__()
        self.item_dim = item_dim
        self.cate_dim = cate_dim

        self.user_emb = nn.Embedding(user_count, user_dim)
        self.item_emb = nn.Embedding(item_count, item_dim)
        self.cate_emb = nn.Embedding(cate_count, cate_dim)
        self.item_bias= tf.Variable(tf.zeros([item_count]), trainable=True)
        self.cate_list = cate_list

        self.hist_bn = nn.BatchNormalization()
        self.hist_fc = nn.Dense(item_dim+cate_dim)

        self.fc = tf.keras.Sequential()
        self.fc.add(nn.BatchNormalization())
        for dim_layer in dim_layers[:-1]:
            self.fc.add(nn.Dense(dim_layer, activation='sigmoid'))
        self.fc.add(nn.Dense(dim_layers[-1], activation=None))

    def get_emb(self, user, item, history):
        user_emb = self.user_emb(user)

        item_emb = self.item_emb(item)
        item_cate_emb = self.cate_emb(tf.gather(self.cate_list, item))
        item_join_emb = tf.concat([item_emb, item_cate_emb], -1)
        item_bias= tf.gather(self.item_bias, item)

        hist_emb = self.item_emb(history)
        hist_cate_emb = self.cate_emb(tf.gather(self.cate_list, history))
        hist_join_emb = tf.concat([hist_emb, hist_cate_emb], -1)

        return user_emb, item_join_emb, item_bias, hist_join_emb

    def call(self, user, item, history, length):
        user_emb, item_join_emb, item_bias, hist_join_emb = self.get_emb(user, item, history)

        hist_mask = tf.sequence_mask(length, max(length), dtype=tf.float32)
        hist_mask = tf.tile(tf.expand_dims(hist_mask, -1), (1,1,self.item_dim+self.cate_dim))
        hist_join_emb = tf.math.multiply(hist_join_emb, hist_mask)
        hist_join_emb = tf.reduce_sum(hist_join_emb, 1)
        hist_join_emb = tf.math.divide(hist_join_emb, tf.cast(tf.tile(tf.expand_dims(length, -1),
                                                      [1,self.item_dim+self.cate_dim]), tf.float32))

        hist_hid_emb = self.hist_fc(self.hist_bn(hist_join_emb))
        join_emb = tf.concat([user_emb, item_join_emb, hist_hid_emb], -1)

        output = tf.squeeze(self.fc(join_emb)) + item_bias
        logit = tf.keras.activations.sigmoid(output)

        return output, logit

### DIN

In [39]:
class DIN(Base):
    def __init__(self, user_count, item_count, cate_count, cate_list,
                       user_dim, item_dim, cate_dim,
                       dim_layers):
        super(DIN, self).__init__(user_count, item_count, cate_count, cate_list,
                                  user_dim, item_dim, cate_dim,
                                  dim_layers)

        self.hist_at = attention(item_dim+cate_dim, dim_layers)

        self.fc = tf.keras.Sequential()
        self.fc.add(nn.BatchNormalization())
        for dim_layer in dim_layers[:-1]:
            self.fc.add(nn.Dense(dim_layer, activation=None))
            self.fc.add(dice(dim_layer))
        self.fc.add(nn.Dense(dim_layers[-1], activation=None))

    def call(self, user, item, history, length):
        user_emb, item_join_emb, item_bias, hist_join_emb = self.get_emb(user, item, history)

        hist_attn_emb = self.hist_at(item_join_emb, hist_join_emb, length)
        hist_attn_emb = self.hist_fc(self.hist_bn(hist_attn_emb))

        join_emb = tf.concat([user_emb, item_join_emb, hist_attn_emb], -1)

        output = tf.squeeze(self.fc(join_emb)) + item_bias
        logit = tf.keras.activations.sigmoid(output)

        return output, logit

### Metrics

In [40]:
def calc_auc(raw_arr):
    """Summary
    Args:
        raw_arr (TYPE): Description
    Returns:
        TYPE: Description
    """
    # sort by pred value, from small to big
    arr = sorted(raw_arr, key=lambda d:d[2])

    auc = 0.0
    fp1, tp1, fp2, tp2 = 0.0, 0.0, 0.0, 0.0
    for record in arr:
        fp2 += record[0] # noclick
        tp2 += record[1] # click
        auc += (fp2 - fp1) * (tp2 + tp1)
        fp1, tp1 = fp2, tp2

    # if all nonclick or click, disgard
    threshold = len(arr) - 1e-3
    if tp2 > threshold or fp2 > threshold:
        return -0.5

    if tp2 * fp2 > 0.0:  # normal auc
        return (1.0 - auc / (2.0 * tp2 * fp2))
    else:
        return None

In [41]:
def auc_arr(score_p, score_n):
    score_arr = []
    for s in score_p.numpy():
        score_arr.append([0, 1, s])
    for s in score_n.numpy():
        score_arr.append([1, 0, s])
    return score_arr

In [42]:
def eval(model, test_data):
    auc_sum = 0.0
    score_arr = []
    for u, i, j, hist_i, sl in test_data:
        p_out, p_logit = model(u,i,hist_i,sl)
        n_out, n_logit = model(u,j,hist_i,sl)
        mf_auc = tf.reduce_sum(tf.cast(p_out>n_out, dtype=tf.float32))

        score_arr += auc_arr(p_logit, n_logit)
        auc_sum += mf_auc
    test_gauc = auc_sum / len(test_data)
    auc = calc_auc(score_arr)
    return test_gauc, auc

## Train

We have a training data set of 260202 examples, a batch size of 32, and we have specified we want the algorithm to run for 10 epochs.

Therefore, in each epoch, we have $260202/32 = 8132$ batches. Each batch gets passed through the algorithm, therefore we have 8132 iterations per epoch. Since we have specified 10 epochs, we have a total of $8132 \times 10 = 81320$ iterations for the whole training.

In [43]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
'''
0 = all messages are logged (default behavior)
1 = INFO messages are not printed
2 = INFO and WARNING messages are not printed
3 = INFO, WARNING, and ERROR messages are not printed
'''
# Environment
print(f"Version of tensorflow: {tf.__version__}")
print("GPU Available: ", tf.config.list_physical_devices('GPU'))

Version of tensorflow: 2.7.0-rc0
GPU Available:  []


### Data Load

In [44]:
# Data Load
train_data, test_data, \
user_count, item_count, cate_count, \
cate_list = get_dataloader(args.train_batch_size, args.test_batch_size)

In [45]:
# Loss, Optim
optimizer = tf.keras.optimizers.SGD(learning_rate=args.lr, momentum=0.0)
loss_metric = tf.keras.metrics.Sum()
auc_metric = tf.keras.metrics.AUC()

In [46]:
# Model
model = Base(user_count, item_count, cate_count, cate_list,
             args.user_dim, args.item_dim, args.cate_dim, args.dim_layers)

# Board
train_summary_writer = tf.summary.create_file_writer(args.log_path)

In [47]:
#@tf.function
def train_one_step(u,i,y,hist_i,sl):
    with tf.GradientTape() as tape:
        output,_ = model(u,i,hist_i,sl)
        loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=output,
                                                                      labels=tf.cast(y, dtype=tf.float32)))
    gradient = tape.gradient(loss, model.trainable_variables)
    clip_gradient, _ = tf.clip_by_global_norm(gradient, 5.0)
    optimizer.apply_gradients(zip(clip_gradient, model.trainable_variables))

    loss_metric(loss)

In [48]:
# Train
def train(optimizer):
    best_loss= 0.
    best_auc = 0.
    start_time = time.time()
    # train 10 epochs
    for epoch in range(args.epochs):
        for step, (u, i, y, hist_i, sl) in enumerate(train_data, start=1):
            train_one_step(u, i, y, hist_i, sl)

            # print the result every 1000 steps
            if step % args.print_step == 0:
                test_gauc, auc = eval(model, test_data)
                print('Epoch %d Global_step %d\tTrain_loss: %.4f\tEval_GAUC: %.4f\tEval_AUC: %.4f' %
                      (epoch, step, loss_metric.result() / args.print_step, test_gauc, auc))

                # save the best model for now
                if best_auc < test_gauc:
                    best_loss= loss_metric.result() / args.print_step
                    best_auc = test_gauc
                    model.save_weights(args.model_path+'cp-%d.ckpt'%epoch)
                loss_metric.reset_states()

        with train_summary_writer.as_default():
            tf.summary.scalar('loss', best_loss, step=epoch)
            tf.summary.scalar('test_gauc', best_auc, step=epoch)

        loss_metric.reset_states()
        optimizer = tf.keras.optimizers.SGD(learning_rate=0.01, momentum=0.0)

        print('Epoch %d DONE\tCost time: %.2f' % (epoch, time.time()-start_time))
    print('Best test_gauc: ', best_auc)

In [49]:
# Base
train(optimizer)

Epoch 0 Global_step 1000	Train_loss: 0.6952	Eval_GAUC: 0.6283	Eval_AUC: 0.6157
Epoch 0 Global_step 2000	Train_loss: 0.6941	Eval_GAUC: 0.6603	Eval_AUC: 0.6497
Epoch 0 Global_step 3000	Train_loss: 0.6934	Eval_GAUC: 0.6723	Eval_AUC: 0.6661
Epoch 0 Global_step 4000	Train_loss: 0.6927	Eval_GAUC: 0.6781	Eval_AUC: 0.6762
Epoch 0 Global_step 5000	Train_loss: 0.6914	Eval_GAUC: 0.6843	Eval_AUC: 0.6827
Epoch 0 Global_step 6000	Train_loss: 0.6908	Eval_GAUC: 0.6873	Eval_AUC: 0.6856
Epoch 0 Global_step 7000	Train_loss: 0.6900	Eval_GAUC: 0.6876	Eval_AUC: 0.6877
Epoch 0 Global_step 8000	Train_loss: 0.6891	Eval_GAUC: 0.6904	Eval_AUC: 0.6898
Epoch 0 DONE	Cost time: 237.80
Epoch 1 Global_step 1000	Train_loss: 0.6876	Eval_GAUC: 0.6899	Eval_AUC: 0.6897
Epoch 1 Global_step 2000	Train_loss: 0.6864	Eval_GAUC: 0.6913	Eval_AUC: 0.6899
Epoch 1 Global_step 3000	Train_loss: 0.6861	Eval_GAUC: 0.6911	Eval_AUC: 0.6900
Epoch 1 Global_step 4000	Train_loss: 0.6860	Eval_GAUC: 0.6904	Eval_AUC: 0.6902
Epoch 1 Global_step 5

In [50]:
# DIN
model = DIN(user_count, item_count, cate_count, cate_list,
            args.user_dim, args.item_dim, args.cate_dim, args.dim_layers)
train(optimizer)

Epoch 0 Global_step 1000	Train_loss: 0.6931	Eval_GAUC: 0.5871	Eval_AUC: 0.5693
Epoch 0 Global_step 2000	Train_loss: 0.6917	Eval_GAUC: 0.6125	Eval_AUC: 0.6007
Epoch 0 Global_step 3000	Train_loss: 0.6904	Eval_GAUC: 0.6159	Eval_AUC: 0.6065
Epoch 0 Global_step 4000	Train_loss: 0.6882	Eval_GAUC: 0.6075	Eval_AUC: 0.6035
Epoch 0 Global_step 5000	Train_loss: 0.6825	Eval_GAUC: 0.6053	Eval_AUC: 0.6022
Epoch 0 Global_step 6000	Train_loss: 0.6800	Eval_GAUC: 0.6125	Eval_AUC: 0.6107
Epoch 0 Global_step 7000	Train_loss: 0.6787	Eval_GAUC: 0.6173	Eval_AUC: 0.6128
Epoch 0 Global_step 8000	Train_loss: 0.6755	Eval_GAUC: 0.6198	Eval_AUC: 0.6160
Epoch 0 DONE	Cost time: 504.41
Epoch 1 Global_step 1000	Train_loss: 0.6742	Eval_GAUC: 0.6246	Eval_AUC: 0.6228
Epoch 1 Global_step 2000	Train_loss: 0.6690	Eval_GAUC: 0.6267	Eval_AUC: 0.6251
Epoch 1 Global_step 3000	Train_loss: 0.6695	Eval_GAUC: 0.6279	Eval_AUC: 0.6270
Epoch 1 Global_step 4000	Train_loss: 0.6663	Eval_GAUC: 0.6418	Eval_AUC: 0.6394
Epoch 1 Global_step 5