# Behavior Sequence Transformer

using Transformer to capture the sequential signals underlying users' behavior sequences  

### References
- https://arxiv.org/pdf/1905.06874.pdf  
- https://www.kaggle.com/laowingkin/netflix-movie-recommendation

In [1]:
import os
import math
import numpy as np
import pandas as pd
from tqdm import tqdm
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental.preprocessing import StringLookup

from sklearn.model_selection import train_test_split
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt

## Load Data

In [2]:
# df1 = pd.read_csv('../input/netflix-prize-data/combined_data_1.txt', header=None, names=['cust_id', 'rating', 'timestamp'])
# df1['rating'] = df1['rating'].astype(float)
# df1.head()

# df2 = pd.read_csv('../input/netflix-prize-data/combined_data_2.txt', 
#                   header=None, names=['cust_id', 'rating', 'timestamp'])
# df2['rating'] = df2['rating'].astype(float)
# df1 = pd.concat([df1, df2])
# print(df1.shape)
# del df2

# df3 = pd.read_csv('../input/netflix-prize-data/combined_data_3.txt', 
#                   header=None, names=['cust_id', 'rating', 'timestamp'])
# df3['rating'] = df3['rating'].astype(float)
# df1 = pd.concat([df1, df3])
# print(df1.shape)
# del df3

# df4 = pd.read_csv('../input/netflix-prize-data/combined_data_4.txt', 
#                   header=None, names=['cust_id', 'rating', 'timestamp'])
# df4['rating'] = df4['rating'].astype(float)
# df1 = pd.concat([df1, df4])
# print(df1.shape)
# del df4

In [3]:
# df1.to_parquet('/kaggle/working/ratings.parquet')
# df1 = pd.read_parquet('../input/netflix-ratings/netflix_ratings.parquet')
# df1.shape

In [4]:
# df_nan = df1.loc[df1['rating'].isna()].reset_index().drop(['rating', 'timestamp'], axis=1)
# df_nan['next'] = df_nan['index'].shift(-1).fillna(df1.index[-1]+1).astype(int)
# df_nan['movie_id'] = df_nan['cust_id'].str[:-1].astype(int)
# df_nan.drop('cust_id', axis=1, inplace=True)

# movie_ids = np.full((1, df1.shape[0]), 0)
# for i, j, k in tqdm(df_nan[['index', 'next', 'movie_id']].values):
#     movie_ids[0, i+1:j] = k
# df1['movie_id'] = movie_ids[0]

# df2 = df1.loc[~df1['rating'].isna()]
# del df1, df_nan

In [5]:
# random_selection = np.random.rand(len(df2.index)) <= 0.5
# df3 = df2[random_selection]
# del df2
# df_movie_summary = df3.groupby('movie_id')[['rating']].count()
# df_user_summary = df3.groupby('cust_id')[['rating']].count()
# drop_movie_list = df_movie_summary.loc[df_movie_summary['rating'] < 100].index
# drop_cust_list = df_user_summary.loc[df_user_summary['rating'] < 100].index

# df3 = df3[~df3['movie_id'].isin(drop_movie_list)]
# df3 = df3[~df3['cust_id'].isin(drop_cust_list)]


df3 = pd.read_parquet('../input/netflixratings/netflix_ratings_sampled01.parquet')
df3['cust_id'] = df3['cust_id'].apply(lambda x: f'cust_{x}')
df3['movie_id'] = df3['movie_id'].apply(lambda x: f'movie_{x}')
# print('here')

In [6]:
ratings_group = df3.sort_values('timestamp').groupby('cust_id')
df3[['cust_id', 'movie_id']] = df3[['cust_id', 'movie_id']].astype('string')
CATEGORICAL_FEATURES_WITH_VOCABULARY = {
    'cust_id': list(df3.cust_id.unique()), 
    'movie_id': list(df3.movie_id.unique())
}
del df3

In [7]:
ratings_data = pd.DataFrame(data={
    'cust_id': list(ratings_group.groups.keys()), 
    'movie_ids': list(ratings_group.movie_id.apply(list)), 
    'ratings': list(ratings_group.rating.apply(list)),
    'timestamps': list(ratings_group.timestamp.apply(list))
})

ratings_data.isna().sum()

cust_id       0
movie_ids     0
ratings       0
timestamps    0
dtype: int64

In [8]:
sequence_length = 8
step_size = 1

def create_sequences(values, window_size, step_size):
    sequences = []
    start_index = 0
    
    while len(values[start_index:]) >= window_size:
        end_index = start_index + window_size
        seq = values[start_index:end_index]
        sequences.append(seq)
        start_index += step_size
    return sequences

ratings_data.movie_ids = ratings_data.movie_ids.apply(
    lambda ids: create_sequences(ids, sequence_length, step_size)
)
ratings_data.ratings = ratings_data.ratings.apply(
    lambda ids: create_sequences(ids, sequence_length, step_size)
)

ratings_data.drop('timestamps', axis=1, inplace=True)

In [9]:
ratings_data_movies = ratings_data[['cust_id', 'movie_ids']]\
													.explode('movie_ids', ignore_index=True)
ratings_data_rating = ratings_data[['ratings']]\
													.explode('ratings', ignore_index=True)

In [10]:
del ratings_data
ratings_data_transformed = \
		pd.concat([ratings_data_movies, ratings_data_rating], axis=1).dropna()
# del ratings_data_movies, ratings_data_rating

In [11]:
ratings_data_transformed.movie_ids = \
		ratings_data_transformed.movie_ids\
		.apply(lambda x: ','.join(x))
ratings_data_transformed.ratings = \
		ratings_data_transformed.ratings\
		.apply(lambda x: ','.join([str(v) for v in x]))
ratings_data_transformed.rename(
    columns={"movie_ids": "sequence_movie_ids", "ratings": "sequence_ratings"},
    inplace=True,
)
print('here3')

random_selection = np.random.rand(len(ratings_data_transformed.index)) <= 0.8
train_data = ratings_data_transformed[random_selection]
test_data = ratings_data_transformed[~random_selection]

# train_data.to_parquet('/kaggle/working/train_data.parquet')
# test_data.to_parquet('/kaggle/working/test_data.parquet')
train_data.to_csv("/kaggle/working/train_data.csv", index=False, sep="|", header=False)
test_data.to_csv("/kaggle/working/test_data.csv", index=False, sep="|", header=False)

here3


In [12]:
# CSV_HEADER = list(ratings_data_transformed.columns)
CSV_HEADER = ['cust_id', 'sequence_movie_ids', 'sequence_ratings']

def get_dataset_from_csv(csv_file_path, shuffle=False, batch_size=128):
    def process(features):
        movie_ids_string = features['sequence_movie_ids']
        sequence_movie_ids = tf.strings.split(movie_ids_string, ',').to_tensor()
        
        features['target_movie_id'] = sequence_movie_ids[:, -1]
        features['sequence_movie_ids'] = sequence_movie_ids[:, :-1]
        
        ratings_string = features['sequence_ratings']
        sequence_ratings = tf.strings.to_number(
            tf.strings.split(ratings_string, ','), tf.dtypes.float32
        ).to_tensor()
        
        target = sequence_ratings[:, -1]
        features['sequence_ratings'] = sequence_ratings[:, :-1]
        return features, target
    
    dataset = tf.data.experimental.make_csv_dataset(
        csv_file_path, 
        batch_size=batch_size, 
        column_names=CSV_HEADER, 
        num_epochs=1,
        header=False,
        field_delim='|',
        shuffle=shuffle
    ).map(process)
    
    return dataset

train_dataset = get_dataset_from_csv('train_data.csv', shuffle=True, batch_size=265)

sequence_length = 8
include_user_id = False
hidden_units = [256, 128]
dropout_rate = 0.1
num_heads = 3

def create_model_inputs():
    return {
        'cust_id': layers.Input(name='cust_id', shape=(1,), dtype=tf.string), 
        'sequence_movie_ids': layers.Input(name='sequence_movie_ids', 
                                           shape=(sequence_length - 1,), 
                                           dtype=tf.string),
        'target_movie_id': layers.Input(name='target_movie_id', 
                                        shape=(1,), dtype=tf.string),
        'sequence_ratings': layers.Input(name='sequence_ratings', 
                                         shape=(sequence_length - 1,), 
                                         dtype=tf.float32)
    }

inputs = create_model_inputs()

2022-01-14 07:52:49.859979: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-14 07:52:49.861398: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-14 07:52:49.862169: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-14 07:52:49.864371: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

In [13]:
def encode_input_features(inputs, include_user_id=True):
    
    encoded_transformer_features = []
    encoded_other_features = []
    other_feature_names = []
    
    if include_user_id:
        other_feature_names.append('cust_id')
        
    for feature_name in other_feature_names:
        # string input values -> integer indices
        vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]
        # nlp의 tokenizer와 비슷한 역할
        idx = StringLookup(vocabulary=vocabulary, mask_token=None, 
                           num_oov_indices=0)(inputs[feature_name])
        
        embedding_dims = int(math.sqrt(len(vocabulary)))
        embedding_encoder = layers.Embedding(
            input_dim=len(vocabulary),
            output_dim=embedding_dims,
            name=f'{feature_name}_embedding'
        )
        
        # convert the index values to embedding representation
        encoded_other_features.append(embedding_encoder(idx))
        
    if len(encoded_other_features) == 1:
        encoded_other_features = encoded_other_features[0]
    else:
        encoded_other_features = None
        
    ###########################################################################
    # movie_id                               
    ###########################################################################
    movie_vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY['movie_id']
    movie_embedding_dims = int(math.sqrt(len(movie_vocabulary)))
    movie_index_lookup = StringLookup(
        vocabulary=movie_vocabulary,
        mask_token=None,
        num_oov_indices=0,
        name='movie_index_lookup'
    )
    
    movie_embedding_encoder = layers.Embedding(
        input_dim=len(movie_vocabulary),
        output_dim=movie_embedding_dims,
        name='movie_embedding'
    )
    
    # define a function to encode a given movie id
    def encode_movie(movie_id):
        # string input -> integer indices
        movie_idx = movie_index_lookup(movie_id)
        encoded_movie = movie_embedding_encoder(movie_idx)
        
        return encoded_movie
    
    target_movie_id = inputs['target_movie_id']
    encoded_target_movie = encode_movie(target_movie_id)
    
    sequence_movie_ids = inputs['sequence_movie_ids']
    encoded_sequence_movies = encode_movie(sequence_movie_ids)
    
    ###########################################################################    
    # position embedding
    ###########################################################################
    position_embedding_encoder = layers.Embedding(
        input_dim=sequence_length, 
        output_dim=movie_embedding_dims, 
        name='poisition_embedding'
    )
    positions = tf.range(start=0, limit=sequence_length - 1, delta=1)
    encoded_positions = position_embedding_encoder(positions)
    
    ###########################################################################
    # ratings
    ###########################################################################
    # shape (None, 7) -> shape (None, 7, 1)  
    sequence_ratings = tf.expand_dims(inputs['sequence_ratings'], -1)
    
    
    ###########################################################################
    # inner product of movie id sequence + encoded position & sequence_rating
    ###########################################################################
    
    # encoded_sequence_movies shape (7,114)
    # encoded_positions shape (7,114)
    # sequence_ratings shape (None, 7, 1)
    # encoded_sequence_movies_with_position_and_rating shape (None, 7, 114)
    
    encoded_sequence_movies_with_position_and_rating = layers.Multiply()(
        [(encoded_sequence_movies + encoded_positions), sequence_ratings]
    )
    
    
    # unstack -> (None, 7, 114)에서 114씩 encoded_movie로 분리됨
    for encoded_movie in tf.unstack(encoded_sequence_movies_with_position_and_rating, axis=1):
        # encoded_movie shape(None, 114) -> (None, 1, 114)
        encoded_transformer_features.append(tf.expand_dims(encoded_movie, 1))
    
    encoded_transformer_features.append(encoded_target_movie)
    encoded_transformer_features = layers.concatenate(encoded_transformer_features, axis=1)
    
    return encoded_transformer_features, encoded_other_features

In [14]:
transformer_features, other_features = encode_input_features(
    inputs, include_user_id
)

# create a multi-headed attention layer
# params: (target, source)
attention_output = layers.MultiHeadAttention(
    num_heads=num_heads, 
    key_dim=transformer_features.shape[2], 
    dropout=dropout_rate
)(transformer_features, transformer_features)

# transformer block
attention_output = layers.Dropout(dropout_rate)(attention_output)
x1 = layers.Add()([transformer_features, attention_output])
x1 = layers.LayerNormalization()(x1)
x2 = layers.LeakyReLU()(x1)
# 왜 shape[-1]인가
x2 = layers.Dense(units=x2.shape[-1])(x2)
x2 = layers.Dropout(dropout_rate)(x2)
transformer_features = layers.Add()([x1, x2])
transformer_features = layers.LayerNormalization()(transformer_features)
features = layers.Flatten()(transformer_features)

In [15]:
if other_features is not None:
    features = layers.concatenate(
        [features, layers.Reshape([other_features.shape[-1]])(other_features)]
    )

for num_units in hidden_units:
    features = layers.Dense(num_units)(features)
    features = layers.BatchNormalization()(features)
    features = layers.LeakyReLU()(features)
    features = layers.Dropout(dropout_rate)(features)
    
outputs = layers.Dense(units=1)(features)
model = tf.keras.Model(inputs=inputs, outputs=outputs)

model.compile(
    optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.01),
    loss=tf.keras.losses.MeanSquaredError(),
    metrics=[tf.keras.metrics.MeanAbsoluteError()]
)

In [16]:
train_dataset = get_dataset_from_csv('train_data.csv', shuffle=True, batch_size=265)
model.fit(train_dataset, epochs=16)

test_dataset = get_dataset_from_csv('test_data.csv', batch_size=265)
_, mae = model.evaluate(test_dataset, verbose=0)
print(f'Test MAE: {round(mae, 3)}')

2022-01-14 07:52:54.607315: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/16
      1/Unknown - 3s 3s/step - loss: 14.9279 - mean_absolute_error: 3.5811

2022-01-14 07:52:57.995162: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8005


Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16
Test MAE: 0.715


In [17]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/netflix-prize-data/combined_data_3.txt
/kaggle/input/netflix-prize-data/movie_titles.csv
/kaggle/input/netflix-prize-data/combined_data_4.txt
/kaggle/input/netflix-prize-data/combined_data_1.txt
/kaggle/input/netflix-prize-data/README
/kaggle/input/netflix-prize-data/probe.txt
/kaggle/input/netflix-prize-data/combined_data_2.txt
/kaggle/input/netflix-prize-data/qualifying.txt
/kaggle/input/amazon-ratings/ratings_Beauty.csv
/kaggle/input/netflix-ratings/netflix_ratings.parquet
/kaggle/input/netflixratings/netflix_ratings_sampled01.parquet


In [18]:
df = pd.read_csv('/kaggle/input/amazon-ratings/ratings_Beauty.csv')
df.head()

Unnamed: 0,UserId,ProductId,Rating,Timestamp
0,A39HTATAQ9V7YF,205616461,5.0,1369699200
1,A3JM6GV9MNOF9X,558925278,3.0,1355443200
2,A1Z513UWSAAO0F,558925278,5.0,1404691200
3,A1WMRR494NWEWV,733001998,4.0,1382572800
4,A3IAAVS479H7M7,737104473,1.0,1274227200


In [19]:
tmp = df.drop_duplicates().groupby(['UserId', 'Timestamp'])[['ProductId']].count()
tmp.loc[tmp['ProductId'] >= 2]

Unnamed: 0_level_0,Unnamed: 1_level_0,ProductId
UserId,Timestamp,Unnamed: 2_level_1
A00125322X21CGQBJ30S9,1365379200,2
A00262022JQPXX5SXEVJR,1397520000,4
A00370223FX3K9TUF1QCL,1384300800,2
A0038640S18JE5Y497U6,1376524800,4
A00414041RD0BXM6WK0GX,1405296000,7
...,...,...
AZZVT7PFUPM8D,1402444800,3
AZZWJ3LICUEKJ,1361577600,4
AZZYW4YOE1B6E,1386633600,2
AZZZLM1E5JJ8C,1376697600,4
