### Load Packages

In [107]:
%matplotlib inline
import os
import random

import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
from tensorflow.keras.utils import get_file
from tensorflow.keras import backend as K
from tqdm import tqdm
np.set_printoptions(5,)

### Download Data

In [108]:
ROOT_URL = "https://craftsangjae.s3.ap-northeast-2.amazonaws.com/data/"

# 데이터 가져오기
ratings_path = get_file("100k_ratings.csv", ROOT_URL+"ratings.csv")
movies_path = get_file("100k_movies.csv", ROOT_URL+"movies.csv")
users_path = get_file("100k_users.csv", ROOT_URL+"users.csv")

ratings_df = pd.read_csv(ratings_path)
movies_df = pd.read_csv(movies_path)
users_df = pd.read_csv(users_path)

#### Construct Feature Tables

In [109]:
feature_tables = pd.merge(
    pd.merge(ratings_df, movies_df), 
    users_df)
feature_tables.head()

Unnamed: 0,user_id,item_id,rating,title,year,unknown,Action,Adventure,Animation,Children,...,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,age,gender,occupation
0,196,242,3,Kolya (1996),1997,0,0,0,0,0,...,0,0,0,0,0,0,0,9,M,writer
1,196,257,2,Men in Black (1997),1997,0,1,1,0,0,...,0,0,0,1,0,0,0,9,M,writer
2,196,111,4,"Truth About Cats & Dogs, The (1996)",1996,0,0,0,0,0,...,0,0,1,0,0,0,0,9,M,writer
3,196,25,4,"Birdcage, The (1996)",1996,0,0,0,0,0,...,0,0,0,0,0,0,0,9,M,writer
4,196,382,4,"Adventures of Priscilla, Queen of the Desert, ...",1994,0,0,0,0,0,...,0,0,0,0,0,0,0,9,M,writer


# DeepFM(Factorization Machine)
---

### Build Custom Layer

In [110]:
from tensorflow.keras.layers import Layer

class LinearModel(Layer):
    """
    Linear Logit
    y = w0 + x1 + x2 + ...
    """
    def build(self, input_shape):
        self.b = self.add_weight(shape=(1,),
                                 initializer='zeros',
                                 trainable=True)
        super().build(input_shape)    
    
    def call(self, inputs, **kwargs):
        logits = tf.add_n(inputs) + self.b
        return logits

In [111]:
class FactorizationMachine(Layer):
    """
    Factorization Machine Layer
    """
    def call(self, inputs, **kwargs):
        # List of (# Batch, # Embed) -> (# Batch, # Features ,# Embed)
        inputs = tf.stack(inputs, axis=1) 

        logits = tf.reduce_sum(
            tf.square(tf.reduce_sum(inputs, axis=1))
            - tf.reduce_sum(tf.square(inputs), axis=1)
            , axis=1, keepdims=True) / 2.
        return logits

### Build Model

In [127]:
from tensorflow.keras.layers import Input, Reshape
from tensorflow.keras.layers import Concatenate
from tensorflow.keras.layers import Dropout, Add
from tensorflow.keras.layers import Embedding, Dense
from tensorflow.keras.models import Model

In [131]:
def deep_factorization_machine(fm_size=8, 
                               deep_size=100,
                               num_cols=[], cate_cols=[], 
                               feature_tables=None):
    inputs = []
    fm_embeds = []
    linear_embeds = []

    # Numerical Column
    for col_name in num_cols:
        x = Input(shape=(), name=col_name)
        r_x = Reshape((1,))(x)

        lr_out = Dense(1, use_bias=False, 
                       name=f'{col_name}_lr_embed')(r_x)

        fm_out = Dense(fm_size, use_bias=False,
                       name=f'{col_name}_fm_embed')(r_x)

        inputs.append(x)
        linear_embeds.append(lr_out)
        fm_embeds.append(fm_out)    

    # categorical column
    for col_name in cate_cols:
        x = Input(shape=(), name=col_name)
        input_embed_dim = feature_tables[col_name].nunique()
        lr_out = Embedding(input_embed_dim, 1,
                           name=f'{col_name}_lr_embed')(x)

        fm_out = Embedding(input_embed_dim, fm_size,         
                           name=f'{col_name}_fm_embed')(x)

        inputs.append(x)
        linear_embeds.append(lr_out)
        fm_embeds.append(fm_out)

    # LR Model Part
    lr_logits = LinearModel(name='lr')(linear_embeds)

    # FM model part
    fm_logits = FactorizationMachine(name='fm')(fm_embeds)
    
    # Deep model part
    deep_inputs = Concatenate()(fm_embeds)
    
    h = Dense(deep_size, 'relu')(deep_inputs)
    h = Dropout(0.5)(h)
    h = Dense(deep_size, 'relu')(h)
    h = Dropout(0.5)(h)
    h = Dense(deep_size, 'relu')(h)
    h = Dropout(0.5)(h)
    deep_logits = Dense(1)(h)
    
    pred = Add()([lr_logits, fm_logits, deep_logits])
    return Model(inputs, pred, name='deepFM')

In [132]:
fm_size = 8
deep_size = 100

num_cols = ['year', 'age']

cate_cols = ['user_id', 'item_id',
             'unknown', 'Action', 'Adventure', 
             'Animation', 'Children', 'Comedy',
             'Crime', 'Documentary', 'Drama', 
             'Fantasy', 'Film-Noir', 'Horror',
             'Musical', 'Mystery', 'Romance']

model = deep_factorization_machine(fm_size, 
                                   deep_size,
                                   num_cols, 
                                   cate_cols, 
                                   feature_tables)

### Build Data Pipeline

In [133]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split

prep_tables = feature_tables.copy()

user_encoder = LabelEncoder()
prep_tables['user_id'] = (
    user_encoder.fit_transform(feature_tables['user_id']))

item_encoder = LabelEncoder()
prep_tables['item_id'] = (
    item_encoder.fit_transform(feature_tables['item_id']))

title_encoder = LabelEncoder()
prep_tables['title'] = (
    title_encoder.fit_transform(feature_tables['title']))

year_encoder = MinMaxScaler()
prep_tables[['year']] = (
    year_encoder.fit_transform(feature_tables[['year']]))

gender_encoder = LabelEncoder()
prep_tables['gender'] = (
    gender_encoder.fit_transform(feature_tables['gender']))

occupation_encoder = LabelEncoder()
prep_tables['occupation'] = (
    occupation_encoder.fit_transform(feature_tables['occupation']))

X = prep_tables[num_cols+cate_cols]
y = prep_tables['rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

trainset = tf.data.Dataset.from_tensor_slices(
    ({k : v.values.astype(np.int32) 
      for k, v in X_train.iteritems()}, 
     y_train.values))

validset = tf.data.Dataset.from_tensor_slices(
    ({k : v.values.astype(np.int32) 
      for k, v in X_test.iteritems()}, 
     y_test.values))

### Compile Model

In [134]:
from tensorflow.keras.optimizers import Adagrad
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.metrics import RootMeanSquaredError

model.compile(optimizer=Adagrad(learning_rate=0.1),
              loss=MeanSquaredError(), 
              metrics=[RootMeanSquaredError()])

### Train Model

In [135]:
batch_size = 256
num_epoch = 50

model.fit(trainset.shuffle(10000).batch(batch_size),
          validation_data=validset.batch(batch_size*4),
          epochs=num_epoch)

Train for 352 steps, validate for 10 steps
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50


Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7fd16c135d68>