In [1]:
from DataGenerator import DataGenerator

data = DataGenerator()

In [10]:
import tensorflow as tf
train = data.add_negatives(data.train, n_samples=4)

X_train = data.get_features(train)
# X_train = tf.convert_to_tensor(data.get_features(X))
y_train = data.get_target(train)
# y_train = tf.convert_to_tensor(data.get_target(X))

In [3]:
from tensorflow.keras.layers import Input, Dense, Embedding, Flatten, multiply
from tensorflow.keras.regularizers import l2

def GMF(n_users, n_items, latent_dim):
    user_input = Input(shape=(1,), dtype='int32', name='user')
    item_input = Input(shape=(1,), dtype='int32', name='movie')

    user_embedding = Embedding(input_dim = n_users,
                                    output_dim = latent_dim,
                                    name = 'user_embedding',
                                    embeddings_regularizer = l2(0),
                                    input_length=1)
    item_embedding = Embedding(input_dim = n_items,
                                output_dim = latent_dim,
                                name = 'item_embedding',
                                # init = init_normal,
                                embeddings_regularizer = l2(0),
                                input_length=1)
    user_latent = Flatten()(user_embedding(user_input))
    item_latent = Flatten()(item_embedding(item_input))
    predict = multiply([user_latent, item_latent])
    prediction = Dense(1, activation=tf.nn.sigmoid, kernel_initializer='lecun_uniform')(predict)
    return tf.keras.Model(inputs=[user_input, item_input],
                  outputs=prediction)

In [49]:
learning_rate=0.001
gmf = GMF(data.num_users, data.num_movies, latent_dim=8)
gmf.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=learning_rate),
            loss='binary_crossentropy',
            metrics=[tf.keras.metrics.TopKCategoricalAccuracy()])

In [50]:
def find_batch_size():
    if len(tf.config.list_physical_devices('GPU'))>0:
        return 1024
    else:
        return 64
batch_size = find_batch_size()

In [51]:
# for e in range(5):
epochs = 3
gmf.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, shuffle=True)


Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x12fdbbdf0>

In [52]:
test = data.add_negatives(data.test, n_samples=4)

X_test = data.get_features(test)
y_test = data.get_target(test)

In [None]:
# D = data.get_dataset(X)
# D = D.shuffle(buffer_size=100)
# D = D.batch(batch_size)

In [53]:
gmf.evaluate(X_test, y_test)



[0.8397703170776367, 1.0]

In [17]:
import pandas as pd
# complete = pd.concat([train, test]).sort_values(by=['uid', 'mid'])
# complete.reset_index(drop=True, inplace=True)
# complete.head()


Unnamed: 0,uid,mid,rating
0,0,0,1
1,0,47,1
2,0,149,1
3,0,183,0
4,0,259,1


Unnamed: 0,uid,mid_x,rating_x,mid_y,rating_y
0,0,0,1,47,1
1,0,0,1,1418,0
2,0,0,1,1420,0
3,0,0,1,1437,0
4,0,0,1,2605,0
...,...,...,...,...,...
5242640,6039,3818,1,699,0
5242641,6039,3818,1,1087,0
5242642,6039,3818,1,1220,1
5242643,6039,3818,1,1793,0


In [54]:
test['score'] = gmf.predict(X_test)

In [55]:
test.score.max() - test.score.min()

0.0014717579

In [38]:
test

Unnamed: 0,uid,mid,rating,score
0,0,47,1,0.584681
1,0,70,0,0.584704
2,0,133,0,0.584754
3,0,2808,0,0.584665
4,0,3461,0,0.584952
...,...,...,...,...
30195,6039,320,0,0.585153
30196,6039,556,0,0.585029
30197,6039,1220,1,0.584991
30198,6039,1645,0,0.584639


In [56]:
test_pos = test[test.rating==1]

test_pos.rename(columns={'score':'t_score', 'mid':'t_mid'}, inplace=True)

test_pos

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


Unnamed: 0,uid,t_mid,rating,t_score
0,0,47,1,0.602761
6,1,1686,1,0.602784
11,2,2080,1,0.602730
17,3,2950,1,0.602888
21,4,287,1,0.602350
...,...,...,...,...
30179,6035,2806,1,0.602852
30180,6036,967,1,0.602751
30186,6037,1182,1,0.602674
30191,6038,911,1,0.602895


In [57]:
complete = pd.merge(test, test_pos, on=['uid'], how='left')

In [58]:
complete['rank'] = complete.groupby('uid')['score'].rank(method='first', ascending=False)
complete.sort_values(['uid', 'rank'], inplace=True)
complete

Unnamed: 0,uid,mid,rating_x,score,t_mid,rating_y,t_score,rank
2,0,846,0,0.602926,47,1,0.602761,1.0
3,0,1227,0,0.602906,47,1,0.602761,2.0
0,0,47,1,0.602761,47,1,0.602761,3.0
1,0,330,0,0.602569,47,1,0.602761,4.0
4,0,3877,0,0.602208,47,1,0.602761,5.0
...,...,...,...,...,...,...,...,...
30198,6039,3175,0,0.602815,1220,1,0.602690,1.0
30195,6039,37,0,0.602799,1220,1,0.602690,2.0
30196,6039,1053,0,0.602701,1220,1,0.602690,3.0
30197,6039,1220,1,0.602690,1220,1,0.602690,4.0


In [None]:
# hr, ndcgs = [], []
# hr = calc_hr(test, k=10)

In [59]:
k = 10
# def calc_hr(df, k):
top_k = complete[complete['rank']<=k]
test_in_top_k = top_k[top_k['mid'] == top_k['t_mid']]
len(test_in_top_k)

6040

In [60]:
top_k[top_k.uid == 1]

Unnamed: 0,uid,mid,rating_x,score,t_mid,rating_y,t_score,rank
7,1,1701,0,0.602959,1686,1,0.602784,1.0
6,1,1686,1,0.602784,1686,1,0.602784,2.0
9,1,3059,0,0.602698,1686,1,0.602784,3.0
5,1,859,0,0.602623,1686,1,0.602784,4.0
8,1,2833,0,0.602587,1686,1,0.602784,5.0


In [61]:
from heapq import nlargest
from numpy import full
from math import log

u = test_pos.uid.iloc[0]
movie = test_pos.t_mid.iloc[0]

items = test[test.uid==0].mid.tolist()

map_item_score = {}
users = full(len(items), u, dtype='int32')
predictions = gmf.predict(X_test)

for i in range(len(items)):
    map_item_score[items[i]] = predictions[i]
items.pop()

ranklist= nlargest(10, map_item_score, key=map_item_score.get)

hr, ndcg = 0, 0
for item in ranklist:
    if item == movie:
        hr = 1
        ndcg = log(2) / log(movie+2)

In [62]:
hr

1

In [63]:
ndcg

0.1781035935540111