In [4]:
import pandas as pd
from deepctr.feature_column import SparseFeat, VarLenSparseFeat
from sklearn.preprocessing import LabelEncoder

from tensorflow.python.keras import backend as K
from tensorflow.python.keras.models import Model

In [78]:
import numpy as np
import random
from tqdm import tqdm
def gen_data_set(data, seq_max_len=50, negsample=0):
    data.sort_values("timestamp", inplace=True)
    item_ids = data['movie_id'].unique()
    item_id_genres_map = dict(zip(data['movie_id'].values, data['genres'].values))
    train_set = []
    test_set = []
    for reviewerID, hist in tqdm(data.groupby('user_id')):
        pos_list = hist['movie_id'].tolist()
        genres_list = hist['genres'].tolist()
        rating_list = hist['rating'].tolist()

        if negsample > 0:
            candidate_set = list(set(item_ids) - set(pos_list))
            neg_list = np.random.choice(candidate_set, size=len(pos_list) * negsample, replace=True)
        for i in range(1, len(pos_list)):
            hist = pos_list[:i]
            genres_hist = genres_list[:i]
            seq_len = min(i, seq_max_len)
            if i != len(pos_list) - 1:
                train_set.append((
                    reviewerID, pos_list[i], 1, hist[::-1][:seq_len], seq_len, genres_hist[::-1][:seq_len],
                    genres_list[i],
                    rating_list[i]))
                for negi in range(negsample):
                    train_set.append((reviewerID, neg_list[i * negsample + negi], 0, hist[::-1][:seq_len], seq_len,
                                      genres_hist[::-1][:seq_len], item_id_genres_map[neg_list[i * negsample + negi]]))
            else:
                test_set.append((reviewerID, pos_list[i], 1, hist[::-1][:seq_len], seq_len, genres_hist[::-1][:seq_len],
                                 genres_list[i],
                                 rating_list[i]))

    random.shuffle(train_set)
    random.shuffle(test_set)

    print(len(train_set[0]), len(test_set[0]))

    return train_set, test_set

def gen_model_input(train_set, user_profile, seq_max_len):
    train_uid = np.array([line[0] for line in train_set])
    train_iid = np.array([line[1] for line in train_set])
    train_label = np.array([line[2] for line in train_set])
    train_seq = [line[3] for line in train_set]
    train_hist_len = np.array([line[4] for line in train_set])
    train_seq_genres = [line[5] for line in train_set]
    train_genres = np.array([line[6] for line in train_set])
    train_seq_pad = pad_sequences(train_seq, maxlen=seq_max_len, padding='post', truncating='post', value=0)
    train_seq_genres_pad = pad_sequences(train_seq_genres, maxlen=seq_max_len, padding='post', truncating='post',
                                         value=0)
    train_model_input = {"user_id": train_uid, "movie_id": train_iid, "hist_movie_id": train_seq_pad,
                         "hist_genres": train_seq_genres_pad,
                         "hist_len": train_hist_len, "genres": train_genres}

    for key in ["gender", "age", "occupation", "zip"]:
        train_model_input[key] = user_profile.loc[train_model_input['user_id']][key].values

    return train_model_input, train_label


In [12]:
from deepmatch.models import ComiRec
from deepmatch.utils import sampledsoftmaxloss, NegativeSampler

In [80]:
pad_sequences = tf.keras.utils.pad_sequences

In [23]:
data_path = "./datasets/"

unames = ['user_id','gender','age','occupation','zip']
user = pd.read_csv(data_path+'ml-1m/users.dat',sep='::',header=None,names=unames)
rnames = ['user_id','movie_id','rating','timestamp']
ratings = pd.read_csv(data_path+'ml-1m/ratings.dat',sep='::',header=None,names=rnames)
mnames = ['movie_id','title','genres']
movies = pd.read_csv(data_path+'ml-1m/movies.dat',sep='::',header=None,names=mnames,encoding="unicode_escape")
movies['genres'] = list(map(lambda x: x.split('|')[0], movies['genres'].values))


  user = pd.read_csv(data_path+'ml-1m/users.dat',sep='::',header=None,names=unames)
  ratings = pd.read_csv(data_path+'ml-1m/ratings.dat',sep='::',header=None,names=rnames)
  movies = pd.read_csv(data_path+'ml-1m/movies.dat',sep='::',header=None,names=mnames,encoding="unicode_escape")


In [127]:
data = pd.merge(pd.merge(ratings,movies),user)#.iloc[:10000]

In [128]:
sparse_features = ["movie_id", "user_id",
                    "gender", "age", "occupation", "zip", "genres"]
SEQ_LEN = 50
negsample = 0

# 1.Label Encoding for sparse features,and process sequence features with `gen_date_set` and `gen_model_input`
feature_max_idx = {}

In [129]:
np.array([[1,2], [3,3]])

array([[1, 2],
       [3, 3]])

In [130]:

for feature in sparse_features:
    lbe = LabelEncoder()
    data[feature] = lbe.fit_transform(data[feature]) + 1
    feature_max_idx[feature] = data[feature].max() + 1

user_profile = data[["user_id", "gender", "age", "occupation", "zip"]].drop_duplicates('user_id')

item_profile = data[["movie_id"]].drop_duplicates('movie_id')

user_profile.set_index("user_id", inplace=True)

user_item_list = data.groupby("user_id")['movie_id'].apply(list)

train_set, test_set = gen_data_set(data, SEQ_LEN, negsample)


100%|██████████████████████████████████████████████████████████████████████████████████████████████| 6040/6040 [00:03<00:00, 1514.46it/s]


8 8


In [131]:
embedding_dim = 32

user_feature_columns = [SparseFeat('user_id', feature_max_idx['user_id'], 16),
                        SparseFeat("gender", feature_max_idx['gender'], 16),
                        SparseFeat("age", feature_max_idx['age'], 16),
                        SparseFeat("occupation", feature_max_idx['occupation'], 16),
                        SparseFeat("zip", feature_max_idx['zip'], 16),
                        VarLenSparseFeat(SparseFeat('hist_movie_id', feature_max_idx['movie_id'], embedding_dim,
                                                    embedding_name="movie_id"), SEQ_LEN, 'mean', 'hist_len'),
                        VarLenSparseFeat(SparseFeat('hist_genres', feature_max_idx['genres'], embedding_dim,
                                                   embedding_name="genres"), SEQ_LEN, 'mean', 'hist_len'),
                        ]

item_feature_columns = [SparseFeat('movie_id', feature_max_idx['movie_id'], embedding_dim)]

In [132]:
train_model_input, train_label = gen_model_input(train_set, user_profile, SEQ_LEN)
test_model_input, test_label = gen_model_input(test_set, user_profile, SEQ_LEN)

In [133]:
from collections import Counter
train_counter = Counter(train_model_input['movie_id'])
#train_counter = Counter(data["movie_id"])
item_count = [train_counter.get(i,0) for i in range(item_feature_columns[0].vocabulary_size)]
sampler_config = NegativeSampler('frequency',num_sampled=255,item_name="movie_id",item_count=item_count)

In [134]:
import tensorflow as tf
if tf.__version__ >= '2.0.0':
    tf.compat.v1.disable_eager_execution()
else:
    K.set_learning_phase(True)
    
#model = YoutubeDNN(user_feature_columns, item_feature_columns, user_dnn_hidden_units=(128,64, embedding_dim), sampler_config=sampler_config)
model = ComiRec(user_feature_columns,item_feature_columns,k_max=2, user_dnn_hidden_units=(128,64, embedding_dim), sampler_config=sampler_config)


In [135]:
model.compile(optimizer="adam", loss=sampledsoftmaxloss)


In [136]:
history = model.fit(train_model_input, train_label,  # train_label,
                    batch_size=512, epochs=20, verbose=1, validation_split=0.0, )

Train on 988129 samples
Epoch 1/20


2024-03-24 23:40:22.976522: W tensorflow/c/c_api.cc:291] Operation '{name:'training_2/Adam/user_dnn_3/bias2/v/Assign' id:5606 op device:{requested: '', assigned: ''} def:{{{node training_2/Adam/user_dnn_3/bias2/v/Assign}} = AssignVariableOp[_has_manual_control_dependencies=true, dtype=DT_FLOAT, validate_shape=false](training_2/Adam/user_dnn_3/bias2/v, training_2/Adam/user_dnn_3/bias2/v/Initializer/zeros)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [137]:
test_user_model_input = test_model_input
all_item_model_input = {"movie_id": item_profile['movie_id'].values,}

user_embedding_model = Model(inputs=model.user_input, outputs=model.user_embedding)
item_embedding_model = Model(inputs=model.item_input, outputs=model.item_embedding)

(6040, 2, 32)
(3706, 32)


2024-03-24 23:52:19.366761: W tensorflow/c/c_api.cc:291] Operation '{name:'user_dnn_3/dropout_19/cond/Identity' id:4380 op device:{requested: '', assigned: ''} def:{{{node user_dnn_3/dropout_19/cond/Identity}} = Identity[T=DT_FLOAT, _has_manual_control_dependencies=true](user_dnn_3/dropout_19/cond)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.
2024-03-24 23:52:19.503958: W tensorflow/c/c_api.cc:291] Operation '{name:'lambda_11/Squeeze' id:4562 op device:{requested: '', assigned: ''} def:{{{node lambda_11/Squeeze}} = Squeeze[T=DT_FLOAT, _has_manual_control_dependencies=true, squeeze_dims=[1]](lambda_11/GatherV2)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.


In [141]:
from tensorflow.python.keras.models import  save_model,load_model

In [142]:
from deepmatch.layers import custom_objects

In [147]:
save_model(user_embedding_model, 'user_emb.h5')

In [148]:
user_embedding_model = load_model('user_emb.h5',custom_objects)# load_model,just add a parameter



2024-03-24 23:54:17.997428: W tensorflow/c/c_api.cc:291] Operation '{name:'sparse_seq_emb_hist_genres_4/embeddings/Assign' id:5784 op device:{requested: '', assigned: ''} def:{{{node sparse_seq_emb_hist_genres_4/embeddings/Assign}} = AssignVariableOp[_has_manual_control_dependencies=true, dtype=DT_FLOAT, validate_shape=false](sparse_seq_emb_hist_genres_4/embeddings, sparse_seq_emb_hist_genres_4/embeddings/Initializer/stateless_random_normal)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.


In [149]:
user_embs = user_embedding_model.predict(test_user_model_input, batch_size=2 ** 12)
# user_embs = user_embs[:, i, :]  # i in [0,k_max) if MIND
item_embs = item_embedding_model.predict(all_item_model_input, batch_size=2 ** 12)

print(user_embs.shape)
print(item_embs.shape)

(6040, 2, 32)
(3706, 32)


2024-03-24 23:54:18.724928: W tensorflow/c/c_api.cc:291] Operation '{name:'user_dnn_4/dropout_24/cond/Identity' id:6166 op device:{requested: '', assigned: ''} def:{{{node user_dnn_4/dropout_24/cond/Identity}} = Identity[T=DT_FLOAT, _has_manual_control_dependencies=true](user_dnn_4/dropout_24/cond)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.


In [151]:
import heapq
from collections import defaultdict
from tqdm import tqdm
import numpy as np
import faiss
from deepmatch.utils import recall_N

k_max = 2
topN = 50
test_true_label = {line[0]: [line[1]] for line in test_set}

index = faiss.IndexFlatIP(embedding_dim)
# faiss.normalize_L2(item_embs)
index.add(item_embs)
# faiss.normalize_L2(user_embs)

if len(user_embs.shape) == 2:  # multi interests model's shape = 3 (MIND,ComiRec)
    user_embs = np.expand_dims(user_embs, axis=1)


In [152]:
score_dict = defaultdict(dict)
for k in range(k_max):
    user_emb = user_embs[:, k, :]
    D, I = index.search(np.ascontiguousarray(user_emb), topN)
    for i, uid in tqdm(enumerate(test_user_model_input['user_id']), total=len(test_user_model_input['user_id'])):
        if np.abs(user_emb[i]).max() < 1e-8:
            continue
        for score, itemid in zip(D[i], I[i]):
            score_dict[uid][itemid] = max(score, score_dict[uid].get(itemid, float("-inf")))

s = []
hit = 0
for i, uid in enumerate(test_user_model_input['user_id']):
    pred = [item_profile['movie_id'].values[x[0]] for x in
            heapq.nlargest(topN, score_dict[uid].items(), key=lambda x: x[1])]
    filter_item = None
    recall_score = recall_N(test_true_label[uid], pred, N=topN)
    s.append(recall_score)
    if test_true_label[uid] in pred:
        hit += 1

print("recall", np.mean(s))
print("hr", hit / len(test_user_model_input['user_id']))

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 6040/6040 [00:00<00:00, 21827.49it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████| 6040/6040 [00:00<00:00, 17698.07it/s]


recall 0.42549668874172186
hr 0.42549668874172186


In [126]:
user_embs

(2159, 32)

In [119]:
len(test_user_model_input["user_id"])

70

In [123]:
test_user_model_input['hist_movie_id'][0]

(50,)

In [116]:
item_embs.shape

(2159, 32)

In [36]:
train_uid = np.array([line[0] for line in train_set])

In [41]:
s = [line[5] for line in train_set]

In [57]:
len(train_set[4][5])

50

In [47]:
np.array([line[5] for line in train_set[:3]])

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (3,) + inhomogeneous part.

In [40]:
train_seq_genres = np.array([line[5] for line in train_set])

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (9860,) + inhomogeneous part.

In [34]:
train_model_input, train_label = gen_model_input(train_set, user_profile, SEQ_LEN)
test_model_input, test_label = gen_model_input(test_set, user_profile, SEQ_LEN)

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (9860,) + inhomogeneous part.

In [35]:
# 2.count #unique features for each sparse field and generate feature config for sequence feature

embedding_dim = 32

user_feature_columns = [SparseFeat('user_id', feature_max_idx['user_id'], 16),
                        SparseFeat("gender", feature_max_idx['gender'], 16),
                        SparseFeat("age", feature_max_idx['age'], 16),
                        SparseFeat("occupation", feature_max_idx['occupation'], 16),
                        SparseFeat("zip", feature_max_idx['zip'], 16),
                        VarLenSparseFeat(SparseFeat('hist_movie_id', feature_max_idx['movie_id'], embedding_dim,
                                                    embedding_name="movie_id"), SEQ_LEN, 'mean', 'hist_len'),
                        VarLenSparseFeat(SparseFeat('hist_genres', feature_max_idx['genres'], embedding_dim,
                                                   embedding_name="genres"), SEQ_LEN, 'mean', 'hist_len'),
                        ]

item_feature_columns = [SparseFeat('movie_id', feature_max_idx['movie_id'], embedding_dim)]

from collections import Counter
train_counter = Counter(train_model_input['movie_id'])
item_count = [train_counter.get(i,0) for i in range(item_feature_columns[0].vocabulary_size)]
sampler_config = NegativeSampler('frequency',num_sampled=255,item_name="movie_id",item_count=item_count)


NameError: name 'train_model_input' is not defined