In [16]:
import pandas as pd
from deepctr.feature_column import SparseFeat, VarLenSparseFeat
from sklearn.preprocessing import LabelEncoder
from tensorflow.python.keras import backend as K
from tensorflow.python.keras.models import Model
import numpy as np
import random
from tqdm import tqdm
from deepmatch.models import ComiRec, NCF
from deepmatch.utils import sampledsoftmaxloss, NegativeSampler
import tensorflow as tf

from deepctr.feature_column import DenseFeat

pad_sequences = tf.keras.utils.pad_sequences
SEQ_LEN = 50

In [17]:
data_path = "./datasets/"

unames = ['user_id','gender','age','occupation','zip']
user = pd.read_csv(data_path+'ml-1m/users.dat',sep='::',header=None,names=unames)
rnames = ['user_id','movie_id','rating','timestamp']
ratings = pd.read_csv(data_path+'ml-1m/ratings.dat',sep='::',header=None,names=rnames)
mnames = ['movie_id','title','genres']
movies = pd.read_csv(data_path+'ml-1m/movies.dat',sep='::',header=None,names=mnames,encoding="unicode_escape")
movies['genres'] = list(map(lambda x: x.split('|')[0], movies['genres'].values))


  user = pd.read_csv(data_path+'ml-1m/users.dat',sep='::',header=None,names=unames)
  ratings = pd.read_csv(data_path+'ml-1m/ratings.dat',sep='::',header=None,names=rnames)
  movies = pd.read_csv(data_path+'ml-1m/movies.dat',sep='::',header=None,names=mnames,encoding="unicode_escape")


In [18]:
import pandas as pd

In [19]:
data = pd.merge(pd.merge(ratings,movies),user)#.iloc[:10000]
lbe = LabelEncoder()
data["genres"] = lbe.fit_transform(data["genres"]) + 1

def get_train_input_for_user(user_df):
    user_df = user_df.sort_values("timestamp")
    # user_df = user_df[-SEQ_LEN:] 
    user_id = user_df["user_id"].to_list()[0]
    tmp = [user_df[:x+1] for x in range(len(user_df))]
    input_data = []
    for record in tmp:
        sample = {
            "user_id": user_id, # user_id
            "history_movie_id":np.array(record["movie_id"].tolist()[:-1]),
            "history_genre_id": np.array(record["genres"].tolist()[:-1]),
            "movie_id":np.array(record["movie_id"].tolist()[-1]),
            "genre_id":np.array(record["genres"].tolist()[-1]),
            # np.array(record["genres"].tolist()[:-1]) # history genre list
        }
        input_data.append(sample)
    # train, test = input_data[:-1], input_data[-1:]
    result_df = pd.DataFrame(input_data)
    result_df["label"] = 1 # tmp for now
    result_df["hist_len"] = SEQ_LEN
    return result_df

def get_test_input_for_user(user_df):
    user_df = user_df.sort_values("timestamp")
    # user_df = user_df[-SEQ_LEN:] 
    user_id = user_df["user_id"].to_list()[0]
    
    input_data = []
    record = user_df
    sample = {
        "user_id": user_id, # user_id
        "history_movie_id":np.array(record["movie_id"].tolist()[:-1]),
        "history_genre_id": np.array(record["genres"].tolist()[:-1]),
        "movie_id":np.array(record["movie_id"].tolist()[-1]),
        "genre_id":np.array(record["genres"].tolist()[-1]),
        # np.array(record["genres"].tolist()[:-1]) # history genre list
    }
    input_data.append(sample)
    # train, test = input_data[:-1], input_data[-1:]
    result_df = pd.DataFrame(input_data)
    result_df["label"] = 1 # tmp for now
    result_df["hist_len"] = SEQ_LEN
    return result_df

In [20]:
# define the feature columns
user_id_size =  data['user_id'].max()+1
movie_id_size = data['movie_id'].max()+1
genre_id_size = int(data['genres'].max())+1
embedding_dim = 32
user_feature_columns = [
                        # SparseFeat('user_id', user_id_size, 16),
                        VarLenSparseFeat(SparseFeat('hist_movie_id', movie_id_size, embedding_dim,
                                                    embedding_name="movie_id"), SEQ_LEN, 'mean', 'hist_len'),
                        VarLenSparseFeat(SparseFeat('hist_genres', genre_id_size, embedding_dim,
                                                   embedding_name="genres"), SEQ_LEN, 'mean', 'hist_len'),
                        ]
item_feature_columns = [SparseFeat('movie_id', movie_id_size, embedding_dim),]

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [21]:
tqdm.pandas()

In [22]:
train_set = data.groupby("user_id").progress_apply(lambda x:get_train_input_for_user(x[:-1]))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6040/6040 [01:09<00:00, 87.32it/s]


In [23]:
test_set = data.groupby("user_id").progress_apply(lambda x:get_test_input_for_user(x))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6040/6040 [00:02<00:00, 2116.19it/s]


In [24]:
test_set["history_movie_id"] = pad_sequences(test_set["history_movie_id"], maxlen=SEQ_LEN, padding='post', truncating='post', value=0).tolist()
test_set["history_genre_id"] = pad_sequences(test_set["history_genre_id"], maxlen=SEQ_LEN, padding='post', truncating='post', value=0).tolist()
test_input = test_set.sample(frac=1)
test_model_input = {
    "user_id":np.array(test_input["user_id"].tolist()),
    "hist_movie_id":np.array(test_input["history_movie_id"].tolist()),
    "hist_genres":np.array(test_input["history_genre_id"].tolist()),
    "movie_id": np.array(test_input["movie_id"].tolist()),
    "genre": np.array(test_input["genre_id"].tolist()),
    "hist_len": np.array(test_input["hist_len"].tolist()),
}
test_label =  test_input["label"].tolist()

train_set["history_movie_id"] = pad_sequences(train_set["history_movie_id"], maxlen=SEQ_LEN, padding='post', truncating='post', value=0).tolist()
train_set["history_genre_id"] = pad_sequences(train_set["history_genre_id"], maxlen=SEQ_LEN, padding='post', truncating='post', value=0).tolist()

train_input = train_set.sample(frac=1)

train_model_input = {
    "user_id":np.array(train_input["user_id"].tolist()),
    "hist_movie_id":np.array(train_input["history_movie_id"].tolist()),
    "hist_genres":np.array(train_input["history_genre_id"].tolist()),
    "movie_id": np.array(train_input["movie_id"].tolist()),
    "genre": np.array(train_input["genre_id"].tolist()),
    "hist_len": np.array(train_input["hist_len"].tolist()),
}
train_label =  train_input["label"].tolist()

import tensorflow as tf
if tf.__version__ >= '2.0.0':
    tf.compat.v1.disable_eager_execution()
else:
    K.set_learning_phase(True)

from collections import Counter
train_counter = Counter(data['movie_id'])
item_count = [train_counter.get(i,0) for i in range(item_feature_columns[0].vocabulary_size)] #todo change this later
sampler_config = NegativeSampler('frequency',num_sampled=255,item_name="movie_id",item_count=item_count)
#model = YoutubeDNN(user_feature_columns, item_feature_columns, user_dnn_hidden_units=(128,64, embedding_dim), sampler_config=sampler_config)
model = ComiRec(user_feature_columns,
                item_feature_columns,
                k_max=2, 
                user_dnn_hidden_units=(128,64, embedding_dim),
                sampler_config=sampler_config)


In [25]:
model.compile(optimizer="adam", loss=sampledsoftmaxloss)
history = model.fit(train_model_input, train_label,  # train_label,
                    batch_size=512, epochs=20, verbose=1, validation_split=0.0, )

Train on 994169 samples
Epoch 1/20


2024-03-29 21:40:40.716032: W tensorflow/c/c_api.cc:291] Operation '{name:'user_dnn_attn_1/kernel0/Assign' id:1522 op device:{requested: '', assigned: ''} def:{{{node user_dnn_attn_1/kernel0/Assign}} = AssignVariableOp[_has_manual_control_dependencies=true, dtype=DT_FLOAT, validate_shape=false](user_dnn_attn_1/kernel0, user_dnn_attn_1/kernel0/Initializer/stateless_truncated_normal)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


## To Chroma

In [26]:
all_movie_ids = movies["movie_id"]
metadatas = movies.to_dict(orient="records")

all_item_model_input = {"movie_id": all_movie_ids}

user_embedding_model = Model(inputs=model.user_input, outputs=model.user_embedding)
item_embedding_model = Model(inputs=model.item_input, outputs=model.item_embedding)

#user_embs = user_embedding_model.predict(test_user_model_input, batch_size=2 ** 12)
# user_embs = user_embs[:, i, :]  # i in [0,k_max) if MIND
item_embs = item_embedding_model.predict(all_item_model_input, batch_size=2 ** 12)

2024-03-29 21:50:51.831468: W tensorflow/c/c_api.cc:291] Operation '{name:'lambda_7/Squeeze' id:1946 op device:{requested: '', assigned: ''} def:{{{node lambda_7/Squeeze}} = Squeeze[T=DT_FLOAT, _has_manual_control_dependencies=true, squeeze_dims=[1]](lambda_7/GatherV2)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.


In [27]:
user_embs = user_embedding_model.predict(test_model_input, batch_size=2 ** 12)

2024-03-29 21:50:56.425494: W tensorflow/c/c_api.cc:291] Operation '{name:'user_dnn_1/dropout_9/cond/Identity' id:1764 op device:{requested: '', assigned: ''} def:{{{node user_dnn_1/dropout_9/cond/Identity}} = Identity[T=DT_FLOAT, _has_manual_control_dependencies=true](user_dnn_1/dropout_9/cond)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.


In [28]:
import chromadb
chroma_client = chromadb.PersistentClient(path="./chroma_data")
#collection.delete(ids=all_movie_ids.apply(str).tolist())
collection = chroma_client.get_or_create_collection("movie_rec")
collection.add(ids=all_movie_ids.apply(str).tolist(), embeddings=item_embs, metadatas=metadatas)

In [29]:
count = 0
for i in tqdm(range(len(user_embs))):
    user_emb = user_embs[i:i+1]
    mids = [x["movie_id"] for x in collection.query(query_embeddings=user_emb[:, 1, :], n_results=50)["metadatas"][0]] + \
    [x["movie_id"] for x in collection.query(query_embeddings=user_emb[:, 0, :], n_results=50)["metadatas"][0]]
    target = movies[movies["movie_id"]==test_model_input["movie_id"][i]]
    count += target["movie_id"].tolist()[0] in mids

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6040/6040 [00:13<00:00, 463.04it/s]


In [30]:
count/6040

0.20231788079470198

In [32]:
from tensorflow.python.keras.models import  save_model,load_model
from deepmatch.layers import custom_objects
save_model(user_embedding_model, 'user_emb.h5')



In [180]:
collection.query(query_embeddings=user_emb[:, 0, :])["metadatas"]

[[{'genres': 'Comedy', 'movie_id': 45, 'title': 'To Die For (1995)'},
  {'genres': 'Comedy',
   'movie_id': 562,
   'title': 'Welcome to the Dollhouse (1995)'},
  {'genres': 'Comedy',
   'movie_id': 1885,
   'title': 'Opposite of Sex, The (1998)'},
  {'genres': 'Drama',
   'movie_id': 538,
   'title': 'Six Degrees of Separation (1993)'},
  {'genres': 'Comedy',
   'movie_id': 708,
   'title': 'Truth About Cats & Dogs, The (1996)'},
  {'genres': 'Drama',
   'movie_id': 1683,
   'title': 'Wings of the Dove, The (1997)'},
  {'genres': 'Drama', 'movie_id': 215, 'title': 'Before Sunrise (1995)'},
  {'genres': 'Comedy', 'movie_id': 3129, 'title': 'Sweet and Lowdown (1999)'},
  {'genres': 'Drama', 'movie_id': 1354, 'title': 'Breaking the Waves (1996)'},
  {'genres': 'Comedy', 'movie_id': 1747, 'title': 'Wag the Dog (1997)'}]]

In [7]:
requests.get("http://127.0.0.1:7861").text

''

In [3]:
import chromadb
from chromadb.config import Settings

client = chromadb.HttpClient(host='localhost', port=7861, settings=Settings(allow_reset=True, anonymized_telemetry=False))

print(client.heartbeat())

ValueError: Could not connect to tenant default_tenant. Are you sure it exists?

In [200]:
def convert_tmdb_to_mvlen(tmdb_title):
    mvlen_result = movies[movies["title"].apply(lambda x:tmdb_title.lower() in x.lower())]
    print(mvlen_result)
    if len(mvlen_result) > 0:
        return mvlen_result.iloc[0]
    else:
        return None
convert_tmdb_to_mvlen("shining")

Empty DataFrame
Columns: [movie_id, title, genres]
Index: []


In [202]:
movies[movies["title"].apply(lambda x:"shining" in x.lower())]

Unnamed: 0,movie_id,title,genres
1238,1258,"Shining, The (1980)",Horror


In [205]:
movies[movies["title"].apply(lambda x:"The Shining" in x)]

Unnamed: 0,movie_id,title,genres


In [204]:
movies

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation
1,2,Jumanji (1995),Adventure
2,3,Grumpier Old Men (1995),Comedy
3,4,Waiting to Exhale (1995),Comedy
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [35]:
movies["title"].tolist()

['Toy Story (1995)',
 'Jumanji (1995)',
 'Grumpier Old Men (1995)',
 'Waiting to Exhale (1995)',
 'Father of the Bride Part II (1995)',
 'Heat (1995)',
 'Sabrina (1995)',
 'Tom and Huck (1995)',
 'Sudden Death (1995)',
 'GoldenEye (1995)',
 'American President, The (1995)',
 'Dracula: Dead and Loving It (1995)',
 'Balto (1995)',
 'Nixon (1995)',
 'Cutthroat Island (1995)',
 'Casino (1995)',
 'Sense and Sensibility (1995)',
 'Four Rooms (1995)',
 'Ace Ventura: When Nature Calls (1995)',
 'Money Train (1995)',
 'Get Shorty (1995)',
 'Copycat (1995)',
 'Assassins (1995)',
 'Powder (1995)',
 'Leaving Las Vegas (1995)',
 'Othello (1995)',
 'Now and Then (1995)',
 'Persuasion (1995)',
 'City of Lost Children, The (1995)',
 'Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)',
 'Dangerous Minds (1995)',
 'Twelve Monkeys (1995)',
 'Wings of Courage (1995)',
 'Babe (1995)',
 'Carrington (1995)',
 'Dead Man Walking (1995)',
 'Across the Sea of Time (1995)',
 'It Takes Two (1995)',
 'Clueless (

In [None]:
movies[movies["title"].apply(lambda x:"Pulp".lower() in x.lower())]