In [87]:
import pandas as pd
from deepctr.feature_column import SparseFeat, VarLenSparseFeat
from sklearn.preprocessing import LabelEncoder
from tensorflow.python.keras import backend as K
from tensorflow.python.keras.models import Model
import numpy as np
import random
from tqdm import tqdm
from deepmatch.models import ComiRec, NCF
from deepmatch.utils import sampledsoftmaxloss, NegativeSampler
import tensorflow as tf
import gc

from deepctr.feature_column import DenseFeat

pad_sequences = tf.keras.utils.pad_sequences
SEQ_LEN = 50

In [88]:
def get_train_input_for_user(user_df):
    #user_df = user_df.sort_values("timestamp")
    # user_df = user_df[-SEQ_LEN:] 
    user_id = user_df["user_id"].to_list()[0]
    tmp = [user_df[:x+1] for x in range(len(user_df))]
    input_data = []
    for record in tmp:
        sample = {
            "history_movie_id":np.array(record["movie_id"].tolist()[:-1]),
            "history_genre_id": np.array(record["genres"].tolist()[:-1]),
            "history_hot":np.array(record["hot"].tolist()[:-1]),
            "history_grade": np.array(record["grade"].tolist()[:-1]),
            "history_year": np.array(record["year"].tolist()[:-1]),
            "movie_id":np.array(record["movie_id"].tolist()[-1]),
            # np.array(record["genres"].tolist()[:-1]) # history genre list
        }
        input_data.append(sample)
    # train, test = input_data[:-1], input_data[-1:]
    result_df = pd.DataFrame(input_data)
    result_df["label"] = 1 # tmp for now
    result_df["hist_len"] = SEQ_LEN
    result_df[["label", "hist_len"]] = result_df[["label", "hist_len"]].astype("int8")
    return result_df

def get_test_input_for_user(user_df):
    #user_df = user_df.sort_values("timestamp")
    # user_df = user_df[-SEQ_LEN:] 
    user_id = user_df["user_id"].to_list()[0]
    
    input_data = []
    record = user_df
    sample = {
        "history_movie_id":np.array(record["movie_id"].tolist()[:-1]),
        "history_genre_id": np.array(record["genres"].tolist()[:-1]),
            "history_hot":np.array(record["hot"].tolist()[:-1]),
            "history_grade": np.array(record["grade"].tolist()[:-1]),
        "history_year": np.array(record["year"].tolist()[:-1]),
        "movie_id":np.array(record["movie_id"].tolist()[-1]),
        # np.array(record["genres"].tolist()[:-1]) # history genre list
    }
    input_data.append(sample)
    # train, test = input_data[:-1], input_data[-1:]
    result_df = pd.DataFrame(input_data)
    result_df["label"] = 1 # tmp for now
    result_df["hist_len"] = SEQ_LEN
    return result_df

In [89]:
df = pd.read_csv("./datasets/ml-25m/ratings.csv")

In [90]:
movies = pd.read_csv("./datasets/ml-25m/movies.csv")

In [91]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [92]:
links_df = pd.read_csv('./datasets/ml-25m/links.csv')
tags_df = pd.read_csv('./datasets/ml-25m/tags.csv')
genome_scores_df = pd.read_csv('./datasets/ml-25m/genome-scores.csv')
genome_tags_df = pd.read_csv('./datasets/ml-25m/genome-tags.csv')

In [93]:
tags_df.groupby("movieId").apply(lambda x:"  ".join([str(x) for x in x["tag"].drop_duplicates()]))

  tags_df.groupby("movieId").apply(lambda x:"  ".join([str(x) for x in x["tag"].drop_duplicates()]))


movieId
1         Owned  imdb top 250  Pixar  time travel  child...
2         Robin Williams  time travel  fantasy  based on...
3         funny  best friend  duringcreditsstinger  fish...
4         based on novel or book  chick flick  divorce  ...
5         aging  baby  confidence  contraception  daught...
                                ...                        
208813                                           might like
208933                 black and white  deal with the devil
209035    computer animation  Japan  mass behavior  mass...
209037    chameleon  computer animation  gluttony  humor...
209063    black  education  friends schools  independent...
Length: 45251, dtype: object

In [94]:
genome_scores_df[genome_scores_df["relevance"]>0.5]

Unnamed: 0,movieId,tagId,relevance
10,1,11,0.58025
18,1,19,0.66250
28,1,29,0.89375
29,1,30,0.67625
60,1,61,0.61750
...,...,...,...
15584291,206499,972,0.60600
15584311,206499,992,0.51225
15584327,206499,1008,0.52500
15584333,206499,1014,0.59775


In [95]:
xdf = df.groupby("movieId").agg({"rating":"mean", "userId":"count"})

In [96]:
xdf = xdf.reset_index()

In [97]:
xdf["hot"] = 0
xdf["grade"] = 0

In [98]:
# create movie popular feature
xdf.loc[xdf["userId"]>0, "hot"] = 1
xdf.loc[xdf["userId"]>100, "hot"] = 2
xdf.loc[xdf["userId"]>1000, "hot"] = 3
xdf.loc[xdf["userId"]>10000, "hot"] = 4
xdf.loc[xdf["userId"]>30000, "hot"] = 5
xdf.loc[xdf["userId"]>50000, "hot"] = 6

# create movie rating zone feature
xdf.loc[xdf["rating"]>1, "grade"] = 1
xdf.loc[xdf["rating"]>2, "grade"] = 2
xdf.loc[xdf["rating"]>3, "grade"] = 3
xdf.loc[xdf["rating"]>4, "grade"] = 4
xdf.loc[xdf["rating"]>4.5, "grade"] = 5

In [99]:
movies = pd.merge(movies, xdf)
del movies["userId"]
del movies["rating"]

In [100]:
def get_year(title):
    tmp = title.split(")")[0].split("(")[-1]
    try:
        return int(tmp)
    except:
        return 0

In [101]:
movies["year"] = movies["title"].apply(get_year)

In [102]:
movies.to_csv("./datasets/ml-25m/processed_movies.csv", index=False)

In [103]:
movies

Unnamed: 0,movieId,title,genres,hot,grade,year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,6,3,1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,4,3,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,4,3,1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,3,2,1995
4,5,Father of the Bride Part II (1995),Comedy,4,3,1995
...,...,...,...,...,...,...
59042,209157,We (2018),Drama,1,1,2018
59043,209159,Window of the Soul (2001),Documentary,1,2,2001
59044,209163,Bad Poems (2018),Comedy|Drama,1,4,2018
59045,209169,A Girl Thing (2001),(no genres listed),1,2,2001


In [15]:
#movies["movie_id"] = movies["movieId"]

In [16]:
tqdm.pandas()

In [107]:
xdf

Unnamed: 0,movieId,rating,userId,hot,grade
0,1,3.893708,57309,6,3
1,2,3.251527,24228,4,3
2,3,3.142028,11804,4,3
3,4,2.853547,2523,3,2
4,5,3.058434,11714,4,3
...,...,...,...,...,...
59042,209157,1.500000,1,1,1
59043,209159,3.000000,1,1,2
59044,209163,4.500000,1,1,4
59045,209169,3.000000,1,1,2


In [66]:
data = pd.merge(df,movies, on="movieId")

In [67]:
#data["genres"] = data["genres"].apply(lambda x:x.split("|")[0])

In [68]:
data.columns = ["user_id", "movie_id", "rating", "timestamp", "title", "genres", "hot", "grade", "year"]

In [69]:
lbe = LabelEncoder()
data["genres"] = lbe.fit_transform(data["genres"]) + 1

In [70]:
data = data.sort_values("timestamp")

In [25]:
del data["timestamp"]

In [22]:
del df, movies

In [26]:
del data["title"]

In [71]:
gc.collect()

96096

In [73]:
user_id_size =  data['user_id'].max()+1
movie_id_size = data['movie_id'].max()+1
genre_id_size = int(data['genres'].max())+1
grade_size = int(data['grade'].max()) + 1
hot_size = int(data['hot'].max()) + 1
year = int(data['year'].max()) + 1
embedding_dim = 64

In [74]:
user_feature_columns = [
                        # SparseFeat('user_id', user_id_size, 16),
                        VarLenSparseFeat(SparseFeat('hist_movie_id', movie_id_size, embedding_dim,
                                                    embedding_name="movie_id"), SEQ_LEN, 'mean', 'hist_len'),
                        VarLenSparseFeat(SparseFeat('hist_genres', genre_id_size, embedding_dim,
                                                   embedding_name="genres"), SEQ_LEN, 'mean', 'hist_len'),
                        VarLenSparseFeat(SparseFeat('hist_hot', genre_id_size, embedding_dim,
                                                   embedding_name="genres"), SEQ_LEN, 'mean', 'hist_len'),
                            VarLenSparseFeat(SparseFeat('hist_grade', genre_id_size, embedding_dim,
                                                   embedding_name="genres"), SEQ_LEN, 'mean', 'hist_len'),
                        VarLenSparseFeat(SparseFeat('hist_year', genre_id_size, embedding_dim,
                                                   embedding_name="genres"), SEQ_LEN, 'mean', 'hist_len'),
                        ]
item_feature_columns = [SparseFeat('movie_id', movie_id_size, embedding_dim),]

In [75]:
#tmp_df = data[:1000000].copy()

In [76]:
data[data["year"]>=2000]

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genres,hot,grade,year
11487987,74504,2769,3.0,935027027,"Yards, The (2000)",1301,2,3,2000
18505539,119902,2769,3.0,937997977,"Yards, The (2000)",1301,2,3,2000
8456088,55095,2769,5.0,938893611,"Yards, The (2000)",1301,2,3,2000
1184379,7994,2769,1.0,939145921,"Yards, The (2000)",1301,2,3,2000
8287545,53956,2769,3.0,940972197,"Yards, The (2000)",1301,2,3,2000
...,...,...,...,...,...,...,...,...,...
13207880,85523,168252,4.0,1574327479,Logan (2017),564,3,3,2017
1284508,8642,202101,2.0,1574327512,Stuber (2019),308,1,2,2019
13207893,85523,204704,4.0,1574327533,Ready or Not (2019),1264,1,3,2019
1284464,8642,122914,4.0,1574327549,Avengers: Infinity War - Part II (2019),208,3,3,2019


In [77]:
train_set = data[data["year"]>=2000].groupby("user_id").progress_apply(lambda x:get_train_input_for_user(x[:80]))

100%|██████████████████████████████████| 117328/117328 [08:45<00:00, 223.19it/s]


In [78]:
test_set = data.groupby("user_id").progress_apply(lambda x:get_test_input_for_user(x))

100%|█████████████████████████████████| 162541/162541 [01:02<00:00, 2605.84it/s]


In [None]:
test_set["history_movie_id"] = pad_sequences(test_set["history_movie_id"], maxlen=SEQ_LEN, padding='pre', truncating='post', value=0).tolist()
test_set["history_genre_id"] = pad_sequences(test_set["history_genre_id"], maxlen=SEQ_LEN, padding='pre', truncating='post', value=0).tolist()
test_set["history_hot"] = pad_sequences(test_set["history_hot"], maxlen=SEQ_LEN, padding='pre', truncating='post', value=0).tolist()
test_set["history_grade"] = pad_sequences(test_set["history_grade"], maxlen=SEQ_LEN, padding='pre', truncating='post', value=0).tolist()
test_set["history_year"] = pad_sequences(test_set["history_year"], maxlen=SEQ_LEN, padding='pre', truncating='post', value=0).tolist()


test_input = test_set.sample(frac=1)

test_model_input = {
    "hist_movie_id":np.array(test_input["history_movie_id"].tolist()),
    "hist_genres":np.array(test_input["history_genre_id"].tolist()),
    "hist_hot":np.array(test_input["history_hot"].tolist()),
    "hist_grade":np.array(test_input["history_grade"].tolist()),
    "hist_year":np.array(test_input["history_year"].tolist()),
    "movie_id": np.array(test_input["movie_id"].tolist()),
    "hist_len": np.array(test_input["hist_len"].tolist()),
}
test_label =  test_input["label"].tolist()

In [83]:
1

1

In [None]:
train_set["history_movie_id"] = pad_sequences(train_set["history_movie_id"], maxlen=SEQ_LEN, padding='pre', truncating='post', value=0).tolist()
train_set["history_genre_id"] = pad_sequences(train_set["history_genre_id"], maxlen=SEQ_LEN, padding='pre', truncating='post', value=0).tolist()
train_set["history_hot"] = pad_sequences(train_set["history_hot"], maxlen=SEQ_LEN, padding='pre', truncating='post', value=0).tolist()
train_set["history_grade"] = pad_sequences(train_set["history_grade"], maxlen=SEQ_LEN, padding='pre', truncating='post', value=0).tolist()
train_set["history_year"] = pad_sequences(train_set["history_year"], maxlen=SEQ_LEN, padding='pre', truncating='post', value=0).tolist()


train_input = train_set.sample(frac=1)

train_model_input = {
    "hist_movie_id":np.array(train_input["history_movie_id"].tolist()),
    "hist_genres":np.array(train_input["history_genre_id"].tolist()),
    "hist_hot":np.array(train_input["history_hot"].tolist()),
    "hist_grade":np.array(train_input["history_grade"].tolist()),
    "hist_year":np.array(train_input["history_grade"].tolist()),
    "movie_id": np.array(train_input["movie_id"].tolist()),
    "hist_len": np.array(train_input["hist_len"].tolist()),
}
train_label =  train_input["label"].tolist()





In [104]:
import tensorflow as tf
if tf.__version__ >= '2.0.0':
    tf.compat.v1.disable_eager_execution()
else:
    K.set_learning_phase(True)

In [105]:
from collections import Counter
train_counter = Counter(data['movie_id'])
item_count = [train_counter.get(i,0) for i in range(item_feature_columns[0].vocabulary_size)] #todo change this later
sampler_config = NegativeSampler('frequency',num_sampled=255,item_name="movie_id",item_count=item_count)
#model = YoutubeDNN(user_feature_columns, item_feature_columns, user_dnn_hidden_units=(128,64, embedding_dim), sampler_config=sampler_config)
model = ComiRec(user_feature_columns,
                item_feature_columns,
                k_max=2, 
                user_dnn_hidden_units=(128,64, embedding_dim),
                sampler_config=sampler_config)


In [106]:
model.compile(optimizer="adam", loss=sampledsoftmaxloss)
history = model.fit(train_model_input, train_label,  # train_label,
                    batch_size=512, epochs=20, verbose=1, validation_split=0.0, use_multiprocessing=True)

Train on 4562867 samples
Epoch 1/20


2024-04-04 22:54:15.691019: W tensorflow/c/c_api.cc:291] Operation '{name:'training_2/Adam/beta_1/Assign' id:3073 op device:{requested: '', assigned: ''} def:{{{node training_2/Adam/beta_1/Assign}} = AssignVariableOp[_has_manual_control_dependencies=true, dtype=DT_FLOAT, validate_shape=false](training_2/Adam/beta_1, training_2/Adam/beta_1/Initializer/initial_value)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 18/20
 470528/4562867 [==>...........................] - ETA: 30:50 - loss: 3.2685

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Epoch 19/20


In [38]:
model

<tensorflow.python.keras.engine.functional.Functional at 0x7f296c597070>

In [109]:
all_movie_ids = movies["movieId"]
metadatas = movies.to_dict(orient="records")

all_item_model_input = {"movie_id": all_movie_ids}

user_embedding_model = Model(inputs=model.user_input, outputs=model.user_embedding)
item_embedding_model = Model(inputs=model.item_input, outputs=model.item_embedding)

#user_embs = user_embedding_model.predict(test_user_model_input, batch_size=2 ** 12)
# user_embs = user_embs[:, i, :]  # i in [0,k_max) if MIND
item_embs = item_embedding_model.predict(all_item_model_input, batch_size=2 ** 12)

2024-04-05 10:44:47.605127: W tensorflow/c/c_api.cc:291] Operation '{name:'lambda_7/Squeeze' id:2294 op device:{requested: '', assigned: ''} def:{{{node lambda_7/Squeeze}} = Squeeze[T=DT_FLOAT, _has_manual_control_dependencies=true, squeeze_dims=[1]](lambda_7/GatherV2)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.


In [111]:
item_embs.shape

(59047, 64)

In [114]:
user_embs = user_embedding_model.predict(train_model_input, batch_size=2 ** 12)

In [115]:
user_embs.shape

(4562867, 2, 64)

In [41]:
movies["movie_id"] = movies["movieId"]

In [116]:
import pickle

In [117]:
data = {
    "all_movie_ids":all_movie_ids,
    "item_embs":item_embs,
    "metadatas":metadatas
}

In [118]:
with open("embedding.pl", "wb") as f:
    pickle.dump(data, f)

In [123]:
collection = chroma_client.get_or_create_collection("movie_rec_25m_0405")

In [126]:
import chromadb
chroma_client = chromadb.PersistentClient(path="./chroma_data")
collection.delete(ids=all_movie_ids.apply(str).tolist()[:30000])
collection.delete(ids=all_movie_ids.apply(str).tolist()[30000:])
collection = chroma_client.get_or_create_collection("movie_rec_25m_0405")


collection.add(ids=all_movie_ids.apply(str).tolist()[:30000],
               embeddings=item_embs[:30000],
               metadatas=metadatas[:30000])
collection.add(ids=all_movie_ids.apply(str).tolist()[30000:],
               embeddings=item_embs[30000:],
               metadatas=metadatas[30000:])

Delete of nonexisting embedding ID: 160458
Delete of nonexisting embedding ID: 160460
Delete of nonexisting embedding ID: 160462
Delete of nonexisting embedding ID: 160464
Delete of nonexisting embedding ID: 160466
Delete of nonexisting embedding ID: 160468
Delete of nonexisting embedding ID: 160470
Delete of nonexisting embedding ID: 160472
Delete of nonexisting embedding ID: 160474
Delete of nonexisting embedding ID: 160476
Delete of nonexisting embedding ID: 160480
Delete of nonexisting embedding ID: 160484
Delete of nonexisting embedding ID: 160486
Delete of nonexisting embedding ID: 160488
Delete of nonexisting embedding ID: 160492
Delete of nonexisting embedding ID: 160494
Delete of nonexisting embedding ID: 160496
Delete of nonexisting embedding ID: 160498
Delete of nonexisting embedding ID: 160504
Delete of nonexisting embedding ID: 160506
Delete of nonexisting embedding ID: 160508
Delete of nonexisting embedding ID: 160511
Delete of nonexisting embedding ID: 160513
Delete of n

In [130]:
collection.get(ids=all_movie_ids.apply(str)[:10].tolist(), include=["embeddings"])["embeddings"]

In [133]:
np.array(embs).shape

(10, 64)

In [66]:
count = 0
for i in tqdm(range(len(user_embs))):
    user_emb = user_embs[i:i+1]
    mids = [x["movie_id"] for x in collection.query(query_embeddings=user_emb[:, 1, :], n_results=50)["metadatas"][0]] + \
    [x["movie_id"] for x in collection.query(query_embeddings=user_emb[:, 0, :], n_results=50)["metadatas"][0]]
    target = movies[movies["movie_id"]==test_model_input["movie_id"][i]]
    count += target["movie_id"].tolist()[0] in mids

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162541/162541 [06:47<00:00, 399.02it/s]


In [134]:
from tensorflow.python.keras.models import  save_model,load_model
from deepmatch.layers import custom_objects
save_model(user_embedding_model, 'user_emb_25m_0405.h5')



In [67]:
count/len(test_model_input["movie_id"])

0.14225949145138764

In [68]:
movies[movies["movie_id"]==133093]["genres"].tolist()

['Drama|Thriller']

In [69]:
movies[movies["title"].str.contains("A Quiet")]

Unnamed: 0,movieId,title,genres,hot,grade,movie_id
28024,133093,A Quiet Place to Kill (1970),Drama|Thriller,1,3,133093
30137,138366,A Quiet Life (2010),Drama,1,3,138366
36742,155425,A Quiet Place in the Country (1968),Drama|Horror,1,2,155425
38319,159710,A Quiet Life (1995),Drama,1,2,159710
39489,162726,A Quiet Passion (2016),Drama,1,3,162726
43835,172869,A Quiet Outpost (2011),Action|Drama|War,1,3,172869
49208,185029,A Quiet Place (2018),Drama|Horror|Thriller,3,3,185029


In [59]:
s_1 = "A Quiet Place(1970)"
s_2 = "A Quiet Place"

In [60]:
import difflib
def get_smilar_score(s_1, s_2):
    return difflib.SequenceMatcher(None, s_1, s_2).quick_ratio()

In [61]:
get_smilar_score(s_1, s_2)

0.8125

In [79]:
movies[movies["genres"].str.startswith("Horror")]

Unnamed: 0,movieId,title,genres,movie_id


In [80]:
movies[movies["title"].str.contains("Shining")]

Unnamed: 0,movieId,title,genres,movie_id
1225,1258,"Shining, The (1980)",Horror,1258
8994,26791,Shining Through (1992),Drama|Romance|Thriller|War,26791
19488,101329,Shining Night: A Portrait of Composer Morten L...,Documentary,101329
24167,120823,One Bright Shining Moment (2005),Documentary,120823
26184,125361,The Shining Hour (1938),Drama|Romance,125361
27846,130328,A Bright Shining Lie (1998),Drama|War,130328
50098,180263,The Shining (1997),Drama|Horror|Thriller,180263
60874,204362,Our Shining Days (2017),Comedy|Drama|Romance,204362


In [64]:
movies[movies["movieId"]==183869]

Unnamed: 0,movieId,title,genres,movie_id
51773,183869,Hereditary (2018),(no genres listed),183869


In [None]:
movies[movies["title"].apply(lambda x:"The Shining" in x)]

In [80]:
movies

Unnamed: 0,movieId,title,genres,movie_id
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1
1,2,Jumanji (1995),Adventure|Children|Fantasy,2
2,3,Grumpier Old Men (1995),Comedy|Romance,3
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,4
4,5,Father of the Bride Part II (1995),Comedy,5
...,...,...,...,...
62418,209157,We (2018),Drama,209157
62419,209159,Window of the Soul (2001),Documentary,209159
62420,209163,Bad Poems (2018),Comedy|Drama,209163
62421,209169,A Girl Thing (2001),(no genres listed),209169


In [79]:
set(np.concatenate(movies["genres"].apply(lambda x:x.split("|"))))

{'(no genres listed)',
 'Action',
 'Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'IMAX',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western'}

In [75]:
collection.get(limit=10)

{'ids': ['1',
  '10',
  '100',
  '1000',
  '100001',
  '100003',
  '100008',
  '100015',
  '100017',
  '100032'],
 'embeddings': None,
 'metadatas': [{'genres': 'Adventure|Animation|Children|Comedy|Fantasy',
   'movieId': 1,
   'movie_id': 1,
   'title': 'Toy Story (1995)'},
  {'genres': 'Action|Adventure|Thriller',
   'movieId': 10,
   'movie_id': 10,
   'title': 'GoldenEye (1995)'},
  {'genres': 'Drama|Thriller',
   'movieId': 100,
   'movie_id': 100,
   'title': 'City Hall (1996)'},
  {'genres': 'Crime',
   'movieId': 1000,
   'movie_id': 1000,
   'title': 'Curdled (1996)'},
  {'genres': 'Comedy|Drama',
   'movieId': 100001,
   'movie_id': 100001,
   'title': 'Comic, The (1969)'},
  {'genres': 'Comedy',
   'movieId': 100003,
   'movie_id': 100003,
   'title': 'Up in Smoke (1957)'},
  {'genres': 'Documentary',
   'movieId': 100008,
   'movie_id': 100008,
   'title': 'Flaw, The (2011)'},
  {'genres': 'Crime|Drama|Thriller',
   'movieId': 100015,
   'movie_id': 100015,
   'title': 'Chi