In [36]:
import pandas as pd
import numpy as np
import faiss

In [9]:
# 模拟数据库

class MemDatabase:
    def __init__(self, data_path="./"):
        self.data_path = data_path
        self.user_cols = ['user_id', 'gender', 'age', 'occupation', 'zip']
        self.rating_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
        self.movie_cols = ['movie_id', 'title', 'genres']

    def load_users(self):
        """加载用户数据"""
        user_file = f"{self.data_path}ml-1m/users.dat"
        return pd.read_csv(user_file, sep='::', header=None, names=self.user_cols, engine='python')

    def load_ratings(self):
        """加载评分数据"""
        ratings_file = f"{self.data_path}ml-1m/ratings.dat"
        return pd.read_csv(ratings_file, sep='::', header=None, names=self.rating_cols, engine='python')

    def load_movies(self):
        """加载电影数据，并处理电影类型"""
        movies_file = f"{self.data_path}ml-1m/movies.dat"
        movies = pd.read_csv(movies_file, sep='::', header=None, names=self.movie_cols, encoding="unicode_escape", engine='python')
        movies['genres'] = movies['genres'].map(lambda x: x.split('|')[0])
        return movies

    def load_data(self):
        """加载并合并所有数据"""
        self.users = self.load_users()
        self.ratings = self.load_ratings()
        self.movies = self.load_movies()
        self.user_ratting_movies = pd.merge(self.ratings, self.movies, on='movie_id')
        
# 使用示例
db = MemDatabase(data_path="./")
db.load_data()

In [13]:
import json

tbl_ratting_movies = db.user_ratting_movies

N_MAX = 5

filter_user_id = 1345

json_str = tbl_ratting_movies[ tbl_ratting_movies['user_id'] == filter_user_id ]  \
            .sample(n=N_MAX, random_state=42) \
            .to_json(orient='records')

movies = json.loads(json_str)

print(movies)

[{'user_id': 1345, 'movie_id': 2889, 'rating': 1, 'timestamp': 974775450, 'title': 'Mystery, Alaska (1999)', 'genres': 'Comedy'}, {'user_id': 1345, 'movie_id': 514, 'rating': 2, 'timestamp': 974773583, 'title': 'Ref, The (1994)', 'genres': 'Comedy'}, {'user_id': 1345, 'movie_id': 380, 'rating': 4, 'timestamp': 974774382, 'title': 'True Lies (1994)', 'genres': 'Action'}, {'user_id': 1345, 'movie_id': 2571, 'rating': 5, 'timestamp': 974776880, 'title': 'Matrix, The (1999)', 'genres': 'Action'}, {'user_id': 1345, 'movie_id': 153, 'rating': 5, 'timestamp': 974775947, 'title': 'Batman Forever (1995)', 'genres': 'Action'}]


In [25]:
item_embs = np.load("model/item_embs.npy")
user_embs = np.load("model/user_embs.npy")
user_id_idx = np.load("model/user_id_idx.npy")
movie_id_idx = np.load("model/movie_id_idx.npy")

In [27]:
embedding_dim = 64

# 构建user_id和embedding的KV
user_id_emb_dict = {}

for index, value in enumerate(user_id_idx):
    user_id_emb_dict[user_id_idx[index]] = user_embs[index]

# 加载所有的item 构建user_id和embedding的KV
faiss_index = faiss.IndexFlatIP(embedding_dim)
faiss_index.add(item_embs)

In [33]:
# 根据user id查询

TOP_N = 100

user_id = 1345
user_emb_query = [ user_id_emb_dict[user_id] ]
D, I = faiss_index.search(np.ascontiguousarray(user_emb_query), TOP_N)
search_idx_result = I[0]
movie_ids = []
for idx in search_idx_result:
    movie_ids.append(movie_id_idx[idx])
    
print(movie_ids)

[39, 2129, 2167, 1744, 1174, 21, 217, 2959, 1, 1100, 993, 348, 333, 2192, 1706, 2893, 2496, 1505, 1450, 2198, 2203, 1619, 2163, 484, 1415, 347, 615, 575, 1084, 34, 2132, 1088, 1399, 336, 229, 1704, 538, 428, 121, 1735, 1126, 2487, 1928, 45, 2846, 2204, 2899, 2098, 1385, 175, 427, 339, 2401, 2555, 3509, 2962, 2348, 3030, 549, 1585, 2786, 671, 151, 2091, 2658, 1594, 458, 1737, 1343, 2839, 2111, 695, 3567, 2470, 1179, 11, 1691, 1134, 226, 1398, 1313, 2673, 1294, 1152, 2376, 786, 2514, 51, 1108, 1696, 650, 1667, 2309, 1991, 102, 3032, 1989, 978, 2942, 1599]


In [35]:
tbl_movies = db.movies

filtered_df = tbl_movies[tbl_movies['movie_id'].isin(movie_ids)]
filtered_df.head()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation
10,11,"American President, The (1995)",Comedy
20,21,Get Shorty (1995),Action
33,34,Babe (1995),Children's
38,39,Clueless (1995),Comedy
