## 基于Embedding的电影相关推荐

知识：
* word2vec：输入(doc, words)，得到word embedding
* item2vec：输入（userid, itemids），得到item embedding

说明：
* 使用标题/内容的分词embedding作推荐，属于内容相似推荐
* 使用行为列表作embedding作推荐，属于行为相关推荐，效果比内容相似推荐更好

延伸：
* 把word embedding进行加和、平均，就得到了document embedding；
* 把item embedding进行加和、平均，就得到了user embedding；

## 获取数据

In [1]:
import pandas as pd
from gensim.models import Word2Vec
from tqdm import  tqdm

In [2]:
df = pd.read_csv("./ml-latest-small/ratings.csv")
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [3]:
df["rating"].mean()

3.501556983616962

In [4]:
# 只取平均分以上的数据，作为喜欢的列表
df = df[df["rating"] > df["rating"].mean()].copy()
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
# # 聚合得到userId，movieId列表
# df_group = df.groupby(['userId'])['movieId'].apply(lambda x: ' '.join([str(m) for m in x])).reset_index()
# df_group.head()

## Item2Vec

In [6]:
def get_w2v(sentences, f1, f2, values, emb_size): 
    
    model = Word2Vec(sentences, size=emb_size, sg=1, window=10, seed=2020, workers=24, min_count=5, iter=10)
    
    w2v=[]
    for v in values:
        try:
            a = [int(v)]
            a.extend(model[str(v)])
            w2v.append(a)
        except: 
#         except Exception as e: # [wtb]
#             print(e) # [wtb]
            a = [int(v)]
            a.extend([0] * emb_size)
            w2v.append(a)
    
    out_df = []
    for li in w2v:
        out_df.append([li[0],li[1:]])
        
    out_df = pd.DataFrame(out_df)
    out_df.columns = [f2] + ['vector']
    
    return out_df

    
def item2vec(df_, f1, f2, emb_size):
    
    df = df_.copy()
    df = df.sort_values('timestamp', ascending=True)
    # 聚合
#     sentences = df.groupby([f1])[f2].agg({list}).reset_index()['list'].values.tolist()
    sentences = df.groupby([f1])[f2].apply(lambda x:x.tolist()).tolist() # [wtb]
    for i in range(len(sentences)):
        sentences[i] = [str(x) for x in sentences[i]]
    
    values = df[f2].unique()
    out_df = get_w2v(sentences, f1, f2, values, emb_size)
    
    return out_df

In [7]:
emb_df = item2vec(df, 'userId', 'movieId', 16)

  if __name__ == '__main__':


In [96]:
emb_df

Unnamed: 0,movieId,vector
0,595,"[1.1373544, -0.021706775, 0.8192812, -1.206789..."
1,218,"[1.2743996, 0.32304937, 0.31059283, -1.6139778..."
2,165,"[1.567091, 0.50059074, 1.1967809, -1.1232501, ..."
3,161,"[1.7992687, 0.5040063, 0.7863472, -1.358738, -..."
4,222,"[1.0681629, 0.21331586, 0.30408907, -1.3941166..."
...,...,...
6293,193609,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
6294,159755,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
6295,130444,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
6296,166203,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


In [100]:
df[df['movieId']==130444]

Unnamed: 0,userId,movieId,rating,timestamp
30157,210,130444,4.0,1537632310


## DeepWalk

In [101]:
def deepwalk_walk(walk_length, start_node):

    walk = [start_node]

    while len(walk) <= walk_length:
        cur = walk[-1]
        try:
            cur_nbrs = item_dict_new[cur]
            walk.append(random.choice(cur_nbrs))
        except:
            break
    return walk

def _simulate_walks(nodes, num_walks, walk_length,):
    walks = []
    for _ in tqdm(range(num_walks)):
        random.shuffle(nodes)
        for v in nodes:           
            walks.append(deepwalk_walk(walk_length=walk_length, start_node=v))
    return walks

def deepwalk(df_, f1, f2, emb_size):
    
    df = df_.copy()
    
    item_dict_new = {}
    all_item = []
    
    # 聚合
    sentences = df.groupby([f1])[f2].agg({list}).reset_index()['list'].values.tolist()
    for i in range(len(sentences)):
        sentences[i] = [str(x) for x in sentences[i]]
    
    # 提取共现商品
    for sentence in sentences:
        length = len(sentence)
        for position, itemId in enumerate(sentence):
            # 提取窗口内可组合商品（窗口可以调整）
            for i in range(position-1, position+2):
                if (i < 0) | (i >= length) | (i == position):
                    continue
                try:
                    item_dict_new[itemId].append(sentence[i]) 
                except:
                    item_dict_new[itemId] = [sentence[i]]
    
    nodes = [k for k in item_dict_new]
    num_walks = 5
    walk_length = 20
    results = _simulate_walks(nodes, num_walks, walk_length)
    len(results)
    
    values = df[f2].unique()
    
    out_df = get_w2v(results, f1, f2, values, emb_size)
    
    return out_df

In [102]:
import random
emb_df = deepwalk(df, 'userId', 'movieId', 16)

100%|██████████| 5/5 [00:00<00:00, 128.68it/s]
  a.extend(model[str(v)])


In [103]:
emb_df

Unnamed: 0,movieId,vector
0,1,"[0.009563226, 0.016655952, -0.00384119, 0.0240..."
1,3,"[-0.017449005, 0.016240655, 0.0057858294, -0.0..."
2,6,"[0.004522709, -0.02596823, -0.028129607, 0.017..."
3,47,"[0.009782434, -0.013077681, 0.02617319, -0.014..."
4,50,"[0.0034451142, 0.018996798, -0.031216076, -0.0..."
...,...,...
6293,146309,"[0.012493482, -0.026021998, -0.0076573594, 0.0..."
6294,147657,"[0.018867752, -0.023672048, 0.024560645, -0.01..."
6295,156726,"[-0.0019405718, -0.012622927, 0.02136726, 0.02..."
6296,160527,"[0.005002662, 0.004477478, 0.0028757523, -0.00..."


## 对于给定电影算出最相似的10个电影

In [104]:
df_movie = pd.read_csv("./datas/ml-latest-small/movies.csv")
df_movie.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [105]:
df_merge = pd.merge(df_movie, emb_df, on='movieId', how='left')
df_merge.head()

Unnamed: 0,movieId,title,genres,vector
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"[0.009563226, 0.016655952, -0.00384119, 0.0240..."
1,2,Jumanji (1995),Adventure|Children|Fantasy,"[0.009672286, -0.0046788617, 0.013131944, -0.0..."
2,3,Grumpier Old Men (1995),Comedy|Romance,"[-0.017449005, 0.016240655, 0.0057858294, -0.0..."
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,
4,5,Father of the Bride Part II (1995),Comedy,"[-0.026297674, 0.02462507, -0.020405838, -0.01..."


In [106]:
# 随便挑选一个电影：4018	What Women Want (2000)
movie_id = 4018
df_merge.loc[df_merge["movieId"]==movie_id]

Unnamed: 0,movieId,title,genres,vector
3002,4018,What Women Want (2000),Comedy|Romance,"[0.024533588, 0.0046383645, 0.0018839509, -0.0..."


In [107]:
movie_embedding = df_merge.loc[df_merge["movieId"]==movie_id, "vector"].iloc[0]
movie_embedding

[0.024533588,
 0.0046383645,
 0.0018839509,
 -0.0002835982,
 0.024037596,
 0.0024099012,
 0.0026918708,
 -0.0039753052,
 -0.023770606,
 -0.020398686,
 0.010802483,
 0.021639591,
 -0.016242243,
 0.027446344,
 -0.020431234,
 -0.0005121919]

In [108]:
# 余弦相似度
from scipy.spatial import distance
df_merge["sim_value"] = df_merge["vector"].map(lambda x : 1 - distance.cosine(movie_embedding, x))

In [109]:
df_merge[["movieId", "title", "genres", "sim_value"]].head(3)

Unnamed: 0,movieId,title,genres,sim_value
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,-0.006609
1,2,Jumanji (1995),Adventure|Children|Fantasy,0.325056
2,3,Grumpier Old Men (1995),Comedy|Romance,-0.47534


In [110]:
# 按相似度降序排列，查询前10条
df_merge.sort_values(by="sim_value", ascending=False)[["movieId", "title", "genres", "sim_value"]].head(10)

Unnamed: 0,movieId,title,genres,sim_value
3002,4018,What Women Want (2000),Comedy|Romance,1.0
2684,3594,Center Stage (2000),Drama|Musical,0.773671
4908,7360,Dawn of the Dead (2004),Action|Drama|Horror|Thriller,0.749598
7008,68073,Pirate Radio (2009),Comedy|Drama,0.724681
9510,170993,Mini's First Time (2006),Comedy|Crime|Drama,0.711066
858,1129,Escape from New York (1981),Action|Adventure|Sci-Fi|Thriller,0.696947
326,368,Maverick (1994),Adventure|Comedy|Western,0.688153
2635,3526,Parenthood (1989),Comedy|Drama,0.683185
552,662,Fear (1996),Thriller,0.681208
718,937,Love in the Afternoon (1957),Comedy|Romance,0.680697


## Faiss相似检索
- 压缩
- 分群
- index

In [111]:
emb_df.head()

Unnamed: 0,movieId,vector
0,1,"[0.009563226, 0.016655952, -0.00384119, 0.0240..."
1,3,"[-0.017449005, 0.016240655, 0.0057858294, -0.0..."
2,6,"[0.004522709, -0.02596823, -0.028129607, 0.017..."
3,47,"[0.009782434, -0.013077681, 0.02617319, -0.014..."
4,50,"[0.0034451142, 0.018996798, -0.031216076, -0.0..."


In [112]:
emb_df.shape

(6298, 2)

In [113]:
import faiss
index = faiss.IndexFlatIP(16)
index.add(np.array(emb_df['vector'].tolist()))
# faiss.normalize_L2(user_embs)
D, I = index.search(np.array(emb_df['vector'].tolist()), 10)

In [114]:
D

array([[0.00348195, 0.00337485, 0.00328093, ..., 0.00295225, 0.00293264,
        0.00293117],
       [0.00507831, 0.00483656, 0.004785  , ..., 0.00369507, 0.00365712,
        0.00362798],
       [0.00485329, 0.00467913, 0.00447669, ..., 0.00407606, 0.00393377,
        0.00348858],
       ...,
       [0.00553616, 0.00471477, 0.00462394, ..., 0.00395534, 0.0039509 ,
        0.0039442 ],
       [0.00420992, 0.00414417, 0.00380386, ..., 0.0033235 , 0.00326754,
        0.00320414],
       [0.00433764, 0.00391496, 0.00349112, ..., 0.00340394, 0.00338355,
        0.00336748]], dtype=float32)

In [115]:
I

array([[4263, 4515, 3195, ..., 5534,  240, 5070],
       [1075,    1, 3152, ..., 4187, 6170,  613],
       [1136,    2, 4074, ..., 2770, 2474, 5668],
       ...,
       [6295, 3125, 4871, ..., 5211,   67, 5369],
       [3296, 5715, 6296, ..., 1001, 6176, 5113],
       [6297,  292, 3144, ..., 6291, 3179, 3301]])