In [None]:
!pip install lightfm

Collecting lightfm
[?25l  Downloading https://files.pythonhosted.org/packages/5e/fe/8864d723daa8e5afc74080ce510c30f7ad52facf6a157d4b42dec83dfab4/lightfm-1.16.tar.gz (310kB)
[K     |█                               | 10kB 26.5MB/s eta 0:00:01[K     |██▏                             | 20kB 33.1MB/s eta 0:00:01[K     |███▏                            | 30kB 24.3MB/s eta 0:00:01[K     |████▎                           | 40kB 22.2MB/s eta 0:00:01[K     |█████▎                          | 51kB 24.6MB/s eta 0:00:01[K     |██████▍                         | 61kB 22.3MB/s eta 0:00:01[K     |███████▍                        | 71kB 21.4MB/s eta 0:00:01[K     |████████▌                       | 81kB 23.0MB/s eta 0:00:01[K     |█████████▌                      | 92kB 22.3MB/s eta 0:00:01[K     |██████████▋                     | 102kB 21.4MB/s eta 0:00:01[K     |███████████▋                    | 112kB 21.4MB/s eta 0:00:01[K     |████████████▊                   | 122kB 21.4MB/s eta 0:

In [None]:
!wget http://files.grouplens.org/datasets/movielens/ml-1m.zip -O data.zip
!unzip data.zip

--2020-12-08 20:10:59--  http://files.grouplens.org/datasets/movielens/ml-1m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5917549 (5.6M) [application/zip]
Saving to: ‘data.zip’


2020-12-08 20:10:59 (26.5 MB/s) - ‘data.zip’ saved [5917549/5917549]

Archive:  data.zip
   creating: ml-1m/
  inflating: ml-1m/movies.dat        
  inflating: ml-1m/ratings.dat       
  inflating: ml-1m/README            
  inflating: ml-1m/users.dat         


In [30]:
import pandas as pd
import numpy as np
import scipy.sparse as sp
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

from collections import defaultdict
from tqdm.autonotebook import tqdm
from lightfm import LightFM
from sklearn.neighbors import KDTree
from sklearn.metrics import ndcg_score

В качестве датасета возьму movilens, так как его используют во всех статьях, так намного удобнее оценивать и сравнивать результаты.

In [13]:
ratings = pd.read_csv('ml-1m/ratings.dat', delimiter='::', header=None, 
        names=['user_id', 'movie_id', 'rating', 'timestamp'], 
        usecols=['user_id', 'movie_id', 'rating', 'timestamp'], engine='python')
movie_info = pd.read_csv('ml-1m/movies.dat', delimiter='::', header=None, 
        names=['movie_id', 'name', 'category'], engine='python')

ratings['user_id'] -= 1
ratings['movie_id'] -= 1
movie_info['movie_id'] -= 1

In [14]:
ratings_sorted = ratings.sort_values(['user_id', 'timestamp'])
unique_users = ratings_sorted.user_id.unique()
train, test = [], []
for user_id in unique_users:
    user_ratings = ratings_sorted.loc[ratings_sorted.user_id == user_id]
    try:
        train.append(user_ratings.iloc[:-1])
        test.append(user_ratings.iloc[-1:])
    except:
        continue
train = pd.concat(train)
test = pd.concat(test)

In [15]:
train.describe()

Unnamed: 0,user_id,movie_id,rating,timestamp
count,994169.0,994169.0,994169.0,994169.0
mean,3023.536725,1863.12659,3.581411,972211600.0
std,1728.320003,1095.374307,1.116737,12081430.0
min,0.0,0.0,1.0,956703900.0
25%,1505.0,1028.0,3.0,965302100.0
50%,3071.0,1833.0,4.0,972972300.0
75%,4475.0,2769.0,4.0,975218200.0
max,6039.0,3951.0,5.0,1046455000.0


In [16]:
test.describe()

Unnamed: 0,user_id,movie_id,rating,timestamp
count,6040.0,6040.0,6040.0,6040.0
mean,3019.5,2097.166887,3.606788,977521700.0
std,1743.742145,1177.903116,1.175389,20085960.0
min,0.0,0.0,1.0,956712400.0
25%,1509.75,1135.0,3.0,965491200.0
50%,3019.5,2125.0,4.0,974502400.0
75%,4529.25,3146.0,5.0,976426600.0
max,6039.0,3951.0,5.0,1046455000.0


In [17]:
users = train["user_id"]
items = train["movie_id"]
user_item = sp.coo_matrix((np.ones_like(users), (users, items)))
user_item_csr = user_item.tocsr()

In [18]:
def get_user_history(user_id, user_item):
    return [i for i in user_item[user_id].nonzero()[1]]

def get_movies(idxs):
    return movie_info.set_index('movie_id').loc[[i for i in idxs if i in set(movie_info.movie_id)]]

In [19]:
num_users, num_items = user_item_csr.shape
all_items = set(range(num_items))
unknown_items = {}

for user_id in range(num_users):
    known_items = user_item_csr[user_id].nonzero()[1]
    unknown_items[user_id] = list(all_items - set(known_items))

In [61]:
class BasicRecommender:
    def rmse(self, user_ids, item_ids):
        err = (self.score(user_ids, item_ids) - 1) ** 2
        return np.sqrt(err.mean())

    def compute_metrics(self, test, user_item, unknown_items, k=10):
        print('Train RMSE:', self.rmse(*user_item.nonzero()))
        print('Test RMSE:', self.rmse(test.user_id.values, test.movie_id.values))

        hr = []
        ndcg = []
        for user_id in tqdm(test.user_id.unique()):
            items = test[test.user_id == user_id].movie_id.tolist()
            neg = list(np.random.choice(unknown_items[user_id], size=99, replace=False))
            items.extend(neg)
            pred = self.score(int(user_id), items)
            target = np.zeros(100)
            target[0] = 1
            hr.append((pred.argsort()[::-1] == 0)[:k].sum())
            ndcg.append(ndcg_score([target], [pred], k=k))
        print(f"HR@{k} = {np.mean(hr):.4f}")
        print(f"NDCG@{k} = {np.mean(ndcg):.4f}")

    def recommend(self, user_id, unknown_items, k=10):
        not_recommended = np.array(unknown_items[user_id])
        scores = self.score(user_id, not_recommended)
        return not_recommended[sorted(np.arange(len(not_recommended)), key=lambda x: -scores[x])[:k]]

    def get_similar_items(self, item_id, k=10):
        raise NotImplementedError()

    def score(self, user, item):
        raise NotImplementedError()

In [None]:
class WARP(BasicRecommender):
    def __init__(self, num_components=10, lr=0.05):
        self.model = LightFM(no_components=num_components,
                             learning_rate=lr,
                             loss='warp')
        
    @property
    def item_embeddings(self):
        return self.model.get_item_representations()[1]
    
    @property
    def user_embeddings(self):
        return self.model.get_user_representations()[1]
    
    def fit(self, user_item, epochs=100):
        self.model.fit(user_item, epochs=epochs, verbose=True)

    def get_similar_items(self, item_id, k=10):
        return np.argsort(np.linalg.norm(self.item_embeddings - self.item_embeddings[item_id], axis=1))[:k]

    def score(self, user_ids, item_ids):
        return self.model.predict(user_ids, item_ids)# np.sum(self.user_embeddings[user] * self.item_embeddings[item], axis=1)

In [None]:
warp = WARP(num_components=10, lr=0.05)
warp.fit(user_item_csr, epochs=30)

Epoch: 100%|██████████| 30/30 [00:30<00:00,  1.02s/it]


In [None]:
warp.compute_metrics(test, user_item, unknown_items, 10)

Train RMSE: 7.785028
Test RMSE: 5.3703775


HBox(children=(FloatProgress(value=0.0, max=6040.0), HTML(value='')))


HR@10 = 0.6692
NDCG@10 = 0.3913


Метрики хорошие. В статье NFC лучший результат такой: 

HR@10 = 0.705

NDCG@10 = 0.426

In [None]:
user_id = 0

In [None]:
get_movies(get_user_history(user_id, user_item_csr)).iloc[:10]

Unnamed: 0_level_0,name,category
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Toy Story (1995),Animation|Children's|Comedy
149,Apollo 13 (1995),Drama
259,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Fantasy|Sci-Fi
526,Schindler's List (1993),Drama|War
530,"Secret Garden, The (1993)",Children's|Drama
587,Aladdin (1992),Animation|Children's|Comedy|Musical
593,Snow White and the Seven Dwarfs (1937),Animation|Children's|Musical
594,Beauty and the Beast (1991),Animation|Children's|Musical
607,Fargo (1996),Crime|Drama|Thriller
660,James and the Giant Peach (1996),Animation|Children's|Musical


In [None]:
get_movies(test[test.user_id == user_id].movie_id)

Unnamed: 0_level_0,name,category
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
47,Pocahontas (1995),Animation|Children's|Musical|Romance


Рекомендации соответствуют по категориям, это радует.

In [None]:
get_movies(warp.recommend(user_id, unknown_items))

Unnamed: 0_level_0,name,category
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
33,Babe (1995),Children's|Comedy|Drama
363,"Lion King, The (1994)",Animation|Children's|Musical
1281,Fantasia (1940),Animation|Children's|Musical
2395,Shakespeare in Love (1998),Comedy|Romance
2857,American Beauty (1999),Comedy|Drama
2079,Lady and the Tramp (1955),Animation|Children's|Comedy|Musical|Romance
1264,Groundhog Day (1993),Comedy|Romance
2080,"Little Mermaid, The (1989)",Animation|Children's|Comedy|Musical|Romance
317,"Shawshank Redemption, The (1994)",Drama
898,Singin' in the Rain (1952),Musical|Romance


Симилары тоже хорошие.

In [None]:
get_movies(warp.get_similar_items(0))

Unnamed: 0_level_0,name,category
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Toy Story (1995),Animation|Children's|Comedy
587,Aladdin (1992),Animation|Children's|Comedy|Musical
363,"Lion King, The (1994)",Animation|Children's|Musical
33,Babe (1995),Children's|Comedy|Drama
594,Beauty and the Beast (1991),Animation|Children's|Musical
550,"Nightmare Before Christmas, The (1993)",Children's|Comedy|Musical
2293,Antz (1998),Animation|Children's
1072,Willy Wonka and the Chocolate Factory (1971),Adventure|Children's|Comedy|Fantasy
1264,Groundhog Day (1993),Comedy|Romance
2080,"Little Mermaid, The (1989)",Animation|Children's|Comedy|Musical|Romance


#Neural Collaborative Filtering

In [97]:
class NCF(nn.Module):
    def __init__(self, 
                 num_users, 
                 num_items, 
                 gmf_dim=64, 
                 mlp_dim=64, 
                 hidden_dim=64, 
                 dropout=0.2):
        super().__init__()

        self.user_emb_gmf = nn.Embedding(num_users, gmf_dim)
        self.item_emb_gmf = nn.Embedding(num_items, gmf_dim)
        self.user_emb_mlp = nn.Embedding(num_users, mlp_dim)
        self.item_emb_mlp = nn.Embedding(num_items, mlp_dim)

        self.mlp = nn.Sequential(
            nn.Dropout(p=dropout),
            nn.Linear(mlp_dim * 2, hidden_dim),
            nn.ReLU(),
            nn.Dropout(p=dropout),
            nn.Linear(hidden_dim, hidden_dim)
        )
        
        self.nmf = nn.Sequential(
            nn.Linear(gmf_dim + hidden_dim, 1),
            nn.Sigmoid()
        )

        self._init_weight_()

    def _init_weight_(self):
        nn.init.normal_(self.user_emb_gmf.weight, std=0.01)
        nn.init.normal_(self.item_emb_gmf.weight, std=0.01)
        nn.init.normal_(self.user_emb_mlp.weight, std=0.01)
        nn.init.normal_(self.item_emb_mlp.weight, std=0.01)

        for layer in self.modules():
            if isinstance(layer, nn.Linear):
                nn.init.kaiming_uniform_(layer.weight)
                if layer.bias is not None:
                    layer.bias.data.zero_()

    def forward(self, user, item):
        gmf = self.user_emb_gmf(user) * self.item_emb_gmf(item)
        user_item = torch.cat((self.user_emb_mlp(user), self.item_emb_mlp(item)), -1)
        nmf_input = torch.cat((gmf, self.mlp(user_item)), -1)
        return self.nmf(nmf_input).view(-1)

In [113]:
class NCFRecommender(BasicRecommender):
    def __init__(self, neg_size=5, batch_size=64, lr=1e-4, **kwargs):
        self.neg_size = neg_size
        self.batch_size = batch_size
        assert 'num_users' in kwargs and 'num_items' in kwargs 
        self.num_users = kwargs['num_users']
        self.num_items = kwargs['num_items']

        self.positives = defaultdict(list)
        self.negatives = defaultdict(list)

        self.model = NCF(**kwargs).cuda()
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=lr)
        self.loss = nn.BCELoss()

    def fit(self, user_item, epochs=10):
        self.build_data(user_item)
        self.model.train()

        pbar = tqdm(range(epochs), total=epochs)
        for epoch in pbar:
            losses = self.train_epoch(user_item, epoch)
            pbar.set_postfix_str(f'loss: {np.mean(losses)})')

    def train_epoch(self, user_item, n_epoch):
        pbar = tqdm(self.get_dataloader(user_item))
        pbar.set_description(f"Epoch {n_epoch}")

        losses = []
        for batch in pbar:
            users, items, targets = batch.T.cuda()
            pred = self.model(users, items)
            loss = self.loss(pred, targets.float())
            
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()
            
            losses.append(loss.detach().cpu().item())

        return losses

    def build_data(self, user_item):
        nonzeros = user_item.nonzero()
        for user_id, pos in zip(*nonzeros):
            self.positives[user_id].append(pos)

        items = set(range(self.num_items))
        for user_id, pos in self.positives.items():
            self.negatives[user_id] = list(items - set(pos))

    def get_dataloader(self, user_item):
        data = []
        for u_id, pos in self.positives.items():
            not_recommended = self.negatives[u_id]
            neg_size = min(len(pos) * self.neg_size, len(not_recommended))
            neg_items = np.random.choice(not_recommended, size=neg_size, replace=True)
            
            positives, negatives = np.ones((len(pos), 3), dtype=int), np.zeros((neg_size, 3), dtype=int)
            positives[:, 0], negatives[:, 0] = u_id, u_id
            positives[:, 1], negatives[:, 1] = pos, neg_items
            
            data.append(np.vstack((positives, negatives)))
        
        return DataLoader(np.vstack(data), shuffle=True, batch_size=self.batch_size)

    def get_similar_items(self, item_id, k=10, embeddings_type=None):
        if embeddings_type == 'gmf':
            emb = self.model.item_emb_gmf.weight
        elif embeddings_type == 'mlp':
            emb = self.model.item_emb_mlp.weight
        else:
            emb = torch.cat(
                (self.model.item_emb_mlp.weight, self.model.item_emb_gmf.weight),
                dim=-1
            )
        emb = emb.cpu()
        return torch.linalg.norm(emb - emb[item_id], dim=-1).argsort()[:k].numpy()

    def score(self, user, item):
        self.model.eval()
        with torch.no_grad():
            if isinstance(user, (np.int64, np.int, int)):
                user = [user] * np.array(item).shape[0]

            user = torch.LongTensor(user).cuda() 
            item = torch.LongTensor(item).cuda()
            
            scores = []
            for batch in zip(torch.split(user, self.batch_size), torch.split(item, self.batch_size)):
                scores.append(self.model(*batch).cpu().numpy())
            
            return np.hstack(scores)

In [123]:
ncf = NCFRecommender(num_users=num_users, num_items=num_items,
                     batch_size=1024, lr=1e-3)
ncf.fit(user_item_csr, epochs=10)

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=5559.0), HTML(value='')))






HBox(children=(FloatProgress(value=0.0, max=5559.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5559.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5559.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5559.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5559.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5559.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5559.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5559.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5559.0), HTML(value='')))





In [124]:
ncf.compute_metrics(test, user_item, unknown_items, 10)

Train RMSE: 0.38120726
Test RMSE: 0.6425026


HBox(children=(FloatProgress(value=0.0, max=6040.0), HTML(value='')))


HR@10 = 0.6993
NDCG@10 = 0.4235


Метрики значительно лучше, чем у WARP, что и ожидалось. Напомню, что было в статье: 

HR@10 = 0.705

NDCG@10 = 0.426

То есть полученный результат почти соответствует тому, что в статье.

In [125]:
get_movies(get_user_history(user_id, user_item_csr))

Unnamed: 0_level_0,name,category
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Toy Story (1995),Animation|Children's|Comedy
16,Sense and Sensibility (1995),Drama|Romance
24,Leaving Las Vegas (1995),Drama|Romance
28,"City of Lost Children, The (1995)",Adventure|Sci-Fi
29,Shanghai Triad (Yao a yao yao dao waipo qiao) ...,Drama
...,...,...
3682,Blood Simple (1984),Drama|Film-Noir
3702,Mad Max 2 (a.k.a. The Road Warrior) (1981),Action|Sci-Fi
3734,Serpico (1973),Crime|Drama
3750,Chicken Run (2000),Animation|Children's|Comedy


In [126]:
get_movies(test[test.user_id == user_id].movie_id)

Unnamed: 0_level_0,name,category
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1220,"Godfather: Part II, The (1974)",Action|Crime|Drama


С рекомендациями что-то не очень :(

In [127]:
get_movies(ncf.recommend(user_id, unknown_items))

Unnamed: 0_level_0,name,category
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1220,"Godfather: Part II, The (1974)",Action|Crime|Drama
1956,Chariots of Fire (1981),Drama
1185,"Sex, Lies, and Videotape (1989)",Drama
2242,Broadcast News (1987),Comedy|Drama|Romance
1967,"Breakfast Club, The (1985)",Comedy|Drama
1999,Lethal Weapon (1987),Action|Comedy|Crime|Drama
2351,"Big Chill, The (1983)",Comedy|Drama
2063,Roger & Me (1989),Comedy|Documentary
2730,"400 Blows, The (Les Quatre cents coups) (1959)",Drama
2449,Howard the Duck (1986),Adventure|Children's|Sci-Fi


Симилары отличные!

In [128]:
get_movies(ncf.get_similar_items(0))

Unnamed: 0_level_0,name,category
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Toy Story (1995),Animation|Children's|Comedy
3113,Toy Story 2 (1999),Animation|Children's|Comedy
2354,"Bug's Life, A (1998)",Animation|Children's|Comedy
1264,Groundhog Day (1993),Comedy|Romance
33,Babe (1995),Children's|Comedy|Drama
3174,Galaxy Quest (1999),Adventure|Comedy|Sci-Fi
2996,Being John Malkovich (1999),Comedy
2320,Pleasantville (1998),Comedy
1196,"Princess Bride, The (1987)",Action|Adventure|Comedy|Romance
2395,Shakespeare in Love (1998),Comedy|Romance


In [129]:
get_movies(ncf.get_similar_items(0, embeddings_type='gmf'))

Unnamed: 0_level_0,name,category
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Toy Story (1995),Animation|Children's|Comedy
3113,Toy Story 2 (1999),Animation|Children's|Comedy
2354,"Bug's Life, A (1998)",Animation|Children's|Comedy
33,Babe (1995),Children's|Comedy|Drama
2320,Pleasantville (1998),Comedy
1264,Groundhog Day (1993),Comedy|Romance
1922,There's Something About Mary (1998),Comedy
587,Aladdin (1992),Animation|Children's|Comedy|Musical
1022,Winnie the Pooh and the Blustery Day (1968),Animation|Children's
782,"Hunchback of Notre Dame, The (1996)",Animation|Children's|Musical


In [130]:
get_movies(ncf.get_similar_items(0, embeddings_type='mlp'))

Unnamed: 0_level_0,name,category
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Toy Story (1995),Animation|Children's|Comedy
1258,Stand by Me (1986),Adventure|Comedy|Drama
3174,Galaxy Quest (1999),Adventure|Comedy|Sci-Fi
2996,Being John Malkovich (1999),Comedy
1264,Groundhog Day (1993),Comedy|Romance
2354,"Bug's Life, A (1998)",Animation|Children's|Comedy
1229,Annie Hall (1977),Comedy|Romance
2761,"Sixth Sense, The (1999)",Thriller
2570,"Matrix, The (1999)",Action|Sci-Fi|Thriller
588,Terminator 2: Judgment Day (1991),Action|Sci-Fi|Thriller
