In [None]:
!wget http://files.grouplens.org/datasets/movielens/ml-1m.zip

--2020-12-04 16:10:52--  http://files.grouplens.org/datasets/movielens/ml-1m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5917549 (5.6M) [application/zip]
Saving to: ‘ml-1m.zip’


2020-12-04 16:10:55 (3.69 MB/s) - ‘ml-1m.zip’ saved [5917549/5917549]



In [None]:
!unzip -q ml-1m.zip -d .

In [None]:
!pip install implicit

Collecting implicit
[?25l  Downloading https://files.pythonhosted.org/packages/bc/07/c0121884722d16e2c5beeb815f6b84b41cbf22e738e4075f1475be2791bc/implicit-0.4.4.tar.gz (1.1MB)
[K     |▎                               | 10kB 17.6MB/s eta 0:00:01[K     |▋                               | 20kB 12.0MB/s eta 0:00:01[K     |▉                               | 30kB 7.6MB/s eta 0:00:01[K     |█▏                              | 40kB 7.2MB/s eta 0:00:01[K     |█▌                              | 51kB 4.1MB/s eta 0:00:01[K     |█▊                              | 61kB 4.3MB/s eta 0:00:01[K     |██                              | 71kB 4.5MB/s eta 0:00:01[K     |██▍                             | 81kB 4.8MB/s eta 0:00:01[K     |██▋                             | 92kB 5.1MB/s eta 0:00:01[K     |███                             | 102kB 4.1MB/s eta 0:00:01[K     |███▎                            | 112kB 4.1MB/s eta 0:00:01[K     |███▌                            | 122kB 4.1MB/s eta 0:00:01

#Imports

In [None]:
import pandas as pd
import numpy as np
import scipy.sparse as sp
import implicit
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm, trange
from functools import lru_cache

# Datasets

Выбрал MovieLens, потому что его часто используют в статьях и будет с чем сравнить результаты.

In [None]:
datapath = 'ml-1m/'
ratings = pd.read_csv(datapath + 'ratings.dat', delimiter='::', header=None, 
        names=['user_id', 'movie_id', 'rating', 'timestamp'], 
        usecols=['user_id', 'movie_id', 'rating', 'timestamp'], engine='python')
movie_info = pd.read_csv(datapath + 'movies.dat', delimiter='::', header=None, 
        names=['movie_id', 'name', 'category'], engine='python')
ratings['user_id'] -= 1
ratings['movie_id'] -= 1
movie_info['movie_id'] -= 1

In [None]:
ratings

Unnamed: 0,user_id,movie_id,rating,timestamp
0,0,1192,5,978300760
1,0,660,3,978302109
2,0,913,3,978301968
3,0,3407,4,978300275
4,0,2354,5,978824291
...,...,...,...,...
1000204,6039,1090,1,956716541
1000205,6039,1093,5,956704887
1000206,6039,561,5,956704746
1000207,6039,1095,4,956715648


In [None]:
s_rs = ratings.sort_values(['user_id', 'timestamp'])
train, test = [], []
for user_id in s_rs['user_id'].unique():
    urs = s_rs.loc[s_rs['user_id'] == user_id]
    # Использую медианное значение для выделения implicit данных. 
    # Если брать среднее, то о некоторых юзерах вообще не будем ничего знать. 
    urs = urs.loc[urs.rating >= urs.rating.median()]
    if len(urs) > 1:
        tr, tst = train_test_split(urs, shuffle=False, test_size=0.1)
        train.append(tr)
        test.append(tst)
    else:
        print(user_id)
train = pd.concat(train)
test = pd.concat(test)

In [None]:
train.rating.describe()

count    607458.000000
mean          4.164902
std           0.713723
min           1.000000
25%           4.000000
50%           4.000000
75%           5.000000
max           5.000000
Name: rating, dtype: float64

In [None]:
test.rating.describe()

count    70509.000000
mean         4.073395
std          0.707339
min          1.000000
25%          4.000000
50%          4.000000
75%          5.000000
max          5.000000
Name: rating, dtype: float64

In [None]:
users = train["user_id"]
movies = train["movie_id"]
user_item = sp.coo_matrix((np.ones_like(users), (users, movies)))
item_user_csr = user_item.T.tocsr()
user_item_csr = user_item.tocsr()

In [None]:
user_item_csr

<6040x3952 sparse matrix of type '<class 'numpy.longlong'>'
	with 607458 stored elements in Compressed Sparse Row format>

#BPR

In [None]:
class BaseModelWithMetrics:
    def rmse(self, user_item, test=None):
        if test is None:
            i, j = user_item.nonzero()
        else:
            i, j = test.user_id, test.movie_id
        error = (self.score(i, j) - 1) ** 2
        return np.sqrt(error.mean())

    def hr(self, user_item, test, k=10):
        hits = 0
        for user_id in tqdm(test.user_id.unique()):
            rec_idxs = set(self.recommend(user_id, user_item, k))
            for item_id in test[test.user_id == user_id].movie_id:
                if item_id in rec_idxs:
                    hits += 1
        return hits / len(test)

    def ndcg(self, user_item, test, k=10):
        ndcg = []
        for user_id in tqdm(test.user_id.unique()):
            rec_idxs = self.recommend(user_id, user_item, k)
            test_items = set(test[test.user_id == user_id].movie_id)
            dcg = 0
            for i, item_id in enumerate(rec_idxs):
                if item_id in test_items:
                    dcg += 1 / np.log2(i + 2)
            ndcg.append(dcg / self.idcg(min(len(test_items), k)))
        return np.mean(ndcg)

    @staticmethod
    @lru_cache(maxsize=128)
    def idcg(l):
        return sum(1 / np.log2(i + 2) for i in range(l))

    def recommend(self, user_id, user_item, top_n=10):
        raise NotImplementedError()

    def similar_items(self, item_id, top_n=10):
        raise NotImplementedError()

    @staticmethod
    def user_history(user_id, user_item):
        return [i for i in user_item[user_id].nonzero()[1]]

    def score(self, i, j):
        raise NotImplementedError()


class BPR(BaseModelWithMetrics):
    def __init__(self, factors=63, iters=1000, reg=0.01, lr=1e-3):
        self.model = implicit.bpr.BayesianPersonalizedRanking(
            factors=factors,
            use_gpu=False,
            learning_rate=lr,
            regularization=reg,
            verify_negative_samples=True,
            random_state=42,
            iterations=iters
        )
    
    def fit(self, item_user):
        self.model.fit(item_user)
            
    def recommend(self, user_id, user_item, top_n=10):
        recommended_items = set(user_item[user_id].nonzero()[1])
        not_recommended = np.array(list(set(range(self.item_factors.shape[0])) - recommended_items))
        score = self.score(user_id, not_recommended)
        return not_recommended[sorted(np.arange(len(not_recommended)), key=lambda x: -score[x])[:top_n]]

    def similar_items(self, item_id, top_n=10):
        return np.argsort(np.linalg.norm(self.item_factors - self.item_factors[item_id], axis=1))[:top_n]

    def score(self, i, j):
        return np.sum(self.user_factors[i] * self.item_factors[j], axis=1)

    @property
    def item_factors(self):
        return self.model.item_factors
    
    @property
    def user_factors(self):
        return self.model.user_factors

In [None]:
def get_movies(idxs):
    return movie_info.set_index('movie_id').loc[[i for i in idxs if i in set(movie_info.movie_id)]]

In [None]:
bpr = BPR(factors=63, iters=300, reg=0.01, lr=1e-2)
bpr.fit(item_user_csr)

HBox(children=(FloatProgress(value=0.0, max=300.0), HTML(value='')))




In [None]:
print('RMSE:', bpr.rmse(user_item_csr, test))

RMSE: 1.3282939


In [None]:
print('Hit Ratio:', bpr.hr(user_item_csr, test, 50))

HBox(children=(FloatProgress(value=0.0, max=6040.0), HTML(value='')))


Hit Ratio: 0.12656540299819882


In [None]:
print('NDCG:', bpr.ndcg(user_item_csr, test, 50))

HBox(children=(FloatProgress(value=0.0, max=6040.0), HTML(value='')))


NDCG: 0.10759585998824346


In [None]:
user_id = 0

In [None]:
get_movies(bpr.user_history(user_id, user_item_csr))

Unnamed: 0_level_0,name,category
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Toy Story (1995),Animation|Children's|Comedy
149,Apollo 13 (1995),Drama
259,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Fantasy|Sci-Fi
526,Schindler's List (1993),Drama|War
530,"Secret Garden, The (1993)",Children's|Drama
587,Aladdin (1992),Animation|Children's|Comedy|Musical
593,Snow White and the Seven Dwarfs (1937),Animation|Children's|Musical
594,Beauty and the Beast (1991),Animation|Children's|Musical
607,Fargo (1996),Crime|Drama|Thriller
918,"Wizard of Oz, The (1939)",Adventure|Children's|Drama|Musical


In [None]:
get_movies(test.loc[test.user_id == user_id].movie_id)

Unnamed: 0_level_0,name,category
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
2293,Antz (1998),Animation|Children's
782,"Hunchback of Notre Dame, The (1996)",Animation|Children's|Musical
1565,Hercules (1997),Adventure|Animation|Children's|Comedy|Musical
1906,Mulan (1998),Animation|Children's
47,Pocahontas (1995),Animation|Children's|Musical|Romance


In [None]:
get_movies(bpr.recommend(user_id, user_item_csr))

Unnamed: 0_level_0,name,category
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
952,It's a Wonderful Life (1946),Drama
363,"Lion King, The (1994)",Animation|Children's|Musical
2086,Peter Pan (1953),Animation|Children's|Fantasy|Musical
2058,"Parent Trap, The (1998)",Children's|Drama
1946,West Side Story (1961),Musical|Romance
2084,101 Dalmatians (1961),Animation|Children's
595,Pinocchio (1940),Animation|Children's
2095,Sleeping Beauty (1959),Animation|Children's|Musical
913,My Fair Lady (1964),Musical|Romance
3249,Alive (1993),Drama


In [None]:
get_movies(bpr.similar_items(0))

Unnamed: 0_level_0,name,category
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Toy Story (1995),Animation|Children's|Comedy
3113,Toy Story 2 (1999),Animation|Children's|Comedy
2354,"Bug's Life, A (1998)",Animation|Children's|Comedy
2250,"Cabinet of Dr. Ramirez, The (1991)",Comedy
33,Babe (1995),Children's|Comedy|Drama
587,Aladdin (1992),Animation|Children's|Comedy|Musical
1565,Hercules (1997),Adventure|Animation|Children's|Comedy|Musical
1906,Mulan (1998),Animation|Children's
3163,"Alley Cats, The (1968)",Drama
1842,Slappy and the Stinkers (1998),Children's|Comedy


#NCF

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data

In [181]:
DEVICE = torch.device('cpu')


class NCF(nn.Module):
    def __init__(self, num_users, num_items, 
                 GMF_factors=64, 
                 MLP_factors=128, 
                 layers_dim=[128, 64], 
                 dropout=0.2):
        super(NCF, self).__init__()
        self.dropout = dropout

        self.u_emb_GMF = nn.Embedding(num_users, GMF_factors)
        self.i_emb_GMF = nn.Embedding(num_items, GMF_factors)
        self.u_emb_MLP = nn.Embedding(num_users, MLP_factors)
        self.i_emb_MLP = nn.Embedding(num_items, MLP_factors)

        MLP_modules = [nn.Dropout(p=self.dropout), nn.Linear(MLP_factors * 2, layers_dim[0])]
        for i in range(len(layers_dim) - 1):
            MLP_modules.extend([nn.ReLU(), 
                                nn.Dropout(p=self.dropout),
                                nn.Linear(layers_dim[i], layers_dim[i + 1])])
        self.MLP_layers = nn.Sequential(*MLP_modules)
        self.NMF_layer = nn.Linear(GMF_factors + layers_dim[-1], 2)

        self._init_weight_()

    def _init_weight_(self):
        nn.init.normal_(self.u_emb_GMF.weight, std=0.01)
        nn.init.normal_(self.u_emb_MLP.weight, std=0.01)
        nn.init.normal_(self.i_emb_GMF.weight, std=0.01)
        nn.init.normal_(self.i_emb_MLP.weight, std=0.01)

        for m in self.MLP_layers:
            if isinstance(m, nn.Linear):
                nn.init.kaiming_uniform_(m.weight)
        nn.init.kaiming_uniform_(self.NMF_layer.weight)

        for m in self.modules():
            if isinstance(m, nn.Linear) and m.bias is not None:
                m.bias.data.zero_()

    def forward(self, user, item):
        u_emb_GMF = self.u_emb_GMF(user)
        i_emb_GMF = self.i_emb_GMF(item)
        GMF_out = u_emb_GMF * i_emb_GMF

        u_emb_MLP = self.u_emb_MLP(user)
        i_emb_MLP = self.i_emb_MLP(item)
        interaction = torch.cat((u_emb_MLP, i_emb_MLP), -1)
        MLP_out = self.MLP_layers(interaction)

        NMF_in = torch.cat((GMF_out, MLP_out), -1)

        prediction = self.NMF_layer(NMF_in)
        return prediction


class NCFModel(BaseModelWithMetrics):
    def __init__(self, *args, neg_size=5, batch_size=64, lr=1e-4, **kargs):
        self.batch_size = batch_size
        self.num_users = kargs.get("num_users")
        self.num_items = kargs.get("num_items")
        self.neg_size = neg_size
        self.model = NCF(*args, **kargs).to(DEVICE)
        self.positives = None
        self._all_items = set(range(self.num_items))
        self.loss = nn.CrossEntropyLoss()
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=lr)

    def fit(self, user_item, test, iters=10):
        self.model.train()
        t = trange(iters)
        t.set_description("Epoch")
        t.set_postfix(rmse=self.rmse(user_item, test))
        for _ in t:
            self.train_epoch(user_item)
            t.set_postfix(rmse=self.rmse(user_item, test))

    def train_epoch(self, user_item):
        dataloader = self.build_dataloader(user_item)
        t = tqdm(dataloader)
        t.set_description("Batch")
        for batch in t:
            self.optimizer.zero_grad()
            users, items, targets = batch.T.to(DEVICE)
            preds = self.model(users, items)
            loss = self.loss(preds, targets)
            loss.backward()
            self.optimizer.step()
            t.set_postfix(loss=loss.detach().cpu().item())

    def score(self, i, j):
        with torch.no_grad():
            if isinstance(i, (np.int64, np.int, int)):
                i = [i] * j.shape[0]
            elif isinstance(i, pd.Series):
                i = i.to_numpy()
            if isinstance(j, pd.Series):
                j = j.to_numpy()
            i, j = torch.tensor(i), torch.tensor(j)
            scores = []
            for i_batch, j_batch in zip(torch.split(i, self.batch_size), torch.split(j, self.batch_size)):
                i_batch.to(DEVICE)
                j_batch.to(DEVICE)
                scores.append(F.softmax(self.model(i_batch, j_batch), dim=-1).cpu().numpy()[:, 1])
            return np.hstack(scores)

    def build_dataloader(self, user_item):
        if self.positives is None:
            nonzeros = user_item.nonzero()
            nonzeros += ([1] * len(nonzeros[0]),)
            self.positives = np.array(nonzeros).T
        u_negs, i_negs = [], []
        for user_id in range(user_item.shape[0]):
            pos_items = set(self.positives[self.positives[:, 0] == user_id][1])
            neg_items = np.random.choice(list(self._all_items - pos_items),
                                         size=len(pos_items) * self.neg_size,
                                         replace=False)
            i_negs.extend(neg_items)
            u_negs.extend([user_id] * len(neg_items))
        negatives = np.array([u_negs, i_negs, [0] * len(i_negs)]).T
        dataset = np.vstack((self.positives, negatives))
        return data.DataLoader(dataset, shuffle=True, batch_size=self.batch_size)

    def recommend(self, user_id, user_item, top_n=10, random_samples=None):
        recommended_items = set(user_item[user_id].nonzero()[1])
        not_recommended = np.array(list(self._all_items - recommended_items))
        if random_samples is not None:
            not_recommended = np.random.choice(not_recommended,
                                            size=random_samples,
                                            replace=False)
        score = self.score(user_id, not_recommended)
        return not_recommended[sorted(np.arange(len(not_recommended)), key=lambda x: -score[x])[:top_n]]

    def similar_items(self, item_id, top_n=10, emb_type=None):
        if emb_type == 'MLP':
            emb = self.model.i_emb_MLP.weight.cpu()
        elif emb_type == 'GMF':
            emb = self.model.i_emb_GMF.weight.cpu()
        else:
            emb = self.model.i_emb_MLP.weight.cpu(), self.model.i_emb_GMF.weight.cpu()
            emb = torch.cat(emb, dim=-1)
        return torch.linalg.norm(emb - emb[item_id], dim=-1).argsort()[:top_n].numpy()

In [190]:
ncf = NCFModel(num_users=user_item.shape[0], num_items=user_item.shape[1],
               batch_size=4096, lr=1e-4)
ncf.fit(user_item_csr, test, iters=3)

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=171.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=171.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=171.0), HTML(value='')))





In [191]:
print('RMSE:', ncf.rmse(user_item_csr, test))

RMSE: 0.16780075


In [194]:
print('Hit Ratio:', ncf.hr(user_item_csr, test, 10))

HBox(children=(FloatProgress(value=0.0, max=6040.0), HTML(value='')))


Hit Ratio: 0.029216128437504433


In [193]:
print('NDCG:', ncf.ndcg(user_item_csr, test, 50))

HBox(children=(FloatProgress(value=0.0, max=6040.0), HTML(value='')))


NDCG: 0.07593849628291534


In [195]:
user_id = 0

In [196]:
get_movies(ncf.user_history(user_id, user_item_csr))

Unnamed: 0_level_0,name,category
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Toy Story (1995),Animation|Children's|Comedy
149,Apollo 13 (1995),Drama
259,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Fantasy|Sci-Fi
526,Schindler's List (1993),Drama|War
530,"Secret Garden, The (1993)",Children's|Drama
587,Aladdin (1992),Animation|Children's|Comedy|Musical
593,Snow White and the Seven Dwarfs (1937),Animation|Children's|Musical
594,Beauty and the Beast (1991),Animation|Children's|Musical
607,Fargo (1996),Crime|Drama|Thriller
918,"Wizard of Oz, The (1939)",Adventure|Children's|Drama|Musical


In [197]:
get_movies(test.loc[test.user_id == user_id].movie_id)

Unnamed: 0_level_0,name,category
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
2293,Antz (1998),Animation|Children's
782,"Hunchback of Notre Dame, The (1996)",Animation|Children's|Musical
1565,Hercules (1997),Adventure|Animation|Children's|Comedy|Musical
1906,Mulan (1998),Animation|Children's
47,Pocahontas (1995),Animation|Children's|Musical|Romance


In [198]:
get_movies(ncf.recommend(user_id, user_item_csr))

Unnamed: 0_level_0,name,category
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
2857,American Beauty (1999),Comedy|Drama
1197,Raiders of the Lost Ark (1981),Action|Adventure
1616,L.A. Confidential (1997),Crime|Film-Noir|Mystery|Thriller
2570,"Matrix, The (1999)",Action|Sci-Fi|Thriller
588,Terminator 2: Judgment Day (1991),Action|Sci-Fi|Thriller
1195,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Drama|Sci-Fi|War
1209,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Romance|Sci-Fi|War
2996,Being John Malkovich (1999),Comedy
2395,Shakespeare in Love (1998),Comedy|Romance
857,"Godfather, The (1972)",Action|Crime|Drama


In [202]:
get_movies(ncf.similar_items(0))

Unnamed: 0_level_0,name,category
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Toy Story (1995),Animation|Children's|Comedy
1096,E.T. the Extra-Terrestrial (1982),Children's|Drama|Fantasy|Sci-Fi
1220,"Godfather: Part II, The (1974)",Action|Crime|Drama
2627,Star Wars: Episode I - The Phantom Menace (1999),Action|Adventure|Fantasy|Sci-Fi
1783,As Good As It Gets (1997),Comedy|Drama
1290,Indiana Jones and the Last Crusade (1989),Action|Adventure
3362,American Graffiti (1973),Comedy|Drama
1078,"Fish Called Wanda, A (1988)",Comedy
1967,"Breakfast Club, The (1985)",Comedy|Drama
1258,Stand by Me (1986),Adventure|Comedy|Drama


In [204]:
get_movies(ncf.similar_items(0, emb_type='GMF'))

Unnamed: 0_level_0,name,category
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Toy Story (1995),Animation|Children's|Comedy
3628,"Gold Rush, The (1925)",Comedy
3379,Railroaded! (1947),Film-Noir
379,True Lies (1994),Action|Adventure|Comedy|Romance
3495,Madame Sousatzka (1988),Drama
932,To Catch a Thief (1955),Comedy|Romance|Thriller
1108,Charm's Incidents (1996),Drama
3206,"Snows of Kilimanjaro, The (1952)",Adventure
2593,Open Your Eyes (Abre los ojos) (1997),Drama|Romance|Sci-Fi
2477,Three Amigos! (1986),Comedy|Western


In [203]:
get_movies(ncf.similar_items(0, emb_type='MLP'))

Unnamed: 0_level_0,name,category
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Toy Story (1995),Animation|Children's|Comedy
1196,"Princess Bride, The (1987)",Action|Adventure|Comedy|Romance
1290,Indiana Jones and the Last Crusade (1989),Action|Adventure
1386,Jaws (1975),Action|Horror
1220,"Godfather: Part II, The (1974)",Action|Crime|Drama
2027,Saving Private Ryan (1998),Action|Drama|War
1096,E.T. the Extra-Terrestrial (1982),Children's|Drama|Fantasy|Sci-Fi
911,Casablanca (1942),Drama|Romance|War
1078,"Fish Called Wanda, A (1988)",Comedy
526,Schindler's List (1993),Drama|War


Чет странное выходит :(