In [77]:

import torch
import pandas as pd
import numpy as np

from torch.utils.data import Dataset
from tqdm import tqdm_notebook,tqdm
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn

def emb_sz_rule(n_cat:int)->int: return min(600, round(1.6 * n_cat**0.56))

import torch.optim as optim
from tqdm import tqdm
from validation import Validator

from implicit.als import AlternatingLeastSquares as als

from scipy.sparse import csr_matrix
import scipy.sparse as sp


def dcg_at_k(r, k, method=0):
    r = np.asfarray(r)[:k]
    if r.size:
        if method == 0:
            return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
        elif method == 1:
            return np.sum(r / np.log2(np.arange(2, r.size + 2)))
        else:
            raise ValueError('method must be 0 or 1.')
    return 0.


def ndcg_at_k(r, k, method=0):
    dcg_max = dcg_at_k(sorted(r, reverse=True), k, method)
    if not dcg_max:
        return 0.
    return dcg_at_k(r, k, method) / dcg_max


In [2]:
train_df = pd.read_csv("train_df.csv")
test_df = pd.read_csv('test_df.csv')
validator = Validator('test_df.csv')

In [78]:
users = np.unique(test_df.userId.values)

In [79]:
import torch
import pandas as pd
import numpy as np

from torch.utils.data import Dataset
from tqdm import tqdm_notebook,tqdm
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn

import torch.optim as optim

In [85]:
class My_MF(nn.Module):
    
    def __init__(self,user_num,movie_num,movies):
        torch.manual_seed(0)
        super(My_MF, self).__init__()
        
        self.av_rating = -1
        
        self.user_num = user_num
        self.movie_num = movie_num       
        
        self.user_factors = nn.Embedding(int(user_num)+1,20)
        self.movie_factors = nn.Embedding(int(movie_num)+1,20)
        
        self.user_bias = nn.Embedding(int(user_num)+1,1)
        self.movie_bias = nn.Embedding(int(movie_num)+1,1)
        
        self.movies = movies.long()
        
        self.soft = nn.Softmax(-1)

        
    def forward(self, user):
        
        user = user.long()
        
        user_embedded = self.user_factors(user).repeat(self.movies.shape[0],1)
        movie_embedded = self.movie_factors(self.movies)
        
        products = (user_embedded*movie_embedded).sum(1)+self.movie_bias(self.movies).squeeze(1)

        top10 = torch.argsort(products,descending=True).detach().numpy().tolist()    
        consumed = list(train_df.loc[train_df.userId==user.item()].movieId.values)
        
        recom = []
        for el in top10:
            if el not in consumed:
                recom.append(el)
            if len(recom) == 10:
                break
                
        return recom
   

In [112]:
movie = np.unique(train_df.movieId.values)

mdl = My_MF(71693, 6049,torch.Tensor(movie))

mdl.load_state_dict(torch.load('models/my_als1.0.pkl',map_location='cpu'))
users = np.unique(test_df.userId.values)

In [103]:
ndc1 = []
ndc10 = []

for user in tqdm(users):
    rec_list = mdl(torch.Tensor([user]))
    a,b = validator.valid(user,rec_list)
    ndc1.append(a)
    ndc10.append(b)
   # if a> 0 or b>0:
    #    print('yra')


100%|██████████| 4211/4211 [00:49<00:00, 85.88it/s]


In [104]:
np.mean(ndc1),np.mean(ndc10)
#0.006979035080396459, 0.03618470745334547)

(0.023137151879917765, 0.10094452413768583)

In [105]:
mdl.load_state_dict(torch.load('models/my_als3.0.pkl',map_location='cpu'))
ndc1 = []
ndc10 = []

for user in tqdm(users):
    rec_list = mdl(torch.Tensor([user]))
    a,b = validator.valid(user,rec_list)
    ndc1.append(a)
    ndc10.append(b)
    
np.mean(ndc1),np.mean(ndc10)

100%|██████████| 4211/4211 [00:48<00:00, 88.64it/s]


(0.005755660629605138, 0.030977492136280562)

## Почему так мало?

Проверим, насколько модель хорошо выучила смещения рейтингов фильмов

In [116]:
mdl.load_state_dict(torch.load('models/my_als1.0.pkl',map_location='cpu'))

diff = []
for mov in movie:
    bias = mdl.movie_bias(torch.Tensor([mov]).long()).item()
    bias_real = av_rating-train_df.loc[train_df.movieId==mov,'rating'].mean()
    
    diff.append(np.abs(bias-bias_real))
print(np.mean(diff))

0.9515196308442085


In [117]:
mdl.load_state_dict(torch.load('models/my_als4.0.pkl',map_location='cpu'))

diff = []
for mov in movie:
    bias = mdl.movie_bias(torch.Tensor([mov]).long()).item()
    bias_real = av_rating-train_df.loc[train_df.movieId==mov,'rating'].mean()
    
    diff.append(np.abs(bias-bias_real))
print(np.mean(diff))

0.918402356761014


И аналогично для пользователей:

In [114]:
mdl.load_state_dict(torch.load('models/my_als1.0.pkl',map_location='cpu'))

av_rating = train_df.rating.mean()

diff = []
for user in users:
    bias = mdl.user_bias(torch.Tensor([user]).long()).item()
    bias_real = av_rating-train_df.loc[train_df.userId==user,'rating'].mean()
    
    diff.append(np.abs(bias-bias_real))
print(np.mean(diff))

0.5737534782133606


А если загрузим модель со средним лоссом 4, то будет явно хуже

In [115]:
mdl.load_state_dict(torch.load('models/my_als4.0.pkl',map_location='cpu'))

av_rating = train_df.rating.mean()

diff = []
for user in users:
    bias = mdl.user_bias(torch.Tensor([user]).long()).item()
    bias_real = av_rating-train_df.loc[train_df.userId==user,'rating'].mean()
    
    diff.append(np.abs(bias-bias_real))
print(np.mean(diff))

0.712200143826478
