# Обучение модели

## Подготовка данных

In [21]:
import torch
import pandas as pd
import numpy as np

from torch.utils.data import Dataset
from tqdm import tqdm_notebook,tqdm
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn

import torch.optim as optim
from gensim.models import Word2Vec
import os

device = 'cuda' if torch.cuda.is_available() else 'cpu'

Сформируем датасет, разобьем на трейн/тест. Отсортировав данные по таймстемпу, я решил выбрать отсечение на тест,после 40 процентов рейтингов. Такое решение принято из следующих соображений:

In [5]:
df = pd.read_csv('ratings.csv').sort_values(by='timestamp')
train = 0.1

for train in [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8]:
    sep_index = int(train * df.shape[0])

    train_df = df[:sep_index]
    test_df = df[sep_index:]

    test_df = test_df.loc[test_df.userId.isin(train_df.userId)]
    test_df = test_df.loc[test_df.movieId.isin(train_df.movieId)]
    print(test_df.shape[0]/train_df.shape[0],train,test_df.userId.value_counts().shape[0])

0.016396786841771056 0.1 1402
0.07983296217149177 0.2 3113
0.07543601933174869 0.3 3899
0.07476501870912944 0.4 4211
0.04923675499850952 0.5 4355
0.04363576243210818 0.6 5140
0.03541803450583221 0.7 5173
0.030677347359815903 0.8 5534


Я хочу убрать из тестовой части людей, которых не было в трейне (так как на них все равно предсказаний не построишь). Также для простоты я хочу убрать фильмы из теста из тех, которых не было в трейне. Простота заключается в том, чтобы сформировать преобразование в уникальные номера порядковые только по трейну. Возможно, это можно сделать и без удаления лишних пользователей из теста, но не сейчас. В любом случае - отсутствие "новых" пользователей на тесте метрику не изменит. А отсутствие "новых" фильмов сместит ее в большую сторону. Однако, так как задание на сравнение моделей, то обе модели будут в одинаковых условиях. 
В общем, порог выбран на 0.4. Ниже создаются два файла: train_df.csv и test_df.csv

In [6]:
from preprocess import Preprocessor
import warnings
warnings.filterwarnings("ignore")


prep = Preprocessor('ratings.csv')
prep.process(0.4)

saving files


0

## W2V part

In [7]:
train_df = pd.read_csv('train_df.csv')
test_df = pd.read_csv('test_df.csv')
user_num = train_df.userId.max()
movie_num = train_df.movieId.max()


Создадим корпус для обучения w2v

In [9]:
def sort_by_timestamp(x):
    a = x.values
    a.sort(1)
    a = a[:,0]
    return a.tolist()

q = train_df.groupby(['userId'])[['movieId','timestamp']].apply(lambda x: sort_by_timestamp(x)).reset_index()
q.rename({0:'sequences'},axis=1,inplace=True)

In [10]:
corpus = []
for elem in q.sequences.values:
    corpus.append(list(map(str,elem)))

In [11]:
# w2v params that configure w2v model for sngs
w2v_params = { 'sg':1, # w2v is configured for skip-gram scheme
              'negative':10,# negative sampling is set to 10
              #in the article the authors mentioned size equals 100
              #but they have vocabulary with ~1M. So I have
              #approximately 12k movies thus I set 'size' to the 4th root
              #and just in case I'll multiply it by 2
              'size': 20,
              'window':5, # according to the article
              'min_count': 1 #nothing ignored
}

In [12]:
song2vec = Word2Vec(corpus,**w2v_params)
song2vec.train(corpus,total_examples=song2vec.corpus_count,epochs=5)

(39389408, 40000525)

In [None]:
song2vec.save('w2v.model')

## MF

In [13]:
class Dataset_(Dataset):
    def __init__(self, x):
        self.x = x
        
    def __len__(self):
        return self.x.shape[0]

    def __getitem__(self, idx):

        row = self.x.iloc[idx]
        
        user = row.userId.astype(np.float32)
        mov = row.movieId.astype(np.float32)
        
        most_similar = song2vec.wv.most_similar([str(int(mov))],topn=5)
        most_similar_films = np.array([int(el[0]) for el in most_similar],dtype=np.float32)
        most_similar_values = np.array([el[1] for el in most_similar],dtype=np.float32)

        rating = row.rating.astype(np.float32) 
      
        return user,mov,most_similar_films,most_similar_values,rating


In [14]:
train_dataset = Dataset_(train_df)
train_loader = torch.utils.data.DataLoader(train_dataset,
                                             batch_size=64,
                                           num_workers=2,shuffle=True)

dloaders = {'train' : train_loader}

In [15]:
class My_MF(nn.Module):
    
    def __init__(self,user_num,movie_num,av_rating):
        torch.manual_seed(0)
        super(My_MF, self).__init__()
        
        self.av_rating = av_rating
        
        self.user_num = user_num
        self.movie_num = movie_num       
        
        self.user_factors = nn.Embedding(int(user_num)+1,20)
        self.movie_factors = nn.Embedding(int(movie_num)+1,20)
        
        self.user_bias = nn.Embedding(int(user_num)+1,1)
        self.movie_bias = nn.Embedding(int(movie_num)+1,1)
        
        self.soft = nn.Softmax(-1)
        self.relu = nn.ReLU()

        
    def forward(self, user, movie,ms_films,ms_values):
        
        user = user.long()
        movie = movie.long()
        ms_films = ms_films.long()
        
      #  print(user.shape, ' is a user.shape')
      #  print(movie.shape, ' is a movie.shape')
        movie_embedded = self.movie_factors(movie)
        user_embedded = self.user_factors(user)
      #  print(movie_embedded.shape, ' is a movie_embded.shape')
      #  print(user_embedded.shape, ' is a user_embeded.shape')
        
        dot_product = (user_embedded*movie_embedded).sum(1)
        
        user_bias = self.user_bias(user).squeeze(1)
        movie_bias = self.movie_bias(movie).squeeze(1)
      #  print(user_bias.shape, ' is a shape of user_bias')
      #  print(movie_bias.shape, ' is a shape of movie_bias')
      #  print(dot_product.shape, ' is a shape of dot_product')

        ratings = self.av_rating + dot_product + user_bias + movie_bias
        
        

        reg_part = (movie_embedded**2).sum(1)+(user_embedded**2).sum(1)+user_bias**2+movie_bias**2
       # print(reg_part.shape,' is a shape of reg_part')
        
        ms_embedded = self.movie_factors(ms_films)
        #print(ms_embedded.shape,' is a shape of sim_embedded')
        movie_repeated = movie_embedded.unsqueeze(1).repeat(1,5,1)
       # print(movie_repeated.shape, ' is a shape of movie_repeated')
        
        dot_product = (ms_embedded *movie_repeated).sum(2)
        #print(dot_product.shape, ' is a shape of dot_product')
        
        sgns_together = ((ms_values - dot_product)**2).sum(1)
        
        #print(sgns_together.shape, 'is a shape of sgns_together')
        
        
        return ratings,reg_part#,sgns_together

True

In [24]:
mdl = My_MF(user_num,movie_num,round(train_df.rating.mean(),2))
#mdl = My_MF(user_num,movie_num,round(train_df.rating.mean(),2)).to(device)
#mdl.load_state_dict(torch.load('models/my_als2.0.pkl',map_location='cpu'))
mdl = mdl.to(device)
torch.set_grad_enabled(True)
optimizer = optim.Adam(mdl.parameters(), lr=1e-3)

if not os.path.exists('models/'):
    os.makedir('models')

lam = 0.025
alpha = 0.1

for epoch in range(10):

    optimizer.zero_grad()
    loss = 0
    rem = []
    true_loss = []
    for i,(user,movie,ms_films,ms_values,ratings) in tqdm_notebook(enumerate(dloaders['train'])):
        
        user = user.to(device)
        movie = movie.to(device)
        ms_films = ms_films.to(device)
        ms_values = ms_values.to(device)
        ratings = ratings.to(device)

        ratings_pred,reg_part = mdl(user,movie,ms_films,ms_values)

        loss += (((ratings.float() - ratings_pred)**2) +lam*reg_part).mean()
                # +\
               # alpha*sgns
                #).mean()
        
        true_loss.append(((ratings.float() - ratings_pred)**2).cpu().detach().numpy())
        

        if i % 1 == 0 and i > 3:

            loss.backward()
            optimizer.step()
            rem.append(loss.item())
            optimizer.zero_grad()
            loss = 0
            
        if i % 500 == 499:
            print(np.mean(rem[-100:]),np.mean(true_loss[-100:]))
            torch.save(mdl.state_dict(), 'models/my_als{}.pkl'.format(round(np.mean(rem[-100:]),0)))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

KeyboardInterrupt: 

In [25]:
user_num,movie_num

(71693, 6049)