In [745]:
import torch
import pandas as pd
import numpy as np

from torch.utils.data import Dataset
from tqdm import tqdm_notebook,tqdm
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn

import torch.optim as optim

In [982]:
class Preprocessor:
    
    def __init__(self,filename):
        
        self.file = filename

        
    def process(self,train=0.4):
        """
        the main method to process file with ratings
        """
        print('im begining to process a file with ratings')
        df = pd.read_csv('ratings.csv').sort_values(by='timestamp')
        
        # as df is sorted by timstamp 
        sep_index = int(train * df.shape[0])
        
        train_df = df[:sep_index]
        test_df = df[sep_index:]
        
        test_df = test_df.loc[test_df.userId.isin(train_df.userId)]
        test_df = test_df.loc[test_df.movieId.isin(train_df.movieId)]
        
        users = set(train_df.userId.values)
        movies = set(train_df.movieId.values)
        
        print('indexing')
        movie_index = pd.DataFrame(movies,columns=['movieId']).reset_index()
        movie_coder = {value: key for (key,value) in movie_index.values}
        movie_decoder = {key: value for (key,value) in movie_index.values}

        user_index = pd.DataFrame(users,columns=['userId']).reset_index()
        user_coder = {value: key for (key,value) in user_index.values}
        user_decoder = {key: value for (key,value) in user_index.values}
        
        # transform to appropriate form for als and my method
        print('transforming')
        train_df.userId = train_df.userId.apply(lambda x: user_coder[x])
        train_df.movieId = train_df.movieId.apply(lambda x: movie_coder[x])

        test_df.loc[test_df.movieId.isin(movies)]
        test_df.userId = test_df.userId.apply(lambda x: user_coder[x])
        test_df.movieId = test_df.movieId.apply(lambda x: movie_coder[x])
        
        print('saving')
        train_df.to_csv('train_df.csv',index=None)
        test_df.to_csv('test_df.csv',index=None)
        movie_index.to_csv('movieId_index.csv',index=None)
        user_index.to_csv('userId_index.csv',index=None)
        
        return 0
        
prep = Preprocessor('ratings.csv')
prep.process()

im begining to process a file with ratings
indexing
transforming


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


saving


0

In [981]:
set([1,2,3]).difference([1,2,3])

set()

In [34]:
WINDOW = 3

## W2V

In [747]:
from gensim.models import Word2Vec

model = Word2Vec(common_texts, size=100, window=5, min_count=1, workers=4)

NameError: name 'common_texts' is not defined

In [750]:
train_df.movieId.values

array([ 1174,  1077,    46, ...,  1344,   839, 11455])

In [773]:
def sort_by_timestamp(x):
    a = x.values
    a.sort(1)
    a = a[:,0]
    return a.tolist()

q = train_df.groupby(['userId'])[['movieId','timestamp']].apply(lambda x: sort_by_timestamp(x)).reset_index()
q.rename({0:'sequences'},axis=1,inplace=True)

In [779]:
corpus = []
for elem in q.sequences.values:
    corpus.append(list(map(str,elem)))

In [802]:
(np.unique(train_df.movieId.values).shape[0]**0.5)**0.5

10.549947517585679

In [803]:
# w2v params that configure w2v model for sngs
w2v_params = { 'sg':1, # w2v is configured for skip-gram scheme
              'negative':10,# negative sampling is set to 10
              #in the article the authors mentioned size equals 100
              #but they have vocabulary with ~1M. So I have
              #approximately 12k movies thus I set 'size' to the 4th root
              #and just in case I'll multiply it by 2
              'size': 20,
              'window':5, # according to the article
              'min_count': 1 #nothing ignored
}


In [804]:
song2vec = Word2Vec(corpus,**w2v_params)
# So now w2v is ready to learn dependencies by skip-gram negative sampling

In [805]:
song2vec.train(corpus,total_examples=song2vec.corpus_count,epochs=5)



(79746823, 80001050)

In [806]:
song2vec.save('pretrained.model')

In [808]:
song2vec.most_similar(['1'])

  """Entry point for launching an IPython kernel.


[('441', 0.8406383395195007),
 ('316', 0.761974036693573),
 ('10', 0.7475438714027405),
 ('507', 0.7425953149795532),
 ('508', 0.7387514114379883),
 ('251', 0.7305671572685242),
 ('349', 0.7206587791442871),
 ('203', 0.7132896184921265),
 ('172', 0.7085014581680298),
 ('336', 0.7067444324493408)]

In [971]:
train_df.rating = train_df.rating.apply(lambda x: (x)/5.)

In [642]:
train_df['order'] = train_df.groupby(by=['userId'])['timestamp'].transform(lambda x: x.rank(method='min'))


## Custom ALS

In [973]:
class Dataset_(Dataset):
    def __init__(self, x):
        self.x = x
        
    def __len__(self):
        return self.x.shape[0]

    def __getitem__(self, idx):

        row = self.x.loc[idx]
        
        user = row.userId.astype(np.float32)
        mov = row.movieId.astype(np.float32)
        
        most_similar = song2vec.wv.most_similar([str(int(mov))],topn=5)
        most_similar_films = np.array([int(el[0]) for el in most_similar],dtype=np.float32)
        most_similar_values = np.array([el[1] for el in most_similar],dtype=np.float32)

        rating = row.rating.astype(np.float32)
        
      
        return user,mov,most_similar_films,most_similar_values,rating
    
    
def collate_fn(data):
    """
       data: is a list of tuples with (example, label, length)
             where 'example' is a tensor of arbitrary shape
             and label/length are scalars
    """
    inputs = list(zip(data))
   # print(inputs)
    movies_win = pad_sequence([torch.from_numpy(x[0][0]) for x in inputs],batch_first=True)
    movies_nwin = pad_sequence([torch.from_numpy(x[0][1]) for x in inputs],batch_first=True)
    movie = pad_sequence([torch.from_numpy(x[0][2]) for x in inputs],batch_first=True)

    return movies_win,movies_nwin,movie


In [922]:
train_df.head(1)

Unnamed: 0,userId,movieId,rating,timestamp,rank,order
0,23061,1174,4.0,789652004,1,1


In [974]:
train_dataset = Dataset_(train_df)
train_loader = torch.utils.data.DataLoader(train_dataset,
                                             batch_size=256,
                                           num_workers=1,shuffle=True,
                                           #collate_fn=collate_fn
                                          )

dloaders = {'train' : train_loader}

In [975]:
class My_MF(nn.Module):
    
    def __init__(self,user_num,movie_num,av_rating):
        torch.manual_seed(0)
        super(My_MF, self).__init__()
        
        self.av_rating = av_rating
        
        self.user_num = user_num
        self.movie_num = movie_num       
        
        self.user_factors = nn.Embedding(int(user_num)+1,20)
        self.movie_factors = nn.Embedding(int(movie_num)+1,20)
        
        self.user_bias = nn.Embedding(int(user_num)+1,1)
        self.movie_bias = nn.Embedding(int(movie_num)+1,1)
        
        self.soft = nn.Softmax(-1)
        self.relu = nn.ReLU()

        
    def forward(self, user, movie,ms_films,ms_values):
        
        user = user.long()
        movie = movie.long()
        ms_films = ms_films.long()
        
      #  print(user.shape, ' is a user.shape')
      #  print(movie.shape, ' is a movie.shape')
        movie_embedded = self.movie_factors(movie)
        user_embedded = self.user_factors(user)
      #  print(movie_embedded.shape, ' is a movie_embded.shape')
      #  print(user_embedded.shape, ' is a user_embeded.shape')
        
        dot_product = (user_embedded*movie_embedded).sum(1)
        
        user_bias = self.user_bias(user).squeeze(1)
        movie_bias = self.movie_bias(movie).squeeze(1)
      #  print(user_bias.shape, ' is a shape of user_bias')
      #  print(movie_bias.shape, ' is a shape of movie_bias')
      #  print(dot_product.shape, ' is a shape of dot_product')

        ratings = self.relu(self.av_rating + dot_product + user_bias + movie_bias)
        
        

        reg_part = (movie_embedded**2).sum(1)+(user_embedded**2).sum(1)+user_bias**2+movie_bias**2
       # print(reg_part.shape,' is a shape of reg_part')
        
        ms_embedded = self.movie_factors(ms_films)
        #print(ms_embedded.shape,' is a shape of sim_embedded')
        movie_repeated = movie_embedded.unsqueeze(1).repeat(1,5,1)
       # print(movie_repeated.shape, ' is a shape of movie_repeated')
        
        dot_product = (ms_embedded *movie_repeated).sum(2)
        #print(dot_product.shape, ' is a shape of dot_product')
        
        sgns_together = ((ms_values - dot_product)**2).sum(1)
        
        #print(sgns_together.shape, 'is a shape of sgns_together')
        
        
        return ratings,reg_part,sgns_together
   



In [954]:
for i,(user,movie,ms_films,ms_values,ratings) in enumerate(dloaders['train']):
    break
#user_num = train_df.userId.max()
#movie_num = train_df.movieId.max()
mdl = My_MF(user_num,movie_num,round(train_df.rating.mean(),2))
mdl(user,movie,ms_films,ms_values)

(tensor([ 6.6948,  5.4440,  6.0956, 10.4667,  3.0703,  1.4207,  7.1854,  1.6279,
          6.1839,  1.0732,  4.5362,  0.0000,  8.8983,  0.0728,  0.0000,  2.5164,
          3.1798,  0.0000,  1.5259,  7.2059,  9.0423,  5.0519,  7.2067,  0.0000,
          0.0542, 10.5857,  0.0000,  3.4066,  3.2686,  3.8484, 12.3777,  4.7843],
        grad_fn=<ReluBackward0>),
 tensor([37.1897, 30.1479, 40.1374, 44.2055, 42.5742, 41.3942, 46.5888, 57.4594,
         33.2418, 38.9044, 45.0800, 46.1588, 48.4257, 41.8547, 44.4153, 25.9088,
         28.1288, 40.2833, 34.9534, 47.2979, 44.5763, 34.4927, 44.4972, 58.1054,
         50.5207, 48.6263, 36.1050, 35.6610, 41.0621, 59.6684, 49.0143, 56.8772],
        grad_fn=<AddBackward0>),
 tensor([223.2477, 201.0573,   8.1630,  20.9485,  52.3032,  52.4362,  62.7913,
         211.8257,  67.0896, 165.0832, 167.2347,  54.8258, 159.0081, 136.1177,
         104.9368,  13.9133,  41.9442,  40.6457, 212.6850, 203.3205,  59.8770,
          34.0348,  73.9590,  48.3511,  71.924

In [943]:
user_num,movie_num


(112465, 12387)

In [976]:
mdl = My_MF(user_num,movie_num,round(train_df.rating.mean(),2))


torch.set_grad_enabled(True)
optimizer = optim.SGD(mdl.parameters(), lr=1e-3)

lam = 0.025
alpha = 0.1

for epoch in range(1):

    optimizer.zero_grad()
    loss = 0
    rem = []

    for i,(user,movie,ms_films,ms_values,ratings) in enumerate(dloaders['train']):

        ratings_pred,reg_part,sgns = mdl(user,movie,ms_films,ms_values)

        loss += (((ratings.float() - ratings_pred)**2) +lam*reg_part+\
                alpha*sgns).mean()
        

        if i % 1 == 0 and i > 3:

            loss.backward()
            optimizer.step()
            rem.append(loss.item())
            optimizer.zero_grad()
            loss = 0
            
        if i % 20 == 19:
            print(np.mean(rem[-100:]))
            torch.save(mdl.state_dict(), 'models/my_als{}.pkl'.format(round(np.mean(rem[-100:]),0)))
           # print(mdl.movie_factors(torch.Tensor([295]).long()))

28.356381058692932
25.171711762746174
24.264972686767578
23.92687571676154
23.47640671332677
22.407146015167235
22.357571353912352
22.22559862136841
22.100231170654297
22.240929164886474
22.321729888916014
22.35167272567749
22.332426223754883
22.296225662231446
22.270177173614503
22.240743579864503
22.108447456359862
22.1380598449707
22.14838483810425
22.192457695007324
22.17739845275879
22.417572498321533
22.462370471954344
22.52979118347168
22.496917781829833
22.497417907714844
22.34098424911499
22.280719375610353
22.214986419677736
22.072260875701904
22.120595111846924
22.065850563049317
22.197732238769532
22.075315284729005
22.226713199615478
22.12259038925171
22.29818407058716
22.30522174835205
22.282417945861816
22.24903854370117
22.31714672088623
22.192077236175535
22.071703567504883
22.271631240844727
22.312079257965088
22.314600811004638
22.209635162353514
22.084919891357423
22.025267066955568
21.91879705429077
21.92141839981079
21.974021396636964
22.07798957824707
22.16113616

KeyboardInterrupt: 

In [956]:
mdl.user_factors(torch.Tensor([0]).long())

tensor([[-1.1258, -1.1524, -0.2506, -0.4339,  0.8487,  0.6920, -0.3160, -2.1152,
          0.3223, -1.2633,  0.3500,  0.3081,  0.1198,  1.2377,  1.1168, -0.2473,
         -1.3527, -1.6959,  0.5667,  0.7935]], grad_fn=<EmbeddingBackward>)

In [729]:
train_df.userId.max()+1,train_df.movieId.max()+1

(112466, 12388)

In [978]:
16000210/256/30/60

34.72267795138889