In [6]:
import pandas as pd
import numpy as np
import scipy.sparse as sparse
import datetime

from implicit.als import AlternatingLeastSquares
from implicit.bpr import BayesianPersonalizedRanking
from implicit.lmf import LogisticMatrixFactorization
from implicit.approximate_als import NMSLibAlternatingLeastSquares
from implicit.nearest_neighbours import CosineRecommender
from implicit.nearest_neighbours import bm25_weight, tfidf_weight, normalize

from code.utils import make_matrix, apply_weights, precision_at_k

np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x11bcc6b50>

In [93]:
df_train = pd.read_csv('prepared_data/ml-100k/train.csv')
df_test = pd.read_csv('prepared_data/ml-100k/test.csv')
df_test_warm = pd.read_csv('prepared_data/ml-100k/test_warm.csv')

df_train.head()

Unnamed: 0,user_id,item_id,feedback
0,259,255,4
1,259,286,4
2,259,298,4
3,259,185,4
4,259,173,4


In [96]:
matrix = make_matrix(df_train, count=False)
matrix = apply_weights(matrix, weight='bm25')

## 1. Baselines

In [150]:
model_als = AlternatingLeastSquares(factors=50, regularization=0.00001, 
                                    iterations=50, num_threads=16,
                                    calculate_training_loss=True)

model_bpr = BayesianPersonalizedRanking(factors=50, regularization=0.00001, 
                                    iterations=200, num_threads=16)

model_lmf = LogisticMatrixFactorization(factors=50, regularization=0.00001, 
                                    iterations=200, neg_prop=30)

model_bm25 = CosineRecommender()


model_list = [model_als, model_bpr, model_lmf]
model_names = ['ALS', 'BPR', 'LMF']


result = pd.DataFrame(columns=['model', 'precision@1', 'precision@5', 'precision@10',
                               'precision@20', 'precision@50',
                               'fit_time', 'predict_time'])
result['model'] = model_names

for name, model in zip(model_names, model_list):
    
    t0 = datetime.datetime.now()
    
    model.fit(sparse.csr_matrix(matrix).T.tocsr(), show_progress=True)
    t1 = datetime.datetime.now()
    
    preds = model.recommend_all(sparse.csr_matrix(matrix).tocsr(), show_progress=False,
                               filter_already_liked_items=False, N=200)
    t2 = datetime.datetime.now()
    
    fitted_in = np.round((t1 - t0).total_seconds())
    predicted_in = np.round((t2 - t1).total_seconds())
    result.loc[result['model'] == name, 'fit_time'] = fitted_in
    result.loc[result['model'] == name, 'predict_time'] = predicted_in
    
    for k in [1, 5, 10, 20, 50]:
        precision = precision_at_k(preds, df_test_warm, matrix, k=k, warm=True)
        result.loc[result['model'] == name, 'precision@' + str(k)] = precision

HBox(children=(IntProgress(value=0, max=50), HTML(value='')))




TypeError: precision_at_k() got an unexpected keyword argument 'warm'

In [101]:
result.head(10)

Unnamed: 0,model,precision@1,precision@5,precision@10,precision@20,precision@50,fit_time,predict_time
0,ALS,6.74,4.49,4.94,4.72,4.31,3,0
1,BPR,6.74,5.39,4.27,4.66,4.22,5,0
2,LMF,3.37,5.17,4.49,4.27,4.54,10,0


In [102]:
result.to_csv('output/score_ml_100k.csv')

## 2. LightFM

In [106]:
from lightfm import LightFM
from lightfm.evaluation import precision_at_k

### 2.1 No cold start

In [206]:
# train matrix
df_test_warm_c = df_test_warm.copy()
df_test_warm_c['feedback'] = 0

df_train_warm = df_train.append(df_test_warm_c)
df_train_warm = df_train_warm.groupby(['user_id', 'item_id']).max().reset_index()

matrix = make_matrix(df_train_warm, count=False)
matrix = apply_weights(matrix, weight='bm25')

# test matrix
df_train_c = df_train.copy()
df_train_c['feedback'] = 0
df_train_c = df_train_c.append(df_test_warm)
df_train_c = df_train_c.groupby(['user_id', 'item_id']).max().reset_index()

matrix_test_warm = make_matrix(df_train_c, count=False)

# fitting

model_lightfm_warp = LightFM(no_components=30, learning_rate=0.001, loss='warp')
model_lightfm_bpr = LightFM(no_components=30, learning_rate=0.001, loss='bpr')

model_list = [model_lightfm_bpr, model_lightfm_warp]
model_names = ['LightFM (bpr loss), no features', 'LightFM (warp loss), no features',]

result = pd.DataFrame(columns=['model', 'precision@1', 'precision@5', 'precision@10',
                               'precision@20', 'precision@50',
                               'fit_time', 'predict_time'])
result['model'] = model_names

for name, model in zip(model_names, model_list):
    
    t0 = datetime.datetime.now()
    model.fit(sparse.csr_matrix(matrix).tocsr(), 
    #           user_features=user_features,
    #           item_features=item_features,
             epochs=20,
             num_threads=16)
    
    t1 = datetime.datetime.now()
    
    preds = model.predict_rank(sparse.csr_matrix(matrix_test_warm).tocsr())
    t2 = datetime.datetime.now()
    
    fitted_in = np.round((t1 - t0).total_seconds())
    predicted_in = np.round((t2 - t1).total_seconds())
    result.loc[result['model'] == name, 'fit_time'] = fitted_in
    result.loc[result['model'] == name, 'predict_time'] = predicted_in
    
    for k in [1, 5, 10, 20, 50]:
        precision = precision_at_k(model, 
                                test_interactions=sparse.csr_matrix(matrix_test_warm).tocsr(),
#                                 train_interactions=sparse.csr_matrix(matrix).tocsr(),
                                k=5).mean() * 100
        
        result.loc[result['model'] == name, 'precision@' + str(k)] = precision

In [207]:
result.head()

Unnamed: 0,model,precision@1,precision@5,precision@10,precision@20,precision@50,fit_time,predict_time
0,"LightFM (bpr loss), no features",14.382,14.382,14.382,14.382,14.382,2,0
1,"LightFM (warp loss), no features",16.1798,16.1798,16.1798,16.1798,16.1798,2,0


In [208]:
result.to_csv('output/score_lightfm_ml_100k.csv')

In [1]:
import pandas as pd

In [16]:
d1 = pd.read_csv('output/score_ml_1m.csv')
d1= d1[['model', 'precision@1', 'precision@5', 'fit_time', 'predict_time']]
# d2 = pd.read_csv('output/score_lightfm_ml_1m.csv')
# d1 = d1.append(d2)

d1 = d1.append(pd.DataFrame([['LightFM', 0.54, 1.02, 126.0, 2.0]], columns = ['model', 'precision@1', 'precision@5', 'fit_time', 'predict_time']))


d1.head(10)

Unnamed: 0,model,precision@1,precision@5,fit_time,predict_time
0,ALS,0.55,0.44,27.0,1.0
1,BPR,0.0,0.33,63.0,1.0
2,LMF,0.0,0.22,75.0,1.0
0,LightFM,0.54,1.02,126.0,2.0


## 3. GCN

https://pytorch-geometric.readthedocs.io/en/latest/notes/introduction.html  
https://pytorch-geometric.readthedocs.io/en/latest/modules/nn.html

## 4. HGE (Hierarchical Graph Embeddings)

In [None]:
# PArtially used code from https://github.com/HarshdeepGupta/recommender_pytorch/blob/master/MLP.py 

import torch
import torch.nn.functional as F
from torch.autograd import Variable
from torch import nn
from torch.utils.data import Dataset, DataLoader
torch.manual_seed(0)

# Workspace imports
from evaluate import evaluate_model
from Dataset import MovieLensDataset
from utils import train_one_epoch, test, plot_statistics

# Python imports
import argparse
from time import time
import numpy as np
import pickle

# CUDA for PyTorch
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
# cudnn.benchmark = True

path = 'raw_data/'
dataset = 'movielens'
EPOCHS = 30
BATCH_SIZE=256
LAYERS = [16,32,16,8]
WEIGHT_DECAY = 1e-5
NUM_NEG_TRAIN = 4  # Number of negative instances to pair with a positive instance while training
NUM_NEG_TEST = 100
LR = 1e-3
DROPOUT = 0.
LEARNER = 'adam' 
VERBOSE = 1
OUT = 1  # Whether to save a trained model


class HGE(nn.Module):

    def __init__(self, n_users, n_items, embedding_dim = 100, dropout=False):
        """
        Simple Feedforward network with Embeddings for users and items
        """
        super().__init__()

        # user and item embedding layers
        self.user_embedding = torch.nn.Embedding(n_users, embedding_dim)
        self.item_embedding = torch.nn.Embedding(n_items, embedding_dim)

    def forward(self, user_item_matrix):
        
        # Мб надо конертить user_item matrix в torch.tensor ..
        
        user_embedding = self.user_embedding(user_item_matrix)
        item_embedding = self.item_embedding(user_item_matrix.t())
        
        output = torch.matmul(user_embedding, item_embedding.t())
        output = torch.sigmoid(output)
        
        return output

    def predict(self, user_item_matrix):
        # return the score, inputs and outputs are numpy arrays
        
        output_scores = self.forward(user_item_matrix)
        return output_scores.cpu().detach().numpy()


def main():

    args = parse_args()
    path = args.path
    dataset = args.dataset
    layers = eval(args.layers)
    weight_decay = args.weight_decay
    num_negatives_train = args.num_neg_train
    num_negatives_test = args.num_neg_test
    dropout = args.dropout
    learner = args.learner
    learning_rate = args.lr
    batch_size = args.batch_size
    epochs = args.epochs
    verbose = args.verbose

    topK = 10
    print("MLP arguments: %s " % (args))
    # model_out_file = 'Pretrain/%s_MLP_%s_%d.h5' %(args.dataset, args.layers, time())

    # Load data

    t1 = time()
    full_dataset = MovieLensDataset(
        path + dataset, num_negatives_train=num_negatives_train, num_negatives_test=num_negatives_test)
    train, testRatings, testNegatives = full_dataset.trainMatrix, full_dataset.testRatings, full_dataset.testNegatives
    num_users, num_items = train.shape
    print("Load data done [%.1f s]. #user=%d, #item=%d, #train=%d, #test=%d"
          % (time()-t1, num_users, num_items, train.nnz, len(testRatings)))

    training_data_generator = DataLoader(
        full_dataset, batch_size=batch_size, shuffle=True, num_workers=0)

    # Build model
    model = MLP(num_users, num_items, layers=layers, dropout=dropout)
    # Transfer the model to GPU, if one is available
    model.to(device)
    if verbose:
        print(model)

    loss_fn = torch.nn.BCELoss()
    # Use Adam optimizer
    optimizer = torch.optim.Adam(model.parameters(), weight_decay=weight_decay)

    # Record performance
    hr_list = []
    ndcg_list = []
    BCE_loss_list = []

    # Check Init performance
    hr, ndcg = test(model, full_dataset, topK)
    hr_list.append(hr)
    ndcg_list.append(ndcg)
    BCE_loss_list.append(1)
    # do the epochs now

    for epoch in range(epochs):
        epoch_loss = train_one_epoch( model, training_data_generator, loss_fn, optimizer, epoch, device)

        if epoch % verbose == 0:
            hr, ndcg = test(model, full_dataset, topK)
            hr_list.append(hr)
            ndcg_list.append(ndcg)
            BCE_loss_list.append(epoch_loss)
            # if hr > best_hr:
            #     best_hr, best_ndcg, best_iter = hr, ndcg, epoch
            #     if args.out > 0:
            #         model.save(model_out_file, overwrite=True)
    print("hr for epochs: ", hr_list)
    print("ndcg for epochs: ", ndcg_list)
    print("loss for epochs: ", BCE_loss_list)
    # plot_statistics(hr_list, ndcg_list, BCE_loss_list,model.get_alias(), "./figs")
    # with open("metrics", 'wb') as fp:
    #     pickle.dump(hr_list, fp)
    #     pickle.dump(ndcg_list, fp)

    best_iter = np.argmax(np.array(hr_list))
    best_hr = hr_list[best_iter]
    best_ndcg = ndcg_list[best_iter]
    print("End. Best Iteration %d:  HR = %.4f, NDCG = %.4f. " %
          (best_iter, best_hr, best_ndcg))
    # if args.out > 0:
    #     print("The best MLP model is saved to %s" %(model_out_file))


if __name__ == "__main__":
    print("Device available: {}".format(device))
    main()