-
Notifications
You must be signed in to change notification settings - Fork 31
/
topN.R
482 lines (453 loc) · 23.3 KB
/
topN.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
process.inputs.topN <- function(model, obj, user=NULL, a_vec=NULL, a_bias=NULL,
n=10L, include=NULL, exclude=NULL, output_score=FALSE) {
if (!is.null(include) && !is.null(exclude))
stop("Can only pass one of 'include' or 'exclude'.")
output_score <- check.bool(output_score, "output_score")
n <- check.pos.int(n, "n", TRUE)
if (NROW(obj$info$item_mapping)) {
if (!is.null(include))
include <- as.integer(factor(include, obj$info$item_mapping))
if (!is.null(exclude))
exclude <- as.integer(factor(exclude, obj$info$item_mapping))
if (!is.null(user))
user <- as.integer(factor(user, obj$info$user_mapping))
}
if (is.null(a_bias))
a_bias <- 0.
if (!is.null(user)) {
if (("numeric" %in% class(user)) || ("character" %in% class(user)))
user <- as.integer(user)
user <- check.pos.int(user, "user", TRUE)
if (model != "MostPopular") {
m_max <- ifelse(model %in% c("OMF_explicit", "OMF_explicit", "ContentBased"),
NCOL(obj$matrices$Am), NCOL(obj$matrices$A))
if (user > m_max)
stop("'user' is outside the range of data passed to 'fit'.")
} else {
if (NROW(obj$matrices$user_bias)) {
if (user > NROW(obj$matrices$user_bias))
stop("'user' is outside the range of data passed to 'fit'.")
}
}
if (!(model %in% c("ContentBased", "MostPopular", "OMF_explicit", "OMF_implicit"))) {
a_vec <- obj$matrices$A[, user, drop = TRUE]
} else if (model != "MostPopular") {
a_vec <- obj$matrices$Am[, user, drop = TRUE]
}
if (NROW(obj$matrices$user_bias))
a_bias <- obj$matrices$user_bias[user]
}
if (!is.null(include)) {
if (NROW(intersect(class(include), c("numeric", "character", "matrix"))))
include <- as.integer(include)
if (!("integer" %in% class(include)))
stop("Invalid data type for 'include'.")
if (model != "MostPopular") {
if (max(include) > NCOL(obj$matrices$B))
stop("'include' contains element that were not passed to 'fit'.")
} else {
if (max(include) > NROW(obj$matrices$item_bias))
stop("'include' contains element that were not passed to 'fit'.")
}
include <- include - 1L
if (any(include < 0L) || anyNA(include))
stop("'include' contains invalid entries.")
if (NROW(include) < n)
stop("'n' is greater than the number of entries in 'include'.")
}
if (!is.null(exclude)) {
if (NROW(intersect(class(exclude), c("numeric", "character", "matrix"))))
exclude <- as.integer(exclude)
if (!("integer" %in% class(exclude)))
stop("Invalid data type for 'exclude'.")
if (model != "MostPopular") {
if (max(exclude) > NCOL(obj$matrices$B))
stop("'exclude' contains element that were not passed to 'fit'.")
} else {
if (max(exclude) > NROW(obj$matrices$item_bias))
stop("'exclude' contains element that were not passed to 'fit'.")
}
exclude <- exclude - 1L
if (any(exclude < 0L) || anyNA(exclude))
stop("'exclude' contains invalid entries.")
}
return(list(
n = n,
include = include,
exclude = exclude,
output_score = output_score,
a_vec = a_vec,
a_bias = a_bias
))
}
.topN <- function(model, obj, a_vec, a_bias=0.,
n=10L, include=NULL, exclude=NULL,
output_score=FALSE, reindex=TRUE) {
outp_ix <- integer(length = n)
outp_score <- numeric(length = ifelse(output_score, n, 0L))
if (model == "CMF") {
ret_code <- .Call("call_topN_old_collective_explicit",
a_vec, a_bias,
obj$matrices$B,
obj$matrices$item_bias,
obj$matrices$glob_mean,
obj$info$k, obj$info$k_user, obj$info$k_item, obj$info$k_main,
include,
exclude,
outp_ix, outp_score,
obj$info$n_orig, NCOL(obj$matrices$B),
obj$info$include_all_X, obj$info$nthreads)
} else if (model == "CMF_implicit") {
ret_code <- .Call("call_topN_old_collective_implicit",
a_vec,
obj$matrices$B,
obj$info$k, obj$info$k_user, obj$info$k_item, obj$info$k_main,
include,
exclude,
outp_ix, outp_score,
NCOL(obj$matrices$B), obj$info$nthreads)
} else if (model == "MostPopular") {
ret_code <- .Call("call_topN_old_most_popular",
as.logical(NROW(obj$matrices$user_bias)),
a_bias,
obj$matrices$item_bias,
obj$matrices$glob_mean,
include,
exclude,
outp_ix, outp_score,
NROW(obj$matrices$item_bias))
} else if (model == "ContentBased") {
ret_code <- .Call("call_topN_old_content_based",
a_vec, a_bias,
obj$matrices$Bm,
obj$matrices$item_bias,
obj$matrices$glob_mean,
obj$info$k,
include,
exclude,
outp_ix, outp_score,
NCOL(obj$matrices$Bm), obj$info$nthreads)
} else if (model == "OMF_explicit") {
ret_code <- .Call("call_topN_old_offsets_explicit",
a_vec, a_bias,
obj$matrices$Bm,
obj$matrices$item_bias,
obj$matrices$glob_mean,
obj$info$k, obj$info$k_sec, obj$info$k_main,
include,
exclude,
outp_ix, outp_score,
NCOL(obj$matrices$Bm), obj$info$nthreads)
} else if (model == "OMF_implicit") {
ret_code <- .Call("call_topN_old_offsets_implicit",
a_vec,
obj$matrices$Bm,
obj$info$k,
include,
exclude,
outp_ix, outp_score,
NCOL(obj$matrices$Bm), obj$info$nthreads)
} else {
stop("Unexpected error.")
}
outp_ix <- outp_ix + 1L
if (reindex && NROW(obj$info$item_mapping))
outp_ix <- obj$info$item_mapping[outp_ix]
if (output_score) {
return(list(item=outp_ix, score=outp_score))
} else {
return(outp_ix)
}
}
#' @export
#' @title Calulate top-N predictions for a new or existing user
#' @rdname topN
#' @description Determine top-ranked items for a user according to their predicted
#' values, among the items to which the model was fit.
#'
#' Can produce rankings for existing users (which where in the `X` data to which
#' the model was fit) through function `topN`, or for new users (which were not
#' in the `X` data to which the model was fit, but for which there is now new
#' data) through function `topN_new`, assuming there is either `X` data, `U` data,
#' or both (i.e. can do cold-start and warm-start rankings).
#'
#' For the \link{CMF} model, depending on parameter `include_all_X`, might recommend
#' items which had only side information if their predictions are high enough.
#'
#' For the \link{ContentBased} model, might be used to rank new items (not present
#' in the `X` or `I` data to which the model was fit) given their
#' `I` data, for new users given their `U` data. For the other models, will only
#' rank existing items (columns of the `X` to which the model was fit) - see
#' \link{predict_new_items} for an alternative for the other models.
#'
#' \bold{Important:} the model does not keep any copies of the original data, and
#' as such, it might recommend items that were already seen/rated/consumed by the
#' user. In order to avoid this, must manually pass the seen/rated/consumed entries
#' to the argument `exclude` (see details below).
#' @details Be aware that this function is multi-threaded. As such, if a large batch
#' of top-N predictions is to be calculated in parallel for different users
#' (through e.g. `mclapply` or similar), it's recommended to decrease the number
#' of threads in the model to 1 (e.g. `model$info$nthreads <- 1L`) and to set the
#' number of BLAS threads to 1 (through e.g. `RhpcBLASctl` or environment variables).
#' @param model A collective matrix factorization model from this package - see
#' \link{fit_models} for details.
#' @param user User (row of `X`) for which to rank items. If `X` to which the model
#' was fit was a `data.frame`, should pass an ID matching to the first column of `X`
#' (the user indices), otherwise should pass a row number for `X`, with numeration
#' starting at 1.
#'
#' This is optional for the \link{MostPopular} model, but must be passed for all others.
#'
#' For making recommendations about new users (that were not present in the `X` to
#' which the model was fit), should use `topN_new` and pass either `X` or `U` data.
#'
#' For example usage, see the main section \link{fit_models}.
#' @param n Number of top-predicted items to output.
#' @param include If passing this, will only make a ranking among the item IDs
#' provided here. See the documentation for `user` for how the IDs should be passed.
#' This should be an integer or character vector. Cannot be used together with `exclude`.
#' @param exclude If passing this, will exclude from the ranking all the item IDs
#' provided here. See the documentation for `user` for how the IDs should be passed.
#' This should be an integer or character vector. Cannot be used together with `exclude`.
#' @param output_score Whether to also output the predicted values, in addition
#' to the indices of the top-predicted items.
#' @param X `X` data for a new user for which to make recommendations,
#' either as a numeric vector (class `numeric`), or as
#' a sparse vector from package `Matrix` (class `dsparseVector`). If the `X` to
#' which the model was fit was a `data.frame`, the column/item indices will have
#' been reindexed internally, and the numeration can be found under
#' `model$info$item_mapping`. Alternatively, can instead pass the column indices
#' and values and let the model reindex them (see `X_col` and `X_val`).
#' Should pass at most one of `X` or `X_col`+`X_val`.
#'
#' Dense `X` data is not supported for `CMF_implicit` or `OMF_implicit`.
#' @param X_col `X` data for a new user for which to make recommendations,
#' in sparse vector format, with `X_col` denoting the
#' items/columns which are not missing. If the `X` to which the model was fit was
#' a `data.frame`, here should pass IDs matching to the second column of that `X`,
#' which will be reindexed internally. Otherwise, should have column indices with
#' numeration starting at 1 (passed as an integer vector).
#' Should pass at most one of `X` or `X_col`+`X_val`.
#' @param X_val `X` data for a new user for which to make recommendations,
#' in sparse vector format, with `X_val` denoting the
#' associated values to each entry in `X_col`
#' (should be a numeric vector of the same length as `X_col`).
#' Should pass at most one of `X` or `X_col`+`X_val`.
#' @param weight (Only for the explicit-feedback models)
#' Associated weight to each non-missing observation in `X`. Must have the same
#' number of entries as `X` - that is, if passing a dense vector of length `n`,
#' `weight` should be a numeric vector of length `n` too, if passing a sparse
#' vector, should have a lenght corresponding to the number of non-missing elements.
#' @param U `U` data for a new user for which to make recommendations,
#' either as a numeric vector (class `numeric`), or as a
#' sparse vector from package `Matrix` (class `dsparseVector`). Alternatively,
#' if `U` is sparse, can instead pass the indices of the non-missing columns
#' and their values separately (see `U_col`).
#' Should pass at most one of `U` or `U_col`+`U_val`.
#' @param U_col `U` data for a new user for which to make recommendations,
#' in sparse vector format, with `U_col` denoting the
#' attributes/columns which are not missing. Should have numeration starting at 1
#' (should be an integer vector).
#' Should pass at most one of `U` or `U_col`+`U_val`.
#' @param U_val `U` data for a new user for which to make recommendations,
#' in sparse vector format, with `U_val` denoting the
#' associated values to each entry in `U_col`
#' (should be a numeric vector of the same length as `U_col`).
#' Should pass at most one of `U` or `U_col`+`U_val`.
#' @param U_bin Binary columns of `U` for a new user for which to make recommendations,
#' on which a sigmoid transformation will be
#' applied. Should be passed as a numeric vector. Note that `U` and `U_bin` are
#' not mutually exclusive.
#' @param I (Only for the `ContentBased` model)
#' New `I` data to rank for the given user, with rows denoting new columns of the `X` matrix.
#' Can be passed in the following formats:\itemize{
#' \item A sparse COO/triplets matrix, either from package
#' `Matrix` (class `dgTMatrix`), or from package `SparseM` (class `matrix.coo`).
#' \item A sparse matrix in CSR format, either from package
#' `Matrix` (class `dgRMatrix`), or from package `SparseM` (class `matrix.csr`).
#' Passing the input as CSR is faster than COO as it will be converted internally.
#' \item A sparse row vector from package `Matrix` (class `dsparseVector`).
#' \item A dense matrix from base R (class `matrix`), with missing entries set as NA.
#' \item A dense vector from base R (class `numeric`).
#' \item A `data.frame`.
#' }
#' When passing `I`, the item indices in `include`, `exclude`, and in the resulting
#' output refer to rows of `I`, and the ranking will be made only among the
#' rows of `I` (that is, they will not be compared against the old `X` data).
#' @param exact (In the `OMF_explicit` model)
#' Whether to calculate `A` and `Am` with the regularization applied
#' to `A` instead of to `Am` (if using the L-BFGS method, this is how the model
#' was fit). This is usually a slower procedure.
#' @param ... Not used.
#' @return If passing `output_score=FALSE` (the default), will output the
#' indices of the top-predicted elements. If passing `output_score=TRUE`,
#' will pass a list with two elements:\itemize{
#' \item `item`: The indices of the top-predicted elements.
#' \item `score`: The predicted value for each corresponding element in `item`.
#' }
#' If the `X` to which the model was fit was a `data.frame` (and unless passing `I`),
#' the item indices will be taken from the same IDs in `X` (its second column) - but
#' be aware that in this case they will usually be returned as `character`.
#' Otherwise, will return the indices of the top-predicted columns of `X`
#' (or rows of `I` if passing it) with numeration starting at 1.
#' @seealso \link{factors_single} \link{predict.cmfrec} \link{predict_new}
topN <- function(model, user=NULL, n=10L, include=NULL, exclude=NULL, output_score=FALSE) {
supported_models <- c("CMF", "CMF_implicit",
"MostPopular", "ContentBased",
"OMF_implicit", "OMF_explicit")
if (!NROW(intersect(class(model), supported_models)))
stop("Invalid model object - supported classes: ", paste(supported_models, collapse=", "))
if (is.null(user) && !("MostPopular" %in% class(model)))
stop("'user' cannot be empty for this model.")
inputs <- process.inputs.topN(class(model)[1L], model,
user = user, n = n,
include = include, exclude = exclude,
output_score = output_score)
return(.topN(class(model)[1L], model,
a_vec = inputs$a_vec, a_bias = inputs$a_bias,
n = inputs$n, include = inputs$include, exclude = inputs$exclude,
output_score = inputs$output_score))
}
#' @export
#' @rdname topN
topN_new <- function(model, ...) {
UseMethod("topN_new")
}
#' @export
#' @rdname topN
topN_new.CMF <- function(model, X=NULL, X_col=NULL, X_val=NULL,
U=NULL, U_col=NULL, U_val=NULL, U_bin=NULL, weight=NULL,
n=10L, include=NULL, exclude=NULL,
output_score=FALSE, ...) {
inputs <- process.inputs.topN(class(model)[1L], model,
n = n,
include = include, exclude = exclude,
output_score = output_score)
factors <- factors_single.CMF(model = model, X = X, X_col = X_col, X_val = X_val,
weight = weight,
U = U, U_col = U_col, U_val = U_val, U_bin = U_bin,
output_bias = as.logical(NROW(model$matrices$user_bias)))
if (class(factors) == "list") {
a_vec <- factors$factors
a_bias <- factors$bias
} else {
a_vec <- factors
a_bias <- NULL
}
return(.topN(class(model)[1L], model,
a_vec = a_vec, a_bias = a_bias,
n = inputs$n, include = inputs$include, exclude = inputs$exclude,
output_score = inputs$output_score))
}
#' @export
#' @rdname topN
topN_new.CMF_implicit <- function(model, X=NULL, X_col=NULL, X_val=NULL,
U=NULL, U_col=NULL, U_val=NULL,
n=10L, include=NULL, exclude=NULL,
output_score=FALSE, ...) {
inputs <- process.inputs.topN(class(model)[1L], model,
n = n,
include = include, exclude = exclude,
output_score = output_score)
factors <- factors_single.CMF_implicit(model = model, X = X, X_col = X_col, X_val = X_val,
U = U, U_col = U_col, U_val = U_val)
return(.topN(class(model)[1L], model,
a_vec = factors,
n = inputs$n, include = inputs$include, exclude = inputs$exclude,
output_score = inputs$output_score))
}
#' @export
#' @rdname topN
topN_new.ContentBased <- function(model, U=NULL, U_col=NULL, U_val=NULL, I=NULL,
n=10L, include=NULL, exclude=NULL,
output_score=FALSE, ...) {
if (!is.null(I) && (!is.null(include) || !is.null(exclude)))
stop("Cannot pass 'include' or 'exclude' when passing 'I' data.")
if (is.null(I)) {
inputs <- process.inputs.topN(class(model)[1L], model,
n = n,
include = include, exclude = exclude,
output_score = output_score)
factors <- factors_single.ContentBased(model = model, U = U, U_col = U_col, U_val = U_val)
return(.topN(class(model)[1L], model,
a_vec = factors,
n = inputs$n, include = inputs$include, exclude = inputs$exclude,
output_score = inputs$output_score))
} else {
processed_U <- process.new.U.single(U, U_col, U_val, "U",
model$info$user_mapping, NCOL(model$matrices$C),
model$info$U_cols)
processed_I <- process.new.U(I, model$info$I_cols, NCOL(model$matrices$D), "I",
allow_sparse=TRUE, allow_null=FALSE,
allow_na=FALSE, exact_shapes=TRUE)
outp_ix <- integer(length = n)
outp_score <- numeric(length = ifelse(output_score, n, 0L))
ret_code <- .Call("call_topN_new_content_based",
model$info$k, processed_I$m,
processed_U$U, processed_U$p,
processed_U$U_val, processed_U$U_col,
processed_I$Uarr, processed_I$p,
processed_I$Urow, processed_I$Ucol, processed_I$Uval,
processed_I$Ucsr_p, processed_I$Ucsr_i, processed_I$Ucsr,
model$matrices$C, model$matrices$C_bias,
model$matrices$D, model$matrices$D_bias,
model$matrices$glob_mean,
outp_ix, outp_score,
model$info$nthreads)
check.ret.code(ret_code)
outp_ix <- outp_ix + 1L
if (!is.null(row.names(I))) {
outp_ix <- row.names(I)[outp_ix]
}
if (output_score) {
return(list(item=outp_ix, score=outp_score))
} else {
return(outp_ix)
}
}
}
#' @export
#' @rdname topN
topN_new.OMF_explicit <- function(model, X=NULL, X_col=NULL, X_val=NULL,
U=NULL, U_col=NULL, U_val=NULL, weight=NULL, exact=FALSE,
n=10L, include=NULL, exclude=NULL,
output_score=FALSE, ...) {
inputs <- process.inputs.topN(class(model)[1L], model,
n = n,
include = include, exclude = exclude,
output_score = output_score)
factors <- factors_single.OMF_explicit(model = model, X = X, X_col = X_col, X_val = X_val,
weight = weight,
U = U, U_col = U_col, U_val = U_val,
output_bias = as.logical(NROW(model$matrices$user_bias)),
output_A = FALSE,
exact = exact)
if (class(factors) == "list") {
a_vec <- factors$factors
a_bias <- factors$bias
} else {
a_vec <- factors
a_bias <- NULL
}
return(.topN(class(model)[1L], model,
a_vec = a_vec, a_bias = a_bias,
n = inputs$n, include = inputs$include, exclude = inputs$exclude,
output_score = inputs$output_score))
}
#' @export
#' @rdname topN
topN_new.OMF_implicit <- function(model, X=NULL, X_col=NULL, X_val=NULL,
U=NULL, U_col=NULL, U_val=NULL,
n=10L, include=NULL, exclude=NULL,
output_score=FALSE, ...) {
inputs <- process.inputs.topN(class(model)[1L], model,
n = n,
include = include, exclude = exclude,
output_score = output_score)
a_vec <- factors_single.CMF_implicit(model = model, X = X, X_col = X_col, X_val = X_val,
U = U, U_col = U_col, U_val = U_val)
return(.topN(class(model)[1L], model,
a_vec = a_vec,
n = inputs$n, include = inputs$include, exclude = inputs$exclude,
output_score = inputs$output_score))
}