diff --git a/.devcontainer.json b/.devcontainer.json new file mode 100644 index 0000000..38513c5 --- /dev/null +++ b/.devcontainer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ecb0091094ee4b48df877086e6398aadb578135cb33ed1a5fcdb65ba1a5c4f13 +size 521 diff --git a/.gitattributes b/.gitattributes index 2dc2f6b..824e01d 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,5 +1,2 @@ -*.pt filter=lfs diff=lfs merge=lfs -text -*.pickle filter=lfs diff=lfs merge=lfs -text -*.gz filter=lfs diff=lfs merge=lfs -text -*.json filter=lfs diff=lfs merge=lfs -text -*.npz filter=lfs diff=lfs merge=lfs -text +**/*.ipynb filter=clean-nbs +**/*.ipynb diff=ipynb \ No newline at end of file diff --git a/.gitconfig b/.gitconfig new file mode 100644 index 0000000..4f7288d --- /dev/null +++ b/.gitconfig @@ -0,0 +1,16 @@ +# Generated by nbdev_install_git_hooks +# +# If you need to disable this instrumentation do: +# git config --local --unset include.path +# +# To restore the filter +# git config --local include.path .gitconfig +# +# If you see notebooks not stripped, checked the filters are applied in .gitattributes +# +[filter "clean-nbs"] + clean = nbdev_clean_nbs --read_input_stream True + smudge = cat + required = true +[diff "ipynb"] + textconv = nbdev_clean_nbs --disp True --fname diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml new file mode 100644 index 0000000..c2b264d --- /dev/null +++ b/.github/workflows/main.yml @@ -0,0 +1,33 @@ +name: CI +on: [push, pull_request] +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v1 + - uses: actions/setup-python@v1 + with: + python-version: '3.6' + architecture: 'x64' + - name: Install the library + run: | + pip install nbdev jupyter + pip install -e . + - name: Read all notebooks + run: | + nbdev_read_nbs + - name: Check if all notebooks are cleaned + run: | + echo "Check we are starting with clean git checkout" + if [ -n "$(git status -uno -s)" ]; then echo "git status is not clean"; false; fi + echo "Trying to strip out notebooks" + nbdev_clean_nbs + echo "Check that strip out was unnecessary" + git status -s # display the status to see which nbs need cleaning up + if [ -n "$(git status -uno -s)" ]; then echo -e "!!! Detected unstripped out notebooks\n!!!Remember to run nbdev_install_git_hooks"; false; fi + - name: Check if there is no diff library/notebooks + run: | + if [ -n "$(nbdev_diff_nbs)" ]; then echo -e "!!! Detected difference between the notebooks and the library"; false; fi + - name: Run tests + run: | + nbdev_test_nbs diff --git a/.gitignore b/.gitignore index b6e4761..1156813 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +# Data files in default data dir +dat/ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..e6709be --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,36 @@ +# How to contribute +We welcome all contributions such as writing wrapper for new frameworks (e.g. tensorflow), better docs and also if you have a dataset that could fit this framework. + +## How to get started +This repository use the (nbdev framework)[https://github.com/fastai/nbdev/] to build and maintain the repository, a fairly quick intro can be found at (https://github.com/fastai/nbdev/)[https://github.com/fastai/nbdev/]. +The main part is that one develop the code in jupyter notebooks, and that anything in the subdirectory `recsys_slates_dataset` is generated from these. +Installation of the nbdev package is needed to generate these files: `pip install nbdev` or `conda install -c fastai nbdev`. +Before anything else, please install the git hooks that run automatic scripts during each commit and merge to strip the notebooks of superfluous metadata (and avoid merge conflicts). After cloning the repository, run the following command inside it: +``` +nbdev_install_git_hooks +``` + +## Did you find a bug? + +* Ensure the bug was not already reported by searching on GitHub under Issues. +* If you're unable to find an open issue addressing the problem, open a new one. Be sure to include a title and clear description, as much relevant information as possible, and a code sample or an executable test case demonstrating the expected behavior that is not occurring. +* Be sure to add the complete error messages. + +#### Did you write a patch that fixes a bug? + +* Open a new GitHub pull request with the patch. +* Ensure that your PR includes a test that fails without your patch, and pass with it. +* Ensure the PR description clearly describes the problem and solution. Include the relevant issue number if applicable. + +## PR submission guidelines + +* Keep each PR focused. While it's more convenient, do not combine several unrelated fixes together. Create as many branches as needing to keep each PR focused. +* Do not mix style changes/fixes with "functional" changes. It's very difficult to review such PRs and it most likely get rejected. +* Do not add/remove vertical whitespace. Preserve the original style of the file you edit as much as you can. +* Do not turn an already submitted PR into your development playground. If after you submitted PR, you discovered that more work is needed - close the PR, do the required work and then submit a new PR. Otherwise each of your commits requires attention from maintainers of the project. +* If, however, you submitted a PR and received a request for changes, you should proceed with commits inside that PR, so that the maintainer can see the incremental fixes and won't need to review the whole PR again. In the exception case where you realize it'll take many many commits to complete the requests, then it's probably best to close the PR, do the work and then submit it again. Use common sense where you'd choose one way over another. + +## Do you want to contribute to the documentation? + +* Docs are automatically created from the notebooks in the nbs folder. + diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..261eeb9 --- /dev/null +++ b/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..5c0e7ce --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,5 @@ +include settings.ini +include LICENSE +include CONTRIBUTING.md +include README.md +recursive-exclude * __pycache__ diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..d15af43 --- /dev/null +++ b/Makefile @@ -0,0 +1,37 @@ +.ONESHELL: +SHELL := /bin/bash +SRC = $(wildcard ./*.ipynb) + +all: recsys_slates_dataset docs + +recsys_slates_dataset: $(SRC) + nbdev_build_lib + touch recsys_slates_dataset + +sync: + nbdev_update_lib + +docs_serve: docs + cd docs && bundle exec jekyll serve + +docs: $(SRC) + nbdev_build_docs + touch docs + +test: + nbdev_test_nbs + +release: pypi conda_release + nbdev_bump_version + +conda_release: + fastrelease_conda_package + +pypi: dist + twine upload --repository pypi dist/* + +dist: clean + python setup.py sdist bdist_wheel + +clean: + rm -rf dist \ No newline at end of file diff --git a/README.md b/README.md index 5dcbf56..0ad70bf 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,8 @@ -# FINN.no Recommender Systems Slate Dataset -We release the *FINN.no recommender systems slate dataset* to improve recommender systems research. +# FINN.no Slate Dataset for Recommender Systems +> Data and helper functions for FINN.no slate dataset containing both viewed items and clicks from the FINN.no second hand marketplace. + + +We release the *FINN.no slate dataset* to improve recommender systems research. The dataset includes both search and recommendation interactions between users and the platform over a 30 day period. The dataset has logged both exposures and clicks, *including interactions where the user did not click on any of the items in the slate*. To our knowledge there exist no such large-scale dataset, and we hope this contribution can help researchers constructing improved models and improve offline evaluation metrics. @@ -14,20 +17,34 @@ The dataset consists of 37.4 million interactions, |U| ≈ 2.3) million users a FINN.no is the leading marketplace in the Norwegian classifieds market and provides users with a platform to buy and sell general merchandise, cars, real estate, as well as house rentals and job offerings. For questions, email simen.eide@finn.no or file an issue. +## Install + +`pip install recsys_slates_dataset` + +## How to use + +To download the generic numpy data files: + +``` +from recsys_slates_dataset import datahelper +datahelper.download_data_files(data_dir="data") +``` + +Download and prepare data into ready-to-use pytorch dataloaders: + +``` python +from recsys_slates_dataset import dataset_torch +ind2val, itemattr, dataloaders = dataset_torch.load_dataloaders(data_dir="data") +``` + ## Organization The repository is organized as follows: -- The dataset is placed in (`data/`). -- The code open sourced from the article ["Dynamic Slate Recommendation with Gated Recurrent Units and Thompson Sampling"](https://arxiv.org/abs/2104.15046) is found in (`code/`). However, we are in the process of making the data more generally available which makes the code incompatible with the current (newer) version of the data. Please use [the v1.0 release of the repository](https://github.com/finn-no/recsys-slates-dataset/tree/v1.0) for a compatible version of the code and dataset. - -## Download and prepare dataset -The data files can either be obtained by cloning this repository with git lfs, or (preferably) use the [datahelper.download_data_files()](https://github.com/finn-no/recsys-slates-dataset/blame/transform-to-numpy-arrays/datahelper.py#L3) function which downloads the same dataset from google drive. -For pytorch users, they can directly use the `dataset_torch.load_dataloaders()` to get ready-to-use dataloaders for training, validation and test datasets. +- The dataset is placed in `data/` and stored using git-lfs. We also provide an automatic download function in the pip package (preferred usage). +- The code open sourced from the article ["Dynamic Slate Recommendation with Gated Recurrent Units and Thompson Sampling"](https://arxiv.org/abs/2104.15046) is found in (`code_eide_et_al21/`). However, we are in the process of making the data more generally available which makes the code incompatible with the current (newer) version of the data. Please use [the v1.0 release of the repository](https://github.com/finn-no/recsys-slates-dataset/tree/v1.0) for a compatible version of the code and dataset. -## Quickstart dataset [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/finn-no/recsys-slates-dataset/blob/master/quickstart-finn-recsys-slate-data.ipynb) +## Quickstart dataset [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/finn-no/recsys-slates-dataset/blob/master/examples/quickstart-finn-recsys-slate-data.ipynb) We provide a quickstart jupyter notebook that runs on Google Colab (quickstart-finn-recsys-slate-data.ipynb) which includes all necessary steps above. - -NB: This quickstart notebook is currently incompatible with the main branch. -We will update the notebook as soon as we have published a pip-package. In the meantime, please use [the v1.0 release of the repository](https://github.com/finn-no/recsys-slates-dataset/tree/v1.0) +It gives a quick introduction to how to use the dataset. ## Citations This repository accompany the paper ["Dynamic Slate Recommendation with Gated Recurrent Units and Thompson Sampling"](https://arxiv.org/abs/2104.15046) by Simen Eide, David S. Leslie and Arnoldo Frigessi. @@ -46,14 +63,15 @@ If you use either the code, data or paper, please consider citing the paper. } ``` -# Todo +## Todo This repository is currently *work in progress*, and we will provide descriptions and tutorials. Suggestions and contributions to make the material more available is welcome. There are some features of the repository that we are working on: - [x] Release the dataset as numpy objects instead of pytorch arrays. This will help non-pytorch users to more easily utilize the data - [x] Maintain a pytorch dataset for easy usage -- [ ] Create a pip package for easier installation and usage. the package should download the dataset using a function. -- [ ] Make the quickstart guide compatible with the pip package and numpy format. +- [x] Create a pip package for easier installation and usage. the package should download the dataset using a function. +- [x] Make the quickstart guide compatible with the pip package and numpy format. +- [ ] The git lfs is currently broken by removing some lines in .gitattributes that is in conflict with nbdev. The dataset is still usable using the building download functions as they use a different source. However, we should fix this. An issue is [posted on nbdev](https://github.com/fastai/nbdev/issues/506). - [ ] Add easily useable functions that compute relevant metrics such as hitrate, log-likelihood etc. - [ ] Distribute the data on other platforms such as kaggle. - [ ] Add a short description of the data in the readme.md directly. diff --git a/code/README.md b/code_eide_et_al21/README.md similarity index 100% rename from code/README.md rename to code_eide_et_al21/README.md diff --git a/code/agents.py b/code_eide_et_al21/agents.py similarity index 100% rename from code/agents.py rename to code_eide_et_al21/agents.py diff --git a/code/analysis.py b/code_eide_et_al21/analysis.py similarity index 100% rename from code/analysis.py rename to code_eide_et_al21/analysis.py diff --git a/code/ax_client_gru-flat.json b/code_eide_et_al21/ax_client_gru-flat.json similarity index 100% rename from code/ax_client_gru-flat.json rename to code_eide_et_al21/ax_client_gru-flat.json diff --git a/code/ax_client_gru-hier.json b/code_eide_et_al21/ax_client_gru-hier.json similarity index 100% rename from code/ax_client_gru-hier.json rename to code_eide_et_al21/ax_client_gru-hier.json diff --git a/code/ax_client_gru-unicand.json b/code_eide_et_al21/ax_client_gru-unicand.json similarity index 100% rename from code/ax_client_gru-unicand.json rename to code_eide_et_al21/ax_client_gru-unicand.json diff --git a/code/ax_client_linear-hier.json b/code_eide_et_al21/ax_client_linear-hier.json similarity index 100% rename from code/ax_client_linear-hier.json rename to code_eide_et_al21/ax_client_linear-hier.json diff --git a/code/config.yml b/code_eide_et_al21/config.yml similarity index 100% rename from code/config.yml rename to code_eide_et_al21/config.yml diff --git a/code/config_default.yml b/code_eide_et_al21/config_default.yml similarity index 100% rename from code/config_default.yml rename to code_eide_et_al21/config_default.yml diff --git a/code/dataset.py b/code_eide_et_al21/dataset.py similarity index 100% rename from code/dataset.py rename to code_eide_et_al21/dataset.py diff --git a/code/environment.yml b/code_eide_et_al21/environment.yml similarity index 100% rename from code/environment.yml rename to code_eide_et_al21/environment.yml diff --git a/code/hypertune.py b/code_eide_et_al21/hypertune.py similarity index 100% rename from code/hypertune.py rename to code_eide_et_al21/hypertune.py diff --git a/code/models.py b/code_eide_et_al21/models.py similarity index 100% rename from code/models.py rename to code_eide_et_al21/models.py diff --git a/code/pyrotrainer.py b/code_eide_et_al21/pyrotrainer.py similarity index 100% rename from code/pyrotrainer.py rename to code_eide_et_al21/pyrotrainer.py diff --git a/code/run_hypertune.sh b/code_eide_et_al21/run_hypertune.sh similarity index 100% rename from code/run_hypertune.sh rename to code_eide_et_al21/run_hypertune.sh diff --git a/code/train.py b/code_eide_et_al21/train.py similarity index 100% rename from code/train.py rename to code_eide_et_al21/train.py diff --git a/code/utils.py b/code_eide_et_al21/utils.py similarity index 100% rename from code/utils.py rename to code_eide_et_al21/utils.py diff --git a/conda/recsys_slates_dataset/meta.yaml b/conda/recsys_slates_dataset/meta.yaml new file mode 100644 index 0000000..1d24b66 --- /dev/null +++ b/conda/recsys_slates_dataset/meta.yaml @@ -0,0 +1,45 @@ +package: + name: recsys_slates_dataset + version: 0.0.2 +source: + sha256: 3c76c2f266cd816f9954a7b3e6dcc060278123caf079061086b53c05c35e2e12 + url: https://files.pythonhosted.org/packages/1d/09/19861b503280f7936990b39c27354445639d65d3a108eaba867003cf1021/recsys_slates_dataset-0.0.2.tar.gz +about: + dev_url: http://opensource.finn.no + doc_url: http://opensource.finn.no + home: http://opensource.finn.no + license: Apache Software + license_family: APACHE + summary: Recommender Systems Dataset from FINN.no containing the presented items + and whether and what the user clicked on. +build: + noarch: python + number: '0' + script: '{{ PYTHON }} -m pip install . -vv' +extra: + recipe-maintainers: + - finn-no +requirements: + host: + - pip + - python + - packaging + - requests>=2.25.1 + - numpy>=1.19.5 + - pandas>=1.0.5 + - torch>=1.6.0 + - PyYAML==5.4.1 + - googledrivedownloader==0.4 + run: + - pip + - python + - packaging + - requests>=2.25.1 + - numpy>=1.19.5 + - pandas>=1.0.5 + - torch>=1.6.0 + - PyYAML==5.4.1 + - googledrivedownloader==0.4 +test: + imports: + - recsys_slates_dataset diff --git a/datahelper.ipynb b/datahelper.ipynb new file mode 100644 index 0000000..2a1d6fc --- /dev/null +++ b/datahelper.ipynb @@ -0,0 +1,76 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#default_exp datahelper" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# datahelper\n", + "\n", + "> Generic helper functions that are independent of the specific machine learning library used" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#export\n", + "import logging\n", + "from google_drive_downloader import GoogleDriveDownloader as gdd\n", + "def download_data_files(data_dir : str = \"data\", overwrite=False, progbar=True, use_int32=True):\n", + " \"\"\"\n", + " Downloads the data from google drive.\n", + "\n", + " - data_dir: relative path to where data is downloaded.\n", + " - overwrite: If files exist they will not be downloaded again. NB/todo: the function does not check if there is a complete download of the file.\n", + " - progbar: simple progbar that says how much data that has been downloaded for each file.\n", + " - use_int32: The interaction data is a very large file and is not possible to load into memory in some cases (e.g. google colab). Therefore, we recommend using the int32 data type when loading the data.\n", + " \"\"\"\n", + "\n", + " if use_int32:\n", + " data_fileid = '1XHqyk01qi9qnvBTfWWwqgDzrdjv1eBVV'\n", + " else:\n", + " data_fileid = '1VXKXIvPCJ7z4BCa4G_5-Q2XMAD7nXOc7'\n", + " \n", + " gdrive_file_ids = {\n", + " 'data.npz' : data_fileid,\n", + " 'ind2val.json' : '1WOCKfuttMacCb84yQYcRjxjEtgPp6F4N',\n", + " 'itemattr.npz' : '1rKKyMQZqWp8vQ-Pl1SeHrQxzc5dXldnR'\n", + " }\n", + " \n", + " for filename, gdrive_id in gdrive_file_ids.items():\n", + " logging.info(\"Downloading {}\".format(filename))\n", + " gdd.download_file_from_google_drive(file_id=gdrive_id,\n", + " dest_path=\"{}/{}\".format(data_dir, filename),\n", + " overwrite=overwrite, showsize=progbar)\n", + " logging.info(\"Done downloading all files.\")\n", + " return True\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.8.8 64-bit ('base': conda)", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/datahelper.py b/datahelper.py deleted file mode 100644 index 60d7a62..0000000 --- a/datahelper.py +++ /dev/null @@ -1,19 +0,0 @@ -import logging -from google_drive_downloader import GoogleDriveDownloader as gdd -def download_data_files(data_dir : str = "data", overwrite=False): - """ - Downloads the data from google drive. - If files exist they will not be downloaded again unless overwrite=True - """ - gdrive_file_ids = { - 'data.npz' : '1VXKXIvPCJ7z4BCa4G_5-Q2XMAD7nXOc7', - 'ind2val.json' : '1WOCKfuttMacCb84yQYcRjxjEtgPp6F4N', - 'itemattr.npz' : '1rKKyMQZqWp8vQ-Pl1SeHrQxzc5dXldnR' - } - for filename, gdrive_id in gdrive_file_ids.items(): - logging.info("Downloading {}".format(filename)) - gdd.download_file_from_google_drive(file_id=gdrive_id, - dest_path="{}/{}".format(data_dir, filename), - overwrite=overwrite) - logging.info("Done downloading all files.") - return True diff --git a/dataset_torch.ipynb b/dataset_torch.ipynb new file mode 100644 index 0000000..5e0d33c --- /dev/null +++ b/dataset_torch.ipynb @@ -0,0 +1,163 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#default_exp dataset_torch" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# dataset_torch\n", + "\n", + "> Module to load the slates dataset into a Pytorch Dataset and Dataloaders with default train/valid test splits." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#export\n", + "import torch\n", + "import recsys_slates_dataset.datahelper as datahelper\n", + "from torch.utils.data import Dataset, DataLoader\n", + "import torch\n", + "import json\n", + "import numpy as np\n", + "import logging\n", + "logging.basicConfig(format='%(asctime)s %(message)s', level='INFO')\n", + "\n", + "class SequentialDataset(Dataset):\n", + " '''\n", + " Note: displayType has been uncommented for future easy implementation.\n", + " '''\n", + " def __init__(self, data, sample_uniform_slate=False):\n", + "\n", + " self.data = data\n", + " self.num_items = self.data['slate'].max()+1\n", + " self.sample_uniform_slate = sample_uniform_slate\n", + " logging.info(\n", + " \"Loading dataset with slate size={} and uniform candidate sampling={}\"\n", + " .format(self.data['slate'].size(), self.sample_uniform_slate))\n", + "\n", + " def __getitem__(self, idx):\n", + " batch = {key: val[idx] for key, val in self.data.items()}\n", + "\n", + " if self.sample_uniform_slate:\n", + " # Sample actions uniformly:\n", + " action = torch.randint_like(batch['slate'], low=3, high=self.num_items)\n", + " \n", + " # Add noclick action at pos0 \n", + " # and the actual click action at pos 1 (unless noclick):\n", + " action[:,0] = 1\n", + " clicked = batch['click']!=1\n", + " action[:,1][clicked] = batch['click'][clicked]\n", + " batch['slate'] = action\n", + " # Set click idx to 0 if noclick, and 1 otherwise:\n", + " batch['click_idx'] = clicked.long()\n", + " \n", + " return batch\n", + "\n", + " def __len__(self):\n", + " return len(self.data['click'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#export\n", + "def load_dataloaders(data_dir= \"dat\",\n", + " batch_size=1024,\n", + " num_workers= 0,\n", + " sample_uniform_slate=False,\n", + " valid_pct= 0.05,\n", + " test_pct= 0.05,\n", + " t_testsplit= 5):\n", + " \"\"\"\n", + " Loads pytorch dataloaders to be used in training. If used with standard settings, the train/val/test split is equivalent to Eide et. al. 2021 \n", + " \"\"\"\n", + " \n", + " logging.info(\"Download data if not in data folder..\")\n", + " datahelper.download_data_files(data_dir=data_dir)\n", + "\n", + " logging.info('Load data..')\n", + " with np.load(\"{}/data.npz\".format(data_dir)) as data_np:\n", + " data = {key: torch.tensor(val) for key, val in data_np.items()}\n", + " dataset = SequentialDataset(data, sample_uniform_slate)\n", + " \n", + " with open('{}/ind2val.json'.format(data_dir), 'rb') as handle:\n", + " # Use string2int object_hook found here: https://stackoverflow.com/a/54112705\n", + " ind2val = json.load(\n", + " handle, \n", + " object_hook=lambda d: {\n", + " int(k) if k.lstrip('-').isdigit() else k: v \n", + " for k, v in d.items()\n", + " }\n", + " )\n", + "\n", + " # Split dataset into train, validation and test:\n", + " num_validusers = int(len(dataset) * valid_pct)\n", + " num_testusers = int(len(dataset) * test_pct)\n", + " torch.manual_seed(0)\n", + " num_users = len(dataset)\n", + " perm_user = torch.randperm(num_users)\n", + " valid_user_idx = perm_user[:num_validusers]\n", + " test_user_idx = perm_user[num_validusers:(num_validusers+num_testusers)]\n", + " train_user_idx = perm_user[(num_validusers+num_testusers):]\n", + " # Mask type: 1: train, 2: valid, 3: test\n", + " dataset.data['mask_type'] = torch.ones_like(dataset.data['click'])\n", + " dataset.data['mask_type'][valid_user_idx, t_testsplit:] = 2\n", + " dataset.data['mask_type'][test_user_idx, t_testsplit:] = 3\n", + "\n", + " subsets = {\n", + " 'train': dataset, \n", + " 'valid': torch.utils.data.Subset(dataset, valid_user_idx),\n", + " 'test': torch.utils.data.Subset(dataset, test_user_idx)\n", + " }\n", + "\n", + " # Build dataloaders for each data subset:\n", + " dataloaders = {\n", + " phase: DataLoader(ds, batch_size=batch_size, shuffle=(phase==\"train\"), num_workers=num_workers)\n", + " for phase, ds in subsets.items()\n", + " }\n", + " for key, dl in dataloaders.items():\n", + " logging.info(\n", + " \"In {}: num_users: {}, num_batches: {}\".format(key, len(dl.dataset), len(dl))\n", + " )\n", + " \n", + " # Load item attributes:\n", + " with np.load('{}/itemattr.npz'.format(data_dir), mmap_mode=None) as itemattr_file:\n", + " itemattr = {key : val for key, val in itemattr_file.items()}\n", + "\n", + " return ind2val, itemattr, dataloaders" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# ind2val, itemattr, dataloaders = load_dataloaders()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.8.8 64-bit ('anaconda3': virtualenv)", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..55371f7 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,35 @@ +version: "3" +services: + fastai: &fastai + restart: unless-stopped + working_dir: /data + image: fastai/codespaces + logging: + driver: json-file + options: + max-size: 50m + stdin_open: true + tty: true + volumes: + - .:/data/ + + notebook: + <<: *fastai + command: bash -c "pip install -e . && jupyter notebook --allow-root --no-browser --ip=0.0.0.0 --port=8080 --NotebookApp.token='' --NotebookApp.password=''" + ports: + - "8080:8080" + + watcher: + <<: *fastai + command: watchmedo shell-command --command nbdev_build_docs --pattern *.ipynb --recursive --drop + network_mode: host # for GitHub Codespaces https://github.com/features/codespaces/ + + jekyll: + <<: *fastai + ports: + - "4000:4000" + command: > + bash -c "pip install . + && nbdev_build_docs && cd docs + && bundle i + && chmod -R u+rwx . && bundle exec jekyll serve --host 0.0.0.0" diff --git a/docs/.gitignore b/docs/.gitignore new file mode 100644 index 0000000..57510a2 --- /dev/null +++ b/docs/.gitignore @@ -0,0 +1 @@ +_site/ diff --git a/docs/Gemfile b/docs/Gemfile new file mode 100644 index 0000000..f2509a4 --- /dev/null +++ b/docs/Gemfile @@ -0,0 +1,9 @@ +source "https://rubygems.org" + +gem 'github-pages', group: :jekyll_plugins + +# Added at 2019-11-25 10:11:40 -0800 by jhoward: +gem "nokogiri", "< 1.11.1" +gem "jekyll", ">= 3.7" +gem "kramdown", ">= 2.3.0" +gem "jekyll-remote-theme" diff --git a/docs/Gemfile.lock b/docs/Gemfile.lock new file mode 100644 index 0000000..271bfeb --- /dev/null +++ b/docs/Gemfile.lock @@ -0,0 +1,269 @@ +GEM + remote: https://rubygems.org/ + specs: + activesupport (6.0.3.4) + concurrent-ruby (~> 1.0, >= 1.0.2) + i18n (>= 0.7, < 2) + minitest (~> 5.1) + tzinfo (~> 1.1) + zeitwerk (~> 2.2, >= 2.2.2) + addressable (2.7.0) + public_suffix (>= 2.0.2, < 5.0) + coffee-script (2.4.1) + coffee-script-source + execjs + coffee-script-source (1.11.1) + colorator (1.1.0) + commonmarker (0.17.13) + ruby-enum (~> 0.5) + concurrent-ruby (1.1.7) + dnsruby (1.61.5) + simpleidn (~> 0.1) + em-websocket (0.5.2) + eventmachine (>= 0.12.9) + http_parser.rb (~> 0.6.0) + ethon (0.12.0) + ffi (>= 1.3.0) + eventmachine (1.2.7) + execjs (2.7.0) + faraday (1.3.0) + faraday-net_http (~> 1.0) + multipart-post (>= 1.2, < 3) + ruby2_keywords + faraday-net_http (1.0.1) + ffi (1.14.2) + forwardable-extended (2.6.0) + gemoji (3.0.1) + github-pages (209) + github-pages-health-check (= 1.16.1) + jekyll (= 3.9.0) + jekyll-avatar (= 0.7.0) + jekyll-coffeescript (= 1.1.1) + jekyll-commonmark-ghpages (= 0.1.6) + jekyll-default-layout (= 0.1.4) + jekyll-feed (= 0.15.1) + jekyll-gist (= 1.5.0) + jekyll-github-metadata (= 2.13.0) + jekyll-mentions (= 1.6.0) + jekyll-optional-front-matter (= 0.3.2) + jekyll-paginate (= 1.1.0) + jekyll-readme-index (= 0.3.0) + jekyll-redirect-from (= 0.16.0) + jekyll-relative-links (= 0.6.1) + jekyll-remote-theme (= 0.4.2) + jekyll-sass-converter (= 1.5.2) + jekyll-seo-tag (= 2.6.1) + jekyll-sitemap (= 1.4.0) + jekyll-swiss (= 1.0.0) + jekyll-theme-architect (= 0.1.1) + jekyll-theme-cayman (= 0.1.1) + jekyll-theme-dinky (= 0.1.1) + jekyll-theme-hacker (= 0.1.2) + jekyll-theme-leap-day (= 0.1.1) + jekyll-theme-merlot (= 0.1.1) + jekyll-theme-midnight (= 0.1.1) + jekyll-theme-minimal (= 0.1.1) + jekyll-theme-modernist (= 0.1.1) + jekyll-theme-primer (= 0.5.4) + jekyll-theme-slate (= 0.1.1) + jekyll-theme-tactile (= 0.1.1) + jekyll-theme-time-machine (= 0.1.1) + jekyll-titles-from-headings (= 0.5.3) + jemoji (= 0.12.0) + kramdown (= 2.3.0) + kramdown-parser-gfm (= 1.1.0) + liquid (= 4.0.3) + mercenary (~> 0.3) + minima (= 2.5.1) + nokogiri (>= 1.10.4, < 2.0) + rouge (= 3.23.0) + terminal-table (~> 1.4) + github-pages-health-check (1.16.1) + addressable (~> 2.3) + dnsruby (~> 1.60) + octokit (~> 4.0) + public_suffix (~> 3.0) + typhoeus (~> 1.3) + html-pipeline (2.14.0) + activesupport (>= 2) + nokogiri (>= 1.4) + http_parser.rb (0.6.0) + i18n (0.9.5) + concurrent-ruby (~> 1.0) + jekyll (3.9.0) + addressable (~> 2.4) + colorator (~> 1.0) + em-websocket (~> 0.5) + i18n (~> 0.7) + jekyll-sass-converter (~> 1.0) + jekyll-watch (~> 2.0) + kramdown (>= 1.17, < 3) + liquid (~> 4.0) + mercenary (~> 0.3.3) + pathutil (~> 0.9) + rouge (>= 1.7, < 4) + safe_yaml (~> 1.0) + jekyll-avatar (0.7.0) + jekyll (>= 3.0, < 5.0) + jekyll-coffeescript (1.1.1) + coffee-script (~> 2.2) + coffee-script-source (~> 1.11.1) + jekyll-commonmark (1.3.1) + commonmarker (~> 0.14) + jekyll (>= 3.7, < 5.0) + jekyll-commonmark-ghpages (0.1.6) + commonmarker (~> 0.17.6) + jekyll-commonmark (~> 1.2) + rouge (>= 2.0, < 4.0) + jekyll-default-layout (0.1.4) + jekyll (~> 3.0) + jekyll-feed (0.15.1) + jekyll (>= 3.7, < 5.0) + jekyll-gist (1.5.0) + octokit (~> 4.2) + jekyll-github-metadata (2.13.0) + jekyll (>= 3.4, < 5.0) + octokit (~> 4.0, != 4.4.0) + jekyll-mentions (1.6.0) + html-pipeline (~> 2.3) + jekyll (>= 3.7, < 5.0) + jekyll-optional-front-matter (0.3.2) + jekyll (>= 3.0, < 5.0) + jekyll-paginate (1.1.0) + jekyll-readme-index (0.3.0) + jekyll (>= 3.0, < 5.0) + jekyll-redirect-from (0.16.0) + jekyll (>= 3.3, < 5.0) + jekyll-relative-links (0.6.1) + jekyll (>= 3.3, < 5.0) + jekyll-remote-theme (0.4.2) + addressable (~> 2.0) + jekyll (>= 3.5, < 5.0) + jekyll-sass-converter (>= 1.0, <= 3.0.0, != 2.0.0) + rubyzip (>= 1.3.0, < 3.0) + jekyll-sass-converter (1.5.2) + sass (~> 3.4) + jekyll-seo-tag (2.6.1) + jekyll (>= 3.3, < 5.0) + jekyll-sitemap (1.4.0) + jekyll (>= 3.7, < 5.0) + jekyll-swiss (1.0.0) + jekyll-theme-architect (0.1.1) + jekyll (~> 3.5) + jekyll-seo-tag (~> 2.0) + jekyll-theme-cayman (0.1.1) + jekyll (~> 3.5) + jekyll-seo-tag (~> 2.0) + jekyll-theme-dinky (0.1.1) + jekyll (~> 3.5) + jekyll-seo-tag (~> 2.0) + jekyll-theme-hacker (0.1.2) + jekyll (> 3.5, < 5.0) + jekyll-seo-tag (~> 2.0) + jekyll-theme-leap-day (0.1.1) + jekyll (~> 3.5) + jekyll-seo-tag (~> 2.0) + jekyll-theme-merlot (0.1.1) + jekyll (~> 3.5) + jekyll-seo-tag (~> 2.0) + jekyll-theme-midnight (0.1.1) + jekyll (~> 3.5) + jekyll-seo-tag (~> 2.0) + jekyll-theme-minimal (0.1.1) + jekyll (~> 3.5) + jekyll-seo-tag (~> 2.0) + jekyll-theme-modernist (0.1.1) + jekyll (~> 3.5) + jekyll-seo-tag (~> 2.0) + jekyll-theme-primer (0.5.4) + jekyll (> 3.5, < 5.0) + jekyll-github-metadata (~> 2.9) + jekyll-seo-tag (~> 2.0) + jekyll-theme-slate (0.1.1) + jekyll (~> 3.5) + jekyll-seo-tag (~> 2.0) + jekyll-theme-tactile (0.1.1) + jekyll (~> 3.5) + jekyll-seo-tag (~> 2.0) + jekyll-theme-time-machine (0.1.1) + jekyll (~> 3.5) + jekyll-seo-tag (~> 2.0) + jekyll-titles-from-headings (0.5.3) + jekyll (>= 3.3, < 5.0) + jekyll-watch (2.2.1) + listen (~> 3.0) + jemoji (0.12.0) + gemoji (~> 3.0) + html-pipeline (~> 2.2) + jekyll (>= 3.0, < 5.0) + kramdown (2.3.0) + rexml + kramdown-parser-gfm (1.1.0) + kramdown (~> 2.0) + liquid (4.0.3) + listen (3.4.0) + rb-fsevent (~> 0.10, >= 0.10.3) + rb-inotify (~> 0.9, >= 0.9.10) + mercenary (0.3.6) + mini_portile2 (2.5.0) + minima (2.5.1) + jekyll (>= 3.5, < 5.0) + jekyll-feed (~> 0.9) + jekyll-seo-tag (~> 2.1) + minitest (5.14.3) + multipart-post (2.1.1) + nokogiri (1.11.0) + mini_portile2 (~> 2.5.0) + racc (~> 1.4) + octokit (4.20.0) + faraday (>= 0.9) + sawyer (~> 0.8.0, >= 0.5.3) + pathutil (0.16.2) + forwardable-extended (~> 2.6) + public_suffix (3.1.1) + racc (1.5.2) + rb-fsevent (0.10.4) + rb-inotify (0.10.1) + ffi (~> 1.0) + rexml (3.2.4) + rouge (3.23.0) + ruby-enum (0.8.0) + i18n + ruby2_keywords (0.0.2) + rubyzip (2.3.0) + safe_yaml (1.0.5) + sass (3.7.4) + sass-listen (~> 4.0.0) + sass-listen (4.0.0) + rb-fsevent (~> 0.9, >= 0.9.4) + rb-inotify (~> 0.9, >= 0.9.7) + sawyer (0.8.2) + addressable (>= 2.3.5) + faraday (> 0.8, < 2.0) + simpleidn (0.1.1) + unf (~> 0.1.4) + terminal-table (1.8.0) + unicode-display_width (~> 1.1, >= 1.1.1) + thread_safe (0.3.6) + typhoeus (1.4.0) + ethon (>= 0.9.0) + tzinfo (1.2.9) + thread_safe (~> 0.1) + unf (0.1.4) + unf_ext + unf_ext (0.0.7.7) + unicode-display_width (1.7.0) + zeitwerk (2.4.2) + +PLATFORMS + ruby + +DEPENDENCIES + github-pages + jekyll (>= 3.7) + jekyll-remote-theme + kramdown (>= 2.3.0) + nokogiri (< 1.11.1) + +BUNDLED WITH + 2.1.4 diff --git a/docs/_config.yml b/docs/_config.yml new file mode 100644 index 0000000..5fd2b94 --- /dev/null +++ b/docs/_config.yml @@ -0,0 +1,66 @@ +repository: finn-no/recsys_slates_dataset +output: web +topnav_title: recsys_slates_dataset +site_title: recsys_slates_dataset +company_name: Used with attribution +description: Contains the presented items and whether and what the user clicked on. +# Set to false to disable KaTeX math +use_math: true +# Add Google analytics id if you have one and want to use it here +google_analytics: +# See http://nbdev.fast.ai/search for help with adding Search +google_search: + +host: 127.0.0.1 +# the preview server used. Leave as is. +port: 4000 +# the port where the preview is rendered. + +exclude: + - .idea/ + - .gitignore + - vendor + +exclude: [vendor] + +highlighter: rouge +markdown: kramdown +kramdown: + input: GFM + auto_ids: true + hard_wrap: false + syntax_highlighter: rouge + +collections: + tooltips: + output: false + +defaults: + - + scope: + path: "" + type: "pages" + values: + layout: "page" + comments: true + search: true + sidebar: home_sidebar + topnav: topnav + - + scope: + path: "" + type: "tooltips" + values: + layout: "page" + comments: true + search: true + tooltip: true + +sidebars: +- home_sidebar + +plugins: + - jekyll-remote-theme + +remote_theme: fastai/nbdev-jekyll-theme +baseurl: /recsys_slates_dataset/ \ No newline at end of file diff --git a/docs/_data/sidebars/home_sidebar.yml b/docs/_data/sidebars/home_sidebar.yml new file mode 100644 index 0000000..3108947 --- /dev/null +++ b/docs/_data/sidebars/home_sidebar.yml @@ -0,0 +1,24 @@ + +################################################# +### THIS FILE WAS AUTOGENERATED! DO NOT EDIT! ### +################################################# +# Instead edit ../../sidebar.json +entries: +- folders: + - folderitems: + - output: web,pdf + title: Overview + url: / + - output: web,pdf + title: datahelper + url: datahelper.html + - output: web,pdf + title: dataset_torch + url: dataset_torch.html + - output: web,pdf + title: Quick start with the FINN.no recsys slate dataset [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/finn-no/recsys-slates-dataset/blob/master/quickstart-finn-recsys-slate-data.ipynb) + url: quickstart-finn-recsys-slate-data.html + output: web + title: recsys_slates_dataset + output: web + title: Sidebar diff --git a/docs/_data/topnav.yml b/docs/_data/topnav.yml new file mode 100644 index 0000000..7f0d872 --- /dev/null +++ b/docs/_data/topnav.yml @@ -0,0 +1,10 @@ +topnav: +- title: Topnav + items: + - title: github + external_url: https://github.com/finn-no/recsys_slates_dataset/tree/nbdev/ + +#Topnav dropdowns +topnav_dropdowns: +- title: Topnav dropdowns + folders: \ No newline at end of file diff --git a/docs/core.html b/docs/core.html new file mode 100644 index 0000000..94c4d40 --- /dev/null +++ b/docs/core.html @@ -0,0 +1,149 @@ +--- + +title: module name here + + +keywords: fastai +sidebar: home_sidebar + +summary: "API details." +description: "API details." +nb_path: "00_core.ipynb" +--- + + +
+ + {% raw %} + +
+ +
+ {% endraw %} + + {% raw %} + +
+ +
+
+ +
+ + +
+

say_hello[source]

say_hello(to)

+
+

Say hello to somebody

+ +
+ +
+ +
+
+ +
+ {% endraw %} + + {% raw %} + +
+ +
+ {% endraw %} + + {% raw %} + +
+
+ +
+
+
say_hello("Sylvain")
+
+ +
+
+
+ +
+
+ +
+ + + +
+
'Hello Sylvain!'
+
+ +
+ +
+
+ +
+ {% endraw %} + + {% raw %} + +
+
+ +
+
+
from IPython.display import display,SVG
+display(SVG('<svg height="100"><circle cx="50" cy="50" r="40"/></svg>'))
+
+ +
+
+
+ +
+
+ +
+ + +
+ +
+ +
+ +
+
+ +
+ {% endraw %} + + {% raw %} + +
+
+ +
+
+
assert say_hello("Jeremy")=="Hello Jeremy!"
+
+ +
+
+
+ +
+ {% endraw %} + +
+ + diff --git a/docs/datahelper.html b/docs/datahelper.html new file mode 100644 index 0000000..a25acdd --- /dev/null +++ b/docs/datahelper.html @@ -0,0 +1,72 @@ +--- + +title: datahelper + + +keywords: fastai +sidebar: home_sidebar + +summary: "Generic helper functions that are independent of the specific machine learning library used" +description: "Generic helper functions that are independent of the specific machine learning library used" +nb_path: "datahelper.ipynb" +--- + + +
+ + {% raw %} + +
+ +
+ {% endraw %} + + {% raw %} + +
+ +
+
+ +
+ + +
+

download_data_files[source]

download_data_files(data_dir:str='data', overwrite=False, progbar=True, use_int32=True)

+
+

Downloads the data from google drive.

+
    +
  • data_dir: relative path to where data is downloaded.
  • +
  • overwrite: If files exist they will not be downloaded again. NB/todo: the function does not check if there is a complete download of the file.
  • +
  • progbar: simple progbar that says how much data that has been downloaded for each file.
  • +
  • use_int32: The interaction data is a very large file and is not possible to load into memory in some cases (e.g. google colab). Therefore, we recommend using the int32 data type when loading the data.
  • +
+ +
+ +
+ +
+
+ +
+ {% endraw %} + + {% raw %} + +
+ +
+ {% endraw %} + +
+ + diff --git a/docs/dataset_torch.html b/docs/dataset_torch.html new file mode 100644 index 0000000..001d7bd --- /dev/null +++ b/docs/dataset_torch.html @@ -0,0 +1,115 @@ +--- + +title: dataset_torch + + +keywords: fastai +sidebar: home_sidebar + +summary: "Module to load the slates dataset into a Pytorch Dataset and Dataloaders with default train/valid test splits." +description: "Module to load the slates dataset into a Pytorch Dataset and Dataloaders with default train/valid test splits." +nb_path: "dataset_torch.ipynb" +--- + + +
+ + {% raw %} + +
+ +
+ {% endraw %} + + {% raw %} + +
+ +
+
+ +
+ + +
+

class SequentialDataset[source]

SequentialDataset(*args, **kwds) :: Dataset

+
+

Note: displayType has been uncommented for future easy implementation.

+ +
+ +
+ +
+
+ +
+ {% endraw %} + + {% raw %} + +
+ +
+ {% endraw %} + + {% raw %} + +
+ +
+
+ +
+ + +
+

load_dataloaders[source]

load_dataloaders(data_dir='dat', batch_size=1024, num_workers=0, sample_uniform_slate=False, valid_pct=0.05, test_pct=0.05, t_testsplit=5)

+
+

Loads pytorch dataloaders to be used in training. If used with standard settings, the train/val/test split is equivalent to Eide et. al. 2021

+ +
+ +
+ +
+
+ +
+ {% endraw %} + + {% raw %} + +
+ +
+ {% endraw %} + + {% raw %} + +
+
+ +
+
+
 
+
+ +
+
+
+ +
+ {% endraw %} + +
+ + diff --git a/docs/feed.xml b/docs/feed.xml new file mode 100644 index 0000000..d8d6ac9 --- /dev/null +++ b/docs/feed.xml @@ -0,0 +1,32 @@ +--- +search: exclude +layout: none +--- + + + + + {{ site.title | xml_escape }} + {{ site.description | xml_escape }} + {{ site.url }}/ + + {{ site.time | date_to_rfc822 }} + {{ site.time | date_to_rfc822 }} + Jekyll v{{ jekyll.version }} + {% for post in site.posts limit:10 %} + + {{ post.title | xml_escape }} + {{ post.content | xml_escape }} + {{ post.date | date_to_rfc822 }} + {{ post.url | prepend: site.url }} + {{ post.url | prepend: site.url }} + {% for tag in post.tags %} + {{ tag | xml_escape }} + {% endfor %} + {% for tag in page.tags %} + {{ cat | xml_escape }} + {% endfor %} + + {% endfor %} + + diff --git a/docs/finn-frontpage.png b/docs/finn-frontpage.png new file mode 100644 index 0000000..d3e3d15 Binary files /dev/null and b/docs/finn-frontpage.png differ diff --git a/docs/index.html b/docs/index.html new file mode 100644 index 0000000..3bcfdb8 --- /dev/null +++ b/docs/index.html @@ -0,0 +1,144 @@ +--- + +title: FINN.no Slate Dataset for Recommender Systems + + +keywords: fastai +sidebar: home_sidebar + +summary: "Data and helper functions for FINN.no slate dataset containing both viewed items and clicks from the FINN.no second hand marketplace." +description: "Data and helper functions for FINN.no slate dataset containing both viewed items and clicks from the FINN.no second hand marketplace." +nb_path: "index.ipynb" +--- + + +
+ + {% raw %} + +
+ +
+ {% endraw %} + +
+
+

We release the FINN.no slate dataset to improve recommender systems research. +The dataset includes both search and recommendation interactions between users and the platform over a 30 day period. +The dataset has logged both exposures and clicks, including interactions where the user did not click on any of the items in the slate. +To our knowledge there exist no such large-scale dataset, and we hope this contribution can help researchers constructing improved models and improve offline evaluation metrics.

+

A visualization of a presented slate to the user on the frontpage of FINN.no

+

For each user u and interaction step t we recorded all items in the visible slate equ ) (up to the scroll length equ), and the user's click response equ. +The dataset consists of 37.4 million interactions, |U| ≈ 2.3) million users and |I| ≈ 1.3 million items that belong to one of G = 290 item groups. For a detailed description of the data please see the paper.

+

A visualization of a presented slate to the user on the frontpage of FINN.no

+

FINN.no is the leading marketplace in the Norwegian classifieds market and provides users with a platform to buy and sell general merchandise, cars, real estate, as well as house rentals and job offerings. +For questions, email simen.eide@finn.no or file an issue.

+ +
+
+
+
+
+

Install

+
+
+
+
+
+

pip install recsys_slates_dataset

+ +
+
+
+
+
+

How to use

To download the generic numpy data files:

+ +
+
+
+ {% raw %} + +
+
+ +
+
+
from recsys_slates_dataset import datahelper
+datahelper.download_data_files(data_dir="data")
+
+ +
+
+
+ +
+ {% endraw %} + +
+
+

Download and prepare data into ready-to-use pytorch dataloaders:

+ +
+
+
+
+
+
from recsys_slates_dataset import dataset_torch
+ind2val, itemattr, dataloaders = dataset_torch.load_dataloaders(data_dir="data")
+
+ +
+
+
+
+
+

Organization

The repository is organized as follows:

+ +

Quickstart dataset Open In Colab

+

We provide a quickstart jupyter notebook that runs on Google Colab (quickstart-finn-recsys-slate-data.ipynb) which includes all necessary steps above. +It gives a quick introduction to how to use the dataset.

+

Citations

This repository accompany the paper "Dynamic Slate Recommendation with Gated Recurrent Units and Thompson Sampling" by Simen Eide, David S. Leslie and Arnoldo Frigessi. +The article is under review, and the pre-print can be obtained here.

+

If you use either the code, data or paper, please consider citing the paper.

+ +
@article{eide2021dynamic,
+      title={Dynamic Slate Recommendation with Gated Recurrent Units and Thompson Sampling}, 
+      author={Simen Eide and David S. Leslie and Arnoldo Frigessi},
+      year={2021},
+      eprint={2104.15046},
+      archivePrefix={arXiv},
+      primaryClass={stat.ML}
+}
+

Todo

This repository is currently work in progress, and we will provide descriptions and tutorials. Suggestions and contributions to make the material more available is welcome. +There are some features of the repository that we are working on:

+
    +
  • [x] Release the dataset as numpy objects instead of pytorch arrays. This will help non-pytorch users to more easily utilize the data
  • +
  • [x] Maintain a pytorch dataset for easy usage
  • +
  • [x] Create a pip package for easier installation and usage. the package should download the dataset using a function.
  • +
  • [x] Make the quickstart guide compatible with the pip package and numpy format.
  • +
  • [ ] The git lfs is currently broken by removing some lines in .gitattributes that is in conflict with nbdev. The dataset is still usable using the building download functions as they use a different source. However, we should fix this. An issue is posted on nbdev.
  • +
  • [ ] Add easily useable functions that compute relevant metrics such as hitrate, log-likelihood etc.
  • +
  • [ ] Distribute the data on other platforms such as kaggle.
  • +
  • [ ] Add a short description of the data in the readme.md directly.
  • +
+

As the current state is in early stage, it makes sense to allow the above changes non-backward compatible. +However, this should be completed within the next couple of months.

+ +
+
+
+
+ + diff --git a/docs/interaction_illustration.png b/docs/interaction_illustration.png new file mode 100644 index 0000000..1460d6b Binary files /dev/null and b/docs/interaction_illustration.png differ diff --git a/docs/quickstart-finn-recsys-slate-data.html b/docs/quickstart-finn-recsys-slate-data.html new file mode 100644 index 0000000..1a24407 --- /dev/null +++ b/docs/quickstart-finn-recsys-slate-data.html @@ -0,0 +1,752 @@ +--- + +title: Quick start with the FINN.no recsys slate dataset [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/finn-no/recsys-slates-dataset/blob/master/quickstart-finn-recsys-slate-data.ipynb) + + +keywords: fastai +sidebar: home_sidebar + + + +nb_path: "examples/quickstart-finn-recsys-slate-data.ipynb" +--- + + +
+ + {% raw %} + +
+ +
+ {% endraw %} + +
+
+

Install the recsys_slates_dataset pip package

+
+
+
+ {% raw %} + +
+
+ +
+
+
!pip install recsys_slates_dataset -q
+
+ +
+
+
+ +
+ {% endraw %} + +
+
+

Download and load dataloaders that are ready to use

It is possible to directly load the dataset as a pytorch dataloader which includes the same dataset splits etc as in the original paper. +Use the load_dataloaders function in the dataset_torch module. It has the following options:

+ + + + + + + + + + + + + + + + + + + + + + + + +
ArgumentDescription
batch_sizeNumber of unique users sampled in each batch
split_trainvalidRatio of full dataset dedicated to train
(val/test is split evenly among the rest)
t_testsplitFor users in valid and test,
how many interactions should belong to training set
sample_uniform_actionIf this is True, the exposures in the dataset
are sampled as in the all-item likelihood (see paper)
+

The outputs of the function are ind2val, itemattr and a dictionary with pytorch dataloaders for training, validation and test.

+ +
+
+
+ {% raw %} + +
+
+ +
+
+
import torch
+from recsys_slates_dataset import dataset_torch
+ind2val, itemattr, dataloaders = dataset_torch.load_dataloaders(data_dir="dat")
+
+print("Dictionary containing the dataloaders:")
+print(dataloaders)
+
+ +
+
+
+ +
+
+ +
+ +
+
2021-07-03 21:13:13,672 Download data if not in data folder..
+2021-07-03 21:13:13,676 Downloading data.npz
+2021-07-03 21:13:13,691 Downloading ind2val.json
+2021-07-03 21:13:13,694 Downloading itemattr.npz
+2021-07-03 21:13:13,698 Done downloading all files.
+2021-07-03 21:13:13,707 Load data..
+2021-07-03 21:13:53,963 Loading dataset with slate size=torch.Size([2277645, 20, 25]) and uniform candidate sampling=False
+2021-07-03 21:13:54,179 In train: num_users: 2277645, num_batches: 2225
+2021-07-03 21:13:54,187 In valid: num_users: 113882, num_batches: 112
+2021-07-03 21:13:54,192 In test: num_users: 113882, num_batches: 112
+
+
+
+ +
+ +
+
Dictionary containing the dataloaders:
+{'train': <torch.utils.data.dataloader.DataLoader object at 0x7fcd88da59a0>, 'valid': <torch.utils.data.dataloader.DataLoader object at 0x7fcd88dc6e80>, 'test': <torch.utils.data.dataloader.DataLoader object at 0x7fcd88dc6550>}
+
+
+
+ +
+
+ +
+ {% endraw %} + +
+
+

Batches

The batches are split by userId and provides the necessary information for training. We will explain each element below:

+ +
+
+
+ {% raw %} + +
+
+ +
+
+
batch = next(iter(dataloaders['train']))
+for key, val in batch.items():
+    print(key, val.size())
+
+ +
+
+
+ +
+
+ +
+ +
+
userId torch.Size([1024])
+click torch.Size([1024, 20])
+click_idx torch.Size([1024, 20])
+slate_lengths torch.Size([1024, 20])
+slate torch.Size([1024, 20, 25])
+interaction_type torch.Size([1024, 20])
+mask_type torch.Size([1024, 20])
+
+
+
+ +
+
+ +
+ {% endraw %} + +
+
+

Interaction data (data.npz)

The dataset consist of 2.2M unique users that have interacted up to 20 times with the internet platform platform, and has been exposed to up to 25 items at each interaction. +data.npz contains all the slate and click data, and the two main arrays are click and slate. +The convention of the dimension of the arrays are that the first dimension is per user, second dimension is time and third dimension is the presented slate. +The full description of all array are as follows:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameDimensionDescription
slate[userId, interaction num, slate pos]the presented slates to the users;
click[userId, interaction num]items clicked by the users in each slate
interaction_type[userId, interaction num]type of interaction the user had with the platform (search or recommendation)
click_idx[userId, interaction num]Auxillary data: The position of the click in the slate dataframe (integer from 0-24).
Useful for e.g. categorical likelihoods
slate_lengths[userId, interaction num]Auxillary data: the actual length of the slate.
Same as 25-"number of pad index in action"
+ +
+
+
+ {% raw %} + +
+
+ +
+
+
dat = dataloaders['train'].dataset.data
+
+# Print dimensions of all arrays:
+for key, val in dat.items():
+  print(f"{key} : \t {val.size()}")
+
+ +
+
+
+ +
+
+ +
+ +
+
userId : 	 torch.Size([2277645])
+click : 	 torch.Size([2277645, 20])
+click_idx : 	 torch.Size([2277645, 20])
+slate_lengths : 	 torch.Size([2277645, 20])
+slate : 	 torch.Size([2277645, 20, 25])
+interaction_type : 	 torch.Size([2277645, 20])
+mask_type : 	 torch.Size([2277645, 20])
+
+
+
+ +
+
+ +
+ {% endraw %} + +
+
+

Example: Get one interaction

Get the presented slate + click for user 5 at interaction number 3

+ +
+
+
+ {% raw %} + +
+
+ +
+
+
print("Slate:")
+print(dat['slate'][5,3])
+print(" ")
+print("Click:")
+print(dat['click'][5,3])
+print("Type of interaction: (1 implies search, see ind2val file)")
+print(dat['interaction_type'][5,3])
+
+ +
+
+
+ +
+
+ +
+ +
+
Slate:
+tensor([     1, 638995, 638947, 638711, 637590, 637930, 638894,      0,      0,
+             0,      0,      0,      0,      0,      0,      0,      0,      0,
+             0,      0,      0,      0,      0,      0,      0])
+ 
+Click:
+tensor(637590)
+Type of interaction: (1 implies search, see ind2val file)
+tensor(1)
+
+
+
+ +
+
+ +
+ {% endraw %} + +
+
+

From the above extraction we can see that user 5 at interaction number 3 was presented with a total of 7 items: 6 "real" items and the "no-click" item that has index 1. The remaining positions in the array is padded with the index 0. +The "no-click" item is always present in the slates, as the user has the alternative not to click on any of the presented items in the slate. +Further, we see that the user clicked on the 4'th item in the slate. +The slate length and the click position can be found by the following auxillary arrays:

+ +
+
+
+ {% raw %} + +
+
+ +
+
+
print("Click_idx:")
+print(dat['click_idx'][5,3])
+print("Slate lengths:")
+print(dat['slate_lengths'][5,3])
+
+ +
+
+
+ +
+
+ +
+ +
+
Click_idx:
+tensor(4)
+Slate lengths:
+tensor(7)
+
+
+
+ +
+
+ +
+ {% endraw %} + +
+
+

Index to item (ind2val.json)

This files contains mapping from indices to values for the attributes category and interaction_type.

+ + + + + + + + + + + + + + + + + + + +
NameLengthDescription
category290Mapping from the category index to a text string that describes the category.
The category value is a text string that describes the category and location of the group
interaction_type3Indices of whether the presented slate originated from search or recommendations
+ +
+
+
+
+
+

Example ind2val

We print out the first elements of each index. +For example, we see that category 3 is "BAP,antiques,Trøndelag" which implies the category contains antiques sold in the county of Trøndelag.

+ +
+
+
+ {% raw %} + +
+
+ +
+
+
for key, val in ind2val.items():
+  print(" ")
+  print(f"{key} first entries:")
+  for idx, name in val.items():
+    print(f"{idx}: {val[idx]}")
+    if idx >3:
+      break
+
+ +
+
+
+ +
+
+ +
+ +
+
 
+category first entries:
+0: PAD
+1: noClick
+2: <UNK>
+3: BAP,antiques,Trøndelag
+4: MOTOR,,Sogn og Fjordane
+ 
+interaction_type first entries:
+1: search
+2: rec
+0: <UNK>
+
+
+
+ +
+
+ +
+ {% endraw %} + +
+
+

Item attributes (itemattr.npz)

A numpy array that encodes the category of each item.

+ + + + + + + + + + + + + + +
NameDimensionDescription
category[itemId]The group that each item belong to
+ +
+
+
+ {% raw %} + +
+
+ +
+
+
for key, val in itemattr.items():
+  print(f"{key} : {val.shape}")
+
+print("\nThe full dictionary:")
+itemattr
+
+ +
+
+
+ +
+
+ +
+ +
+
category : (1311775,)
+
+The full dictionary:
+
+
+
+ +
+ + + +
+
{'category': array([  0.,   1.,   2., ..., 289., 289., 289.])}
+
+ +
+ +
+
+ +
+ {% endraw %} + +
+
+

Example itemattr

Get the category of the clicked item above (from user 5, interaction number 3)

+ +
+
+
+ {% raw %} + +
+
+ +
+
+
print("Find the itemId that were click by user 5 in interaction 3:")
+itemId = [dat['click'][5,3]]
+print(f"itemId: {itemId}")
+
+print("\nFind the category index of that item in itemattr:")
+cat_idx = itemattr['category'][itemId]
+print(f"Category index: {cat_idx}")
+
+print("\nFinally, find the category name by using ind2val:")
+cat_name = ind2val['category'][cat_idx.item()]
+print(f"Category name: {cat_name}")
+
+ +
+
+
+ +
+
+ +
+ +
+
Find the itemId that were click by user 5 in interaction 3:
+itemId: [tensor(637590)]
+
+Find the category index of that item in itemattr:
+Category index: [135.]
+
+Finally, find the category name by using ind2val:
+Category name: REAL_ESTATE,,Oppland
+
+
+
+ +
+
+ +
+ {% endraw %} + +
+
+ +
+
+
+ {% raw %} + +
+
+ +
+
+
print(f"Ratio of no clicks: {(dat['click']==1).sum() / (dat['click']!=0).sum():.2f}")
+print(f"Average slate length: {(dat['slate_lengths'][dat['slate_lengths']!=0]).float().mean():.2f}")
+print(f"Ratio of slates that are recommendations: {(dat['interaction_type']==2).sum() / (dat['interaction_type']!=0).sum():.3f}")
+print(f"Average number of interactions per user: {(dat['click']!=0).sum(-1).float().mean():.2f}")
+
+ +
+
+
+ +
+
+ +
+ +
+
Ratio of no clicks: 0.24
+Average slate length: 11.14
+Ratio of slates that are recommendations: 0.303
+Average number of interactions per user: 16.43
+
+
+
+ +
+
+ +
+ {% endraw %} + +
+
+

Masking of train/test/val

Each batch returns a dictionary of pytorch tensors with data, and contains the usual data fields described above. +In addition, it contains a mask_type tensor which explains whether each click belongs to train, valid or test. +It is of the same dimensionality as the click tensor (num users * num interactions). +This is because we want to return the full sequence of interactions so that e.g. the test set can use the first clicks of the user (which belongs to the training set) to build a user profile. +The mask is defined in the following way:

+ +
mask2split = {
+    0 : 'PAD',
+    1 : 'train',
+    2 : 'valid',
+    3 : 'test'
+}
+

If the mask equals zero it means that the length of the user sequence was shorter than this index. +The modeler has to take care to not train on elements in the validation or test dataset. +Typically this can be done by masking all losses that does not originate from the training dataset:

+ +
+
+
+ {% raw %} + +
+
+ +
+
+
train_mask = (batch['mask_type']==1)
+train_mask
+
+ +
+
+
+ +
+
+ +
+ + + +
+
tensor([[True, True, True,  ..., True, True, True],
+        [True, True, True,  ..., True, True, True],
+        [True, True, True,  ..., True, True, True],
+        ...,
+        [True, True, True,  ..., True, True, True],
+        [True, True, True,  ..., True, True, True],
+        [True, True, True,  ..., True, True, True]])
+
+ +
+ +
+
+ +
+ {% endraw %} + +
+
+

For example, for user number 1 in this batch, the first five interactions belong to the training set, and the remaining belongs to the validation set. +We can extract the clicks that belong to the training set by using mask_type:

+ +
+
+
+ {% raw %} + +
+
+ +
+
+
print("Mask of user 2:")
+print(batch['mask_type'][1,])
+print(" ")
+print("Clicks belonging to the training set:")
+print(train_mask[1,])
+print(" ")
+print("Select only the clicks in training dataset:")
+batch['click'][1,][train_mask[1,]]
+
+ +
+
+
+ +
+
+ +
+ +
+
Mask of user 2:
+tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
+ 
+Clicks belonging to the training set:
+tensor([True, True, True, True, True, True, True, True, True, True, True, True,
+        True, True, True, True, True, True, True, True])
+ 
+Select only the clicks in training dataset:
+
+
+
+ +
+ + + +
+
tensor([ 246058,  522114,  688321,       1,       1,  492102,  342033, 1050842,
+              1,       1,  878114, 1104893,  581533,       1, 1114863,  191381,
+         493192,  736750,  693049,  493709])
+
+ +
+ +
+
+ +
+ {% endraw %} + +
+ + diff --git a/docs/sidebar.json b/docs/sidebar.json new file mode 100644 index 0000000..a1b39ca --- /dev/null +++ b/docs/sidebar.json @@ -0,0 +1,8 @@ +{ + "recsys_slates_dataset": { + "Overview": "/", + "datahelper": "datahelper.html", + "dataset_torch": "dataset_torch.html", + "Quick start with the FINN.no recsys slate dataset [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/finn-no/recsys-slates-dataset/blob/master/quickstart-finn-recsys-slate-data.ipynb)": "quickstart-finn-recsys-slate-data.html" + } +} \ No newline at end of file diff --git a/docs/sitemap.xml b/docs/sitemap.xml new file mode 100644 index 0000000..38a04d6 --- /dev/null +++ b/docs/sitemap.xml @@ -0,0 +1,24 @@ +--- +layout: none +search: exclude +--- + + + + {% for post in site.posts %} + {% unless post.search == "exclude" %} + + {{site.url}}{{post.url}} + + {% endunless %} + {% endfor %} + + + {% for page in site.pages %} + {% unless page.search == "exclude" %} + + {{site.url}}{{ page.url}} + + {% endunless %} + {% endfor %} + \ No newline at end of file diff --git a/examples/quickstart-finn-recsys-slate-data.ipynb b/examples/quickstart-finn-recsys-slate-data.ipynb new file mode 100644 index 0000000..b5de0eb --- /dev/null +++ b/examples/quickstart-finn-recsys-slate-data.ipynb @@ -0,0 +1,532 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Quick start with the FINN.no recsys slate dataset [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/finn-no/recsys-slates-dataset/blob/master/quickstart-finn-recsys-slate-data.ipynb)\n", + "\n", + "This notebook gives an introduction to the dataset released with the paper [Dynamic Slate Recommendation with Gated Recurrent Units and Thompson Sampling](https://arxiv.org/abs/2104.15046). \n", + "It is compatible with google colab, and can be run interactive by using the \"Open in Colab\"-button." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Install the recsys_slates_dataset pip package\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install recsys_slates_dataset -q" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Download and load dataloaders that are ready to use\n", + "It is possible to directly load the dataset as a pytorch dataloader which includes the same dataset splits etc as in the original paper.\n", + "Use the `load_dataloaders` function in the `dataset_torch` module. It has the following options:\n", + "\n", + "| Argument | Description |\n", + "| ------------- |-----:|\n", + "| batch_size | Number of unique users sampled in each batch |\n", + "| split_trainvalid | Ratio of full dataset dedicated to train
(val/test is split evenly among the rest) |\n", + "| t_testsplit | For users in valid and test,
how many interactions should belong to training set |\n", + "| sample_uniform_action | If this is True, the exposures in the dataset
are sampled as in the `all-item likelihood` (see paper) |\n", + "\n", + "The outputs of the function are `ind2val`, `itemattr` and a dictionary with pytorch dataloaders for training, validation and test." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2021-07-03 21:13:13,672 Download data if not in data folder..\n", + "2021-07-03 21:13:13,676 Downloading data.npz\n", + "2021-07-03 21:13:13,691 Downloading ind2val.json\n", + "2021-07-03 21:13:13,694 Downloading itemattr.npz\n", + "2021-07-03 21:13:13,698 Done downloading all files.\n", + "2021-07-03 21:13:13,707 Load data..\n", + "2021-07-03 21:13:53,963 Loading dataset with slate size=torch.Size([2277645, 20, 25]) and uniform candidate sampling=False\n", + "2021-07-03 21:13:54,179 In train: num_users: 2277645, num_batches: 2225\n", + "2021-07-03 21:13:54,187 In valid: num_users: 113882, num_batches: 112\n", + "2021-07-03 21:13:54,192 In test: num_users: 113882, num_batches: 112\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dictionary containing the dataloaders:\n", + "{'train': , 'valid': , 'test': }\n" + ] + } + ], + "source": [ + "import torch\n", + "from recsys_slates_dataset import dataset_torch\n", + "ind2val, itemattr, dataloaders = dataset_torch.load_dataloaders(data_dir=\"dat\")\n", + "\n", + "print(\"Dictionary containing the dataloaders:\")\n", + "print(dataloaders)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Batches\n", + "The batches are split by userId and provides the necessary information for training. We will explain each element below:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "userId torch.Size([1024])\n", + "click torch.Size([1024, 20])\n", + "click_idx torch.Size([1024, 20])\n", + "slate_lengths torch.Size([1024, 20])\n", + "slate torch.Size([1024, 20, 25])\n", + "interaction_type torch.Size([1024, 20])\n", + "mask_type torch.Size([1024, 20])\n" + ] + } + ], + "source": [ + "batch = next(iter(dataloaders['train']))\n", + "for key, val in batch.items():\n", + " print(key, val.size())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Interaction data (`data.npz`)\n", + "The dataset consist of 2.2M unique users that have interacted up to 20 times with the internet platform platform, and has been exposed to up to 25 items at each interaction.\n", + "`data.npz` contains all the slate and click data, and the two main arrays are `click` and `slate`. \n", + "The convention of the dimension of the arrays are that the first dimension is per user, second dimension is time and third dimension is the presented slate.\n", + "The full description of all array are as follows:\n", + "\n", + "| Name | Dimension | Description |\n", + "| ------------- |:-------------:| -----:|\n", + "| slate | [userId, interaction num, slate pos]| the presented slates to the users; |\n", + "| click | [userId, interaction num] | items clicked by the users in each slate |\n", + "| interaction_type | [userId, interaction num] | type of interaction the user had with the platform (search or recommendation) |\n", + "| click_idx | [userId, interaction num] | Auxillary data: The position of the click in the `slate` dataframe (integer from 0-24).
Useful for e.g. categorical likelihoods |\n", + "| slate_lengths | [userId, interaction num] | Auxillary data: the actual length of the slate.
Same as 25-`\"number of pad index in action\"` |\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "userId : \t torch.Size([2277645])\n", + "click : \t torch.Size([2277645, 20])\n", + "click_idx : \t torch.Size([2277645, 20])\n", + "slate_lengths : \t torch.Size([2277645, 20])\n", + "slate : \t torch.Size([2277645, 20, 25])\n", + "interaction_type : \t torch.Size([2277645, 20])\n", + "mask_type : \t torch.Size([2277645, 20])\n" + ] + } + ], + "source": [ + "# Load interaction data\n", + "dat = dataloaders['train'].dataset.data\n", + "\n", + "# Print dimensions of all arrays:\n", + "for key, val in dat.items():\n", + " print(f\"{key} : \\t {val.size()}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Example: Get one interaction\n", + "Get the presented slate + click for user 5 at interaction number 3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Slate:\n", + "tensor([ 1, 638995, 638947, 638711, 637590, 637930, 638894, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0])\n", + " \n", + "Click:\n", + "tensor(637590)\n", + "Type of interaction: (1 implies search, see ind2val file)\n", + "tensor(1)\n" + ] + } + ], + "source": [ + "print(\"Slate:\")\n", + "print(dat['slate'][5,3])\n", + "print(\" \")\n", + "print(\"Click:\")\n", + "print(dat['click'][5,3])\n", + "print(\"Type of interaction: (1 implies search, see ind2val file)\")\n", + "print(dat['interaction_type'][5,3])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "From the above extraction we can see that user 5 at interaction number 3 was presented with a total of 7 items: 6 \"real\" items and the \"no-click\" item that has index 1. The remaining positions in the array is padded with the index 0.\n", + "The \"no-click\" item is always present in the slates, as the user has the alternative not to click on any of the presented items in the slate.\n", + "Further, we see that the user clicked on the 4'th item in the slate.\n", + "The slate length and the click position can be found by the following auxillary arrays:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Click_idx:\n", + "tensor(4)\n", + "Slate lengths:\n", + "tensor(7)\n" + ] + } + ], + "source": [ + "print(\"Click_idx:\")\n", + "print(dat['click_idx'][5,3])\n", + "print(\"Slate lengths:\")\n", + "print(dat['slate_lengths'][5,3])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Index to item (`ind2val.json`)\n", + "This files contains mapping from indices to values for the attributes category and interaction_type.\n", + "\n", + "| Name | Length | Description |\n", + "| -------------|:----:| -----:|\n", + "| category | 290 | Mapping from the category index to a text string that describes the category.
The category value is a text string that describes the category and location of the group |\n", + "| interaction_type | 3 | Indices of whether the presented slate originated from search or recommendations|" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Example `ind2val`\n", + "We print out the first elements of each index.\n", + "For example, we see that category 3 is \"BAP,antiques,Trøndelag\" which implies the category contains antiques sold in the county of Trøndelag." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " \n", + "category first entries:\n", + "0: PAD\n", + "1: noClick\n", + "2: \n", + "3: BAP,antiques,Trøndelag\n", + "4: MOTOR,,Sogn og Fjordane\n", + " \n", + "interaction_type first entries:\n", + "1: search\n", + "2: rec\n", + "0: \n" + ] + } + ], + "source": [ + "for key, val in ind2val.items():\n", + " print(\" \")\n", + " print(f\"{key} first entries:\")\n", + " for idx, name in val.items():\n", + " print(f\"{idx}: {val[idx]}\")\n", + " if idx >3:\n", + " break" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Item attributes (`itemattr.npz`)\n", + "A numpy array that encodes the category of each item.\n", + "\n", + "| Name | Dimension | Description |\n", + "| ------------- |:-------------:| -----:|\n", + "| category | [itemId] | The group that each item belong to |\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "category : (1311775,)\n", + "\n", + "The full dictionary:\n" + ] + }, + { + "data": { + "text/plain": [ + "{'category': array([ 0., 1., 2., ..., 289., 289., 289.])}" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "for key, val in itemattr.items():\n", + " print(f\"{key} : {val.shape}\")\n", + "\n", + "print(\"\\nThe full dictionary:\")\n", + "itemattr" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Example `itemattr`\n", + "Get the category of the clicked item above (from user 5, interaction number 3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Find the itemId that were click by user 5 in interaction 3:\n", + "itemId: [tensor(637590)]\n", + "\n", + "Find the category index of that item in itemattr:\n", + "Category index: [135.]\n", + "\n", + "Finally, find the category name by using ind2val:\n", + "Category name: REAL_ESTATE,,Oppland\n" + ] + } + ], + "source": [ + "print(\"Find the itemId that were click by user 5 in interaction 3:\")\n", + "itemId = [dat['click'][5,3]]\n", + "print(f\"itemId: {itemId}\")\n", + "\n", + "print(\"\\nFind the category index of that item in itemattr:\")\n", + "cat_idx = itemattr['category'][itemId]\n", + "print(f\"Category index: {cat_idx}\")\n", + "\n", + "print(\"\\nFinally, find the category name by using ind2val:\")\n", + "cat_name = ind2val['category'][cat_idx.item()]\n", + "print(f\"Category name: {cat_name}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Print some statistics about the dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Ratio of no clicks: 0.24\n", + "Average slate length: 11.14\n", + "Ratio of slates that are recommendations: 0.303\n", + "Average number of interactions per user: 16.43\n" + ] + } + ], + "source": [ + "print(f\"Ratio of no clicks: {(dat['click']==1).sum() / (dat['click']!=0).sum():.2f}\")\n", + "print(f\"Average slate length: {(dat['slate_lengths'][dat['slate_lengths']!=0]).float().mean():.2f}\")\n", + "print(f\"Ratio of slates that are recommendations: {(dat['interaction_type']==2).sum() / (dat['interaction_type']!=0).sum():.3f}\")\n", + "print(f\"Average number of interactions per user: {(dat['click']!=0).sum(-1).float().mean():.2f}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Masking of train/test/val\n", + "Each batch returns a dictionary of pytorch tensors with data, and contains the usual data fields described above.\n", + "In addition, it contains a `mask_type` tensor which explains whether each click belongs to _train_, _valid_ or _test_.\n", + "It is of the same dimensionality as the click tensor (`num users * num interactions`).\n", + "This is because we want to return the full sequence of interactions so that e.g. the test set can use the first clicks of the user (which belongs to the training set) to build a user profile.\n", + "The mask is defined in the following way:\n", + "\n", + "```\n", + "mask2split = {\n", + " 0 : 'PAD',\n", + " 1 : 'train',\n", + " 2 : 'valid',\n", + " 3 : 'test'\n", + "}\n", + "```\n", + "If the mask equals zero it means that the length of the user sequence was shorter than this index.\n", + "The modeler has to take care to not train on elements in the validation or test dataset.\n", + "Typically this can be done by masking all losses that does not originate from the training dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([[True, True, True, ..., True, True, True],\n", + " [True, True, True, ..., True, True, True],\n", + " [True, True, True, ..., True, True, True],\n", + " ...,\n", + " [True, True, True, ..., True, True, True],\n", + " [True, True, True, ..., True, True, True],\n", + " [True, True, True, ..., True, True, True]])" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_mask = (batch['mask_type']==1)\n", + "train_mask" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For example, for user number 1 in this batch, the first five interactions belong to the training set, and the remaining belongs to the validation set.\n", + "We can extract the clicks that belong to the training set by using `mask_type`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Mask of user 2:\n", + "tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])\n", + " \n", + "Clicks belonging to the training set:\n", + "tensor([True, True, True, True, True, True, True, True, True, True, True, True,\n", + " True, True, True, True, True, True, True, True])\n", + " \n", + "Select only the clicks in training dataset:\n" + ] + }, + { + "data": { + "text/plain": [ + "tensor([ 246058, 522114, 688321, 1, 1, 492102, 342033, 1050842,\n", + " 1, 1, 878114, 1104893, 581533, 1, 1114863, 191381,\n", + " 493192, 736750, 693049, 493709])" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print(\"Mask of user 2:\")\n", + "print(batch['mask_type'][1,])\n", + "print(\" \")\n", + "print(\"Clicks belonging to the training set:\")\n", + "print(train_mask[1,])\n", + "print(\" \")\n", + "print(\"Select only the clicks in training dataset:\")\n", + "batch['click'][1,][train_mask[1,]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.8.8 64-bit ('base': conda)", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/index.ipynb b/index.ipynb new file mode 100644 index 0000000..ab717e4 --- /dev/null +++ b/index.ipynb @@ -0,0 +1,152 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#hide\n", + "from recsys_slates_dataset.core import *" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# FINN.no Slate Dataset for Recommender Systems\n", + "> Data and helper functions for FINN.no slate dataset containing both viewed items and clicks from the FINN.no second hand marketplace.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We release the *FINN.no slate dataset* to improve recommender systems research.\n", + "The dataset includes both search and recommendation interactions between users and the platform over a 30 day period.\n", + "The dataset has logged both exposures and clicks, *including interactions where the user did not click on any of the items in the slate*.\n", + "To our knowledge there exist no such large-scale dataset, and we hope this contribution can help researchers constructing improved models and improve offline evaluation metrics.\n", + "\n", + "![A visualization of a presented slate to the user on the frontpage of FINN.no](finn-frontpage.png)\n", + "\n", + "For each user u and interaction step t we recorded all items in the visible slate ![equ](https://latex.codecogs.com/gif.latex?a_t^u(s_t^u) ) (up to the scroll length ![equ](https://latex.codecogs.com/gif.latex?s_t^u)), and the user's click response ![equ](https://latex.codecogs.com/gif.latex?c_t^u).\n", + "The dataset consists of 37.4 million interactions, |U| ≈ 2.3) million users and |I| ≈ 1.3 million items that belong to one of G = 290 item groups. For a detailed description of the data please see the [paper](https://arxiv.org/abs/2104.15046).\n", + "\n", + "![A visualization of a presented slate to the user on the frontpage of FINN.no](interaction_illustration.png)\n", + "\n", + "FINN.no is the leading marketplace in the Norwegian classifieds market and provides users with a platform to buy and sell general merchandise, cars, real estate, as well as house rentals and job offerings.\n", + "For questions, email simen.eide@finn.no or file an issue." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Install" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`pip install recsys_slates_dataset`" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## How to use\n", + "\n", + "To download the generic numpy data files:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from recsys_slates_dataset import datahelper\n", + "datahelper.download_data_files(data_dir=\"data\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Download and prepare data into ready-to-use pytorch dataloaders:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "``` python\n", + "from recsys_slates_dataset import dataset_torch\n", + "ind2val, itemattr, dataloaders = dataset_torch.load_dataloaders(data_dir=\"data\")\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Organization\n", + "The repository is organized as follows:\n", + "- The dataset is placed in `data/` and stored using git-lfs. We also provide an automatic download function in the pip package (preferred usage).\n", + "- The code open sourced from the article [\"Dynamic Slate Recommendation with Gated Recurrent Units and Thompson Sampling\"](https://arxiv.org/abs/2104.15046) is found in (`code_eide_et_al21/`). However, we are in the process of making the data more generally available which makes the code incompatible with the current (newer) version of the data. Please use [the v1.0 release of the repository](https://github.com/finn-no/recsys-slates-dataset/tree/v1.0) for a compatible version of the code and dataset.\n", + "\n", + "## Quickstart dataset [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/finn-no/recsys-slates-dataset/blob/master/examples/quickstart-finn-recsys-slate-data.ipynb)\n", + "We provide a quickstart jupyter notebook that runs on Google Colab (quickstart-finn-recsys-slate-data.ipynb) which includes all necessary steps above.\n", + "It gives a quick introduction to how to use the dataset.\n", + "\n", + "## Citations\n", + "This repository accompany the paper [\"Dynamic Slate Recommendation with Gated Recurrent Units and Thompson Sampling\"](https://arxiv.org/abs/2104.15046) by Simen Eide, David S. Leslie and Arnoldo Frigessi.\n", + "The article is under review, and the pre-print can be obtained [here](https://arxiv.org/abs/2104.15046).\n", + "\n", + "If you use either the code, data or paper, please consider citing the paper.\n", + "\n", + "```\n", + "@article{eide2021dynamic,\n", + " title={Dynamic Slate Recommendation with Gated Recurrent Units and Thompson Sampling}, \n", + " author={Simen Eide and David S. Leslie and Arnoldo Frigessi},\n", + " year={2021},\n", + " eprint={2104.15046},\n", + " archivePrefix={arXiv},\n", + " primaryClass={stat.ML}\n", + "}\n", + "```\n", + "\n", + "## Todo\n", + "This repository is currently *work in progress*, and we will provide descriptions and tutorials. Suggestions and contributions to make the material more available is welcome.\n", + "There are some features of the repository that we are working on:\n", + "\n", + "- [x] Release the dataset as numpy objects instead of pytorch arrays. This will help non-pytorch users to more easily utilize the data\n", + "- [x] Maintain a pytorch dataset for easy usage\n", + "- [x] Create a pip package for easier installation and usage. the package should download the dataset using a function.\n", + "- [x] Make the quickstart guide compatible with the pip package and numpy format.\n", + "- [ ] The git lfs is currently broken by removing some lines in .gitattributes that is in conflict with nbdev. The dataset is still usable using the building download functions as they use a different source. However, we should fix this. An issue is [posted on nbdev](https://github.com/fastai/nbdev/issues/506).\n", + "- [ ] Add easily useable functions that compute relevant metrics such as hitrate, log-likelihood etc.\n", + "- [ ] Distribute the data on other platforms such as kaggle.\n", + "- [ ] Add a short description of the data in the readme.md directly.\n", + "\n", + "As the current state is in early stage, it makes sense to allow the above changes non-backward compatible. \n", + "However, this should be completed within the next couple of months." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.8.8 64-bit ('base': conda)", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/quickstart-finn-recsys-slate-data.ipynb b/quickstart-finn-recsys-slate-data.ipynb deleted file mode 100644 index be771f9..0000000 --- a/quickstart-finn-recsys-slate-data.ipynb +++ /dev/null @@ -1 +0,0 @@ -{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"recsys-slates-dataset.ipynb","provenance":[],"collapsed_sections":["u6290SzC3Omz"],"toc_visible":true,"authorship_tag":"ABX9TyM/t3Kkk8sk4L0WO2ksrvGS"},"kernelspec":{"name":"python388jvsc74a57bd0b64057e63add2b45b1ffc7eab9b09c8889b419c878e2fdf0d08f837f0fc857a7","display_name":"Python 3.8.8 64-bit ('base': conda)"},"language_info":{"name":"python","version":"3.8.8"},"metadata":{"interpreter":{"hash":"b64057e63add2b45b1ffc7eab9b09c8889b419c878e2fdf0d08f837f0fc857a7"}}},"cells":[{"cell_type":"markdown","metadata":{"id":"e7_gJYqJMheQ"},"source":["# Quick start with the FINN.no recsys slate dataset [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/finn-no/recsys-slates-dataset/blob/master/quickstart-finn-recsys-slate-data.ipynb)\n","\n","This notebook gives an introduction to the dataset released with the paper [Dynamic Slate Recommendation with Gated Recurrent Units and Thompson Sampling](https://arxiv.org/abs/2104.15046). \n","It is compatible with google colab, and can be run interactive by using the \"Open in Colab\"-button."]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":[]},{"cell_type":"markdown","metadata":{"id":"u6290SzC3Omz"},"source":["### Install dependencies, download and unzip data\n","This step is necessary for google colab, not if you have manually downloaded the repo."]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"Wvt8VGGUJ3JE","executionInfo":{"status":"ok","timestamp":1618316794045,"user_tz":-120,"elapsed":214740,"user":{"displayName":"Simen Eide","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhBrR64PgLfCXJBwq9GqVl3bvi57_j2i62P5Z8VDA=s64","userId":"07498942408416640037"}},"outputId":"9724bfbd-8ed5-4425-cd30-b745ec8d31a4"},"source":["!sudo apt-get install git-lfs -q\n","!git lfs install\n","!echo Clone data repository..:\n","!git clone https://github.com/finn-no/recsys-slates-dataset.git\n","!echo Unzip datafile..:\n","!gunzip -c recsys-slates-dataset/data/data.pt.gz >recsys-slates-dataset/data/data.pt"],"execution_count":1,"outputs":[{"output_type":"stream","name":"stdout","text":["Clone data repository..:\n","Cloning into 'recsys-slates-dataset'...\n","remote: Enumerating objects: 121, done.\u001b[K\n","remote: Counting objects: 100% (121/121), done.\u001b[K\n","remote: Compressing objects: 100% (84/84), done.\u001b[K\n","remote: Total 121 (delta 57), reused 90 (delta 34), pack-reused 0\u001b[K\n","Receiving objects: 100% (121/121), 844.46 KiB | 7.68 MiB/s, done.\n","Resolving deltas: 100% (57/57), done.\n","Filtering content: 100% (3/3), 1.30 GiB | 35.09 MiB/s, done.\n","Unzip datafile..:\n"]}]},{"cell_type":"code","metadata":{"id":"46TmJdoP3XHc","executionInfo":{"status":"ok","timestamp":1618316796739,"user_tz":-120,"elapsed":217416,"user":{"displayName":"Simen Eide","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhBrR64PgLfCXJBwq9GqVl3bvi57_j2i62P5Z8VDA=s64","userId":"07498942408416640037"}}},"source":["import torch\n","import pickle"],"execution_count":2,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"VHgw9fK04xfM"},"source":["### Main dataset file `data.pt`\n","The dataset consist of 2.2M unique users that have interacted up to 20 times with the internet platform platform, and has been exposed to up to 25 items at each interaction.\n","`data.pt` contains all the slate and click data, and the two main arrays are `click` and `action`. \n","The convention of the dimension of the arrays are that the first dimension is per user, second dimension is time and third dimension is the presented slate.\n","The full description of all array are as follows:\n","\n","| Name | Dimension | Description |\n","| ------------- |:-------------:| -----:|\n","| action | [userId, interaction num, slate pos]| the presented slates to the users; |\n","| click | [userId, interaction num] | items clicked by the users in each slate |\n","| displayType | [userId, interaction num] | type of interaction the user had with the platform (search or recommendation) |\n","| click_idx | [userId, interaction num] | Auxillary data: The position of the click in the `action` dataframe (integer from 0-24).
Useful for e.g. categorical likelihoods |\n","| lengths | [userId, interaction num] | Auxillary data: the actual length of the slate.
Same as 25-`\"number of pad index in action\"` |\n","\n","\n","\n"]},{"cell_type":"code","metadata":{"id":"f9DfbqCiyLoa","executionInfo":{"status":"ok","timestamp":1618316992003,"user_tz":-120,"elapsed":412666,"user":{"displayName":"Simen Eide","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhBrR64PgLfCXJBwq9GqVl3bvi57_j2i62P5Z8VDA=s64","userId":"07498942408416640037"}}},"source":["# Load dataset\n","dat = torch.load(\"recsys-slates-dataset/data/data.pt\")"],"execution_count":3,"outputs":[]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"HzNLLqJh7wn4","executionInfo":{"status":"ok","timestamp":1618316992005,"user_tz":-120,"elapsed":412656,"user":{"displayName":"Simen Eide","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhBrR64PgLfCXJBwq9GqVl3bvi57_j2i62P5Z8VDA=s64","userId":"07498942408416640037"}},"outputId":"fddffba4-cec6-4419-d3e6-09682c3e789c"},"source":["# Print dimensions of all arrays:\n","for key, val in dat.items():\n"," print(f\"{key} : \\t {val.size()}\")"],"execution_count":4,"outputs":[{"output_type":"stream","name":"stdout","text":["userId : \t torch.Size([2277645])\nlengths : \t torch.Size([2277645, 20])\ndisplayType : \t torch.Size([2277645, 20])\naction : \t torch.Size([2277645, 20, 25])\nclick : \t torch.Size([2277645, 20])\nclick_idx : \t torch.Size([2277645, 20])\n"]}]},{"cell_type":"markdown","metadata":{"id":"30Foho6fE9HN"},"source":["#### Example: Get one interaction\n","Get the presented slate + click for user 5 at interaction number 3"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"bqgFFWT4DKWR","executionInfo":{"status":"ok","timestamp":1618316992005,"user_tz":-120,"elapsed":412651,"user":{"displayName":"Simen Eide","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhBrR64PgLfCXJBwq9GqVl3bvi57_j2i62P5Z8VDA=s64","userId":"07498942408416640037"}},"outputId":"e8b182af-2ba3-4a08-afc5-651bee1df7af"},"source":["print(\"Slate:\")\n","print(dat['action'][5,3])\n","print(\" \")\n","print(\"Click:\")\n","print(dat['click'][5,3])\n","print(\"Type of interaction: (1 implies search, see ind2val file)\")\n","print(dat['displayType'][5,3])"],"execution_count":5,"outputs":[{"output_type":"stream","name":"stdout","text":["Slate:\ntensor([ 1, 638995, 638947, 638711, 637590, 637930, 638894, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0])\n \nClick:\ntensor(637590)\nType of interaction: (1 implies search, see ind2val file)\ntensor(1)\n"]}]},{"cell_type":"markdown","metadata":{"id":"xN_7WBrUFcCq"},"source":["From the above extraction we can see that user 5 at interaction number 3 was presented with a total of 7 items: 6 \"real\" items and the \"no-click\" item that has index 1. The remaining positions in the array is padded with the index 0.\n","The \"no-click\" item is always present in the slates, as the user has the alternative not to click on any of the presented items in the slate.\n","Further, we see that the user clicked on the 4'th item in the slate.\n","The slate length and the click position can be found by the following auxillary arrays:"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"9HoOdTDfGAlx","executionInfo":{"status":"ok","timestamp":1618316992006,"user_tz":-120,"elapsed":412647,"user":{"displayName":"Simen Eide","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhBrR64PgLfCXJBwq9GqVl3bvi57_j2i62P5Z8VDA=s64","userId":"07498942408416640037"}},"outputId":"b7cb51f8-b8ef-4b2c-daac-ba2c8e8fd289"},"source":["print(\"Click_idx:\")\n","print(dat['click_idx'][5,3])\n","print(\"lengths:\")\n","print(dat['lengths'][5,3])"],"execution_count":6,"outputs":[{"output_type":"stream","name":"stdout","text":["Click_idx:\ntensor(4)\nlengths:\ntensor(7)\n"]}]},{"cell_type":"markdown","metadata":{"id":"8orXI4uBOlgY"},"source":["### Index to item file `ind2val.pickle`\n","This files contains mapping from indices to values for the attributes userId, itemId, category and displayType.\n","\n","| Name | Length | Description |\n","| -------------|:----:| -----:|\n","| userId | 1.3M | Scrambled id of users |\n","| itemId | 2.3M | Scrambled id of items.
First indicies disclose pad, noclick and unk items. |\n","| category | 290 | Mapping from the category index to a text string that describes the category.
The category value is a text string that describes the category and location of the group |\n","| displayType | 3 | Indices of whether the presented slate originated from search or recommendations|"]},{"cell_type":"markdown","metadata":{"id":"dOHC_oHMP6xH"},"source":["#### Example `ind2val`\n","We print out the first elements of each index.\n","For example, we see that category 3 is \"BAP,antiques,Trøndelag\" which implies the category contains antiques sold in the county of Trøndelag."]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"BVHX7G3fOnxy","executionInfo":{"status":"ok","timestamp":1618317602126,"user_tz":-120,"elapsed":1788,"user":{"displayName":"Simen Eide","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhBrR64PgLfCXJBwq9GqVl3bvi57_j2i62P5Z8VDA=s64","userId":"07498942408416640037"}},"outputId":"a531b92a-b96a-4904-c058-f3fadf98058d"},"source":["ind2val = pickle.load(open(\"recsys-slates-dataset/data/ind2val.pickle\", \"rb\"))\n","for key, val in ind2val.items():\n"," print(\" \")\n"," print(f\"{key} first entries:\")\n"," for idx, name in val.items():\n"," print(f\"{idx}: {val[idx]}\")\n"," if idx >3:\n"," break"],"execution_count":7,"outputs":[{"output_type":"stream","name":"stdout","text":[" \nitemId first entries:\n0: PAD\n1: noClick\n2: \n3: item_3\n4: item_4\n \ncategory first entries:\n0: PAD\n1: noClick\n2: \n3: BAP,antiques,Trøndelag\n4: MOTOR,,Sogn og Fjordane\n \ndisplayType first entries:\n1: search\n2: rec\n0: \n \nuserId first entries:\n1: user_1\n2: user_2\n3: user_3\n4: user_4\n"]}]},{"cell_type":"markdown","metadata":{"id":"1BO23cqGGjBE"},"source":["### Item attributes file `itemattr.pickle`\n","A small attribute file that provides two pieces of information on the items. These are stored as numpy arrays.\n","\n","| Name | Dimension | Description |\n","| ------------- |:-------------:| -----:|\n","| category | [itemId] | The group that each item belong to |\n","| actions | [itemId] | Auxillary data: count of the number of total exposures per item.
`-1` is used to pad special items (unk, pad,noclick) |\n"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"J_J_Q30wGiV4","executionInfo":{"status":"ok","timestamp":1618318248940,"user_tz":-120,"elapsed":457,"user":{"displayName":"Simen Eide","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhBrR64PgLfCXJBwq9GqVl3bvi57_j2i62P5Z8VDA=s64","userId":"07498942408416640037"}},"outputId":"491f619c-7dfa-41d3-ffd9-2576b82784c2"},"source":["itemattr = pickle.load(open(\"recsys-slates-dataset/data/itemattr.pickle\", \"rb\"))\n","\n","for key, val in itemattr.items():\n"," print(f\"{key} : {val.shape}\")\n","\n","print(\"\\nThe full dictionary:\")\n","itemattr"],"execution_count":8,"outputs":[{"output_type":"stream","name":"stdout","text":["actions : (1311775,)\ncategory : (1311775,)\n\nThe full dictionary:\n"]},{"output_type":"execute_result","data":{"text/plain":["{'actions': array([-1., -1., -1., ..., 39., 14., 4.]),\n"," 'category': array([ 0., 1., 2., ..., 289., 289., 289.])}"]},"metadata":{},"execution_count":8}]},{"cell_type":"markdown","metadata":{"id":"q2gyukQYIdTt"},"source":["#### Example `itemattr`\n","Get the category of the clicked item above (from user 5, interaction number 3)"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"bS4765g8IDn4","executionInfo":{"status":"ok","timestamp":1618318138850,"user_tz":-120,"elapsed":845,"user":{"displayName":"Simen Eide","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhBrR64PgLfCXJBwq9GqVl3bvi57_j2i62P5Z8VDA=s64","userId":"07498942408416640037"}},"outputId":"92af5abf-4560-49d7-a0d3-9c0e0f96f582"},"source":["print(\"Find the itemId that were click by user 5 in interaction 3:\")\n","itemId = [dat['click'][5,3]]\n","print(f\"itemId: {itemId}\")\n","\n","print(\"\\nFind the category index of that item in itemattr:\")\n","cat_idx = itemattr['category'][itemId]\n","print(f\"Category index: {cat_idx}\")\n","\n","print(\"\\nFinally, find the category name by using ind2val:\")\n","cat_name = ind2val['category'][cat_idx.item()]\n","print(f\"Category name: {cat_name}\")"],"execution_count":9,"outputs":[{"output_type":"stream","name":"stdout","text":["Find the itemId that were click by user 5 in interaction 3:\nitemId: [tensor(637590)]\n\nFind the category index of that item in itemattr:\nCategory index: [135.]\n\nFinally, find the category name by using ind2val:\nCategory name: REAL_ESTATE,,Oppland\n"]}]},{"source":["## Print some statistics about the dataset"],"cell_type":"markdown","metadata":{}},{"cell_type":"code","execution_count":11,"metadata":{},"outputs":[{"output_type":"stream","name":"stdout","text":["Ratio of no clicks: 0.24\n","Average slate length: 11.14\n","Ratio of slates that are recommendations: 0.303\n","Average number of interactions per user: 16.43\n"]}],"source":["print(f\"Ratio of no clicks: {(dat['click']==1).sum() / (dat['click']!=0).sum():.2f}\")\n","print(f\"Average slate length: {(dat['lengths'][dat['lengths']!=0]).float().mean():.2f}\")\n","print(f\"Ratio of slates that are recommendations: {(dat['displayType']==2).sum() / (dat['displayType']!=0).sum():.3f}\")\n","print(f\"Average number of interactions per user: {(dat['click']!=0).sum(-1).float().mean():.2f}\")"]},{"source":["# Directly load Pytorch Dataloaders with train/valid/test split\n","It is possible to directly load the dataset as a pytorch dataloader which includes the same dataset splits etc as in the original paper.\n","Use the `load_dataloaders` function in the `dataset.py` file. It has the following options:\n","\n","| Argument | Description |\n","| ------------- |-----:|\n","| batch_size | Number of unique users sampled in each batch |\n","| split_trainvalid | Ratio of full dataset dedicated to train
(val/test is split evenly among the rest) |\n","| t_testsplit | For users in valid and test,
how many interactions should belong to training set |\n","| sample_uniform_action | If this is True, the exposures in the dataset
are sampled as in the `all-item likelihood` (see paper) |\n","\n","The outputs of the function is the same `ind2val` and `itemattr` as above.\n","It also returns a dictionary with all the dataloaders."],"cell_type":"markdown","metadata":{"id":"jV2Kbm79DGBP","executionInfo":{"status":"ok","timestamp":1618316994532,"user_tz":-120,"elapsed":415150,"user":{"displayName":"Simen Eide","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhBrR64PgLfCXJBwq9GqVl3bvi57_j2i62P5Z8VDA=s64","userId":"07498942408416640037"}}}},{"cell_type":"code","execution_count":12,"metadata":{},"outputs":[{"output_type":"stream","name":"stderr","text":["2021-05-27 18:25:55,179 Load data..\n","2021-05-27 18:26:01,887 Loading dataset with action size=torch.Size([2277645, 20, 25]) and uniform candidate sampling=False\n","2021-05-27 18:26:02,651 In train: num_users: 2277645, num_batches: 76\n","2021-05-27 18:26:02,651 In valid: num_users: 113882, num_batches: 4\n","2021-05-27 18:26:02,651 In test: num_users: 113882, num_batches: 4\n"," \n","Dictionary containing the dataloaders:\n","{'train': , 'valid': , 'test': }\n"]}],"source":["import dataset\n","ind2val, itemattr, dataloaders = dataset.load_dataloaders(\"recsys-slates-dataset/data\",\n"," batch_size=30000,\n"," split_trainvalid=0.9,\n"," t_testsplit = 5,\n"," sample_uniform_action=False)\n","print(\" \")\n","print(\"Dictionary containing the dataloaders:\")\n","print(dataloaders)"]},{"cell_type":"code","execution_count":13,"metadata":{},"outputs":[{"output_type":"stream","name":"stdout","text":["userId torch.Size([30000])\nlengths torch.Size([30000, 20])\ndisplayType torch.Size([30000, 20])\naction torch.Size([30000, 20, 25])\nclick torch.Size([30000, 20])\nclick_idx torch.Size([30000, 20])\nmask_type torch.Size([30000, 20])\n"]}],"source":["batch = next(iter(dataloaders['train']))\n","for key, val in batch.items():\n"," print(key, val.size())"]},{"source":["### Masking of train/test/val\n","Each batch returns a dictionary of pytorch tensors with data, and contains the usual data fields described above.\n","In addition, it contains a `mask_type` tensor which explains whether each click belongs to _train_, _valid_ or _test_.\n","It is of the same dimensionality as the click tensor (`num users * num interactions`).\n","This is because we want to return the full sequence of interactions so that e.g. the test set can use the first clicks of the user (which belongs to the training set) to build a user profile.\n","The mask is defined in the following way:\n","\n","```\n","mask2split = {\n"," 0 : 'PAD',\n"," 1 : 'train',\n"," 2 : 'valid',\n"," 3 : 'test'\n","}\n","```\n","If the mask equals zero it means that the length of the user sequence was shorter than this index.\n","The modeler has to take care to not train on elements in the validation or test dataset.\n","Typically this can be done by masking all losses that does not originate from the training dataset:"],"cell_type":"markdown","metadata":{}},{"cell_type":"code","execution_count":14,"metadata":{},"outputs":[{"output_type":"execute_result","data":{"text/plain":["tensor([[True, True, True, ..., True, True, True],\n"," [True, True, True, ..., True, True, True],\n"," [True, True, True, ..., True, True, True],\n"," ...,\n"," [True, True, True, ..., True, True, True],\n"," [True, True, True, ..., True, True, True],\n"," [True, True, True, ..., True, True, True]])"]},"metadata":{},"execution_count":14}],"source":["train_mask = (batch['mask_type']==1)\n","train_mask"]},{"source":["For example, for user number 1 in this batch, the first five interactions belong to the training set, and the remaining belongs to the validation set.\n","We can extract the clicks that belong to the training set by using `mask_type`:"],"cell_type":"markdown","metadata":{}},{"cell_type":"code","execution_count":15,"metadata":{},"outputs":[{"output_type":"stream","name":"stdout","text":["Mask of user 2:\ntensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])\n \nClicks belonging to the training set:\ntensor([True, True, True, True, True, True, True, True, True, True, True, True,\n True, True, True, True, True, True, True, True])\n \nSelect only the clicks in training dataset:\n"]},{"output_type":"execute_result","data":{"text/plain":["tensor([ 73296, 66666, 1154594, 613719, 642978, 1231978, 1231727, 1,\n"," 56397, 0, 0, 0, 0, 0, 0, 0,\n"," 0, 0, 0, 0])"]},"metadata":{},"execution_count":15}],"source":["print(\"Mask of user 2:\")\n","print(batch['mask_type'][1,])\n","print(\" \")\n","print(\"Clicks belonging to the training set:\")\n","print(train_mask[1,])\n","print(\" \")\n","print(\"Select only the clicks in training dataset:\")\n","batch['click'][1,][train_mask[1,]]"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":[]}]} \ No newline at end of file diff --git a/recsys_slates_dataset/__init__.py b/recsys_slates_dataset/__init__.py new file mode 100644 index 0000000..3b93d0b --- /dev/null +++ b/recsys_slates_dataset/__init__.py @@ -0,0 +1 @@ +__version__ = "0.0.2" diff --git a/recsys_slates_dataset/_nbdev.py b/recsys_slates_dataset/_nbdev.py new file mode 100644 index 0000000..0344ff0 --- /dev/null +++ b/recsys_slates_dataset/_nbdev.py @@ -0,0 +1,16 @@ +# AUTOGENERATED BY NBDEV! DO NOT EDIT! + +__all__ = ["index", "modules", "custom_doc_links", "git_url"] + +index = {"download_data_files": "datahelper.ipynb", + "SequentialDataset": "dataset_torch.ipynb", + "load_dataloaders": "dataset_torch.ipynb"} + +modules = ["datahelper.py", + "dataset_torch.py"] + +doc_url = "http://opensource.finn.no/recsys_slates_dataset/" + +git_url = "https://github.com/finn-no/recsys_slates_dataset/tree/nbdev/" + +def custom_doc_links(name): return None diff --git a/recsys_slates_dataset/core.py b/recsys_slates_dataset/core.py new file mode 100644 index 0000000..85a7bd8 --- /dev/null +++ b/recsys_slates_dataset/core.py @@ -0,0 +1,8 @@ +# AUTOGENERATED! DO NOT EDIT! File to edit: 00_core.ipynb (unless otherwise specified). + +__all__ = ['say_hello'] + +# Cell +def say_hello(to): + "Say hello to somebody" + return f'Hello {to}!' \ No newline at end of file diff --git a/recsys_slates_dataset/datahelper.py b/recsys_slates_dataset/datahelper.py new file mode 100644 index 0000000..d1a1884 --- /dev/null +++ b/recsys_slates_dataset/datahelper.py @@ -0,0 +1,35 @@ +# AUTOGENERATED! DO NOT EDIT! File to edit: datahelper.ipynb (unless otherwise specified). + +__all__ = ['download_data_files'] + +# Cell +import logging +from google_drive_downloader import GoogleDriveDownloader as gdd +def download_data_files(data_dir : str = "data", overwrite=False, progbar=True, use_int32=True): + """ + Downloads the data from google drive. + + - data_dir: relative path to where data is downloaded. + - overwrite: If files exist they will not be downloaded again. NB/todo: the function does not check if there is a complete download of the file. + - progbar: simple progbar that says how much data that has been downloaded for each file. + - use_int32: The interaction data is a very large file and is not possible to load into memory in some cases (e.g. google colab). Therefore, we recommend using the int32 data type when loading the data. + """ + + if use_int32: + data_fileid = '1XHqyk01qi9qnvBTfWWwqgDzrdjv1eBVV' + else: + data_fileid = '1VXKXIvPCJ7z4BCa4G_5-Q2XMAD7nXOc7' + + gdrive_file_ids = { + 'data.npz' : data_fileid, + 'ind2val.json' : '1WOCKfuttMacCb84yQYcRjxjEtgPp6F4N', + 'itemattr.npz' : '1rKKyMQZqWp8vQ-Pl1SeHrQxzc5dXldnR' + } + + for filename, gdrive_id in gdrive_file_ids.items(): + logging.info("Downloading {}".format(filename)) + gdd.download_file_from_google_drive(file_id=gdrive_id, + dest_path="{}/{}".format(data_dir, filename), + overwrite=overwrite, showsize=progbar) + logging.info("Done downloading all files.") + return True diff --git a/dataset_torch.py b/recsys_slates_dataset/dataset_torch.py similarity index 85% rename from dataset_torch.py rename to recsys_slates_dataset/dataset_torch.py index a899833..8a39f29 100644 --- a/dataset_torch.py +++ b/recsys_slates_dataset/dataset_torch.py @@ -1,6 +1,10 @@ -#%% Imports +# AUTOGENERATED! DO NOT EDIT! File to edit: dataset_torch.ipynb (unless otherwise specified). + +__all__ = ['SequentialDataset', 'load_dataloaders'] + +# Cell import torch -import datahelper +import recsys_slates_dataset.datahelper as datahelper from torch.utils.data import Dataset, DataLoader import torch import json @@ -8,7 +12,6 @@ import logging logging.basicConfig(format='%(asctime)s %(message)s', level='INFO') -#%% DATALOADERS class SequentialDataset(Dataset): ''' Note: displayType has been uncommented for future easy implementation. @@ -28,8 +31,8 @@ def __getitem__(self, idx): if self.sample_uniform_slate: # Sample actions uniformly: action = torch.randint_like(batch['slate'], low=3, high=self.num_items) - - # Add noclick action at pos0 + + # Add noclick action at pos0 # and the actual click action at pos 1 (unless noclick): action[:,0] = 1 clicked = batch['click']!=1 @@ -37,21 +40,24 @@ def __getitem__(self, idx): batch['slate'] = action # Set click idx to 0 if noclick, and 1 otherwise: batch['click_idx'] = clicked.long() - + return batch def __len__(self): return len(self.data['click']) -#%% PREPARE DATA IN TRAINING -def load_dataloaders(data_dir, +# Cell +def load_dataloaders(data_dir= "dat", batch_size=1024, num_workers= 0, sample_uniform_slate=False, valid_pct= 0.05, test_pct= 0.05, t_testsplit= 5): - + """ + Loads pytorch dataloaders to be used in training. If used with standard settings, the train/val/test split is equivalent to Eide et. al. 2021 + """ + logging.info("Download data if not in data folder..") datahelper.download_data_files(data_dir=data_dir) @@ -59,13 +65,13 @@ def load_dataloaders(data_dir, with np.load("{}/data.npz".format(data_dir)) as data_np: data = {key: torch.tensor(val) for key, val in data_np.items()} dataset = SequentialDataset(data, sample_uniform_slate) - + with open('{}/ind2val.json'.format(data_dir), 'rb') as handle: # Use string2int object_hook found here: https://stackoverflow.com/a/54112705 ind2val = json.load( - handle, + handle, object_hook=lambda d: { - int(k) if k.lstrip('-').isdigit() else k: v + int(k) if k.lstrip('-').isdigit() else k: v for k, v in d.items() } ) @@ -85,7 +91,7 @@ def load_dataloaders(data_dir, dataset.data['mask_type'][test_user_idx, t_testsplit:] = 3 subsets = { - 'train': dataset, + 'train': dataset, 'valid': torch.utils.data.Subset(dataset, valid_user_idx), 'test': torch.utils.data.Subset(dataset, test_user_idx) } @@ -99,12 +105,9 @@ def load_dataloaders(data_dir, logging.info( "In {}: num_users: {}, num_batches: {}".format(key, len(dl.dataset), len(dl)) ) - + # Load item attributes: with np.load('{}/itemattr.npz'.format(data_dir), mmap_mode=None) as itemattr_file: itemattr = {key : val for key, val in itemattr_file.items()} - return ind2val, itemattr, dataloaders - -if __name__ == "__main__": - load_dataloaders() \ No newline at end of file + return ind2val, itemattr, dataloaders \ No newline at end of file diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index f6bb465..0000000 --- a/requirements.txt +++ /dev/null @@ -1,14 +0,0 @@ -pyro_ppl==1.3.1 -requests==2.25.1 -numpy==1.19.5 -matplotlib==3.3.4 -gpustat==0.6.0 -names==0.3.0 -pandas==1.0.5 -torch==1.6.0 -seaborn==0.11.0 -ax==0.52.0 -plotly==4.14.3 -pyro==3.16 -PyYAML==5.4.1 -googledrivedownloader==0.4 \ No newline at end of file diff --git a/settings.ini b/settings.ini new file mode 100644 index 0000000..af17ab9 --- /dev/null +++ b/settings.ini @@ -0,0 +1,75 @@ +[DEFAULT] +# All sections below are required unless otherwise specified +host = github +lib_name = recsys_slates_dataset +# For Enterprise Git add variable repo_name and company name +# repo_name = analytics +# company_name = nike + +user = finn-no +description = Recommender Systems Dataset from FINN.no containing the presented items and whether and what the user clicked on. +keywords = recommender systems, dataset, slates, recsys +author = simen eide +author_email = simeneide@gmail.com +copyright = Use with attribution +branch = nbdev +version = 0.0.2 +min_python = 3.6 +audience = Developers +language = English +# Set to True if you want to create a more fancy sidebar.json than the default +custom_sidebar = False +# Add licenses and see current list in `setup.py` +license = tbd +# From 1-7: Planning Pre-Alpha Alpha Beta Production Mature Inactive +status = 2 + +# Optional. Same format as setuptools requirements +requirements = requests>=2.25.1 numpy>=1.19.5 pandas>=1.0.5 torch>=1.6.0 PyYAML==5.4.1 googledrivedownloader==0.4 +# Optional. Same format as setuptools console_scripts +# console_scripts = +# Optional. Same format as setuptools dependency-links +# dep_links = + +### +# You probably won't need to change anything under here, +# unless you have some special requirements +### + +# Change to, e.g. "nbs", to put your notebooks in nbs dir instead of repo root +nbs_path = . +doc_path = docs + +# Whether to look for library notebooks recursively in the `nbs_path` dir +recursive = False + +# Anything shown as '%(...)s' is substituted with that setting automatically +doc_host = http://opensource.finn.no +#For Enterprise Git pages use: +#doc_host = https://pages.github.%(company_name)s.com. + + +doc_baseurl = /%(lib_name)s/ +# For Enterprise Github pages docs use: +# doc_baseurl = /%(repo_name)s/%(lib_name)s/ + +git_url = https://github.com/%(user)s/%(lib_name)s/tree/%(branch)s/ +# For Enterprise Github use: +#git_url = https://github.%(company_name)s.com/%(repo_name)s/%(lib_name)s/tree/%(branch)s/ + + + +lib_path = %(lib_name)s +title = %(lib_name)s + +#Optional advanced parameters +#Monospace docstings: adds
 tags around the doc strings, preserving newlines/indentation.
+#monospace_docstrings = False
+#Test flags: introduce here the test flags you want to use separated by |
+#tst_flags = 
+#Custom sidebar: customize sidebar.json yourself for advanced sidebars (False/True)
+#custom_sidebar = 
+#Cell spacing: if you want cell blocks in code separated by more than one new line
+#cell_spacing = 
+#Custom jekyll styles: if you want more jekyll styles than tip/important/warning, set them here
+#jekyll_styles = note,warning,tip,important
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..6de732f
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,52 @@
+from pkg_resources import parse_version
+from configparser import ConfigParser
+import setuptools
+assert parse_version(setuptools.__version__)>=parse_version('36.2')
+
+# note: all settings are in settings.ini; edit there, not here
+config = ConfigParser(delimiters=['='])
+config.read('settings.ini')
+cfg = config['DEFAULT']
+
+cfg_keys = 'version description keywords author author_email'.split()
+expected = cfg_keys + "lib_name user branch license status min_python audience language".split()
+for o in expected: assert o in cfg, "missing expected setting: {}".format(o)
+setup_cfg = {o:cfg[o] for o in cfg_keys}
+
+licenses = {
+    'apache2': ('Apache Software License 2.0','OSI Approved :: Apache Software License'),
+    'mit': ('MIT License', 'OSI Approved :: MIT License'),
+    'gpl2': ('GNU General Public License v2', 'OSI Approved :: GNU General Public License v2 (GPLv2)'),
+    'gpl3': ('GNU General Public License v3', 'OSI Approved :: GNU General Public License v3 (GPLv3)'),
+    'bsd3': ('BSD License', 'OSI Approved :: BSD License'),
+    'ccby4' : ("CC BY 4.0" , 'Attribution 4.0 International'),
+    'tbd' : ("License to be decided", "")
+}
+statuses = [ '1 - Planning', '2 - Pre-Alpha', '3 - Alpha',
+    '4 - Beta', '5 - Production/Stable', '6 - Mature', '7 - Inactive' ]
+py_versions = '2.0 2.1 2.2 2.3 2.4 2.5 2.6 2.7 3.0 3.1 3.2 3.3 3.4 3.5 3.6 3.7 3.8'.split()
+
+requirements = cfg.get('requirements','').split()
+min_python = cfg['min_python']
+lic = licenses.get(cfg['license'].lower(), (cfg['license'], None))
+
+setuptools.setup(
+    name = cfg['lib_name'],
+    license = lic[0],
+    classifiers = [
+        'Development Status :: ' + statuses[int(cfg['status'])],
+        'Intended Audience :: ' + cfg['audience'].title(),
+        'Natural Language :: ' + cfg['language'].title(),
+    ] + ['Programming Language :: Python :: '+o for o in py_versions[py_versions.index(min_python):]] + (['License :: ' + lic[1] ] if lic[1] else []),
+    url = cfg['git_url'],
+    packages = setuptools.find_packages(),
+    include_package_data = True,
+    install_requires = requirements,
+    dependency_links = cfg.get('dep_links','').split(),
+    python_requires  = '>=' + cfg['min_python'],
+    long_description = open('README.md').read(),
+    long_description_content_type = 'text/markdown',
+    zip_safe = False,
+    entry_points = { 'console_scripts': cfg.get('console_scripts','').split() },
+    **setup_cfg)
+