Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
26 changed files
with
44,474 additions
and
129 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,129 +1,8 @@ | ||
# Byte-compiled / optimized / DLL files | ||
__pycache__/ | ||
*.py[cod] | ||
*$py.class | ||
data/raw/* | ||
tmp/* | ||
.idea/ | ||
.vscode/ | ||
**/.DS_Store | ||
**/__pycache__ | ||
**/.ipynb_checkpoints | ||
|
||
# C extensions | ||
*.so | ||
|
||
# Distribution / packaging | ||
.Python | ||
build/ | ||
develop-eggs/ | ||
dist/ | ||
downloads/ | ||
eggs/ | ||
.eggs/ | ||
lib/ | ||
lib64/ | ||
parts/ | ||
sdist/ | ||
var/ | ||
wheels/ | ||
pip-wheel-metadata/ | ||
share/python-wheels/ | ||
*.egg-info/ | ||
.installed.cfg | ||
*.egg | ||
MANIFEST | ||
|
||
# PyInstaller | ||
# Usually these files are written by a python script from a template | ||
# before PyInstaller builds the exe, so as to inject date/other infos into it. | ||
*.manifest | ||
*.spec | ||
|
||
# Installer logs | ||
pip-log.txt | ||
pip-delete-this-directory.txt | ||
|
||
# Unit test / coverage reports | ||
htmlcov/ | ||
.tox/ | ||
.nox/ | ||
.coverage | ||
.coverage.* | ||
.cache | ||
nosetests.xml | ||
coverage.xml | ||
*.cover | ||
*.py,cover | ||
.hypothesis/ | ||
.pytest_cache/ | ||
|
||
# Translations | ||
*.mo | ||
*.pot | ||
|
||
# Django stuff: | ||
*.log | ||
local_settings.py | ||
db.sqlite3 | ||
db.sqlite3-journal | ||
|
||
# Flask stuff: | ||
instance/ | ||
.webassets-cache | ||
|
||
# Scrapy stuff: | ||
.scrapy | ||
|
||
# Sphinx documentation | ||
docs/_build/ | ||
|
||
# PyBuilder | ||
target/ | ||
|
||
# Jupyter Notebook | ||
.ipynb_checkpoints | ||
|
||
# IPython | ||
profile_default/ | ||
ipython_config.py | ||
|
||
# pyenv | ||
.python-version | ||
|
||
# pipenv | ||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. | ||
# However, in case of collaboration, if having platform-specific dependencies or dependencies | ||
# having no cross-platform support, pipenv may install dependencies that don't work, or not | ||
# install all needed dependencies. | ||
#Pipfile.lock | ||
|
||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow | ||
__pypackages__/ | ||
|
||
# Celery stuff | ||
celerybeat-schedule | ||
celerybeat.pid | ||
|
||
# SageMath parsed files | ||
*.sage.py | ||
|
||
# Environments | ||
.env | ||
.venv | ||
env/ | ||
venv/ | ||
ENV/ | ||
env.bak/ | ||
venv.bak/ | ||
|
||
# Spyder project settings | ||
.spyderproject | ||
.spyproject | ||
|
||
# Rope project settings | ||
.ropeproject | ||
|
||
# mkdocs documentation | ||
/site | ||
|
||
# mypy | ||
.mypy_cache/ | ||
.dmypy.json | ||
dmypy.json | ||
|
||
# Pyre type checker | ||
.pyre/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,62 @@ | ||
# retailhero-recomender-baseline | ||
# Бэйслайн к задаче [RetailHero.ai/#2](https://retailhero.ai/c/recommender_system/overview) от [@geffy](https://github.com/geffy) | ||
|
||
Репозиторий содержит: | ||
* item-to-item модель (NMAP 0.1137, top5 на 09/01/2020) | ||
* распиливание исходных данных на шарды | ||
* вспомогательный переиспользуемый код | ||
* скрипты и для обучения кастомных эмбеддингов на pytorch | ||
* быстрый поиск соседей в связке с faiss | ||
* кастомный docker-образ с поддержой pytorch 1.3 и faiss | ||
|
||
Код написан так, что вполне успешно отрабатывает на машине с 8gb ram. | ||
|
||
## Шаги по подготовке: | ||
|
||
0. Скопировать данные в data/raw | ||
``` | ||
cd {REPO_ROOT} | ||
mkdir -p data/raw | ||
cp /path/to/upacked/data/*.csv ./data/raw | ||
cd src | ||
``` | ||
|
||
|
||
1. Разделить исходные данные о покупках на 16 частей | ||
```bash | ||
python3 purchases_to_jrows.py | ||
``` | ||
|
||
|
||
2. Подготовить train/valid данные в формате, максимально близком к формату `check_queries.tsv` | ||
```bash | ||
python3 train_valid_split.py | ||
``` | ||
|
||
3. Обучить item-2-item модель: | ||
```bash | ||
python3 train_i2i_model.py | ||
``` | ||
|
||
4. Скопировать артефакты в сабмит | ||
```bash | ||
cd {REPO_ROOT} | ||
mkdir -p submit/solutions/assets | ||
cp ./data/raw/products.csv submit/solutions/assets | ||
cp ./tmp/implicit_cosine1/model.pkl submit/solutions/assets | ||
``` | ||
|
||
5. Упаковать сабмит | ||
```bash | ||
cd submit | ||
zip -r submit_title.zip solution/* | ||
``` | ||
|
||
6. Profit! | ||
|
||
## Результаты: | ||
``` | ||
Check: 0,1113 | ||
Public: 0,1137 | ||
``` | ||
|
||
Обучение кастомных эмбеддингов в текущем решении фактически не используется, их код оставлен для экспериментов. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import pandas as pd\n", | ||
"import numpy as np\n", | ||
"import json\n", | ||
"from scipy import sparse as sp\n", | ||
"from tqdm.notebook import tqdm\n", | ||
"from collections import defaultdict" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import sys\n", | ||
"sys.path.append('../src')\n", | ||
"\n", | ||
"from utils import get_shard_path, normalized_average_precision" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"cnt = defaultdict(int)\n", | ||
"\n", | ||
"def read_part(path):\n", | ||
" for js in tqdm((json.loads(s) for s in open(path))):\n", | ||
" for trans in js['transaction_history']:\n", | ||
" for product in trans[\"products\"]:\n", | ||
" cnt[product[\"product_id\"]] += 1\n", | ||
"\n", | ||
"for i in range(4):\n", | ||
" read_part(get_shard_path(i))\n", | ||
"\n", | ||
"_tmp = list(cnt.keys())\n", | ||
"top_products = sorted(_tmp, key=lambda x: -cnt[x])" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"# Check MAP@30" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"scores = []\n", | ||
"for js in tqdm((json.loads(s) for s in open(get_shard_path(15)))):\n", | ||
" recommended_items = top_products[:30]\n", | ||
" gt_items = js[\"target\"][0][\"product_ids\"]\n", | ||
" ap = normalized_average_precision(gt_items, recommended_items)\n", | ||
" scores.append(ap)\n", | ||
"np.mean(scores)" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.7.5" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
Oops, something went wrong.