Skip to content

Commit

Permalink
add baseline code
Browse files Browse the repository at this point in the history
  • Loading branch information
geffy committed Jan 9, 2020
1 parent dbf9191 commit f331923
Show file tree
Hide file tree
Showing 26 changed files with 44,474 additions and 129 deletions.
135 changes: 7 additions & 128 deletions .gitignore
@@ -1,129 +1,8 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
data/raw/*
tmp/*
.idea/
.vscode/
**/.DS_Store
**/__pycache__
**/.ipynb_checkpoints

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
.python-version

# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock

# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/
63 changes: 62 additions & 1 deletion README.md
@@ -1 +1,62 @@
# retailhero-recomender-baseline
# Бэйслайн к задаче [RetailHero.ai/#2](https://retailhero.ai/c/recommender_system/overview) от [@geffy](https://github.com/geffy)

Репозиторий содержит:
* item-to-item модель (NMAP 0.1137, top5 на 09/01/2020)
* распиливание исходных данных на шарды
* вспомогательный переиспользуемый код
* скрипты и для обучения кастомных эмбеддингов на pytorch
* быстрый поиск соседей в связке с faiss
* кастомный docker-образ с поддержой pytorch 1.3 и faiss

Код написан так, что вполне успешно отрабатывает на машине с 8gb ram.

## Шаги по подготовке:

0. Скопировать данные в data/raw
```
cd {REPO_ROOT}
mkdir -p data/raw
cp /path/to/upacked/data/*.csv ./data/raw
cd src
```


1. Разделить исходные данные о покупках на 16 частей
```bash
python3 purchases_to_jrows.py
```


2. Подготовить train/valid данные в формате, максимально близком к формату `check_queries.tsv`
```bash
python3 train_valid_split.py
```

3. Обучить item-2-item модель:
```bash
python3 train_i2i_model.py
```

4. Скопировать артефакты в сабмит
```bash
cd {REPO_ROOT}
mkdir -p submit/solutions/assets
cp ./data/raw/products.csv submit/solutions/assets
cp ./tmp/implicit_cosine1/model.pkl submit/solutions/assets
```

5. Упаковать сабмит
```bash
cd submit
zip -r submit_title.zip solution/*
```

6. Profit!

## Результаты:
```
Check: 0,1113
Public: 0,1137
```

Обучение кастомных эмбеддингов в текущем решении фактически не используется, их код оставлен для экспериментов.
94 changes: 94 additions & 0 deletions sandbox/dummy_baseline.ipynb
@@ -0,0 +1,94 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import json\n",
"from scipy import sparse as sp\n",
"from tqdm.notebook import tqdm\n",
"from collections import defaultdict"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"sys.path.append('../src')\n",
"\n",
"from utils import get_shard_path, normalized_average_precision"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"cnt = defaultdict(int)\n",
"\n",
"def read_part(path):\n",
" for js in tqdm((json.loads(s) for s in open(path))):\n",
" for trans in js['transaction_history']:\n",
" for product in trans[\"products\"]:\n",
" cnt[product[\"product_id\"]] += 1\n",
"\n",
"for i in range(4):\n",
" read_part(get_shard_path(i))\n",
"\n",
"_tmp = list(cnt.keys())\n",
"top_products = sorted(_tmp, key=lambda x: -cnt[x])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Check MAP@30"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"scores = []\n",
"for js in tqdm((json.loads(s) for s in open(get_shard_path(15)))):\n",
" recommended_items = top_products[:30]\n",
" gt_items = js[\"target\"][0][\"product_ids\"]\n",
" ap = normalized_average_precision(gt_items, recommended_items)\n",
" scores.append(ap)\n",
"np.mean(scores)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

0 comments on commit f331923

Please sign in to comment.