Merge pull request #21 from etlundquist/docs

adding sphinx docs
etlundquist · Jun 15, 2020 · 194f994 · 194f994
2 parents af3a621 + 5e77bb3
commit 194f994
Show file tree

Hide file tree

Showing 7 changed files with 398 additions and 0 deletions.
diff --git a/docs/Makefile b/docs/Makefile
@@ -0,0 +1,19 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+SOURCEDIR     = source
+BUILDDIR      = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -0,0 +1,179 @@
+# -*- coding: utf-8 -*-
+#
+# Configuration file for the Sphinx documentation builder.
+#
+# This file does only contain a selection of the most common options. For a
+# full list see the documentation:
+# http://www.sphinx-doc.org/en/master/config
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+# import os
+# import sys
+# sys.path.insert(0, os.path.abspath('.'))
+
+
+# -- Project information -----------------------------------------------------
+
+project = 'rankfm'
+copyright = '2020, Eric Lundquist'
+author = 'Eric Lundquist'
+
+# The short X.Y version
+version = ''
+# The full version, including alpha/beta/rc tags
+release = '0.2.5'
+
+
+# -- General configuration ---------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#
+# needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    'sphinx.ext.autodoc',
+    'sphinx.ext.mathjax',
+    'sphinx.ext.viewcode',
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+# source_suffix = ['.rst', '.md']
+source_suffix = '.rst'
+
+# The master toctree document.
+master_doc = 'index'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = []
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = None
+
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'alabaster'
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#
+# html_theme_options = {}
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# Custom sidebar templates, must be a dictionary that maps document names
+# to template names.
+#
+# The default sidebars (for documents that don't match any pattern) are
+# defined by theme itself.  Builtin themes are using these templates by
+# default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
+# 'searchbox.html']``.
+#
+# html_sidebars = {}
+
+
+# -- Options for HTMLHelp output ---------------------------------------------
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'rankfmdoc'
+
+
+# -- Options for LaTeX output ------------------------------------------------
+
+latex_elements = {
+    # The paper size ('letterpaper' or 'a4paper').
+    #
+    # 'papersize': 'letterpaper',
+
+    # The font size ('10pt', '11pt' or '12pt').
+    #
+    # 'pointsize': '10pt',
+
+    # Additional stuff for the LaTeX preamble.
+    #
+    # 'preamble': '',
+
+    # Latex figure (float) alignment
+    #
+    # 'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+    (master_doc, 'rankfm.tex', 'rankfm Documentation',
+     'Eric Lundquist', 'manual'),
+]
+
+
+# -- Options for manual page output ------------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    (master_doc, 'rankfm', 'rankfm Documentation',
+     [author], 1)
+]
+
+
+# -- Options for Texinfo output ----------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+    (master_doc, 'rankfm', 'rankfm Documentation',
+     author, 'rankfm', 'One line description of project.',
+     'Miscellaneous'),
+]
+
+
+# -- Options for Epub output -------------------------------------------------
+
+# Bibliographic Dublin Core info.
+epub_title = project
+
+# The unique identifier of the text. This can be a ISBN number
+# or the project homepage.
+#
+# epub_identifier = ''
+
+# A unique identification for the text.
+#
+# epub_uid = ''
+
+# A list of files that should not be packed into the epub file.
+epub_exclude_files = ['search.html']
+
+
+# -- Extension configuration -------------------------------------------------
diff --git a/docs/source/evaluation.rst b/docs/source/evaluation.rst
@@ -0,0 +1,7 @@
+Model Evaluation
+================
+
+.. automodule:: rankfm.evaluation
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/docs/source/home.rst b/docs/source/home.rst
@@ -0,0 +1,57 @@
+Welcome to RankFM's Documentation!
+==================================
+
+RankFM is a python implementation of the general Factorization Machines model class described in `Rendle 2010 <https://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf>`_ adapted for collaborative filtering recommendation/ranking problems with implicit feedback user-item interaction data. It uses `Bayesian Personalized Ranking (BPR) <https://arxiv.org/pdf/1205.2618.pdf>`_ and a variant of `Weighted Approximate-Rank Pairwise (WARP) <http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.587.3946&rep=rep1&type=pdf>`_ loss to learn model weights via Stochastic Gradient Descent (SGD). It can (optionally) incorporate individual training sample weights and/or user/item auxiliary features to augment the main interaction data for model training.
+
+The core training/prediction/recommendation methods are written in `Cython <https://cython.org/>`_. This makes it possible to scale to millions of users, items, and interactions. Designed for ease-of-use, RankFM accepts both `pd.DataFrame` and `np.ndarray` inputs. You do not have to convert your data to `scipy.sparse` matrices or re-map user/item identifiers to matrix indexes prior to use - RankFM internally maps all user/item identifiers to zero-based integer indexes, but always converts its outputs back to the original user/item identifiers from your data, which can be arbitrary (non-zero-based, non-consecutive) integers or even strings.
+
+In addition to the familiar `fit()`, `predict()`, `recommend()` methods, RankFM includes additional utilities `similiar_users()` and `similar_items()` to find the most similar users/items to a given user/item based on latent factor space embeddings. A number of popular recommendation/ranking evaluation metric functions have been included in the separate `evaluation` module to streamline model tuning and validation.
+
+Dependencies
+------------
+
+* Python 3.6+
+* numpy >= 1.15
+* pandas >= 0.24
+
+Installation
+------------
+
+Prerequisites
+^^^^^^^^^^^^^
+
+To install RankFM's C extensions you will need the `GNU Compiler Collection (GCC) <https://gcc.gnu.org/>`_. Check to see whether you already have it installed:
+
+.. code:: bash
+
+  gcc --version
+
+If you don't have it already you can easily install it using `Homebrew <https://brew.sh/>`_ on OSX or your default linux package manager:
+
+.. code:: bash
+
+  # OSX
+  brew install gcc
+
+  # linux
+  sudo yum install gcc
+
+  # ensure [gcc] has been installed correctly and is on the system PATH
+  gcc --version
+
+Package Installation
+^^^^^^^^^^^^^^^^^^^^
+
+You can install the latest published version from PyPI using `pip`:
+
+.. code:: bash
+
+  pip install rankfm
+
+Or alternatively install the current development build directly from GitHub:
+
+.. code:: bash
+  
+  pip install git+https://github.com/etlundquist/rankfm.git#egg=rankfm
+
+
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -0,0 +1,14 @@
+.. include:: home.rst
+
+
+Contents 
+========
+
+.. toctree::
+   :maxdepth: 2
+
+   Home <home>
+   Quickstart <quickstart>
+   RankFM Model <rankfm>
+   Model Evaluation <evaluation>
+
diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst
@@ -0,0 +1,115 @@
+Quickstart
+==========
+
+Let's work through a simple example of fitting a model, generating recommendations, evaluating performance, and assessing some item-item similarities. The data we'll be using here may already be somewhat familiar: you know it, you love it, it's the `MovieLens 1M <https://grouplens.org/datasets/movielens/1m/>`_!
+
+Let's first look at the required shape of the interaction data:
+
+======= =======
+user_id item_id    
+======= =======
+3       233
+5       377
+8       610
+======= =======
+
+It has just two columns: a `user_id` and an `item_id` (you can name these fields whatever you want or use a numpy array instead). Notice that there is no `rating` column - this library is for **implicit feedback** data (e.g. watches, page views, purchases, clicks) as opposed to **explicit feedback** data (e.g. 1-5 ratings, thumbs up/down). Implicit feedback is far more common in real-world recommendation contexts and doesn't suffer from the `missing-not-at-random problem <https://resources.bibblio.org/hubfs/share/2018-01-24-RecSysLDN-Ravelin.pdf>`_ of pure explicit feedback approaches.
+
+Now let's import the library, initialize our model, and fit on the training data:
+
+.. code:: python
+
+  from rankfm.rankfm import RankFM
+  model = RankFM(factors=20, loss='warp', max_samples=20, learning_rate=0.1, learning_schedule='invscaling')
+  model.fit(interactions_train, epochs=20, verbose=True)
+
+If you set `verbose=True` the model will print the current epoch number as well as the epoch's log-likelihood during training. This can be useful to gauge both computational speed and training gains by epoch. If the log likelihood is not increasing then try upping the `learning_rate` or lowering the (`alpha`, `beta`) regularization strength terms. If the log likelihood is starting to bounce up and down try lowering the `learning_rate` or using `learning_schedule='invscaling'` to decrease the learning rate over time. If you run into overflow errors then decrease the feature and/or sample-weight magnitudes and try upping `beta`, especially if you have a small number of dense user-features and/or item-features. Selecting `BPR` loss will lead to faster training times, but `WARP` loss typically yields superior model performance.
+
+Now let's generate some user-item model scores from the validation data:
+
+.. code:: python
+
+  valid_scores = model.predict(interactions_valid, cold_start='nan')
+
+this will produce an array of real-valued model scores generated using the Factorization Machines model equation. You can interpret it as a measure of the predicted utility of item (i) for user (u). The `cold_start='nan'` option can be used to set scores to `np.nan` for user/item pairs not found in the training data, or `cold_start='drop'` can be specified to drop those pairs so the results contain no missing values.
+
+Now let's generate our topN recommended movies for each user:
+
+.. code:: python
+
+  valid_recs = model.recommend(valid_users, n_items=10, filter_previous=True, cold_start='drop')
+
+The input should be a `pd.Series`, `np.ndarray` or `list` of `user_id` values. You can use `filter_previous=True` to prevent generating recommendations that include any items observed by the user in the training data, which could be useful depending on your application context. The result will be a `pd.DataFrame` where `user_id` values will be the index and the rows will be each user's top recommended items in descending order (best item is in column 0):
+
+=======  ====  ====  ====  ====  ====  ====  ====  ==== ====  ====
+user_id     0     1     2     3     4     5     6     7    8     9
+=======  ====  ====  ====  ====  ====  ====  ====  ==== ====  ====
+3        2396  1265   357    34  2858  3175     1  2028   17   356
+5         608  1617  1610  3418   590   474   858   377  924  1036
+8         589  1036  2571  2028  2000  1220  1197   110  780  1954
+=======  ====  ====  ====  ====  ====  ====  ====  ==== ====  ====
+
+Now let's see how the model is performing wrt the included validation metrics evaluated on the hold-out data:
+
+.. code:: python
+
+  from rankfm.evaluation import hit_rate, reciprocal_rank, discounted_cumulative_gain, precision, recall
+
+  valid_hit_rate = hit_rate(model, interactions_valid, k=10)
+  valid_reciprocal_rank = reciprocal_rank(model, interactions_valid, k=10)
+  valid_dcg = discounted_cumulative_gain(model, interactions_valid, k=10)
+  valid_precision = precision(model, interactions_valid, k=10)
+  valid_recall = recall(model, interactions_valid, k=10)
+
+.. parsed-literal::
+
+  hit_rate: 0.796
+  reciprocal_rank: 0.339
+  dcg: 0.734
+  precision: 0.159
+  recall: 0.077
+
+`That's a Bingo! <https://www.youtube.com/watch?v=q5pESPQpXxE>`_
+
+Now let's find the most similar other movies for a few movies based on their embedding representations in latent factor space:
+
+.. code:: python
+
+  # Terminator 2: Judgment Day (1991)
+  model.similar_items(589, n_items=10)
+
+.. parsed-literal::
+
+  2571                       Matrix, The (1999)
+  1527                Fifth Element, The (1997)
+  2916                      Total Recall (1990)
+  3527                          Predator (1987)
+  780             Independence Day (ID4) (1996)
+  1909    X-Files: Fight the Future, The (1998)
+  733                          Rock, The (1996)
+  1376     Star Trek IV: The Voyage Home (1986)
+  480                      Jurassic Park (1993)
+  1200                            Aliens (1986)
+
+`I hope you like explosions... <https://www.youtube.com/watch?v=uENYMZNzg9w>`_
+
+.. code:: python
+
+  # Being John Malkovich (1999)
+  model.similar_items(2997, n_items=10)
+
+.. parsed-literal::
+
+  2599           Election (1999)
+  3174    Man on the Moon (1999)
+  2858    American Beauty (1999)
+  3317        Wonder Boys (2000)
+  223              Clerks (1994)
+  3897      Almost Famous (2000)
+  2395           Rushmore (1998)
+  2502       Office Space (1999)
+  2908     Boys Don't Cry (1999)
+  3481      High Fidelity (2000)
+
+`Let's get weird... <https://www.youtube.com/watch?v=lIpev8JXJHQ&t=5s>`_
+