### Setup

In [1]:
import numpy as np
import pandas as pd

from os import path
import pickle
import re

import pdb

import matplotlib.pyplot as plt
import seaborn as sns

import itertools
from collections import Counter
import random
import time

from dask.distributed import Client, progress
import dask.delayed as delayed
import dask.dataframe as dd

In [2]:
import sys
sys.path.append('..')
import src.my_helper as my_helper

In [3]:
from imp import reload
reload(my_helper)

<module 'src.my_helper' from '..\\src\\my_helper.py'>

### Import Data

In [4]:
fd = ['..','data','processed']

# Train set
fp = path.join(*fd, 'train.p')
with open(file=fp, mode='rb') as file:
    train = pickle.load(file)

In [5]:
fd = ['..','data','processed']

# Train set
fp = path.join(*fd, 'val.p')
with open(file=fp, mode='rb') as file:
    val = pickle.load(file)

### Global Popularity Recommender

In [6]:
class RecommenderSystem():
    '''
    Base class for recommender systems
    '''
    MODEL_NAME = 'Base'
    
    def __init__(self, n_rec=10, rank=False, verbose=False, decay=False, decay_method='linear', decay_constant=None):
        '''
        Parameters
        ----------
        n_rec: int, optional (default: 10)
            Number of recommendations to return. Value of -1 returns full list.
        rank: bool, optional (default: True)
            Whether to include rank in index of returned recommendations. Note that results will typically be
            sorted regardless of the inclusion of this rank index.
        verbose: bool, optional (default: False)
            Whether to print status/progress during processing. Implementation may vary depending on model complexity. 
        '''
        self.n_rec = n_rec
        self.verbose = verbose
        self.rank = rank
        
        self.fitted = False
    
    def fit(data=None):
        '''
        Dummy base class fit method.
        '''
        raise NotImplementedError()
    
    def get_model_name(self):
        '''
        Get model name.
        '''
        return self.MODEL_NAME
    
    def get_params(self):
        '''
        Get model parameters.
        Returns: dict
        '''
        d = {'n_rec': self.n_rec,
             'rank': self.rank,
             'verbose': self.verbose}
        
        return d
    
    def set_params(self, **kwargs):
        '''
        Set model parameters.
        
        Parameters
        ----------
        n_rec: int, optional
            Provides option to overwrite existing n_rec parameter for the number of recommendations to return per user.
            A value of -1 returns the full list. 
        rank: bool, optional
            Provides option to overwrite existing rank parameter to include ranking index values of recommendations.
            Note that recommendations will be returned sorted by rank regardless of the inclusion of this index.
        verbose: bool, optional
            Whether to print status/progress during processing. Implementation may vary depending on model complexity.            
        ----------
        '''
        self.n_rec = kwargs.pop('n_rec', self.n_rec)
        self.verbose = kwargs.pop('verbose', self.verbose)
        self.rank = kwargs.pop('rank', self.rank)
        
        if len(kwargs) > 0:
            warnings.warn('Parameters {} not found and have been ignored.'.format(list(kwargs.keys())))
            
    def recommend(self, df_rec=None):
        '''
        Base recommend method. Not implemented for use outside of child recommenders.
        '''
        # Check if called without provides recommendation results (i.e. from actual recommender)
        if df_rec is None:
            raise NotImplementedError()
        
        if self.rank:
            ranking = []
            for user, recs in df_rec.groupby(level=0):
                # Rank recs per user from 1:n_rec and append to overarching list
                ranking += [i + 1 for i, rec in enumerate(recs)]
            if type(df_rec) is pd.Series:
                df_rec = df_rec.to_frame()
            # Set generated list to new rank column (aligns with data)
            df_rec['rank'] = ranking
            # Set user id and rank as index
            df_rec = df_rec.reset_index().set_index(['user_id', 'rank'])
        
        return df_rec

In [7]:
class GlobalPopularityRecommender(RecommenderSystem):
    '''
    Global Popularity Recommender system: produces recommendations based off most popular (i.e. frequently ordered)
    items across all users.
    
    Parameters
    ----------
    n_rec: int (default: -1)
        Number of product recommendations to return. Setting to -1 will return all products ranked.
    ----------
    '''
    MODEL_NAME = 'Global Popularity'
    
    def fit(self, data):
        '''
        Fit recommender using prior order product data.
        
        Parameters
        ----------
        data: pandas.DataFrame
            Dataframe containing history of order products. Assumes format of one row per product ordered.
            Must contain column of 'product_id' on which to sum frequencies.
        ----------
        '''
        # Produce sorted list of products IDs according to purchase frequency
        self.sorted_product_ids = data['product_id'].value_counts().sort_values(ascending=False).index.values
        self.fitted = True
    
    def recommend(self, user_id=0, **kwargs):
        '''
        Recommend products for a given user. Note that given the nature of this global popularity recommender,
        recommendations are the same for all users, but user_id can still be provided for consistency amongst
        recommender system parameters.
        
        Parameters
        ----------
        user_id: int or array-like, optional (default: 0)
            User ID to produce recommendations for. Note that for this particular recommender system, recommendations
            are the same for all users and this parameter is merely included for consistency amongst recommenders.
        ----------
        Returns: pandas.DataFrame, index = ['user_id', 'rank'], cols = ['product_id]
        '''
        # Check if fit was performed
        if not self.fitted:
            raise RuntimeError('cannot recommend without fitting!')
        # Overwrite params if provided
        self.set_params(**kwargs)
        # Check user_id
        if isinstance(user_id, (int, np.integer)):
            user_id = [user_id]
        if type(user_id) not in (list, np.ndarray):
            raise ValueError('user_id not of accepted type. Must be int or array-like.')
        
        if self.n_rec == -1:
            # get all products
            df_rec = pd.Series(data=np.tile(self.sorted_product_ids,len(user_id)), index=np.repeat(user_id,self.sorted_product_ids.shape[0]), name='product_id')
        else:
            # get top n products
            df_rec = pd.Series(data=np.tile(self.sorted_product_ids[:self.n_rec],len(user_id)), index=np.repeat(user_id,self.n_rec), name='product_id')
        
        df_rec.index.name = 'user_id'
        
        return super(GlobalPopularityRecommender, self).recommend(df_rec=df_rec)

In [8]:
gpop_rec = GlobalPopularityRecommender(n_rec=10)

In [9]:
gpop_rec.fit(train)

In [8]:
def get_product_name(product_id, fp=path.join('..','data','raw','products.csv'), reload=False):
    '''
    Get the product name for a given product_id. Function caches product data for successive calls, but can be forced
    to reload data via 'reload' parameter.
    
    Parameters
    ----------
    product_id: int or array-like
        Product ID to retrieve name for.
    fp: string, (default: path.join('..','data','raw','products.csv'))
        Filepath from which to retrieve product csv data.
    reload: bool (default: False)
        Reload the product csv data.
    ----------
    Returns: product name (string) if ID found, else np.nan
    '''
    # shorthand for "self"
    f = get_product_name
    try:
        f.product_data
        if reload:
            raise          
    except:
        # Load product data
        with open(fp, 'rb') as file:
            f.product_data = pd.read_csv(file, encoding='utf8').set_index('product_id')
    
    product_name = f.product_data.loc[product_id,'product_name'].values

    return product_name

In [11]:
gpop_rec.recommend(user_id=7, rank=False)

user_id
7    24852
7    13176
7    21903
7    21137
7    47766
7    47209
7    47626
7    16797
7    26209
7    27845
Name: product_id, dtype: int64

In [12]:
get_product_name(gpop_rec.recommend(user_id=7, n_rec=10).values)

array(['Banana', 'Bag of Organic Bananas', 'Organic Baby Spinach',
       'Organic Strawberries', 'Organic Avocado', 'Organic Hass Avocado',
       'Large Lemon', 'Strawberries', 'Limes', 'Organic Whole Milk'],
      dtype=object)

### Performance Evaluation

##### Full Product List

In [13]:
fp = path.join('..','data','processed','products.p')
with open(fp, 'rb') as file:
    products = pickle.load(file=file)

In [14]:
products.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


##### Precision@K

Precision @ K is probably the most basic performance evaluation method, but is a good place to start before building out other methods. We simply look at the set of products in each order, and calculate the proportion of products correctly recommended:

In [21]:
def order_pk(order_products, rec_products, k=10):
    '''
    Calculate precision@k for a set of ordered products vs. recommended products.
    
    Parameters
    ----------
    order_products: array-like
        List of product IDs for a given order
    rec_products: array-like
        Sorted list of recommended product IDs (highest to lowest)
    k: int, array-like (default = 10)
        K threshold at which to evaluate precision
    ----------
    Returns: precision score (float)
    '''
    if isinstance(k, (int,np.integer)):
        k = [k]
    precisions = {}
    for k_ in k:
        # Get top K recommendations
        top_n_recs = rec_products[:k_]

        # Number of recommendations in order
        n_hit = len(set(top_n_recs).intersection(order_products))

        # Discount precision denominator for imbalances between # of recommendations and # of ordered products
        m = min(len(order_products), len(top_n_recs))
        # Proportion of correct recommendations
        precisions['p@{:02d}'.format(k_)] = (n_hit / m)

    return precisions

Note that the calculation of precision is discounted for potential imbalances between the number of products in an order and the number of recommendations. For example, if 5 products are in an order and we have 10 recommended products which capture all 5 products in the order (extreme example), we do not want to punish scoring by saying only 5/10 were correct. Similarly, if we have an order of 10 products but only 5 recommendations, we again would not want to punish our scoring for only being able to recommend 5/10 products correctly. In both these scenarios, we would want to record 100% precision, since our recommendations have performend to the best of their ability.

We will set up a local cluster via Dask to process our validation set (20k orders):

In [22]:
c = Client()

In [17]:
c

0,1
Client  Scheduler: tcp://127.0.0.1:64120  Dashboard: http://127.0.0.1:8787,Cluster  Workers: 8  Cores: 8  Memory: 34.31 GB


In [18]:
# Convert validation set to dask dataframe
dd_val = dd.from_pandas(val.set_index('user_id'), npartitions=8)

In [20]:
unique_users = dd_val.index.unique()
user_recs = delayed(gpop_rec.recommend)(user_id=unique_users.values, n_rec=10)
user_recs = user_recs.compute()

In [21]:
dd_eval = dd_val.groupby(['user_id', 'order_id'])['product_id'].apply(lambda x: order_pk(x, user_recs.loc[x.name[0]], [5,10]), meta=('precision@k', float))

In [22]:
fut = c.compute(dd_eval)

In [23]:
progress(fut, multi=False)

2.1 seconds to process our 20k orders should be acceptable for repeated evaluations.

In [24]:
pk_results = fut.result()

In [25]:
pk_results.head()

user_id  order_id      
7        2452257   p@05    0.2
                   p@10    0.1
13       1789302   p@05    0.0
                   p@10    0.0
14       3394109   p@05    0.0
Name: precision@k, dtype: float64

In [26]:
pk_results.unstack().head()

Unnamed: 0_level_0,Unnamed: 1_level_0,p@05,p@10
user_id,order_id,Unnamed: 2_level_1,Unnamed: 3_level_1
7,2452257,0.2,0.1
13,1789302,0.0,0.0
14,3394109,0.0,0.0
65,864720,0.2,0.1
70,29902,0.0,0.0


In [27]:
pk_results.unstack().mean()

p@05    0.093233
p@10    0.086623
dtype: float64

Our precision at 5 and 10 both hover around 0.08-0.09. This is far from what we would hope to achieve in our actual models, and we should probably even consider expanding upon this baseline for a higher standard of comparison.

##### MAP (Mean Average Precision)

Another evaluation metric is Mean Average Precision, which is akin to our Precision@K approach but also gives weight to the rank/position of the accurate recommendations. 

In [20]:
def order_map(order_products, rec_products, n_range=10):
    '''
    Calculate the MAP score for a given set of orderered products and recommended products.
    
    Paramters
    ---------
    order_products: array-like (int)
        List of product IDs in a given order
    rec_products: array-like (int)
        List of recommended product IDs
    n_range: int, array-like
        Value or list of values at which to evaluate MAP. Scoring is calculated relative to the first n recommendations.
    ---------
    Returns: dict(string: float)
    '''
    if isinstance(n_range, (int, np.integer)):
        n_range = [n_range]
    order_product_set = set(order_products)
    order_eval = {}
    for n in n_range:
        top_n_recs = rec_products[:n]
        # Minimum of number of products in order or rec; avoids unfair scoring for length mismatches
        m = min(len(order_product_set), n)
        n_hit = 0
        cum_prec = 0
        for i, rec in enumerate(top_n_recs):
            # Check if recommendation is contained in order
            hit = int(rec in order_product_set)
            # Increment hit count
            n_hit += hit
            # Cumulative precision of hits, dicounted for rank
            cum_prec += hit*n_hit/(i+1)

        # Mean precision according to number of products/recs
        ap = cum_prec/m
        order_eval['map@{:02d}'.format(n)] = ap
    
    return order_eval

In [29]:
# Convert validation set to dask dataframe
dd_val = dd.from_pandas(val.set_index('user_id'), npartitions=8)

In [379]:
unique_users = dd_val.index.unique()
user_recs = delayed(pop_rec.recommend)(user_id=unique_users.values, n_rec=10)
user_recs = user_recs.compute()

In [452]:
dd_eval = dd_val.groupby(['user_id', 'order_id'])['product_id'].apply(lambda x: order_map(x, user_recs.loc[x.name[0]], [5,10,20]), meta=('map@k', float))

In [453]:
fut = c.compute(dd_eval)

In [454]:
progress(fut, multi=False)

In [455]:
order_map_results = fut.result().unstack()

In [456]:
order_map_results.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,map@05,map@10,map@20
user_id,order_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
7,2452257,0.05,0.025,0.020833
13,1789302,0.0,0.0,0.0
14,3394109,0.0,0.0,0.0
65,864720,0.05,0.025,0.014706
70,29902,0.0,0.0,0.0


In [465]:
order_map_results.mean()

map@05    0.057445
map@10    0.041477
map@20    0.032851
dtype: float64

#### Top-N Accuracy

In [19]:
# Convert validation set to dask dataframe
dd_users = dd.from_array(val['user_id'].unique())

In [20]:
dd_val = dd.from_pandas(val.set_index('user_id'), npartitions=8)

In [23]:
#step 0: get full data
train_full = pd.concat([train,val])

In [24]:
#step 1: get product list
full_product_list = products['product_id']

In [25]:
#step 2: get user ordered products
dd_full = dd.from_pandas(train_full, npartitions=8)
dd_ord_prod = dd_full.groupby('user_id')['product_id'].apply(lambda x: x.unique(), meta=object)

In [26]:
dd_ord_prod = dd.from_pandas(train_full[['user_id','product_id']].drop_duplicates().set_index('user_id'), npartitions=8)
ord_prod = dd_ord_prod.compute()

In [9]:
#step 3: get non ordered products
def get_non_ordered_products(user_id):
    ordered_products = ord_prod.loc[user_id]
    non_ordered_product_set = set(full_product_list).difference(ordered_products)
    return non_ordered_product_set

In [10]:
def order_tna(user_id, order_products, rec_products, sample_size=100, n_range=10):
    '''
    Calculate Top-N Accuracy for a given user and set of order products and recommended products.
    
    Parameters
    ----------
    user_id: int
        User ID for which order is being evaluated. Required to pull list of non-ordered products for said user.
    order_products: array-like (int)
        List of products in a given order
    rec_products: array-like (int)
        List of recommended products (must be sorted by rank)
    sample_size: int (default: 100)
        Number of random non-purchased products to combine with each order product 
        when evaluating recommendation relevancy/effectiveness.
    n_range: int, array-like (default: 10)
        N value(s) at which to evaluate Top-N Accuracy.
    ----------
    Returns: dict(string: float)
    '''
    if isinstance(n_range, (int, np.integer)):
        n_range = [n_range]
        
    non_ordered_products_set = get_non_ordered_products(user_id)
    
    order_eval = {}
    hits_at_n = Counter()
    for product_id in set(order_products):
        # get 100 non purchased products
        non_ordered_sample = set(random.sample(non_ordered_products_set, k=100))
        # combine with product of interest
        tna_sample = non_ordered_sample.union([product_id])
        # rank products according to recs
        ranked_products = rec_products[rec_products.isin(tna_sample)]
        # top_n eval
        for n in n_range:
            if ranked_product_id in ranked_products.values[:n]:
                hits_at_n[n] += 1
    
    n_products = len(order_products)
    for n in n_range:
        order_eval['tna@{:02d}'.format(n)] = hits_at_n[n]/n_products
    
    return order_eval

In [30]:
order_tna(user_id=7, order_products=val.set_index(['user_id', 'order_id']).sort_index().loc[7, 2452257]['product_id'].values, rec_products=pop_rec.recommend(7, n_rec=-1))

{'tna@10': 0.8333333333333334}

In [247]:
# Convert validation set to dask dataframe
dd_val = dd.from_pandas(val.set_index('user_id'), npartitions=8)

In [248]:
dd_eval = dd_val.groupby(['user_id', 'order_id'])['product_id'].apply(lambda x: order_tna(user_id=x.name, order_products=x, rec_products=gpop_rec.recommend(user_id=x.name, n_rec=-1), n_range=[5,10]), meta=('tna@k', float))

In [249]:
fut = c.compute(dd_eval)

In [250]:
progress(fut, multi=False)

In [242]:
tna_results = fut.result().unstack()

In [243]:
tna_results.mean()

tna@05    0.686929
tna@10    0.795457
dtype: float64

These scorings are clearly at a significantly higher magnitude than our previous Precision@K and MAP methods. This indicates that whilst our basic global recommender is rarely able to capture a signficant portion of ordered products in a given order, it does typically rank said products in the top 5/10 over samplings of products of (assumed) non-interest.

However, these high scores do raise some concerns as to the effectiveness of this measure. It is possible that given the large set of products (close to 50k), there is an overwhelming proportion of completely irrelevant products for most customers. Consequently, when a sampling of 100 of these random products is combined with a given products of interest, things like popularity recommenders will fair disproportionally well due to a general bias toward the core set of ordered products. Our scores above consequently become something along the lines of 'our recommender ranks ordered products better than useless products 60-70% of the time', which is hardly a convincing measure.

This issue may be alleviated by selecting a larger product sample (i.e. 1000 versus 100 non-purchased products), but this likely cause an additional to hit to performance. With processing times of our validation set of 20000 orders already taking close to 12 minutes, this evaluation method simply is not feasible for repeated use. Consequently, we should likely continue with MAP as a compromise between performance and robustness.

##### Model Evaluator

In [23]:
class ModelEvaluator():
    N_CORES = 8
    
    def __init__(self, eval_data, method='precision@k', n_range=[5,10]):
        '''
        Model evaluator class to perform scoring on recommender systems.
        Utilizes dask distributed processing with the assumption a default client has already been configured.
        
        Parameters
        ----------
        method: string (default: 'precision@k')
            Scoring method to perform. Options are 'precision@k' (default), or 'map'
        eval_data: pandas.DataFrame
            Data on which to perform evaluation. Assumes columns of ['user_id', 'order_id', 'product_id],
            with a row instance for each product ordered in a given order & user.
        n_range: int, array-like (default: [5,10])
            N value(s) at which to perform evaluation (i.e precision@10)
        ----------
        '''
        # Set n_range
        if isinstance(n_range, (int, np.integer)):
            n_range = [n_range]
        self.n_range = n_range
        # Set list of unique products
        self.product_list = None
        # Set method
        self.method = method
        # Set eval data
        self.eval_data = None
        self.update_data(eval_data=eval_data)
    
    def update_data(self, eval_data):
        '''
        Update the data on which to perform evaluation
        
        Parameters
        ----------
        eval_data: pandas.DataFrame
            Order-product data consisting columns for 'user_id' (int), 'order_id' (int), and 'product_id' (int).
        ----------
        '''
        # Store eval data with User ID as index
        self.eval_data = eval_data[['user_id', 'order_id', 'product_id']].copy().set_index('user_id').sort_index()
        # List of unique users in evaluation dataset
        self.unique_users = np.array(self.eval_data.index.unique())
        # Number of unique users in evaluation dataset
        self.nunique_users = self.unique_users.shape[0]
    
    def evaluate_model(self, model, method=None, eval_data=None, return_full=False):
        '''
        Evaluate a given recommender system. Evaluation data can be provided again to overwrite existing dataset.
        
        Parameters
        ----------
        model: RecommenderSystem object (custom class)
            Recommender system model to be evaluated. Assume model to have been fitted prior to evaluation.
        eval_data: pandas.DataFrame, optional
            Evaluation data to overwrite existing dataset. See update_data for details.
        method: string, optional
            Method with which to evaluate model performance. Will overwrite existing method.
            Available options are 'precision@k' or 'map'
        return_full: bool, optional (default: False)
            Whether to include the full user-order scoring results versus just the aggreggated scores.
        ----------
        Returns: dict (return_full = False), or tuple(dict, pandas.DataFrame) (return_full = True)
        '''
        # Update data if provided
        if eval_data is not None:
            self.update_data(eval_data=eval_data)
        # Update method if provided
        if method is not None:
            self.method = method
        # Unset verbose in model
        model.set_params(verbose=False)    
        if self.method == 'precision@k':
            eval_results = self.eval_pk(model)
        elif self.method == 'map':
            eval_results = self.eval_map(model)
        else:
            raise ValueError('method {} not recognized.'.format(self.method))
        
        if return_full:
            return eval_results
        else:
            return eval_results[0]
    
    def eval_pk(self, model):
        '''
        Evaluate Precision@K for a given model
        '''
        dd_test = dd.from_pandas(self.eval_data, npartitions=self.N_CORES, sort=True)
        
        # Precompute user recommendations. Assumes n_range is within reasonable bounds.
        user_recs = model.recommend(user_id=self.unique_users, n_rec=max(self.n_range))
        
        # Apply order_pk function to each order per user
        dd_order_pk = dd_test.groupby(['user_id', 'order_id'])['product_id']\
                                     .apply(lambda x: order_pk(x, user_recs.loc[x.name[0]], k=self.n_range),\
                                            meta=('precison@k', float))
        dd_order_pk = dd_order_pk.compute()
        # Get results
        order_pk_results = dd_order_pk.unstack()
        order_pk_mean = order_pk_results.mean()
        
        # Convert to summary dict
        global_metrics = {'model_name': model.MODEL_NAME, **order_pk_mean.to_dict()}
        
        return global_metrics, order_pk_results
    
    def eval_map(self, model):
        '''
        Evaluate MAP for a given model
        '''
        dd_test = dd.from_pandas(self.eval_data, npartitions=self.N_CORES, sort=True)
        
        # Precompute user recommendations. Assumes n_range is within reasonable bounds.
        user_recs = model.recommend(user_id=self.unique_users, n_rec=max(self.n_range))
        
        # Apply order_pk function to each order per user
        dd_order_map = dd_test.groupby(['user_id', 'order_id'])['product_id']\
                                     .apply(lambda x: order_map(x, user_recs.loc[x.name[0]], self.n_range),\
                                            meta=('precison@k', float))
        dd_order_map = dd_order_map.compute()
        # Get results
        order_map_results = dd_order_map.unstack()
        order_map_mean = order_map_results.mean()
        
        # Convert to summary dict
        global_metrics = {'model_name': model.MODEL_NAME, **order_map_mean.to_dict()}
        
        return global_metrics, order_map_results

In [24]:
mev = ModelEvaluator(eval_data=val)

In [34]:
pk_results = mev.evaluate_model(gpop_rec, method='precision@k')

In [35]:
pk_results

{'model_name': 'Global Popularity',
 'p@05': 0.09323250000000875,
 'p@10': 0.0866226984126957}

In [36]:
map_results = mev.evaluate_model(gpop_rec, method='map')

In [37]:
map_results

{'map@05': 0.057444861111113014,
 'map@10': 0.04147721793902748,
 'model_name': 'Global Popularity'}

### User Popularity Recommender

In [15]:
c = Client()

In [39]:
c

0,1
Client  Scheduler: tcp://127.0.0.1:64245  Dashboard: http://127.0.0.1:64246,Cluster  Workers: 8  Cores: 8  Memory: 34.31 GB


In [48]:
class UserPopularityRecommender(RecommenderSystem):
    '''
    Global Popularity Recommender system: produces recommendations based off most popular (i.e. frequently ordered)
    items across all users.
    
    Parameters
    ----------
    n_rec: int (default: 10)
        Number of product recommendations to return. Setting to -1 will return all products ranked.
    ----------
    '''
    MODEL_NAME = 'User Popularity'
    
    def fit(self, data):
        '''
        Fit recommender using prior order product data.
        
        Parameters
        ----------
        data: pandas.DataFrame
            Dataframe containing history of order products. Assumes format of one row per product ordered.
            Must contain column of 'product_id' on which to sum frequencies.
        ----------
        '''
        t_start = time.time()
        if self.verbose: print('[{:.0f}s] Fit starting...'.format(time.time()-t_start))
        # convert data to dask dataframe
        dd_data = dd.from_pandas(data[['user_id', 'order_id', 'product_id']], npartitions=8)
        # get global product counts
        if self.verbose: print('[{:.0f}s] Getting global product counts'.format(time.time()-t_start))
        self.glob_product_counts = dd_data['product_id'].value_counts().to_frame(name='glob_purchase_cnt').compute()
        
        # get user product counts
        if self.verbose: print('[{:.0f}s] Getting user product counts'.format(time.time()-t_start))
        self.user_product_counts = dd_data.groupby('user_id')['product_id'].apply(lambda x: x.value_counts(), meta=('user_purchase_cnt', int))\
                                                                            .to_frame()\
                                                                            .compute()\
                                                                            .reset_index(1).rename(columns={'level_1':'product_id'}) # extract product id from index
        
        if self.verbose: print('[{:.0f}s] Fit complete'.format(time.time()-t_start))
        self.fitted = True
    
    def recommend(self, user_id, **kwargs):
        '''
        Recommend products for a given user. Note that given the nature of this global popularity recommender,
        recommendations are the same for all users, but user_id can still be provided for consistency amongst
        recommender system parameters.
        
        Parameters
        ----------
        user_id: int or array-like, optional (default: 0)
            User ID to produce recommendations for. Note that for this particular recommender system, recommendations
            are the same for all users and this parameter is merely included for consistency amongst recommenders.
        ----------
        Returns: pandas.DataFrame, index = ['user_id', 'rank'], cols = ['product_id]
        '''
        # Check if fit was performed
        if not self.fitted:
            raise RuntimeError('cannot recommend without fitting!')
        # Overwrite params if provided
        self.set_params(**kwargs)
        
        t_start = time.time()
        if self.verbose: print('[{:.0f}s] Rec starting...'.format(time.time()-t_start))
        
        # Limit to top n_rec results per user
        if self.verbose: print('[{:.0f}s] Getting top n user products'.format(time.time()-t_start))
        user_data = self.user_product_counts.loc[user_id].reset_index()
        top_user_product_counts =  dd.from_pandas(user_data, npartitions=8)\
                                     .groupby('user_id')[['product_id','user_purchase_cnt']]\
                                     .apply(lambda x: x.nlargest(self.n_rec, columns=['user_purchase_cnt']),\
                                            meta={'product_id': int, 'user_purchase_cnt': int})\
                                     .compute()\
                                     .reset_index(1, drop=True).reset_index() # clean up grouped indexing
        
        # create dataframe of user purchases with user & global purchase counts
        if self.verbose: print('[{:.0f}s] Combining user and global purchase counts'.format(time.time()-t_start))
        user_recs_df = top_user_product_counts.join(self.glob_product_counts, on='product_id')
        # get additional recommendations for users with insufficient purchase histories
        if self.verbose: print('[{:.0f}s] Getting additionl global recommendations per user'.format(time.time()-t_start))
        def get_extra_glob_recs(user_recs, n_rec, global_product_counts):
            '''
            Helper function for getting additional recommendations to pad results up to n_rec
            '''
            n = user_recs.shape[0]
            # Initialize dict of additional recs
            new_data = {'product_id': [],
                        'glob_purchase_cnt': [],
                        'user_purchase_cnt': 0}
            # If missing recs
            if n < n_rec:
                # n missing recs
                n_delta = n_rec-n
                # non-recommended products from global list
                new_prods_mask = ~global_product_counts.index.isin(user_recs['product_id'].values)
                # get top n recs to add
                add_rec = global_product_counts[new_prods_mask][:n_delta]
                # populate dict lists
                new_data['product_id'] += add_rec.index.tolist()
                new_data['glob_purchase_cnt'] += add_rec['glob_purchase_cnt'].values.tolist()
            return pd.DataFrame(new_data)
        
        add_rec = dd.from_pandas(user_recs_df, npartitions=8)
        add_rec = add_rec.groupby('user_id').apply(lambda x: get_extra_glob_recs(x, self.n_rec, self.glob_product_counts),
                                                   meta={'product_id': int, 'glob_purchase_cnt': int, 'user_purchase_cnt': int})\
                                            .compute()
        
        # if any additional results
        if add_rec.shape[0] > 0:
            add_rec = add_rec.reset_index(1, drop=True).reset_index() # clean up grouped indexing
            # combine user recs with additional recs
            if self.verbose: print('[{:.0f}s] Combining original and additional recommendation results'.format(time.time()-t_start))
            user_recs_df = pd.concat([user_recs_df, add_rec], ignore_index=True)
        
        # sort and index
        self.user_recs_df = user_recs_df.sort_values(by=['user_id', 'user_purchase_cnt', 'glob_purchase_cnt'], ascending=[True, False, False]).set_index('user_id')
        
        self.fitted = True
        if self.verbose: print('[{:.0f}s] Rec complete'.format(time.time()-t_start))

        df_rec = self.user_recs_df.loc[user_id]['product_id']
        
        return super(UserPopularityRecommender, self).recommend(df_rec=df_rec)

In [49]:
upop_rec = UserPopularityRecommender(verbose=True)

In [50]:
upop_rec.fit(train)

[0s] Fit starting...
[0s] Getting global product counts
[1s] Getting user product counts
[5s] Fit complete


In [51]:
upop_rec.recommend(user_id=7)

[0s] Rec starting...
[0s] Getting top n user products
[4s] Combining user and global purchase counts
[4s] Getting additionl global recommendations per user
[6s] Rec complete


user_id
7    40852
7    17638
7    37602
7    21137
7     4920
7    31683
7    13198
7    42803
7    30391
7    43967
Name: product_id, dtype: int64

In [113]:
pk_results = mev.evaluate_model(upop_rec, method='precision@k')

In [114]:
pk_results

{'model_name': 'User Popularity',
 'p@05': 0.4033566666666485,
 'p@10': 0.39397716269841737}

In [115]:
map_results = mev.evaluate_model(upop_rec, method='map')

In [116]:
map_results

{'map@05': 0.3180649583333421,
 'map@10': 0.2706378651423558,
 'model_name': 'User Popularity'}

### User Popularity w/ Time Decay

In [16]:
class UserPopularityRecommender(RecommenderSystem):
    '''
    Global Popularity Recommender system: produces recommendations based off most popular (i.e. frequently ordered)
    items across all users.
    
    Parameters
    ----------
    n_rec: int (default: 10)
        Number of product recommendations to return. Setting to -1 will return all products ranked.
    decay: bool (default: True)
        Whether to implement time decay to product recommendation ranking
    decay_method: str (default: 'linear')
        Method with which to decay product purchase values according to time of purchase.
        Choice between 'linear' or 'exponential' decay methods.
        Linear decay reduces products weight linearly toward 0 relative to the maximum order age present.
        Exponential decay redced products weight according to calculated/provided half-life of the product.
    decay_constant: float (default: None)
        Decay parameter determining the strength of decay. Is automatically calculated if not provided
        For linear decay, default value is 1. Values less than 1 will decrease decay rate, values above will increase decay rate.
        For exponential decay, default value is median order age. Defining this parameter manually sets the half-life interval (scale dependent)
        at which products weights decrease by half.
        
    ----------
    '''
    MODEL_NAME = 'User Popularity'
    
    def __init__(self, decay=True, decay_method='linear', decay_constant=None, **kwargs):
        super(UserPopularityRecommender, self).__init__(**kwargs)
        self.decay = decay
        self.decay_method = decay_method
        self.decay_constant = decay_constant
    
    def set_params(self, **kwargs):
        self.decay = kwargs.pop('decay', self.decay)
        self.decay_method = kwargs.pop('decay_method', self.decay_method)
        self.decay_constant = kwargs.pop('decay_constant', self.decay_constant)
        super(UserPopularityRecommender, self).set_params(**kwargs)
    
    def get_params(self):
        d = super(UserPopularityRecommender, self).get_params()
        d['decay'] = self.decay
        d['decay_method'] = self.decay_method
        d['decay_constant'] = self.decay_constant
        return d
    
    def fit(self, data):
        '''
        Fit recommender using prior order product data.
        
        Parameters
        ----------
        data: pandas.DataFrame
            Dataframe containing history of order products. Assumes format of one row per product ordered.
            Must contain column of 'product_id' on which to sum frequencies.
        ----------
        '''
        t_start = time.time()
        if self.verbose: print('[{:.0f}s] Fit starting...'.format(time.time()-t_start))
        data = data.copy()
        
        if self.decay:
            if self.verbose:
                print('[{:.0f}s] Adjusting purchase values for time decay'.format(time.time()-t_start))
            # Fill NaN (i.e. the first orders) with 0
            data['days_since_prior_order'].fillna(0, inplace=True)
            
            # Cumulative sum of days passed with progressing order numbers
            order_times = data.groupby(['user_id', 'order_number'])[['days_since_prior_order']].mean().groupby(level=[0]).cumsum()
            
            # Rename columns
            order_times.rename(columns={'days_since_prior_order': 'days_since_first_order'}, inplace=True)
            
            # Invert time scale (days since first order -> days since last order)
            order_times['days_since_last_order'] = dd.from_pandas(order_times.reset_index(1), npartitions=8)\
                                                     .groupby('user_id')['days_since_first_order']\
                                                     .apply(lambda x: x.max()-x, meta=int)
            self.order_times = order_times
            # Merge new time values into original dataframe
            data = data.merge(order_times, left_on = ['user_id', 'order_number'], right_index=True)
            
            if self.decay_method == 'linear':
                if self.decay_constant is None:
                    self.decay_constant = 1
                t_max = data['days_since_last_order'].max()
                t_min = 0
                data['weighted_product_value'] = data['days_since_last_order'].apply(lambda x: max(0, 1 - self.decay_constant*(x/(t_max - t_min))))
            
            elif self.decay_method == 'exponential':
                if self.decay_constant is None:
                    # Default to half life being the median age of an order
                    self.decay_constant = data['days_since_last_order'].median()*np.log(2)
                data['weighted_product_value'] = np.exp(-1/self.decay_constant * data['days_since_last_order'])
            
            else:
                raise ValueError("decay method not recognized.")
                
        else:
            # All products get value of 1 without decay
            data['weighted_product_value'] = 1
            
        # convert data to dask dataframe
        dd_data = dd.from_pandas(data[['user_id', 'order_id', 'product_id', 'weighted_product_value']], npartitions=8)
        # get global product counts
        if self.verbose:
            print('[{:.0f}s] Getting global product values'.format(time.time()-t_start))
        self.glob_product_val = dd_data.groupby('product_id')['weighted_product_value'].sum()\
                                                                                       .to_frame(name='global_purchase_value')\
                                                                                       .compute()\
                                                                                       .sort_values(by='global_purchase_value')
        
        # get user product counts
        if self.verbose:
            print('[{:.0f}s] Getting user product values'.format(time.time()-t_start))
        self.user_product_val = dd_data.groupby(['user_id','product_id'])['weighted_product_value'].sum()\
                                                                            .to_frame(name = 'user_purchase_value')\
                                                                            .compute()\
                                                                            .reset_index(1) # extract product id from index
        
        if self.verbose: print('[{:.0f}s] Fit complete'.format(time.time()-t_start))
        self.fitted = True
    
    def recommend(self, user_id, **kwargs):
        '''
        Recommend products for a given user. Note that given the nature of this global popularity recommender,
        recommendations are the same for all users, but user_id can still be provided for consistency amongst
        recommender system parameters.
        
        Parameters
        ----------
        user_id: int or array-like, optional (default: 0)
            User ID to produce recommendations for. Note that for this particular recommender system, recommendations
            are the same for all users and this parameter is merely included for consistency amongst recommenders.
        ----------
        Returns: pandas.DataFrame, index = ['user_id'], cols = ['product_id]
        '''
        # Check if fit was performed
        if not self.fitted:
            raise RuntimeError('cannot recommend without fitting!')
        # Overwrite params if provided
        self.set_params(**kwargs)
        
        t_start = time.time()
        if self.verbose: print('[{:.0f}s] Rec starting...'.format(time.time()-t_start))
        
        # Limit to top n_rec results per user
        if self.verbose: print('[{:.0f}s] Getting top n user products'.format(time.time()-t_start))
        user_data = self.user_product_val.loc[user_id].reset_index()
        top_user_products =  dd.from_pandas(user_data, npartitions=8)\
                                     .groupby('user_id')[['product_id', 'user_purchase_value']]\
                                     .apply(lambda x: x.nlargest(self.n_rec, columns=['user_purchase_value']),\
                                            meta={'product_id': int, 'user_purchase_value': int})\
                                     .compute()\
                                     .reset_index(1, drop=True).reset_index() # clean up grouped indexing
        
        # create dataframe of user purchases with user & global purchase counts
        if self.verbose: print('[{:.0f}s] Combining user and global purchase counts'.format(time.time()-t_start))
        user_recs_df = top_user_products.join(self.glob_product_val, on='product_id')
        # get additional recommendations for users with insufficient purchase histories
        if self.verbose: print('[{:.0f}s] Getting additionl global recommendations per user'.format(time.time()-t_start))
        def get_extra_glob_recs(user_recs, n_rec, global_product_counts):
            '''
            Helper function for getting additional recommendations to pad results up to n_rec
            '''
            n = user_recs.shape[0]
            # Initialize dict of additional recs
            new_data = {'product_id': [],
                        'global_purchase_value': [],
                        'user_purchase_value': 0}
            # If missing recs
            if n < n_rec:
                # n missing recs
                n_delta = n_rec-n
                # non-recommended products from global list
                new_prods_mask = ~global_product_counts.index.isin(user_recs['product_id'].values)
                # get top n recs to add
                add_rec = global_product_counts[new_prods_mask][:n_delta]
                # populate dict lists
                new_data['product_id'] += add_rec.index.tolist()
                new_data['global_purchase_value'] += add_rec['global_purchase_value'].values.tolist()
            return pd.DataFrame(new_data)
        
        add_rec = dd.from_pandas(user_recs_df, npartitions=8)
        add_rec = add_rec.groupby('user_id').apply(lambda x: get_extra_glob_recs(x, self.n_rec, self.glob_product_val),
                                                   meta={'product_id': int, 'global_purchase_value': int, 'user_purchase_value': int})\
                                            .compute()
        
        # if any additional results
        if add_rec.shape[0] > 0:
            add_rec = add_rec.reset_index(1, drop=True).reset_index() # clean up grouped indexing
            add_rec = add_rec[['user_id', 'product_id', 'user_purchase_value', 'global_purchase_value']]
            # combine user recs with additional recs
            if self.verbose: print('[{:.0f}s] Combining original and additional recommendation results'.format(time.time()-t_start))
            user_recs_df = pd.concat([user_recs_df, add_rec], ignore_index=True)
        
        # sort and index
        self.user_recs_df = user_recs_df.sort_values(by=['user_id', 'user_purchase_value', 'global_purchase_value'], ascending=[True, False, False]).set_index('user_id')
        
        if self.verbose: print('[{:.0f}s] Rec complete'.format(time.time()-t_start))
        df_rec = self.user_recs_df.loc[user_id]['product_id']
        return super(UserPopularityRecommender, self).recommend(df_rec=df_rec)

In [17]:
upop_rec = UserPopularityRecommender(n_rec=10, decay=True, verbose=True)

In [18]:
upop_rec.fit(train)

[0s] Fit starting...
[0s] Adjusting purchase values for time decay
[19s] Getting global product values
[19s] Getting user product values
[19s] Fit complete


In [19]:
upop_rec.recommend(user_id = 88)

[0s] Rec starting...
[0s] Getting top n user products
[1s] Combining user and global purchase counts
[1s] Getting additionl global recommendations per user
[1s] Combining original and additional recommendation results
[1s] Rec complete


user_id
88    35384
88    35921
88     6104
88    26856
88    31513
88    42150
88    14245
88    41275
88    22138
88    12327
Name: product_id, dtype: int64

In [25]:
pk_results = mev.evaluate_model(upop_rec, method='precision@k')

In [26]:
pk_results

{'model_name': 'User Popularity',
 'p@05': 0.41637166666665215,
 'p@10': 0.41233724206349714}

In [27]:
map_results = mev.evaluate_model(upop_rec, method='map')

In [28]:
map_results

{'model_name': 'User Popularity',
 'map@05': 0.3304408611111218,
 'map@10': 0.2850165834356901}

### Model Optimization via "Grid Search"

In [29]:
def recommender_gs(recommender, train_data, test_data, param_dict, scoring = 'map', scoring_n_range = [5,10], verbose = False):
    
    t_start = time.time()
    
    from sklearn.model_selection import ParameterGrid
    param_grid = ParameterGrid(param_dict)
    
    if type(scoring) is not list:
        scoring = [scoring]
    
    mev = ModelEvaluator(eval_data=test_data, n_range=scoring_n_range)
    
    gs_results = []
    
    for param_set in param_grid:
        if verbose:
            print('Performing evaluation for {}'.format(param_set))
        
        if verbose:
            print('\t{:30}'.format('Fitting...'), end='\r')
        rec = recommender
        rec.set_params(**param_set, verbose=False)
        rec.fit(train_data)
        
        scores = {}
        for scoring_method in scoring:
            if verbose:
                print('\t{:30}'.format("Evaluating '{}'...".format(scoring_method)), end='\r')
            scores.update(mev.evaluate_model(rec, method = scoring_method))
        
        summary = {'params': param_set, **{**scores}}
        try:
            del summary['model_name']
        except:
            return summary
        gs_results.append(summary)
        
        if verbose:
            print('\t{:30}'.format("Complete!".format(scoring_method)))
    
    gs_results_df = pd.DataFrame(gs_results)
    return gs_results_df

In [30]:
param_sets = [{'decay': [False]},
               {'decay': [True], 'decay_method': ['linear'], 'decay_constant': [0.5, 1, 1.5, 2.0]},
               {'decay': [True], 'decay_method': ['exponential'], 'decay_constant': [None, 10, 20, 50, 100, 150, 200]}]

In [20]:
gs_results = recommender_gs(recommender = upop_rec,
                            train_data = train, test_data = val,
                            param_dict = param_sets,
                            scoring = 'map',
                            verbose = True)

Performing evaluation for {'decay': False}
	Complete!                     
Performing evaluation for {'decay': True, 'decay_constant': 0.5, 'decay_method': 'linear'}
	Complete!                     
Performing evaluation for {'decay': True, 'decay_constant': 1, 'decay_method': 'linear'}
	Complete!                     
Performing evaluation for {'decay': True, 'decay_constant': 1.5, 'decay_method': 'linear'}
	Complete!                     
Performing evaluation for {'decay': True, 'decay_constant': 2.0, 'decay_method': 'linear'}
	Complete!                     
Performing evaluation for {'decay': True, 'decay_constant': None, 'decay_method': 'exponential'}
	Complete!                     
Performing evaluation for {'decay': True, 'decay_constant': 10, 'decay_method': 'exponential'}
	Complete!                     
Performing evaluation for {'decay': True, 'decay_constant': 20, 'decay_method': 'exponential'}
	Complete!                     
Performing evaluation for {'decay': True, 'decay_con

In [21]:
gs_results.sort_values(by = "map@10", ascending=False, inplace=True)

In [22]:
gs_results

Unnamed: 0,map@05,map@10,params
8,0.340463,0.293842,"{'decay': True, 'decay_constant': 50, 'decay_m..."
9,0.339426,0.292888,"{'decay': True, 'decay_constant': 100, 'decay_..."
5,0.337759,0.291956,"{'decay': True, 'decay_constant': None, 'decay..."
4,0.33772,0.291143,"{'decay': True, 'decay_constant': 2.0, 'decay_..."
10,0.335601,0.289088,"{'decay': True, 'decay_constant': 150, 'decay_..."
3,0.334709,0.288556,"{'decay': True, 'decay_constant': 1.5, 'decay_..."
11,0.332193,0.286475,"{'decay': True, 'decay_constant': 200, 'decay_..."
2,0.330441,0.285017,"{'decay': True, 'decay_constant': 1, 'decay_me..."
7,0.327054,0.283086,"{'decay': True, 'decay_constant': 20, 'decay_m..."
1,0.327086,0.281837,"{'decay': True, 'decay_constant': 0.5, 'decay_..."


In [23]:
gs_results.loc[8]['params']

{'decay': True, 'decay_constant': 50, 'decay_method': 'exponential'}

In [31]:
param_sets = [{'decay': [True], 'decay_method': ['exponential'], 'decay_constant': np.arange(30,61,5)}]

In [33]:
gs_results_2 = recommender_gs(recommender = upop_rec,
                            train_data = train, test_data = val,
                            param_dict = param_sets,
                            scoring = 'map',
                            verbose = True)

Performing evaluation for {'decay': True, 'decay_constant': 30, 'decay_method': 'exponential'}
	Complete!                     
Performing evaluation for {'decay': True, 'decay_constant': 35, 'decay_method': 'exponential'}
	Complete!                     
Performing evaluation for {'decay': True, 'decay_constant': 40, 'decay_method': 'exponential'}
	Complete!                     
Performing evaluation for {'decay': True, 'decay_constant': 45, 'decay_method': 'exponential'}
	Complete!                     
Performing evaluation for {'decay': True, 'decay_constant': 50, 'decay_method': 'exponential'}
	Complete!                     
Performing evaluation for {'decay': True, 'decay_constant': 55, 'decay_method': 'exponential'}
	Complete!                     
Performing evaluation for {'decay': True, 'decay_constant': 60, 'decay_method': 'exponential'}
	Complete!                     


In [34]:
gs_results_2.sort_values(by = "map@10", ascending=False, inplace=True)

In [35]:
gs_results_2.head()

Unnamed: 0,map@05,map@10,params
6,0.341276,0.294563,"{'decay': True, 'decay_constant': 60, 'decay_m..."
5,0.341145,0.294323,"{'decay': True, 'decay_constant': 55, 'decay_m..."
4,0.340463,0.293842,"{'decay': True, 'decay_constant': 50, 'decay_m..."
3,0.339234,0.29294,"{'decay': True, 'decay_constant': 45, 'decay_m..."
2,0.337409,0.291745,"{'decay': True, 'decay_constant': 40, 'decay_m..."


In [37]:
gs_results_2.loc[6]['params']

{'decay': True, 'decay_constant': 60, 'decay_method': 'exponential'}