In [1]:
import numpy as np
import pandas as pd

from os import path
import pickle
import re

import pdb

from itertools import combinations
from collections import Counter
import random
import time

import gc
from dask.distributed import Client, progress
import dask.delayed as delayed
import dask.dataframe as dd

In [2]:
import sys
sys.path.append('..')
import src.my_helper as my_helper
import src.my_eval as my_eval
import src.my_rec as my_rec

In [84]:
from imp import reload
reload(my_helper);
reload(my_eval);
reload(my_rec);

In [4]:
c = Client(silence_logs=40) # Hide warnings

In [5]:
pd.set_option('float_format', '{:.6f}'.format)

### Import Data

In [6]:
fd = ['..','data','processed']

# Train set
fp = path.join(*fd, 'train.p')
with open(file=fp, mode='rb') as file:
    train = pickle.load(file)

In [7]:
fd = ['..','data','processed']

# Train set
fp = path.join(*fd, 'val.p')
with open(file=fp, mode='rb') as file:
    val = pickle.load(file)

### Support

First we will take a look at the distribution of product purchase counts:

In [7]:
train.product_id.value_counts().describe()

count    39214.000000
mean        46.458918
std        280.919448
min          1.000000
25%          2.000000
50%          7.000000
75%         24.000000
max      25728.000000
Name: product_id, dtype: float64

50% of our product set has not been purchased more than 7 times! For the sake of building strong association rules, we realistically need to ignore products with insufficient purchase history. A low purchase frequency directly translates to a low support for the individual product, so we can establish an approximate cutoff for evaluation:

In [8]:
product_support = train['product_id'].value_counts() / train['order_id'].nunique()

In [9]:
product_support.describe()

count    39214.000000
mean         0.000255
std          0.001542
min          0.000005
25%          0.000011
50%          0.000038
75%          0.000132
max          0.141181
Name: product_id, dtype: float64

Considering the product counts these support value translate to above, we probably want to aim for a minimum purchase count in the 10-20 range. It will likely be worth exploring various support thresholds during model evaluation:

In [50]:
product_counts = train['product_id'].value_counts()
n_products = train['product_id'].nunique()
n_orders = train['order_id'].nunique()

for n in [10,15,20,25,50,100,200,500,1000]:
    reduced = product_counts[product_counts >= n]
    n_reduced = reduced.shape[0]
    min_support = (reduced / n_orders).min()
    print('Min. {:4} purchases: {:5} products ({:04.1f}%), min support = {:.6f}'.format(n, n_reduced, 100*n_reduced/n_products, min_support))

Min.   10 purchases: 16638 products (42.4%), min support = 0.000055
Min.   15 purchases: 13337 products (34.0%), min support = 0.000082
Min.   20 purchases: 11141 products (28.4%), min support = 0.000110
Min.   25 purchases:  9703 products (24.7%), min support = 0.000137
Min.   50 purchases:  5872 products (15.0%), min support = 0.000274
Min.  100 purchases:  3263 products (08.3%), min support = 0.000549
Min.  200 purchases:  1642 products (04.2%), min support = 0.001097
Min.  500 purchases:   534 products (01.4%), min support = 0.002744
Min. 1000 purchases:   213 products (00.5%), min support = 0.005515


### Generating Association Rules (Apriori)

In [8]:
# Define the quantile for min support threshold
q = 0.9

In [9]:
# Wprk with subsample of data during design/exploration
sample_order_ids = np.random.choice(train['order_id'].unique(), 10000)
train_sample = train[train['order_id'].isin(sample_order_ids)]

In [12]:
# Get order data and basic counts
order_data = train_sample[['order_id', 'product_id']].set_index('order_id').sort_index().copy()

n_orders = order_data.index.nunique()
n_products = order_data['product_id'].nunique()

In [13]:
# Calculate base level counts and support of individual products

temp_itemset_stats = order_data['product_id'].apply(lambda x: frozenset([x])).value_counts().to_frame(name='freq')
temp_itemset_stats['support'] = temp_itemset_stats['freq'] / n_orders
temp_itemset_stats.index.names = ['itemset']

# Store in dict for future use (itemset size n=1)
itemset_stats = {}
itemset_stats[1] = temp_itemset_stats

In [14]:
min_supp = itemset_stats[1]['support'].quantile(q)
min_supp

0.0011312217194570137

In [15]:
# Immitate loop, initialize with itemset size = 2
n = 2

In [14]:
# Get itemset stats from previous eval
current_itemset_stats = itemset_stats[n-1]
# Get list products above min support
frequent_itemsets = current_itemset_stats[current_itemset_stats['support'] >= min_supp]
frequent_products = np.unique([item for itemset in frequent_itemsets.index for item in itemset])
# Filter order data
order_data_reduced = order_data[order_data['product_id'].isin(frequent_products)]

In [15]:
%%time
# Dask dataframe
dd_order_data = dd.from_pandas(order_data_reduced, npartitions=8)

# Get size n combinations of products per order and count occurances of each set
item_combos = dd_order_data.groupby('order_id')['product_id']
                           .apply(lambda x: pd.Series(list(combinations(x, n))), meta = ('itemset', object))\
                           .map(frozenset)\
                           .reset_index(1)\
                           .value_counts()\
                           .compute()\
                           .reset_index()

Wall time: 35.3 s


In [16]:
item_combos.head()

Unnamed: 0,index,itemset
0,"(24852, 47766)",3071
1,"(24852, 21903)",2521
2,"(13176, 47209)",2507
3,"(21137, 24852)",2463
4,"(13176, 21137)",2452


In [19]:
# Rename and calculate support
item_combos.rename(columns={'index':'itemsetAB', 'itemset':'freqAB'}, inplace=True)
item_combos['supportAB'] = item_combos['freqAB'] / n_orders

In [21]:
# Filter for min support
df_results = item_combos[item_combos['supportAB'] >= min_supp]

In [26]:
# Save filtered results for future use (akin to initial loop steps above)
itemset_stats[n] = df_results.rename(columns={'itemsetAB': 'itemset', 'freqAB': 'freq', 'supportAB': 'support'}).set_index('itemset')

In [28]:
# Helper function for generating (x1, x2, ..., xn) -> (y) combinations per itemset
def antecedent_consequent_combos(x):
    combo_list = []
    for item in x:
        item = frozenset([item])
        consequent = item
        antecedent = x.difference(item)
        combo_list.append((antecedent, consequent))
    return combo_list

In [29]:
# Generate all rule permutations for itemsets
temp = dd.from_pandas(df_results.set_index('itemsetAB', drop=False), npartitions=N_CORES)['itemsetAB']\
                 .apply(lambda x: antecedent_consequent_combos(x), meta = list)\
                 .apply(lambda x: pd.Series(x), meta = {i: object for i in range(n)})\
                 .compute().stack().reset_index(1, drop=True)
temp = pd.DataFrame(temp.values.tolist(), index = temp.index, columns=['A', 'B'])

In [30]:
# Merge freq and support with antecedents and consequents using previously generated itemset_stats
# Note consequent is always 1 product (limited for sake of simplicity)
ab_results = temp.merge(itemset_stats[n-1], left_on='A', right_index=True).rename(columns={'freq': 'freqA', 'support': 'supportA'})\
                         .merge(itemset_stats[1], left_on='B', right_index=True).rename(columns={'freq': 'freqB', 'support': 'supportB'})

In [31]:
# Merge antecedent-consequent results above with original itemsets
df_results = df_results.merge(ab_results, left_on='itemsetAB', right_index=True)

In [32]:
# Create list of results per "loop"
results = []
results.append(df_results)

In [34]:
# Proceed to itemset size n = 3 and repeat above process
n = 3

In [39]:
current_itemset_stats = itemset_stats[n-1]
frequent_products = np.unique([item for itemset in current_itemset_stats.index for item in itemset])
order_data_reduced = order_data[order_data['product_id'].isin(frequent_products)]

In [45]:
%%time
dd_order_data = dd.from_pandas(order_data_reduced, npartitions=N_CORES)

item_combos = dd_order_data.groupby('order_id')['product_id']\
                           .apply(lambda x: pd.Series(list(combinations(x, n))), meta = ('itemset', object))\
                           .map(frozenset)\
                           .compute().reset_index(1, drop=True)



Wall time: 39.3 s


In [46]:
item_combos = item_combos.value_counts().reset_index()
item_combos.rename(columns={'index':'itemsetAB', 'itemset':'freqAB'}, inplace=True)
item_combos['supportAB'] = item_combos['freqAB'] / n_orders

In [47]:
df_results = item_combos[item_combos['supportAB'] >= min_supp]

In [50]:
df_results_reduced = df_results[df_results['supportAB'] >= min_supp]

In [51]:
itemset_stats[n] = df_results_reduced.rename(columns={'itemsetAB': 'itemset', 'freqAB': 'freq', 'supportAB': 'support'}).set_index('itemset')

In [52]:
temp = dd.from_pandas(df_results_reduced.set_index('itemsetAB', drop=False), npartitions=N_CORES)['itemsetAB']\
                 .apply(lambda x: antecedent_consequent_combos(x), meta = list)\
                 .apply(lambda x: pd.Series(x), meta = {i: object for i in range(n)})\
                 .compute().stack().reset_index(1, drop=True)
temp = pd.DataFrame(temp.values.tolist(), index = temp.index, columns=['A', 'B'])

In [54]:
ab_results = temp.merge(itemset_stats[n-1], left_on='A', right_index=True).rename(columns={'freq': 'freqA', 'support': 'supportA'})\
                         .merge(itemset_stats[1], left_on='B', right_index=True).rename(columns={'freq': 'freqB', 'support': 'supportB'})

In [55]:
df_results = df_results_reduced.merge(ab_results, left_on='itemsetAB', right_index=True)

In [56]:
results.append(df_results)

In [59]:
df_final = pd.concat(results, ignore_index=True)
df_final['confidenceAB'] = df_final['supportAB'] / df_final['supportA']
df_final['liftAB'] = df_final['supportAB'] / (df_final['supportA'] * df_final['supportB'])
df_final = df_final.sort_values(by='liftAB', ascending=False)

In [60]:
df_final

Unnamed: 0,itemsetAB,freqAB,supportAB,A,B,freqA,supportA,freqB,supportB,confidenceAB,liftAB
9221,"(44786, 13269)",95,0.000521,(44786),(13269),148,0.000812,170,0.000933,0.641892,688.085453
9220,"(44786, 13269)",95,0.000521,(13269),(44786),170,0.000933,148,0.000812,0.558824,688.085453
15228,"(38312, 15984, 48220)",85,0.000466,"(15984, 48220)",(38312),117,0.000642,272,0.001493,0.726496,486.736111
15229,"(38312, 15984, 48220)",85,0.000466,"(38312, 48220)",(15984),128,0.000702,258,0.001416,0.664062,469.049479
15230,"(38312, 15984, 48220)",85,0.000466,"(38312, 15984)",(48220),134,0.000735,273,0.001498,0.634328,423.429282
5255,"(15984, 38312)",134,0.000735,(15984),(38312),258,0.001416,272,0.001493,0.519380,347.973039
5254,"(15984, 38312)",134,0.000735,(38312),(15984),272,0.001493,258,0.001416,0.492647,347.973039
4606,"(38544, 4962)",144,0.000790,(4962),(38544),261,0.001432,293,0.001608,0.551724,343.149818
4607,"(38544, 4962)",144,0.000790,(38544),(4962),293,0.001608,261,0.001432,0.491468,343.149818
5692,"(38312, 48220)",128,0.000702,(48220),(38312),273,0.001498,272,0.001493,0.468864,314.128852


In [17]:
def apriori(order_data, min_support_q = 0.9, min_support = None, depth = 2, verbose=False):
    '''
    Generate association rules for a set of order data (apriori approach)
    
    Params
    ----------
    order_data: pandas.DataFrame
        Data to evaluate. Expects columns ['order_id', 'product_id'].
    min_support_q: float (default = 0.9)
        Quantile at which to calculate min support threshold. If min_support (below) is provided, this value is ignored.
    min_support: float, optional (default = None)
        Define concrete min_support threshold, below which products and itemsets will be eliminated.
        If not provided (default = None), min_support_q is utilized.
    depth: int (default = 2)
        Maximum itemset size to evaluate up to. Generated association rules will have maximum {n-1 antecedents} -> {1 consequent}
    '''
    # Time display and line width for verbose displays
    t_start = time.time()
    def td(): return '[{:02d}:{:02d}]'.format(*(int(t) for t in divmod(time.time() - t_start, 60)))
    lw = 20
    
    if verbose: print(td(),'START\n','-'*lw)
    
    # Number of cores/partitions for dask
    N_CORES = 8
    
    if verbose: print(td(), 'Preparing data.')
    
    # Prepare basic dataset
    order_data = order_data[['order_id', 'product_id']].set_index('order_id').sort_index().copy()
    
    # Reference values
    n_orders = order_data.index.nunique()
    #n_products = order_data['product_id'].unique()
    
    # Count and support of single products
    itemset_stats = {}
    itemset_stats[1] = order_data['product_id'].apply(lambda x: frozenset([x])).value_counts().to_frame(name='freq')
    itemset_stats[1]['support'] = itemset_stats[1]['freq'] / n_orders
    
    # Get minimum support threshold according to parameters (defined min_support overrides quantile calculation)
    min_supp = min_support if min_support is not None else itemset_stats[1]['support'].quantile(min_support_q)
    # Eliminate single items below support threshold
    itemset_stats[1] = itemset_stats[1][itemset_stats[1]['support'] >= min_supp]
    
    # Initialize list of results per loop
    results = []
    
    def antecedent_consequent_combos(x):
        '''
        Helper function to get all (X1, X2, ..., Xn) -> Y rules from a given set of products (x)
        
        Returns a list of tuples in the format of ({antecedent frozenset}, {consequent frozenset}),
        with the consequent frozenset always being limited to a single item for the sake of simplicity.
        '''
        # Initialize list of combinations for product set x
        combo_list = []
        for item in x:
            # Consequent = single item set for type consistency
            consequent = frozenset([item])
            # Antecedent = all other products
            antecedent = x.difference([item])
            combo_list.append((antecedent, consequent))
        return combo_list
    
    for n in range(2, depth+1):
        if verbose: print('-'*lw, '\n'+td(), 'Performing itemset evaluation for n = {}.'.format(n))
        
        # Reduce order data according to min_supp
        if verbose: print(td(),'Reducing order data w.r.t. minimum support threshold ({:.6f}).'.format(min_supp))
        # Get itemsets from previous runthrough (n-1) and convert to list of unique product ids
        frequent_products = np.unique([item for itemset in itemset_stats[n-1].index for item in itemset])
        # Reduce order data to contain only remaining frequent products
        order_data_reduced = order_data[order_data['product_id'].isin(frequent_products)]
        del frequent_products

        # Convert to dask dataframe
        dd_order_data = dd.from_pandas(order_data_reduced, npartitions=N_CORES)
        
        # Get all n combinations of products per order
        if verbose: print(td(), 'Gathering itemset combinations.')
        item_combos = dd_order_data.groupby('order_id')['product_id']\
                                   .apply(lambda x: pd.Series(list(combinations(x, n))), meta = ('itemset', object))\
                                   .map(frozenset)\
                                   .compute()\
                                   .reset_index(1, drop=True)
        del dd_order_data, order_data_reduced
        
        # Count of item combinations
        if verbose: print(td(), 'Counting itemset frequencies')
        df_item_combos = item_combos.value_counts().reset_index().rename(columns={'index':'itemsetAB', 'itemset':'freqAB'})
        df_item_combos['supportAB'] = df_item_combos['freqAB'] / n_orders
        #del item_combos
        
        # Filter out itemsets below min_supp
        df_item_combos_reduced = df_item_combos[df_item_combos['supportAB'] >= min_supp]
        
        # Copy of current n itemset stats for future use (same naming format as itemset_stats[1] above)
        itemset_stats[n] = df_item_combos_reduced.rename(columns={'itemsetAB': 'itemset',
                                                                  'freqAB': 'freq',
                                                                  'supportAB': 'support'})\
                                                 .copy().set_index('itemset')      
        
        # Get all antecedent-conseqient combinations (limited to single item consequents)
        if verbose: print(td(), 'Splitting into antecedent and consequent pairs.')
        dd_temp = dd.from_pandas(df_item_combos_reduced.set_index('itemsetAB', drop=False), npartitions=N_CORES)
        temp_results = dd_temp['itemsetAB'].apply(lambda x: antecedent_consequent_combos(x), meta = list)\
                                           .apply(lambda x: pd.Series(x), meta = {i: object for i in range(n)})\
                                           .compute()\
                                           .stack().reset_index(1, drop=True)
        ab_combos = pd.DataFrame(temp_results.values.tolist(), index = temp_results.index, columns=['A', 'B'])
        #del dd_temp, temp_results
        
        if verbose: print(td(), 'Merging itemsets and statistics.')
        # Merge antecedent-consequent data with previously calulated support statistics
        ab_results = ab_combos.merge(itemset_stats[n-1], left_on='A', right_index=True).rename(columns={'freq': 'freqA', 'support': 'supportA'})\
                              .merge(itemset_stats[1], left_on='B', right_index=True).rename(columns={'freq': 'freqB', 'support': 'supportB'})
        del ab_combos
        
        # Merge antecedent-consequent data with original results
        df_results = df_item_combos_reduced.merge(ab_results, left_on='itemsetAB', right_index=True)
        results.append(df_results)
        del ab_results, df_results
    
    # Combine all results and calulcate confidence and lift
    if verbose: print('-'*lw, '\n'+td(), 'Concatenating rules results into single dataframe')
    df_final = pd.concat(results, ignore_index=True)
    if verbose: print(td(), 'Calculating confidence and lift, and sorting rules by lift')
    df_final['confidenceAB'] = df_final['supportAB'] / df_final['supportA']
    df_final['liftAB'] = df_final['supportAB'] / (df_final['supportA'] * df_final['supportB'])
    df_final = df_final.sort_values(by='liftAB', ascending=False)
    
    if verbose: print('-'*lw, '\n'+td(), 'COMPLETE')
    return df_final

In [57]:
r = apriori(train, depth=3, min_support_q=.9, verbose = True)

[00:00] START
 --------------------
[00:00] Preparing data.
-------------------- 
[00:03] Performing itemset evaluation for n = 2.
[00:03] Reducing order data w.r.t. minimum support threshold (0.000439).
[00:03] Gathering itemset combinations.
[00:41] Counting itemset frequencies
[00:50] Splitting into antecedent and consequent pairs.
[00:50] Merging itemsets and statistics.
-------------------- 
[00:50] Performing itemset evaluation for n = 3.
[00:50] Reducing order data w.r.t. minimum support threshold (0.000439).
[00:51] Gathering itemset combinations.
[01:50] Counting itemset frequencies
[02:10] Splitting into antecedent and consequent pairs.
[02:10] Merging itemsets and statistics.
-------------------- 
[02:10] Concatenating rules results into single dataframe
[02:10] Calculating confidence and lift, and sorting rules by lift
-------------------- 
[02:10] COMPLETE


In [58]:
r.head()

Unnamed: 0,itemsetAB,freqAB,supportAB,A,B,freqA,supportA,freqB,supportB,confidenceAB,liftAB
9231,"(44786, 13269)",95,0.000521,(44786),(13269),148,0.000812,170,0.000933,0.641892,688.085453
9230,"(44786, 13269)",95,0.000521,(13269),(44786),170,0.000933,148,0.000812,0.558824,688.085453
15228,"(38312, 15984, 48220)",85,0.000466,"(15984, 48220)",(38312),117,0.000642,272,0.001493,0.726496,486.736111
15229,"(38312, 15984, 48220)",85,0.000466,"(38312, 48220)",(15984),128,0.000702,258,0.001416,0.664062,469.049479
15230,"(38312, 15984, 48220)",85,0.000466,"(38312, 15984)",(48220),134,0.000735,273,0.001498,0.634328,423.429282


### Association Rules Recommender

In [11]:
# First aim for simple 1:1 join of singular past purchases (A) and associated products (B)

In [14]:
ar_results = apriori(train, depth=3, min_support_q=.9, verbose = False)

In [50]:
user_products = train[['user_id','product_id']].drop_duplicates().sort_values(by=['user_id','product_id'])
user_products['product_id'] = user_products['product_id'].apply(lambda x: frozenset([x]))

In [51]:
user_products.head()

Unnamed: 0,user_id,product_id
5281249,7,(274)
5135268,7,(519)
5147325,7,(4920)
5704903,7,(4945)
5097913,7,(6361)


In [52]:
user_recs = user_products.merge(ar_results[['A', 'B', 'supportAB', 'confidenceAB', 'liftAB']], left_on='product_id', right_on='A')

In [53]:
user_recs = user_recs.drop(['product_id', 'A'], axis=1).rename(columns={'B': 'product_id',
                                                                        'supportAB': 'supp',
                                                                        'confidenceAB': 'conf',
                                                                        'liftAB': 'lift'})

In [55]:
user_recs = user_recs.set_index('user_id')

In [56]:
user_recs.head()

Unnamed: 0_level_0,product_id,supp,conf,lift
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
7,(41787),0.000845,0.035623,3.543557
7,(5212),0.000461,0.019431,3.324864
7,(45066),0.001663,0.07009,3.104721
7,(13870),0.000565,0.023826,2.973915
7,(46906),0.00079,0.03331,2.936745


In [61]:
user_recs.loc[7]['product_id'][:10]

user_id
7    (41787)
7     (5212)
7    (45066)
7    (13870)
7    (46906)
7    (42265)
7    (11777)
7    (16797)
7    (28204)
7     (8174)
Name: product_id, dtype: object

In [82]:
d = {'a': 1, 'b': 2, 'c': 3, 'd': 4}

In [86]:
d.pop('a', 100)

100

In [22]:
class AprioriRecommender(my_rec.RecommenderSystem):
    MODEL_NAME = 'Apriori (Singular)'
    
    def __init__(self, min_support_q = 0.9, min_support = None, **kwargs):
        super(AprioriRecommender, self).__init__(**kwargs)
        self.min_support_q = min_support_q
        self.min_support = min_support
    
    def set_params(self, **kwargs):
        self.min_support_q = kwargs.pop('min_support_q', self.min_support_q)
        self.min_support = kwargs.pop('min_support', self.min_support)
        super(AprioriRecommender, self).set_params(**kwargs)

    def fit(self, data):
        '''
        Fit recommender using prior order product data.
        
        Parameters
        ----------
        data: pandas.DataFrame
            Dataframe containing history of user-order products. Assumes format of one row per product ordered.
            Must contain columns ['user_id', 'order_id', product_id'].
        ----------
        '''
        # Produce apriori rules
        
        self.data = data[['user_id', 'order_id', 'product_id']].set_index('user_id').copy()
        if self.verbose: print('Generating association rules...')
        self.apriori_results = apriori(self.data, depth=2,
                                       min_support_q=self.min_support_q, min_support=self.min_support,
                                       verbose = self.verbose)
        
        if self.verbose: print('Association rules generation completed.')
        
    def recommend(self, user_id, n_rec=None):
        if n_rec is not None:
            self.set_params(n_rec=n_rec)
        
        user_products = self.data['product_id'].loc[user_id]
        user_products = user_products.drop_duplicates()\
                                     .apply(lambda x: frozenset([x]))\
                                     .to_frame(name='product_id').reset_index()
        
        user_recs = user_products.merge(temp.apriori_results[['A', 'B', 'liftAB']], left_on='product_id', right_on='A')\
                                 .drop(['product_id', 'A'], axis=1)\
                                 .rename(columns={'B': 'product_id', 'liftAB': 'lift'})
        
        # Extract from frozenset
        user_recs['product_id'] = user_recs['product_id'].apply(lambda x: list(x)[0])
        
        # Drop duplicate recommended products (consequents)
        user_recs = user_recs.drop_duplicates(subset=['user_id', 'product_id'])
        
        # Get top n recommended products
        top_user_recs = dd.from_pandas(user_recs, npartitions=8)\
                          .groupby('user_id')[['product_id', 'lift']]\
                          .apply(lambda x: x.nlargest(10, columns=['lift']),
                                 meta = {'product_id': int, 'lift': int})\
                          .compute()\
                          .reset_index(1, drop=True).reset_index()
        
        df_rec = top_user_recs

In [23]:
temp = AprioriRecommender(n_rec = 10, min_support_q = 0.8)

In [24]:
temp.fit(train)

In [25]:
temp.apriori_results.head()

Unnamed: 0,itemsetAB,freqAB,supportAB,A,B,freqA,supportA,freqB,supportB,confidenceAB,liftAB
40317,"(3858, 15692)",36,0.000198,(3858),(15692),57,0.000313,42,0.00023,0.631579,2740.360902
40316,"(3858, 15692)",36,0.000198,(15692),(3858),42,0.00023,57,0.000313,0.857143,2740.360902
39041,"(11224, 39739)",37,0.000203,(11224),(39739),61,0.000335,68,0.000373,0.606557,1625.520251
39040,"(11224, 39739)",37,0.000203,(39739),(11224),68,0.000373,61,0.000335,0.544118,1625.520251
36098,"(1347, 23427)",39,0.000214,(23427),(1347),57,0.000313,79,0.000434,0.684211,1578.309127


In [130]:
user_products = temp.data['product_id']

In [131]:
user_products = user_products.drop_duplicates().apply(lambda x: frozenset([x])).to_frame(name='product_id').reset_index()

In [132]:
user_products.head()

Unnamed: 0,user_id,product_id
0,109,(196)
1,476,(14084)
2,3546,(12427)
3,1317,(26088)
4,8853,(26405)


In [133]:
user_recs = user_products.merge(temp.apriori_results[['A', 'B', 'liftAB']], left_on='product_id', right_on='A')\
                   .drop(['product_id', 'A'], axis=1)\
                   .rename(columns={'B': 'product_id', 'liftAB': 'lift'})

In [134]:
user_recs['product_id'] = user_recs['product_id'].apply(lambda x: list(x)[0])

In [135]:
user_recs.head()

Unnamed: 0,user_id,product_id,lift
0,109,46149,33.766922
1,109,46562,28.389579
2,109,45051,24.110767
3,109,42500,23.247098
4,109,36472,18.879461


In [138]:
top_user_recs =  dd.from_pandas(user_recs, npartitions=8).groupby('user_id')[['product_id', 'lift']]\
                                     .apply(lambda x: x.nlargest(10, columns=['lift']),
                                            meta = {'product_id': int, 'lift': int})\
                                     .compute()\
                                     .reset_index(1, drop=True).reset_index()

In [139]:
top_user_recs.head()

Unnamed: 0,user_id,product_id,lift
0,88,38689,8.396441
1,88,27845,7.841136
2,88,17948,6.17942
3,88,25659,6.068906
4,88,33731,4.672147


In [143]:
top_user_recs = top_user_recs.drop_duplicates(subset=['user_id', 'product_id'])

In [144]:
top_user_recs.groupby('user_id')['product_id'].nunique()

user_id
7        10
13        9
14       10
65       10
70       10
80        9
84       10
88       10
96        9
97       10
103       9
108      10
109      10
124      10
131      10
145       5
161       9
163      10
167      10
176       6
193      10
197      10
209      10
215      10
261       9
284       8
294      10
319       9
330      10
340      10
         ..
9015      1
9156      1
9162      1
9187      1
9227      1
9273      1
9323      1
9428      1
9527      1
9765      1
10196     1
10206     1
10469     1
10701     1
10914     1
11087     1
11121     3
11293     1
11675     1
11789     2
12051     1
13338     3
13824     1
13831     1
14537     1
15058     1
16369     2
16441     1
21329     1
25426     1
Name: product_id, Length: 467, dtype: int64

In [68]:
user_purchase_lists = user_products.drop_duplicates().groupby('user_id').apply(lambda x: frozenset(x)).to_frame(name='purchased_products')

In [73]:
user_rec_lists = top_user_products.groupby('user_id')['product_id'].unique().map(frozenset).to_frame(name='recommended_products')

In [74]:
user_rec_v_purch = pd.merge(user_purchase_lists, user_rec_lists, left_index=True, right_index=True)

In [79]:
user_rec_v_purch['n_new_prod'] = user_rec_v_purch.apply(lambda x: len(x['recommended_products'] - x['purchased_products']), axis=1)

In [80]:
user_rec_v_purch

Unnamed: 0_level_0,purchased_products,recommended_products,n_new_prod
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
7,"(11520, 35333, 519, 10504, 47623, 45066, 13198...","(30561, 16262, 38313, 18441, 49131, 14764, 358...",10
88,"(26856, 35921, 6104, 35384, 31513)","(38689, 33731, 27845, 19057, 22035, 5077, 3992...",10
