# Algorithm: Product Transition Probability

As a product list was not available, the products bought by all the users is considered as the complete product list. If product list is available, use that for analysis (eg. in Instacart Data)

In [1]:
import numpy as np
import pandas as pd
from pandas import isnull

In [3]:
data = pd.read_csv("final_train.csv")

In [4]:
data = data.drop('Unnamed: 0',  axis = 1)
data.head()

Unnamed: 0,SHOP_WEEK_x,PROD_CODE,CUST_CODE,BASKET_ID,BASKET_NUM
0,200608,PRD0900121,CUST0000000001,994100200154444,1
1,200608,PRD0900186,CUST0000000001,994100200154444,1
2,200608,PRD0900398,CUST0000000001,994100200154444,1
3,200608,PRD0900424,CUST0000000001,994100200154444,1
4,200608,PRD0900440,CUST0000000001,994100200154444,1


In [5]:
# Giving each product a unique integer ID as bins can only be calulated in N1 and N11 for integer values
# Always convert IDs to integer if its stringS
prod_list = data["PROD_CODE"].unique()
prod_code = pd.DataFrame(prod_list, columns =['PROD_CODE'])
prod_code['PROD_NO'] = prod_code.index + 1
prod_code.head()

Unnamed: 0,PROD_CODE,PROD_NO
0,PRD0900121,1
1,PRD0900186,2
2,PRD0900398,3
3,PRD0900424,4
4,PRD0900440,5


In [6]:
# Merging the train data with new product IDs
data = pd.merge(data, prod_code, on='PROD_CODE', how='left')
data.head()

Unnamed: 0,SHOP_WEEK_x,PROD_CODE,CUST_CODE,BASKET_ID,BASKET_NUM,PROD_NO
0,200608,PRD0900121,CUST0000000001,994100200154444,1,1
1,200608,PRD0900186,CUST0000000001,994100200154444,1,2
2,200608,PRD0900398,CUST0000000001,994100200154444,1,3
3,200608,PRD0900424,CUST0000000001,994100200154444,1,4
4,200608,PRD0900440,CUST0000000001,994100200154444,1,5


In [7]:
train_bas = data[["CUST_CODE","BASKET_ID","BASKET_NUM"]]
train_bas = train_bas.drop_duplicates()
train_bas = train_bas.reset_index(drop=True)

In [8]:
# PREV_BASKET_ID column has the preceding BASKET_ID of the current BASKET_ID for the given user
# Done by shifting the BASKET_ID column by 1
train_bas['PREV_BASKET_ID'] = train_bas.sort_values(['CUST_CODE', 'BASKET_NUM'])\
.groupby('CUST_CODE')['BASKET_ID'].shift().fillna(0).astype(np.uint64)
train_bas.head()

Unnamed: 0,CUST_CODE,BASKET_ID,BASKET_NUM,PREV_BASKET_ID
0,CUST0000000001,994100200154444,1,0
1,CUST0000000001,994100300159616,2,994100200154444
2,CUST0000000001,994100400162198,3,994100300159616
3,CUST0000000001,994100500160568,4,994100400162198
4,CUST0000000001,994100500160569,5,994100500160568


In [9]:
# This is done to make the dataset ready for lookup on the basis of BASKET_ID
train_bas = train_bas.set_index('BASKET_ID')

In [11]:
# Get product list for alL the orders and make a new column PROD_LIST
train_bas['PROD_LIST'] = data.groupby('BASKET_ID').aggregate(
    {'PROD_NO':lambda x: set(x)})

train_bas.head()

Unnamed: 0_level_0,CUST_CODE,BASKET_NUM,PREV_BASKET_ID,PROD_LIST
BASKET_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
994100200154444,CUST0000000001,1,0,"{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
994100300159616,CUST0000000001,2,994100200154444,"{1, 5, 7, 12, 14, 20, 21, 23, 25, 26, 27, 28, ..."
994100400162198,CUST0000000001,3,994100300159616,"{33, 4, 5, 6, 36, 40, 41, 10, 42, 43, 44, 45, ..."
994100500160568,CUST0000000001,4,994100400162198,"{2, 3, 5, 6, 11, 12, 13, 16, 20, 21, 23, 33, 3..."
994100500160569,CUST0000000001,5,994100500160568,"{64, 65, 66, 13, 56, 63}"


In [12]:
# Make a new dataset that has all rows except where BASKET_NUM is 1
# This is done because BASKET_NUM 1 for every user cannot be compared with any BASKET_ID.
ords = train_bas[(train_bas.BASKET_NUM > 1)]
ords = ords.reset_index()

In [13]:
# Mapping the BASKET_ID column with the index of train_bas dataset which is set as the BASKET_ID preciously
# Done to get the list of products in the current BASKET_ID
ords['PROD_LIST'] = ords.BASKET_ID.map(train_bas.PROD_LIST)

In [14]:
# Mapping the PREV_BASKET_ID column with the index of train_bas dataset 
# Done to get the list of products in the previous BASKET_ID
ords['PREV_PROD_LIST'] = ords.PREV_BASKET_ID.map(train_bas.PROD_LIST)

In [15]:
# fill N/A values: na -> empty set
ords.loc[:, ['PROD_LIST', 
               'PREV_PROD_LIST']] \
= ords.loc[:, ['PROD_LIST', 
               'PREV_PROD_LIST']].applymap(lambda x: set() if isnull(x) else x)

In [17]:
# Making a set T11: Common products in the the current and previous orders. 
ords['T11'] = ords.apply(lambda r: r['PROD_LIST'] & r['PREV_PROD_LIST'], axis=1)

ords.head()

Unnamed: 0,BASKET_ID,CUST_CODE,BASKET_NUM,PREV_BASKET_ID,PROD_LIST,PREV_PROD_LIST,T11
0,994100300159616,CUST0000000001,2,994100200154444,"{1, 5, 7, 12, 14, 20, 21, 23, 25, 26, 27, 28, ...","{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","{1, 5, 7, 12, 14, 20, 21, 23, 25, 26}"
1,994100400162198,CUST0000000001,3,994100300159616,"{33, 4, 5, 6, 36, 40, 41, 10, 42, 43, 44, 45, ...","{1, 5, 7, 12, 14, 20, 21, 23, 25, 26, 27, 28, ...","{33, 36, 5, 20, 21, 23}"
2,994100500160568,CUST0000000001,4,994100400162198,"{2, 3, 5, 6, 11, 12, 13, 16, 20, 21, 23, 33, 3...","{33, 4, 5, 6, 36, 40, 41, 10, 42, 43, 44, 45, ...","{33, 36, 5, 6, 16, 20, 21, 23}"
3,994100500160569,CUST0000000001,5,994100500160568,"{64, 65, 66, 13, 56, 63}","{2, 3, 5, 6, 11, 12, 13, 16, 20, 21, 23, 33, 3...","{56, 13}"
4,994100600158884,CUST0000000001,6,994100500160569,"{1, 5, 6, 7, 11, 14, 15, 16, 19, 20, 21, 23, 2...","{64, 65, 66, 13, 56, 63}",{}


In [18]:
# product count -> No. of bins needed for N1 and N11
n_products = data["PROD_NO"].nunique()

In [19]:
# N1 ----------------------------
# flatten list of sets of the prev_prod_list column  --> f1 
f1 = [val for sublist in [list(i) for i in ords.PREV_PROD_LIST.values] for val in sublist]

# N1: number of times a product occurs in the PREV_PROD_LIST column; count its recurrence in f1
N1 = np.bincount(f1, minlength=n_products+1)

# N11 ----------------------------
# flatten list of sets of the T11 column --> f11
f11 = [val for sublist in [list(i) for i in ords.T11.values] for val in sublist]

# N1: number of times a product occurs in the T11 column; count its recurrence in f11
N11 = np.bincount(f11, minlength=n_products+1)

In [21]:
# Calculate P11
# Probability that the product will be purchased in the next order given that it was purchased in past order
"""
P11 = No of times product was present in both current and past order --> (N11)
       _______________________________________________________________________ 
             No of times product was present in past order --> (N1)
"""

product_probs = pd.DataFrame(
    data={
        'PROD_NO': np.array(range(0, n_products+1)),
        'P11': (N11) / (N1)
    }
)
product_probs = product_probs[1:]
product_probs.head()

  after removing the cwd from sys.path.


Unnamed: 0,PROD_NO,P11
1,1,0.370709
2,2,0.103944
3,3,0.105095
4,4,0.021074
5,5,0.157596


In [22]:
# Mapping the New Product IDs with the original ones.
prod_prob = pd.merge(product_probs, prod_code, on = 'PROD_NO', how = 'left')
prod_prob.head()

Unnamed: 0,PROD_NO,P11,PROD_CODE
0,1,0.370709,PRD0900121
1,2,0.103944,PRD0900186
2,3,0.105095,PRD0900398
3,4,0.021074,PRD0900424
4,5,0.157596,PRD0900440


In [23]:
# Exporting Product Transition Probability
prod_prob.to_csv("prod_prob.csv", index = False)