# Algorithm: Weighted Basket Method

In [0]:
import pandas as pd
import numpy as np
import collections
import glob

In [4]:
# Train Data
data = pd.read_csv("final_train.csv")
data = data.drop('Unnamed: 0', axis = 1)
data.head()

Unnamed: 0,SHOP_WEEK_x,PROD_CODE,CUST_CODE,BASKET_ID,BASKET_NUM
0,200608,PRD0900121,CUST0000000001,994100200154444,1
1,200608,PRD0900186,CUST0000000001,994100200154444,1
2,200608,PRD0900398,CUST0000000001,994100200154444,1
3,200608,PRD0900424,CUST0000000001,994100200154444,1
4,200608,PRD0900440,CUST0000000001,994100200154444,1


In [0]:
df = data[["CUST_CODE","PROD_CODE","BASKET_NUM"]]

In [6]:
# Calculating the weightage of each product for the respective user.
# The oldest order has the least weightage while the most recent order has the maximum weightage.
# The weightage of each product corresponds to order_number it appears in.
# Thus, aggregating the order_number to calculate weightage of of each user purchased product:
sum_df = df.groupby(['CUST_CODE','PROD_CODE']).agg({'BASKET_NUM': 'sum'})
sum_df = sum_df.reset_index()
sum_df.head()

Unnamed: 0,CUST_CODE,PROD_CODE,BASKET_NUM
0,CUST0000000001,PRD0900046,36
1,CUST0000000001,PRD0900053,44
2,CUST0000000001,PRD0900060,72
3,CUST0000000001,PRD0900070,51
4,CUST0000000001,PRD0900118,13


In [7]:
# Computing the number of orders of the user to calculate the total weightage of each user
max_basket = df.groupby(['CUST_CODE']).agg({'BASKET_NUM': 'max'})
# Calculating the total weightage of each user using formula: (n * (n +1)) / 2 
# Where n = total number of orders for that user
max_basket['weight'] = (max_basket.BASKET_NUM * (max_basket.BASKET_NUM + 1)) / 2
max_basket = max_basket.reset_index()
max_basket.head()

Unnamed: 0,CUST_CODE,BASKET_NUM,weight
0,CUST0000000001,56,1596.0
1,CUST0000000009,19,190.0
2,CUST0000000010,64,2080.0
3,CUST0000000012,3,6.0
4,CUST0000000020,35,630.0


In [8]:
# Merging the product weightage data with total weightage data
weight_data = pd.merge(sum_df, max_basket, on = 'CUST_CODE', how='outer')
weight_data.head()

Unnamed: 0,CUST_CODE,PROD_CODE,BASKET_NUM_x,BASKET_NUM_y,weight
0,CUST0000000001,PRD0900046,36,56,1596.0
1,CUST0000000001,PRD0900053,44,56,1596.0
2,CUST0000000001,PRD0900060,72,56,1596.0
3,CUST0000000001,PRD0900070,51,56,1596.0
4,CUST0000000001,PRD0900118,13,56,1596.0


In [0]:
weight_data = weight_data.drop(['BASKET_NUM_y'], axis = 1)

In [10]:
# Calculating the weighted score of the user purchased products
# Formula: Product weighatge / Total weightage of user
weight_data['imp_score'] = weight_data.BASKET_NUM_x / weight_data.weight
weight_data.head()

Unnamed: 0,CUST_CODE,PROD_CODE,BASKET_NUM_x,weight,imp_score
0,CUST0000000001,PRD0900046,36,1596.0,0.022556
1,CUST0000000001,PRD0900053,44,1596.0,0.027569
2,CUST0000000001,PRD0900060,72,1596.0,0.045113
3,CUST0000000001,PRD0900070,51,1596.0,0.031955
4,CUST0000000001,PRD0900118,13,1596.0,0.008145


Product Transition Probability is calculated in other file

In [14]:
# Product Transition Probability
prod_prob = pd.read_csv("prod_prob.csv")
prod_prob = prod_prob[['PROD_CODE', 'P11']]
prod_prob.head()

Unnamed: 0,PROD_CODE,P11
0,PRD0900121,0.370709
1,PRD0900186,0.103944
2,PRD0900398,0.105095
3,PRD0900424,0.021074
4,PRD0900440,0.157596


In [16]:
score_data = pd.merge(weight_data, prod_prob, on = 'PROD_CODE', how = 'left')
score_data.head()

Unnamed: 0,CUST_CODE,PROD_CODE,BASKET_NUM_x,weight,imp_score,P11
0,CUST0000000001,PRD0900046,36,1596.0,0.022556,0.007699
1,CUST0000000001,PRD0900053,44,1596.0,0.027569,0.101455
2,CUST0000000001,PRD0900060,72,1596.0,0.045113,0.032002
3,CUST0000000001,PRD0900070,51,1596.0,0.031955,0.018101
4,CUST0000000001,PRD0900118,13,1596.0,0.008145,0.097539


In [17]:
score_data['SCORE'] = score_data.imp_score * score_data.P11
score_data.head()

Unnamed: 0,CUST_CODE,PROD_CODE,BASKET_NUM_x,weight,imp_score,P11,SCORE
0,CUST0000000001,PRD0900046,36,1596.0,0.022556,0.007699,0.000174
1,CUST0000000001,PRD0900053,44,1596.0,0.027569,0.101455,0.002797
2,CUST0000000001,PRD0900060,72,1596.0,0.045113,0.032002,0.001444
3,CUST0000000001,PRD0900070,51,1596.0,0.031955,0.018101,0.000578
4,CUST0000000001,PRD0900118,13,1596.0,0.008145,0.097539,0.000794


In [0]:
score_data = score_data.sort_values(["CUST_CODE", "SCORE"], ascending= [True, False])

In [21]:
rec = score_data.groupby(['CUST_CODE']).apply(lambda x: (x.sort_values(['CUST_CODE', 'SCORE'], 
                                                  ascending=[True, False]).head(10)))
rec = rec.reset_index(drop = True)
rec.head(20)

Unnamed: 0,CUST_CODE,PROD_CODE,BASKET_NUM_x,weight,imp_score,P11,SCORE
0,CUST0000000001,PRD0904358,799,1596.0,0.500627,0.393904,0.197199
1,CUST0000000001,PRD0903678,656,1596.0,0.411028,0.473889,0.194781
2,CUST0000000001,PRD0904757,1261,1596.0,0.7901,0.204243,0.161372
3,CUST0000000001,PRD0902560,1108,1596.0,0.694236,0.185534,0.128804
4,CUST0000000001,PRD0900830,988,1596.0,0.619048,0.182323,0.112867
5,CUST0000000001,PRD0903052,383,1596.0,0.239975,0.363066,0.087127
6,CUST0000000001,PRD0900440,880,1596.0,0.551378,0.157596,0.086895
7,CUST0000000001,PRD0900121,342,1596.0,0.214286,0.370709,0.079438
8,CUST0000000001,PRD0903246,722,1596.0,0.452381,0.163489,0.073959
9,CUST0000000001,PRD0902505,908,1596.0,0.568922,0.117372,0.066775


# Validation 

In [22]:
# Test Data
test = pd.read_csv("final_test.csv")
test.head()

Unnamed: 0,SHOP_WEEK,PROD_CODE,CUST_CODE,BASKET_ID,BASKET_NUM_x,reorder,BASKET_NUM_y,FLAG
0,200706,PRD0900290,CUST0000000001,994105200166936,15,1,15,1.0
1,200706,PRD0900330,CUST0000000001,994105200166936,15,1,15,1.0
2,200706,PRD0900440,CUST0000000001,994105200166936,15,1,15,1.0
3,200706,PRD0900890,CUST0000000001,994105200166936,15,1,15,1.0
4,200706,PRD0901871,CUST0000000001,994105200166936,15,1,15,1.0


In [0]:
# data1 --> recommended product list
data1 = rec[['CUST_CODE', 'PROD_CODE']]
# data2 --> Actually purchased products in the test order
data2 = test[['CUST_CODE', 'PROD_CODE']]

In [0]:
# Function to Filter out the common products recommended as well as actually purchased.
def dataframe_difference(df1, df2, which= 'both' ):
    """Find rows which are different between two DataFrames."""
    comparison_df = df1.merge(df2,
                              indicator=True,
                              how='outer')
    if which is None:
        diff_df = comparison_df[comparison_df['_merge'] != 'both']
    else:
        diff_df = comparison_df[comparison_df['_merge'] == which]
    diff_df.to_csv()
    return diff_df

In [26]:
diff_df = dataframe_difference(data2, data1)
diff_df = diff_df.reset_index(drop = True)
diff_df.head(10)

Unnamed: 0,CUST_CODE,PROD_CODE,_merge
0,CUST0000000001,PRD0900440,both
1,CUST0000000001,PRD0902505,both
2,CUST0000000001,PRD0902560,both
3,CUST0000000001,PRD0904358,both
4,CUST0000000001,PRD0904757,both
5,CUST0000000001,PRD0903678,both
6,CUST0000000009,PRD0900121,both
7,CUST0000000009,PRD0900819,both
8,CUST0000000009,PRD0901878,both
9,CUST0000000009,PRD0903052,both


In [30]:
# Number of Recommended Products --> Mostly 10 for all users 
# But can change if less than 10 products were purchased by the user in all baskets combined
rec = data1.groupby('CUST_CODE')['PROD_CODE'].count().to_frame('rec_count')

# Number of relevant products --> actually repurchased products in the test order
rel = data2.groupby('CUST_CODE')['PROD_CODE'].count().to_frame('rel_count')

# Count of the common products in both datases
common = diff_df.groupby('CUST_CODE')['PROD_CODE'].count().to_frame('true_count')

# Dataset to calculate the accuracy ---> Merge above 3 datasets
accuracy = pd.concat([rec, rel, common], axis = 1)
accuracy = accuracy.fillna(0)
accuracy = accuracy.reset_index()
accuracy.head(10)

Unnamed: 0,index,rec_count,rel_count,true_count
0,CUST0000000001,10.0,22,6.0
1,CUST0000000009,10.0,28,6.0
2,CUST0000000010,10.0,1,0.0
3,CUST0000000012,6.0,1,1.0
4,CUST0000000020,10.0,2,2.0
5,CUST0000000022,10.0,3,2.0
6,CUST0000000028,10.0,3,1.0
7,CUST0000000031,10.0,3,2.0
8,CUST0000000033,10.0,9,4.0
9,CUST0000000035,10.0,1,1.0


In [31]:
# Calculate Precision and Recall
precision = []
recall = []
for i in range(len(accuracy)) :
    if accuracy["rec_count"][i] != 0 and accuracy["rel_count"][i] != 0:
        pre = accuracy["true_count"][i] / accuracy["rec_count"][i]
        re = accuracy["true_count"][i] / accuracy["rel_count"][i]
        precision.append(pre)
        recall.append(re)
    else:
        precision.append(0)
        recall.append(0)
        
accuracy["Precision"] = precision
accuracy["Recall"] = recall
accuracy.head(20)

Unnamed: 0,index,rec_count,rel_count,true_count,Precision,Recall
0,CUST0000000001,10.0,22,6.0,0.6,0.272727
1,CUST0000000009,10.0,28,6.0,0.6,0.214286
2,CUST0000000010,10.0,1,0.0,0.0,0.0
3,CUST0000000012,6.0,1,1.0,0.166667,1.0
4,CUST0000000020,10.0,2,2.0,0.2,1.0
5,CUST0000000022,10.0,3,2.0,0.2,0.666667
6,CUST0000000028,10.0,3,1.0,0.1,0.333333
7,CUST0000000031,10.0,3,2.0,0.2,0.666667
8,CUST0000000033,10.0,9,4.0,0.4,0.444444
9,CUST0000000035,10.0,1,1.0,0.1,1.0


In [32]:
# Precision
accuracy["Precision"].mean()

0.20837751940004925

In [33]:
# Recall
accuracy["Recall"].mean()

0.5789625967015751