This code creates an implicit rating score for each product a customer has interacted with. The implicit rating score is created using as reference the following academic journal papers:
    - Nguyen, H. T., Almenningen, T., Havig, M., Schistad, H., Kofod-Petersen, A., Langseth, H., &amp; Ramampiaro, H. (2014). Learning to rank for personalised fashion recommender systems via implicit feedback. Mining Intelligence and Knowledge Exploration, 51-61. doi:10.1007/978-3-319-13817-6_6
    - Schoinas, Ι, &amp; Tjortjis, C. (2019). MuSIF: A product recommendation system based on Multi-source implicit feedback. IFIP Advances in Information and Communication Technology, 660-672. doi:10.1007/978-3-030-19823-7_55
    - Wang, B., Ye, F., &amp; Xu, J. (2018). A personalized recommendation algorithm based on the user’s implicit feedback in e-commerce. Future Internet, 10(12), 117. doi:10.3390/fi10120117

#### Note:

This is the same exact code as the zip file added into this Git repository with the exception that the visualization code cells have been marked down to avoid exceeding the memory limit of Git.

In [1]:
# Loading basic needed libraries
import pandas as pd
import gc
import numpy as np
from scipy import stats
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from plotly.offline import init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)

# Loading libraries for S3 bucket connection
import boto3
import io
from io import StringIO,BytesIO, TextIOWrapper
import gzip

client = boto3.client('s3') 
resource = boto3.resource('s3') 

In [2]:
# Reading customers who have purchased and their category affinity profile from notebook:Customer_Affinity_profiles.ipynb
affinity_df = pd.read_csv('s3://myaws-capstone-bucket/data/category_affinity_profile.csv')
affinity_df.nunique()

user_id                     1817173
category                        959
total_view                     1198
total_cart_add                  369
total_purchases                 280
total_sessions                  240
total_spent                  320214
min_category_spent            51735
max_category_spent            59649
cust_retailer_age               214
days_since_last_activity        214
first_view_age                  214
days_since_last_view            214
first_cart_age                  214
days_since_last_cart            214
first_purchase_age              212
days_since_last_purchase        212
view_rank                         8
cart_rank                         8
purchases_rank                    8
spent_rank                        8
dtype: int64

In [3]:
# Reading purchase dataset
purchase_df = pd.read_csv('s3://myaws-capstone-bucket/eCommerce_purchase_data.csv')
purchase_df = purchase_df.drop_duplicates(subset=['category', 'category_id'])

# Only keeping purchased category records in affinity_df
affinity_df = pd.merge(affinity_df, purchase_df[['category','category_id']], on=["category"], how='inner')
affinity_df.nunique()

user_id                     1817173
category                        930
total_view                     1198
total_cart_add                  369
total_purchases                 280
total_sessions                  240
total_spent                  320214
min_category_spent            51735
max_category_spent            59649
cust_retailer_age               214
days_since_last_activity        214
first_view_age                  214
days_since_last_view            214
first_cart_age                  214
days_since_last_cart            214
first_purchase_age              212
days_since_last_purchase        212
view_rank                         8
cart_rank                         8
purchases_rank                    8
spent_rank                        8
category_id                     930
dtype: int64

In [4]:
# Reading the overall customer profile from the notebook: Customer_Behavior_Profile_Analysis.ipynb
cust_profile = pd.read_csv('s3://myaws-capstone-bucket/data/customers_of_focus.csv')
cust_profile.columns = ['user_id', 'overall_total_view', 'overall_total_cart_add', 'overall_total_purchases',
       'overall_total_sessions', 'overall_total_spent', 'overall_min_spent', 'overall_max_spent',
       'overall_cust_retailer_age', 'overall_days_since_last_activity', 'overall_first_view_age',
       'overall_days_since_last_view', 'overall_first_cart_age', 'overall_days_since_last_cart',
       'overall_first_purchase_age', 'overall_days_since_last_purchase']
cust_profile.nunique()

user_id                             1817173
overall_total_view                     2424
overall_total_cart_add                  484
overall_total_purchases                 366
overall_total_sessions                  306
overall_total_spent                  387698
overall_min_spent                     39053
overall_max_spent                     57825
overall_cust_retailer_age               213
overall_days_since_last_activity        213
overall_first_view_age                  214
overall_days_since_last_view            214
overall_first_cart_age                  214
overall_days_since_last_cart            214
overall_first_purchase_age              211
overall_days_since_last_purchase        211
dtype: int64

In [5]:
# Analyzing count of views per user
data = cust_profile.loc[cust_profile.overall_total_view > 0]
data = data.groupby('user_id')['overall_total_view'].sum()
data.describe()

count    1.814252e+06
mean     7.045757e+01
std      1.421332e+02
min      1.000000e+00
25%      1.000000e+01
50%      3.100000e+01
75%      7.900000e+01
max      5.734900e+04
Name: overall_total_view, dtype: float64

# Visualizing distribution of views per user

trace = go.Histogram(x = data.values,name = 'views', xbins = dict(start = 0, end = 100))

layout = go.Layout(title = 'Visualizing Distribution of views per Users' , xaxis = dict(title = 'Number of views Per User'), yaxis = dict(title = 'Occurrence Count'), bargap = 0.2)

fig = go.Figure(data=[trace], layout=layout)
fig.update_traces(marker_color='#19D3F3')

iplot(fig)

In [7]:
# Analyzing count of views per user
data = cust_profile.loc[cust_profile.overall_total_cart_add > 0]
data = data.groupby('user_id')['overall_total_cart_add'].sum()
data.describe()

count    1.753742e+06
mean     6.998836e+00
std      1.295174e+01
min      1.000000e+00
25%      2.000000e+00
50%      4.000000e+00
75%      8.000000e+00
max      2.186000e+03
Name: overall_total_cart_add, dtype: float64

# Visualizing distribution of carts per user

trace = go.Histogram(x = data.values,name = 'carts', xbins = dict(start = 0, end = 100))

layout = go.Layout(title = 'Visualizing Distribution of "add to carts" per Users' , xaxis = dict(title = 'Number of "add to carts" Per User'), yaxis = dict(title = 'Occurrence Count'), bargap = 0.2)

fig = go.Figure(data=[trace], layout=layout)
fig.update_traces(marker_color='#19D3F3')

iplot(fig)

In [9]:
# Analyzing count of purchases per user
pd.options.display.float_format = '{:.2f}'.format
data = cust_profile.loc[cust_profile.overall_total_purchases > 0]
data = data.groupby('user_id')['overall_total_purchases'].sum()
data.describe()

count   1817173.00
mean          3.14
std           7.81
min           1.00
25%           1.00
50%           2.00
75%           3.00
max        1975.00
Name: overall_total_purchases, dtype: float64

# Visualizing distribution of purchases per user

trace = go.Histogram(x = data.values,name = 'purchases', xbins = dict(start = 0, end = 100))

layout = go.Layout(title = 'Visualizing Distribution of purchases per Users' , xaxis = dict(title = 'Number of purchases Per User'), yaxis = dict(title = 'Number of Users'), bargap = 0.2)

fig = go.Figure(data=[trace], layout=layout)
fig.update_traces(marker_color='#19D3F3')

iplot(fig)

Given the behavior in the distribution of purchases we see that approximately half of our dataset has purchased only once. Since this data is meant to help us create a reliable recommendation model where we need as much purchase history as possible for our users we will focus on those with more than one purchase. This way our models will have at least two purchases from each person to train themselves. In an ideal model we would have at least 5 for each customer, but given that our users tend to purchase so little we will do the cutoff at less than 1.  

In [11]:
# Analyzing total spent per user
pd.options.display.float_format = '{:.2f}'.format
data = cust_profile.loc[cust_profile.overall_total_purchases > 0]
data = data.groupby('user_id')['overall_total_spent'].sum()
data.describe()

count   1817173.00
mean       1047.44
std        3794.79
min           0.42
25%         158.94
50%         348.92
75%         900.92
max      790098.29
Name: overall_total_spent, dtype: float64

# Visualizing distribution of spent per user

trace = go.Histogram(x = data.values,name = 'total_spent', xbins = dict(start = 0, end = 1000))

layout = go.Layout(title = 'Visualizing Distribution of Total Spent per Users' , xaxis = dict(title = 'Total Spent Per User'), yaxis = dict(title = 'Occurrence Count'), bargap = 0.2)

fig = go.Figure(data=[trace], layout=layout)
fig.update_traces(marker_color='#19D3F3')

iplot(fig)

In [13]:
# Only keeping customers with more than 1 purchase
cust_profile = cust_profile.loc[cust_profile.overall_total_purchases > 1]
cust_profile.nunique()

user_id                             914574
overall_total_view                    2338
overall_total_cart_add                 483
overall_total_purchases                365
overall_total_sessions                 284
overall_total_spent                 383165
overall_min_spent                    28158
overall_max_spent                    49804
overall_cust_retailer_age              213
overall_days_since_last_activity       213
overall_first_view_age                 214
overall_days_since_last_view           214
overall_first_cart_age                 214
overall_days_since_last_cart           214
overall_first_purchase_age             211
overall_days_since_last_purchase       211
dtype: int64

In [14]:
# Analyzing spent for this focus user group
pd.options.display.float_format = '{:.2f}'.format
data = cust_profile.loc[cust_profile.overall_total_purchases > 0]
data = data.groupby('user_id')['overall_total_spent'].sum()
data.describe()

count   914574.00
mean      1793.00
std       5234.08
min          1.17
25%        353.68
50%        726.05
75%       1706.55
max     790098.29
Name: overall_total_spent, dtype: float64

# Visualizing distribution of spent per user

trace = go.Histogram(x = data.values,name = 'total_spent', xbins = dict(start = 0, end = 2000))

layout = go.Layout(title = 'Visualizing Distribution of Total Spent per Users' , xaxis = dict(title = 'Total Spent Per User'), yaxis = dict(title = 'Occurrence Count'), bargap = 0.2)

fig = go.Figure(data=[trace], layout=layout)
fig.update_traces(marker_color='#19D3F3')

iplot(fig)

#### Preference Score - Implicit Scoring and rating

The equation will contain recency, spent and frequency behavior such as total views, total purchase and total added to cart.

The recency on the equation will be reflected by a linear penalisation weight for each event/category combination.
- Recency value of event/category = days since latest type of event for that category / date of first type of that event the customer has
- If there is not a specific type of event performed by that customer for that category or at all then the value will be 0


Each event will also have its own weight depending on its type, out of three possible events - view, add to cart and purchase.
- Purchase will have the highest weight and will be given the value of 3/3 = 1
- Add to cart will have the second highest weight and will be given the value of 2/3 = .67
- View will have the lowest weight because it indicates the least strongest sense of affinity and will be given the value 1/3 = .33
- Each event weight will be applied to the frequency behavior columns - total views, total purchase and total added to cart

category spent will be represented on the equation depending on how much the spent on that category represents from the total spent of the customer. 

- Spent value = spent on category by customer / customer total spent 

#### Creating Preference Score for each user/category pairing
- Step#1: Sum of all the event scores where the event scores are calculated by - Recency value for event + Frequency value of the event. 
- Setp#2: Sum of all the event scores + Spent value = Preference Score for user/category pairing

Recency value of event/category

In [16]:
# Creating recency_df 
recency_df = pd.merge(affinity_df[['user_id', 'category', 'category_id',
       'min_category_spent', 'max_category_spent', 'cust_retailer_age',
       'days_since_last_activity', 'first_view_age', 'days_since_last_view',
       'first_cart_age', 'days_since_last_cart', 'first_purchase_age',
       'days_since_last_purchase']], cust_profile[['user_id',
       'overall_cust_retailer_age', 'overall_days_since_last_activity',
       'overall_first_view_age', 'overall_days_since_last_view',
       'overall_first_cart_age', 'overall_days_since_last_cart',
       'overall_first_purchase_age', 'overall_days_since_last_purchase']], on=["user_id"], how='inner')
recency_df.nunique()

user_id                             914574
category                               927
category_id                            927
min_category_spent                   44703
max_category_spent                   54510
cust_retailer_age                      214
days_since_last_activity               214
first_view_age                         214
days_since_last_view                   214
first_cart_age                         214
days_since_last_cart                   214
first_purchase_age                     212
days_since_last_purchase               212
overall_cust_retailer_age              213
overall_days_since_last_activity       213
overall_first_view_age                 214
overall_days_since_last_view           214
overall_first_cart_age                 214
overall_days_since_last_cart           214
overall_first_purchase_age             211
overall_days_since_last_purchase       211
dtype: int64

In [17]:
# Verifying the distribution of the numeric columns
pd.options.display.float_format = '{:.2f}'.format
recency_df.describe()

Unnamed: 0,user_id,category_id,min_category_spent,max_category_spent,cust_retailer_age,days_since_last_activity,first_view_age,days_since_last_view,first_cart_age,days_since_last_cart,first_purchase_age,days_since_last_purchase,overall_cust_retailer_age,overall_days_since_last_activity,overall_first_view_age,overall_days_since_last_view,overall_first_cart_age,overall_days_since_last_cart,overall_first_purchase_age,overall_days_since_last_purchase
count,2620267.0,2620267.0,2620267.0,2620267.0,2620267.0,2620267.0,2620267.0,2620267.0,2620267.0,2620267.0,2620267.0,2620267.0,2620267.0,2620267.0,2620267.0,2620267.0,2620267.0,2620267.0,2620267.0,2620267.0
mean,557123092.03,2.1495819183831012e+18,94.16,193.57,6560.03,6528.23,6561.99,6530.27,6594.45,6577.4,6549.56,6535.93,154.71,46.31,157.08,48.98,218.25,160.38,119.83,74.89
std,37733781.84,8.899340865285515e+16,210.7,327.21,4696.94,4740.29,4696.31,4739.54,4695.5,4718.98,4710.89,4729.48,54.68,49.94,162.36,162.49,962.66,968.12,58.91,53.93
min,128968633.0,2.053013551857009e+18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,520023203.0,2.0530135556318828e+18,0.0,0.0,194.0,77.0,194.0,78.0,165.0,100.0,165.0,112.0,124.0,5.0,124.0,5.0,81.0,19.0,74.0,28.0
50%,551618573.0,2.2327320797060792e+18,0.0,41.77,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,171.0,26.0,171.0,26.0,135.0,57.0,129.0,68.0
75%,585410439.0,2.232732093077521e+18,110.66,234.15,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,200.0,75.0,200.0,75.0,167.0,102.0,166.0,118.0
max,649770848.0,2.2920440759829133e+18,2574.07,2574.07,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,212.0,212.0,9999.0,9999.0,9999.0,9999.0,212.0,212.0


In [18]:
recency_df.head(10)

Unnamed: 0,user_id,category,category_id,min_category_spent,max_category_spent,cust_retailer_age,days_since_last_activity,first_view_age,days_since_last_view,first_cart_age,...,first_purchase_age,days_since_last_purchase,overall_cust_retailer_age,overall_days_since_last_activity,overall_first_view_age,overall_days_since_last_view,overall_first_cart_age,overall_days_since_last_cart,overall_first_purchase_age,overall_days_since_last_purchase
0,128968633,2232732093077520756_construction.tools.light,2232732093077520756,0.0,0.0,9999.0,0.0,9999.0,0.0,9999.0,...,9999.0,9999.0,174,0,174.0,0.0,160.0,0.0,121.0,121.0
1,128968633,2053013552351936731_appliances.environment.air...,2053013552351936731,0.0,0.0,174.0,9999.0,174.0,9999.0,9999.0,...,9999.0,9999.0,174,0,174.0,0.0,160.0,0.0,121.0,121.0
2,128968633,2053013555631882655_electronics.smartphone,2053013555631882655,0.0,0.0,9999.0,9999.0,9999.0,9999.0,160.0,...,9999.0,9999.0,174,0,174.0,0.0,160.0,0.0,121.0,121.0
3,128968633,2232732102103663163_furniture.bedroom.blanket,2232732102103663163,56.6,56.6,9999.0,9999.0,9999.0,9999.0,9999.0,...,9999.0,121.0,174,0,174.0,0.0,160.0,0.0,121.0,121.0
4,128968633,2232732108613223108_sport.trainer,2232732108613223108,0.0,157.02,9999.0,9999.0,9999.0,9999.0,9999.0,...,121.0,9999.0,174,0,174.0,0.0,160.0,0.0,121.0,121.0
5,192078182,2232732093077520756_construction.tools.light,2232732093077520756,110.72,110.72,9999.0,9999.0,9999.0,9999.0,9999.0,...,9999.0,50.0,155,50,155.0,50.0,155.0,50.0,104.0,50.0
6,192078182,2053013565983425517_appliances.environment.vacuum,2053013565983425517,0.0,0.0,155.0,9999.0,155.0,9999.0,155.0,...,9999.0,9999.0,155,50,155.0,50.0,155.0,50.0,104.0,50.0
7,192078182,2232732101063475749_appliances.environment.vacuum,2232732101063475749,0.0,308.86,9999.0,50.0,9999.0,50.0,9999.0,...,104.0,9999.0,155,50,155.0,50.0,155.0,50.0,104.0,50.0
8,200985178,2232732093077520756_construction.tools.light,2232732093077520756,0.0,262.3,9999.0,9999.0,9999.0,9999.0,131.0,...,123.0,123.0,174,5,174.0,5.0,131.0,123.0,123.0,123.0
9,200985178,2053013555631882655_electronics.smartphone,2053013555631882655,0.0,0.0,174.0,9999.0,174.0,9999.0,9999.0,...,9999.0,9999.0,174,5,174.0,5.0,131.0,123.0,123.0,123.0


In [19]:
# Replacing 0 with 0.0001 in all the days related columns - to not loose cases where the days since last purchase is 0
col_list = ['cust_retailer_age', 'days_since_last_activity', 'first_view_age',
       'days_since_last_view', 'first_cart_age', 'days_since_last_cart',
       'first_purchase_age', 'days_since_last_purchase',
       'overall_cust_retailer_age', 'overall_days_since_last_activity',
       'overall_first_view_age', 'overall_days_since_last_view',
       'overall_first_cart_age', 'overall_days_since_last_cart',
       'overall_first_purchase_age', 'overall_days_since_last_purchase']
recency_df.loc[:, col_list] = recency_df.loc[:, col_list].replace(0,0.0001)

In [20]:
# Creating recency value columns for each event type
recency_df.loc[(recency_df['days_since_last_view'] == 9999),'recency_value_view'] = 0
recency_df.loc[(recency_df['days_since_last_view'] != 9999),'recency_value_view'] = recency_df['days_since_last_view']/recency_df['overall_first_view_age']

recency_df.loc[(recency_df['days_since_last_cart'] == 9999),'recency_value_cart'] = 0
recency_df.loc[(recency_df['days_since_last_cart'] != 9999),'recency_value_cart'] = recency_df['days_since_last_cart']/recency_df['overall_first_cart_age']

recency_df.loc[(recency_df['days_since_last_purchase'] == 9999),'recency_value_purchase'] = 0
recency_df.loc[(recency_df['days_since_last_purchase'] != 9999),'recency_value_purchase'] = recency_df['days_since_last_purchase']/recency_df['overall_first_purchase_age']

In [21]:
# Reversing recency value columns so that the more recent the higher the value 
recency_df['recency_value_view'] = 1/recency_df['recency_value_view'] 
recency_df['recency_value_cart'] = 1/recency_df['recency_value_cart'] 
recency_df['recency_value_purchase'] = 1/recency_df['recency_value_purchase'] 

# Replacing cases where infinite of nan values occurred do to the division, with 0
recency_df = recency_df.replace([np.inf, -np.inf], 0)
recency_df[['recency_value_view','recency_value_cart','recency_value_purchase']] = recency_df[['recency_value_view','recency_value_cart','recency_value_purchase']].replace(np.nan, 0)

recency_df.head(10)

Unnamed: 0,user_id,category,category_id,min_category_spent,max_category_spent,cust_retailer_age,days_since_last_activity,first_view_age,days_since_last_view,first_cart_age,...,overall_days_since_last_activity,overall_first_view_age,overall_days_since_last_view,overall_first_cart_age,overall_days_since_last_cart,overall_first_purchase_age,overall_days_since_last_purchase,recency_value_view,recency_value_cart,recency_value_purchase
0,128968633,2232732093077520756_construction.tools.light,2232732093077520756,0.0,0.0,9999.0,0.0,9999.0,0.0,9999.0,...,0.0,174.0,0.0,160.0,0.0,121.0,121.0,1740000.0,1600000.0,0.0
1,128968633,2053013552351936731_appliances.environment.air...,2053013552351936731,0.0,0.0,174.0,9999.0,174.0,9999.0,9999.0,...,0.0,174.0,0.0,160.0,0.0,121.0,121.0,0.0,0.0,0.0
2,128968633,2053013555631882655_electronics.smartphone,2053013555631882655,0.0,0.0,9999.0,9999.0,9999.0,9999.0,160.0,...,0.0,174.0,0.0,160.0,0.0,121.0,121.0,0.0,0.0,0.0
3,128968633,2232732102103663163_furniture.bedroom.blanket,2232732102103663163,56.6,56.6,9999.0,9999.0,9999.0,9999.0,9999.0,...,0.0,174.0,0.0,160.0,0.0,121.0,121.0,0.0,0.0,1.0
4,128968633,2232732108613223108_sport.trainer,2232732108613223108,0.0,157.02,9999.0,9999.0,9999.0,9999.0,9999.0,...,0.0,174.0,0.0,160.0,0.0,121.0,121.0,0.0,0.0,0.0
5,192078182,2232732093077520756_construction.tools.light,2232732093077520756,110.72,110.72,9999.0,9999.0,9999.0,9999.0,9999.0,...,50.0,155.0,50.0,155.0,50.0,104.0,50.0,0.0,0.0,2.08
6,192078182,2053013565983425517_appliances.environment.vacuum,2053013565983425517,0.0,0.0,155.0,9999.0,155.0,9999.0,155.0,...,50.0,155.0,50.0,155.0,50.0,104.0,50.0,0.0,0.0,0.0
7,192078182,2232732101063475749_appliances.environment.vacuum,2232732101063475749,0.0,308.86,9999.0,50.0,9999.0,50.0,9999.0,...,50.0,155.0,50.0,155.0,50.0,104.0,50.0,3.1,3.1,0.0
8,200985178,2232732093077520756_construction.tools.light,2232732093077520756,0.0,262.3,9999.0,9999.0,9999.0,9999.0,131.0,...,5.0,174.0,5.0,131.0,123.0,123.0,123.0,0.0,1.07,1.0
9,200985178,2053013555631882655_electronics.smartphone,2053013555631882655,0.0,0.0,174.0,9999.0,174.0,9999.0,9999.0,...,5.0,174.0,5.0,131.0,123.0,123.0,123.0,0.0,0.0,0.0


In [22]:
recency_df.describe()

Unnamed: 0,user_id,category_id,min_category_spent,max_category_spent,cust_retailer_age,days_since_last_activity,first_view_age,days_since_last_view,first_cart_age,days_since_last_cart,...,overall_days_since_last_activity,overall_first_view_age,overall_days_since_last_view,overall_first_cart_age,overall_days_since_last_cart,overall_first_purchase_age,overall_days_since_last_purchase,recency_value_view,recency_value_cart,recency_value_purchase
count,2620267.0,2620267.0,2620267.0,2620267.0,2620267.0,2620267.0,2620267.0,2620267.0,2620267.0,2620267.0,...,2620267.0,2620267.0,2620267.0,2620267.0,2620267.0,2620267.0,2620267.0,2620267.0,2620267.0,2620267.0
mean,557123092.03,2.1495819183831012e+18,94.16,193.57,6560.03,6528.23,6561.99,6530.27,6594.45,6577.4,...,46.31,157.08,48.98,218.25,160.38,119.83,74.89,27944.0,5694.31,3417.8
std,37733781.84,8.899340865285515e+16,210.7,327.21,4696.94,4740.29,4696.31,4739.54,4695.5,4718.98,...,49.94,162.36,162.49,962.66,968.12,58.91,53.93,217689.29,87829.72,67064.91
min,128968633.0,2.053013551857009e+18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,520023203.0,2.0530135556318828e+18,0.0,0.0,194.0,77.0,194.0,78.0,165.0,100.0,...,5.0,124.0,5.0,81.0,19.0,74.0,28.0,0.0,0.0,0.0
50%,551618573.0,2.2327320797060792e+18,0.0,41.77,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,...,26.0,171.0,26.0,135.0,57.0,129.0,68.0,0.0,0.0,0.0
75%,585410439.0,2.232732093077521e+18,110.66,234.15,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,...,75.0,200.0,75.0,167.0,102.0,166.0,118.0,1.47,1.05,1.01
max,649770848.0,2.2920440759829133e+18,2574.07,2574.07,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,...,212.0,9999.0,9999.0,9999.0,9999.0,212.0,212.0,2120000.0,2120000.0,2120000.0


In [23]:
# Checking how the value columns for customers with highest recency_value_purchase look
recency_df.loc[recency_df['recency_value_purchase'] == 212]

Unnamed: 0,user_id,category,category_id,min_category_spent,max_category_spent,cust_retailer_age,days_since_last_activity,first_view_age,days_since_last_view,first_cart_age,...,overall_days_since_last_activity,overall_first_view_age,overall_days_since_last_view,overall_first_cart_age,overall_days_since_last_cart,overall_first_purchase_age,overall_days_since_last_purchase,recency_value_view,recency_value_cart,recency_value_purchase
48648,512919823,2232732093077520756_construction.tools.light,2232732093077520756,278.21,460.5,9999.0,1.0,9999.0,1.0,146.0,...,1.0,212.0,1.0,146.0,1.0,212.0,1.0,212.0,146.0,212.0
88531,513508022,2232732093077520756_construction.tools.light,2232732093077520756,0.0,694.48,9999.0,1.0,9999.0,1.0,9999.0,...,1.0,212.0,1.0,208.0,1.0,212.0,1.0,212.0,208.0,212.0
113724,513992195,2232732093077520756_construction.tools.light,2232732093077520756,168.06,290.51,9999.0,1.0,9999.0,1.0,9999.0,...,1.0,212.0,1.0,191.0,1.0,212.0,1.0,212.0,191.0,212.0
124271,514189719,2232732093077520756_construction.tools.light,2232732093077520756,148.68,1415.45,9999.0,1.0,9999.0,1.0,9999.0,...,1.0,212.0,1.0,196.0,1.0,212.0,1.0,212.0,196.0,212.0
127489,514257075,2232732097842250207_apparel.shoes.keds,2232732097842250207,107.85,107.85,9999.0,9999.0,9999.0,0.0,9999.0,...,0.0,212.0,0.0,199.0,1.0,212.0,1.0,2120000.0,199.0,212.0
185653,515406322,2232732093077520756_construction.tools.light,2232732093077520756,178.12,948.02,9999.0,1.0,9999.0,1.0,9999.0,...,1.0,212.0,1.0,209.0,1.0,212.0,1.0,212.0,209.0,212.0
185933,515411256,2232732079706079299_sport.bicycle,2232732079706079299,163.18,163.18,9999.0,9999.0,9999.0,9999.0,9999.0,...,1.0,212.0,1.0,47.0,1.0,212.0,1.0,0.0,47.0,212.0
189690,515494405,2232732093077520756_construction.tools.light,2232732093077520756,532.83,1309.61,9999.0,1.0,9999.0,1.0,9999.0,...,1.0,212.0,1.0,212.0,1.0,212.0,1.0,212.0,212.0,212.0
194737,515598234,2232732093077520756_construction.tools.light,2232732093077520756,417.0,1479.36,9999.0,1.0,9999.0,1.0,9999.0,...,1.0,212.0,1.0,212.0,1.0,212.0,1.0,212.0,212.0,212.0
381619,523282406,2110937219442148235_computers.components.cdrw,2110937219442148235,28.06,28.06,9999.0,9999.0,9999.0,9999.0,9999.0,...,0.0,212.0,0.0,148.0,1.0,212.0,1.0,0.0,148.0,212.0


In [24]:
# Scailing the recency value columns so that they range from 0 to 1
pd.options.display.float_format = '{:.10f}'.format
scaler = MinMaxScaler()

recency_df['recency_value_view'] = scaler.fit_transform(recency_df['recency_value_view'].values.reshape(-1,1))
recency_df['recency_value_cart'] = scaler.fit_transform(recency_df['recency_value_cart'].values.reshape(-1,1))
recency_df['recency_value_purchase'] = scaler.fit_transform(recency_df['recency_value_purchase'].values.reshape(-1,1))

recency_df.describe()

Unnamed: 0,user_id,category_id,min_category_spent,max_category_spent,cust_retailer_age,days_since_last_activity,first_view_age,days_since_last_view,first_cart_age,days_since_last_cart,...,overall_days_since_last_activity,overall_first_view_age,overall_days_since_last_view,overall_first_cart_age,overall_days_since_last_cart,overall_first_purchase_age,overall_days_since_last_purchase,recency_value_view,recency_value_cart,recency_value_purchase
count,2620267.0,2620267.0,2620267.0,2620267.0,2620267.0,2620267.0,2620267.0,2620267.0,2620267.0,2620267.0,...,2620267.0,2620267.0,2620267.0,2620267.0,2620267.0,2620267.0,2620267.0,2620267.0,2620267.0,2620267.0
mean,557123092.0283353,2.1495819183831012e+18,94.1615346222,193.5672463912,6560.0345323854,6528.2300686695,6561.9885290585,6530.2749751607,6594.4460691128,6577.4018239184,...,46.309160449,157.0805666403,48.982620441,218.2494991416,160.3849255889,119.8320535948,74.8918088658,0.0131811305,0.0026859936,0.0016121703
std,37733781.842346005,8.899340865285515e+16,210.6986910269,327.2106516592,4696.9369089544,4740.2945749688,4696.3111844045,4739.5443553048,4695.496663917,4718.9835663629,...,49.9443509518,162.3604643228,162.4921223237,962.6594543153,968.1198537848,58.9063318926,53.9268570003,0.1026836278,0.0414291145,0.0316343932
min,128968633.0,2.053013551857009e+18,0.0,0.0,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001,...,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001,0.0,0.0,0.0
25%,520023203.0,2.0530135556318828e+18,0.0,0.0,194.0,77.0,194.0,78.0,165.0,100.0,...,5.0,124.0,5.0,81.0,19.0,74.0,28.0,0.0,0.0,0.0
50%,551618573.0,2.2327320797060792e+18,0.0,41.77,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,...,26.0,171.0,26.0,135.0,57.0,129.0,68.0,0.0,0.0,0.0
75%,585410439.0,2.232732093077521e+18,110.66,234.15,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,...,75.0,200.0,75.0,167.0,102.0,166.0,118.0,6.918e-07,4.943e-07,4.777e-07
max,649770848.0,2.2920440759829133e+18,2574.07,2574.07,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,...,212.0,9999.0,9999.0,9999.0,9999.0,212.0,212.0,1.0,1.0,1.0


In [25]:
# Checking how the value columns for customers with highest recency_value_purchase look after scailing them
recency_df.loc[recency_df['recency_value_purchase'] == 1]

Unnamed: 0,user_id,category,category_id,min_category_spent,max_category_spent,cust_retailer_age,days_since_last_activity,first_view_age,days_since_last_view,first_cart_age,...,overall_days_since_last_activity,overall_first_view_age,overall_days_since_last_view,overall_first_cart_age,overall_days_since_last_cart,overall_first_purchase_age,overall_days_since_last_purchase,recency_value_view,recency_value_cart,recency_value_purchase
42109,512845211,2232732093077520756_construction.tools.light,2232732093077520756,111.38,978.15,9999.0,0.0001,9999.0,0.0001,9999.0,...,0.0001,212.0,0.0001,212.0,0.0001,212.0,0.0001,1.0,1.0,1.0
52926,512973929,2232732093077520756_construction.tools.light,2232732093077520756,0.0,115.04,9999.0,0.0001,9999.0,0.0001,9999.0,...,0.0001,212.0,0.0001,212.0,0.0001,212.0,0.0001,1.0,1.0,1.0
77958,513338994,2232732093077520756_construction.tools.light,2232732093077520756,171.69,951.09,9999.0,0.0001,9999.0,0.0001,9999.0,...,0.0001,212.0,0.0001,199.0,0.0001,212.0,0.0001,1.0,0.9386792453,1.0
146116,514587815,2232732093077520756_construction.tools.light,2232732093077520756,0.0,227.44,9999.0,0.0001,9999.0,0.0001,9999.0,...,0.0001,212.0,0.0001,209.0,0.0001,212.0,0.0001,1.0,0.9858490566,1.0
146761,514598342,2232732093077520756_construction.tools.light,2232732093077520756,0.0,930.88,9999.0,0.0001,9999.0,0.0001,9999.0,...,0.0001,212.0,0.0001,212.0,0.0001,212.0,0.0001,1.0,1.0,1.0
159065,514835303,2232732093077520756_construction.tools.light,2232732093077520756,228.57,806.61,9999.0,0.0001,9999.0,0.0001,9999.0,...,0.0001,212.0,0.0001,206.0,0.0001,212.0,0.0001,1.0,0.9716981132,1.0
207850,515908047,2232732093077520756_construction.tools.light,2232732093077520756,84.69,84.69,9999.0,0.0001,9999.0,0.0001,9999.0,...,0.0001,212.0,0.0001,212.0,0.0001,212.0,0.0001,1.0,1.0,1.0
295492,518746541,2232732093077520756_construction.tools.light,2232732093077520756,64.07,1614.97,9999.0,0.0001,9999.0,0.0001,9999.0,...,0.0001,212.0,0.0001,212.0,0.0001,212.0,0.0001,1.0,1.0,1.0
303878,518941128,2232732093077520756_construction.tools.light,2232732093077520756,298.57,298.57,9999.0,0.0001,9999.0,0.0001,0.0001,...,0.0001,212.0,0.0001,0.0001,0.0001,212.0,0.0001,1.0,4.717e-07,1.0
349785,521095556,2232732093077520756_construction.tools.light,2232732093077520756,108.09,923.81,9999.0,0.0001,9999.0,0.0001,9999.0,...,0.0001,212.0,0.0001,212.0,0.0001,212.0,0.0001,1.0,1.0,1.0


In [26]:
# Keeping only needed columns for recency df
recency_df = recency_df[['user_id', 'category', 'category_id','recency_value_view', 'recency_value_cart', 'recency_value_purchase']]
recency_df.head()

Unnamed: 0,user_id,category,category_id,recency_value_view,recency_value_cart,recency_value_purchase
0,128968633,2232732093077520756_construction.tools.light,2232732093077520756,0.820754717,0.7547169811,0.0
1,128968633,2053013552351936731_appliances.environment.air...,2053013552351936731,0.0,0.0,0.0
2,128968633,2053013555631882655_electronics.smartphone,2053013555631882655,0.0,0.0,0.0
3,128968633,2232732102103663163_furniture.bedroom.blanket,2232732102103663163,0.0,0.0,4.717e-07
4,128968633,2232732108613223108_sport.trainer,2232732108613223108,0.0,0.0,0.0


In [27]:
recency_df.nunique()

user_id                   914574
category                     927
category_id                  927
recency_value_view         13926
recency_value_cart         13932
recency_value_purchase     13445
dtype: int64

Frequency value of event/product

In [28]:
# Creating event weight variables
purchase_weight = 1
cart_weight = .67
view_weight = .33

In [29]:
# Creating Frequency value columns
affinity_df['frequency_value_purchase'] = affinity_df['total_purchases']*purchase_weight
affinity_df['frequency_value_cart'] = affinity_df['total_cart_add']*cart_weight
affinity_df['frequency_value_view'] = affinity_df['total_view'] *view_weight

# Creating Frequency df with wanted columns
frequency_df = affinity_df[['user_id', 'category', 'category_id', 'frequency_value_purchase', 'frequency_value_cart', 'frequency_value_view']]
frequency_df.head()

Unnamed: 0,user_id,category,category_id,frequency_value_purchase,frequency_value_cart,frequency_value_view
0,101875240,2232732093077520756_construction.tools.light,2232732093077520756,1.0,2.01,4.29
1,128968633,2232732093077520756_construction.tools.light,2232732093077520756,0.0,0.67,25.08
2,136662675,2232732093077520756_construction.tools.light,2232732093077520756,1.0,1.34,0.99
3,192078182,2232732093077520756_construction.tools.light,2232732093077520756,1.0,3.35,2.31
4,200985178,2232732093077520756_construction.tools.light,2232732093077520756,1.0,5.36,28.71


Spent value of product

In [30]:
# Creating spent_df
spent_df = pd.merge(affinity_df[['user_id', 'category', 'category_id', 'total_spent']], cust_profile[['user_id','overall_total_spent']], on=["user_id"], how='inner')
spent_df.nunique()

user_id                914574
category                  927
category_id               927
total_spent            318111
overall_total_spent    383165
dtype: int64

In [31]:
# Calculating spent value by dividing customer total spent in product by customer total spent overall
spent_df['spent_value'] = spent_df['total_spent']/spent_df['overall_total_spent']
# Keeping only wanted columns
spent_df = spent_df[['user_id', 'category', 'category_id', 'spent_value']]
spent_df.head()

Unnamed: 0,user_id,category,category_id,spent_value
0,128968633,2232732093077520756_construction.tools.light,2232732093077520756,0.0
1,128968633,2053013552351936731_appliances.environment.air...,2053013552351936731,0.0
2,128968633,2053013555631882655_electronics.smartphone,2053013555631882655,0.0
3,128968633,2232732102103663163_furniture.bedroom.blanket,2232732102103663163,0.1577524457
4,128968633,2232732108613223108_sport.trainer,2232732108613223108,0.8422475543


Calculating the preference score 

In [32]:
# Joining recency,frequency and spent dfs together
cust_implicit_df = pd.merge(recency_df, frequency_df, on=["user_id",'category', 'category_id'], how='inner')
cust_implicit_df = pd.merge(cust_implicit_df, spent_df, on=["user_id",'category', 'category_id'], how='inner')
cust_implicit_df.nunique()# Making sure all customers are kept

user_id                      914574
category                        927
category_id                     927
recency_value_view            13926
recency_value_cart            13932
recency_value_purchase        13445
frequency_value_purchase        280
frequency_value_cart            368
frequency_value_view           1127
spent_value                 1086656
dtype: int64

In [33]:
cust_implicit_df.head()

Unnamed: 0,user_id,category,category_id,recency_value_view,recency_value_cart,recency_value_purchase,frequency_value_purchase,frequency_value_cart,frequency_value_view,spent_value
0,128968633,2232732093077520756_construction.tools.light,2232732093077520756,0.820754717,0.7547169811,0.0,0.0,0.67,25.08,0.0
1,128968633,2053013552351936731_appliances.environment.air...,2053013552351936731,0.0,0.0,0.0,0.0,0.0,1.98,0.0
2,128968633,2053013555631882655_electronics.smartphone,2053013555631882655,0.0,0.0,0.0,0.0,0.67,5.28,0.0
3,128968633,2232732102103663163_furniture.bedroom.blanket,2232732102103663163,0.0,0.0,4.717e-07,1.0,1.34,5.61,0.1577524457
4,128968633,2232732108613223108_sport.trainer,2232732108613223108,0.0,0.0,0.0,2.0,1.34,6.6,0.8422475543


In [34]:
# Calculating event scores for each event/product of each customer
cust_implicit_df['view_event_score'] = cust_implicit_df['recency_value_view']+cust_implicit_df['frequency_value_view']
cust_implicit_df['cart_event_score'] = cust_implicit_df['recency_value_cart']+cust_implicit_df['frequency_value_cart']
cust_implicit_df['purchase_event_score'] = cust_implicit_df['recency_value_purchase']+cust_implicit_df['frequency_value_purchase']
cust_implicit_df.head()

Unnamed: 0,user_id,category,category_id,recency_value_view,recency_value_cart,recency_value_purchase,frequency_value_purchase,frequency_value_cart,frequency_value_view,spent_value,view_event_score,cart_event_score,purchase_event_score
0,128968633,2232732093077520756_construction.tools.light,2232732093077520756,0.820754717,0.7547169811,0.0,0.0,0.67,25.08,0.0,25.900754717,1.4247169811,0.0
1,128968633,2053013552351936731_appliances.environment.air...,2053013552351936731,0.0,0.0,0.0,0.0,0.0,1.98,0.0,1.98,0.0,0.0
2,128968633,2053013555631882655_electronics.smartphone,2053013555631882655,0.0,0.0,0.0,0.0,0.67,5.28,0.0,5.28,0.67,0.0
3,128968633,2232732102103663163_furniture.bedroom.blanket,2232732102103663163,0.0,0.0,4.717e-07,1.0,1.34,5.61,0.1577524457,5.61,1.34,1.0000004717
4,128968633,2232732108613223108_sport.trainer,2232732108613223108,0.0,0.0,0.0,2.0,1.34,6.6,0.8422475543,6.6,1.34,2.0


In [35]:
# Summing all the event scores together
cust_implicit_df['agg_event_scores'] = cust_implicit_df['view_event_score'] + cust_implicit_df['cart_event_score'] + cust_implicit_df['purchase_event_score']
cust_implicit_df.head()

Unnamed: 0,user_id,category,category_id,recency_value_view,recency_value_cart,recency_value_purchase,frequency_value_purchase,frequency_value_cart,frequency_value_view,spent_value,view_event_score,cart_event_score,purchase_event_score,agg_event_scores
0,128968633,2232732093077520756_construction.tools.light,2232732093077520756,0.820754717,0.7547169811,0.0,0.0,0.67,25.08,0.0,25.900754717,1.4247169811,0.0,27.3254716981
1,128968633,2053013552351936731_appliances.environment.air...,2053013552351936731,0.0,0.0,0.0,0.0,0.0,1.98,0.0,1.98,0.0,0.0,1.98
2,128968633,2053013555631882655_electronics.smartphone,2053013555631882655,0.0,0.0,0.0,0.0,0.67,5.28,0.0,5.28,0.67,0.0,5.95
3,128968633,2232732102103663163_furniture.bedroom.blanket,2232732102103663163,0.0,0.0,4.717e-07,1.0,1.34,5.61,0.1577524457,5.61,1.34,1.0000004717,7.9500004717
4,128968633,2232732108613223108_sport.trainer,2232732108613223108,0.0,0.0,0.0,2.0,1.34,6.6,0.8422475543,6.6,1.34,2.0,9.94


In [36]:
# Preference score columns creation
cust_implicit_df['implicit_preference_score'] = cust_implicit_df['agg_event_scores']+cust_implicit_df['spent_value']
cust_implicit_df.head()

Unnamed: 0,user_id,category,category_id,recency_value_view,recency_value_cart,recency_value_purchase,frequency_value_purchase,frequency_value_cart,frequency_value_view,spent_value,view_event_score,cart_event_score,purchase_event_score,agg_event_scores,implicit_preference_score
0,128968633,2232732093077520756_construction.tools.light,2232732093077520756,0.820754717,0.7547169811,0.0,0.0,0.67,25.08,0.0,25.900754717,1.4247169811,0.0,27.3254716981,27.3254716981
1,128968633,2053013552351936731_appliances.environment.air...,2053013552351936731,0.0,0.0,0.0,0.0,0.0,1.98,0.0,1.98,0.0,0.0,1.98,1.98
2,128968633,2053013555631882655_electronics.smartphone,2053013555631882655,0.0,0.0,0.0,0.0,0.67,5.28,0.0,5.28,0.67,0.0,5.95,5.95
3,128968633,2232732102103663163_furniture.bedroom.blanket,2232732102103663163,0.0,0.0,4.717e-07,1.0,1.34,5.61,0.1577524457,5.61,1.34,1.0000004717,7.9500004717,8.1077529174
4,128968633,2232732108613223108_sport.trainer,2232732108613223108,0.0,0.0,0.0,2.0,1.34,6.6,0.8422475543,6.6,1.34,2.0,9.94,10.7822475543


In [37]:
pd.options.display.float_format = '{:.15f}'.format
# Viewing distribution of the implicit preference score
cust_implicit_df['implicit_preference_score'].describe()

count   2620267.000000000000000
mean         10.033447771957125
std          21.183431028562293
min           0.330000000000000
25%           1.650000000000000
50%           5.106648017198954
75%          11.814575994793909
max       12886.017174747899844
Name: implicit_preference_score, dtype: float64

In [38]:
cust_implicit_df.nunique()

user_id                       914574
category                         927
category_id                      927
recency_value_view             13926
recency_value_cart             13932
recency_value_purchase         13445
frequency_value_purchase         280
frequency_value_cart             368
frequency_value_view            1127
spent_value                  1086656
view_event_score              341989
cart_event_score              168124
purchase_event_score          107576
agg_event_scores              995566
implicit_preference_score    1596984
dtype: int64

In [39]:
# Keeping only wanted columns
cust_implicit_df = cust_implicit_df[['user_id','category', 'category_id','implicit_preference_score']]

In [40]:
cust_implicit_df['implicit_preference_score'].describe()

count   2620267.000000000000000
mean         10.033447771957125
std          21.183431028562293
min           0.330000000000000
25%           1.650000000000000
50%           5.106648017198954
75%          11.814575994793909
max       12886.017174747899844
Name: implicit_preference_score, dtype: float64

In [41]:
# Scailing score from 1 to 5 to create implicit rating column

cust_implicit_df['implicit_rating'] = cust_implicit_df['implicit_preference_score']

# Handling the skew distribution of the implicit_rating column with boxcox transformation
cust_implicit_df['implicit_rating'] = stats.boxcox(cust_implicit_df['implicit_rating'])[0] 

# Scailing transformed column to 1 to 5
cust_implicit_df['implicit_rating'] -= cust_implicit_df['implicit_rating'].min()

cust_implicit_df['implicit_rating'] /= cust_implicit_df['implicit_rating'].max()

cust_implicit_df['implicit_rating']=cust_implicit_df['implicit_rating']*5+1

# Cleaning ratings
cust_implicit_df.loc[cust_implicit_df['implicit_rating'] > 5,'implicit_rating'] = 5 
# rounding rating column to make sure all ratings are int
cust_implicit_df['implicit_rating'] = cust_implicit_df['implicit_rating'].round().astype(int)

In [42]:
cust_implicit_df['implicit_rating'].describe()#Checking distribution of values

count   2620267.000000000000000
mean          1.869310646586779
std           0.588452329550929
min           1.000000000000000
25%           2.000000000000000
50%           2.000000000000000
75%           2.000000000000000
max           5.000000000000000
Name: implicit_rating, dtype: float64

In [43]:
cust_implicit_df.nunique()

user_id                       914574
category                         927
category_id                      927
implicit_preference_score    1596984
implicit_rating                    5
dtype: int64

In [44]:
cust_implicit_df.head()

Unnamed: 0,user_id,category,category_id,implicit_preference_score,implicit_rating
0,128968633,2232732093077520756_construction.tools.light,2232732093077520756,27.32547169811321,3
1,128968633,2053013552351936731_appliances.environment.air...,2053013552351936731,1.98,2
2,128968633,2053013555631882655_electronics.smartphone,2053013555631882655,5.95,2
3,128968633,2232732102103663163_furniture.bedroom.blanket,2232732102103663163,8.107752917418452,2
4,128968633,2232732108613223108_sport.trainer,2232732108613223108,10.782247554279664,2


In [45]:
cust_implicit_df.loc[cust_implicit_df['implicit_rating'] == 1]

Unnamed: 0,user_id,category,category_id,implicit_preference_score,implicit_rating
6,192078182,2053013565983425517_appliances.environment.vacuum,2053013565983425517,1.000000000000000,1
9,200985178,2053013555631882655_electronics.smartphone,2053013555631882655,0.330000000000000,1
10,200985178,2053013555573162395_electronics.telephone,2053013555573162395,0.660016415094340,1
12,221480173,2053013553140465927_kids.toys,2053013553140465927,0.330000000000000,1
15,237470903,2053013558920217191_computers.notebook,2053013558920217191,0.660000000000000,1
...,...,...,...,...,...
2620199,568957427,2232732097993245155_apparel.shoes.sandals,2232732097993245155,0.330003443396226,1
2620225,641380645,2232732104569913971_appliances.kitchen.grill,2232732104569913971,0.330000000000000,1
2620230,648254901,2102305809651204547_appliances.kitchen.dishwasher,2102305809651204547,0.330000000000000,1
2620247,558928662,2053013559289315959_apparel.shoes,2053013559289315959,0.330000486914181,1


In [46]:
cust_implicit_df.describe()

Unnamed: 0,user_id,category_id,implicit_preference_score,implicit_rating
count,2620267.0,2620267.0,2620267.0,2620267.0
mean,557123092.0283353,2.1495819183831012e+18,10.033447771957125,1.869310646586779
std,37733781.842346005,8.899340865285515e+16,21.183431028562293,0.588452329550929
min,128968633.0,2.053013551857009e+18,0.33,1.0
25%,520023203.0,2.0530135556318828e+18,1.65,2.0
50%,551618573.0,2.2327320797060792e+18,5.106648017198954,2.0
75%,585410439.0,2.232732093077521e+18,11.814575994793907,2.0
max,649770848.0,2.2920440759829133e+18,12886.0171747479,5.0


In [47]:
cust_implicit_df.shape

(2620267, 5)

In [48]:
# Saving df in S3 - This df can be used to train models like CF, BPR, among others that just require a rating
cust_implicit_df = cust_implicit_df[['user_id','category', 'category_id','implicit_rating']]

# Creating short catID for models like PySpark CF ALS that expect a smaller range id
# instantiating labelencoder object
le = LabelEncoder()

cust_implicit_df['catID'] = le.fit_transform(cust_implicit_df['category'])
cust_implicit_df.head()

Unnamed: 0,user_id,category,category_id,implicit_rating,catID
0,128968633,2232732093077520756_construction.tools.light,2232732093077520756,3,668
1,128968633,2053013552351936731_appliances.environment.air...,2053013552351936731,2,16
2,128968633,2053013555631882655_electronics.smartphone,2053013555631882655,2,107
3,128968633,2232732102103663163_furniture.bedroom.blanket,2232732102103663163,2,734
4,128968633,2232732108613223108_sport.trainer,2232732108613223108,2,788


In [49]:
# Making sure we only have on user/product per row
cust_implicit_df = cust_implicit_df.drop_duplicates(subset=['user_id','category', 'category_id'])
cust_implicit_df.to_csv('s3://myaws-capstone-bucket/data/implicit_rating_category.csv',index=False)

In [50]:
cust_implicit_df.shape

(2620267, 5)

In [51]:
cust_implicit_df.nunique()

user_id            914574
category              927
category_id           927
implicit_rating         5
catID                 927
dtype: int64

In [52]:
# Preparing input data for modeling

# Creating train, validation and test datasets for implicit_cat rating data

# Trains and Test split will be 70/30
train_df, test_df = train_test_split(cust_implicit_df, test_size=0.30,random_state=42)

# Splitting the training set for validation data
val_train_df, val_test_df = train_test_split(train_df, test_size=0.30,random_state=42)

# Saving this df in S3
train_df.to_csv('s3://myaws-capstone-bucket/data/modeling/input/implicit_cat_rating_train.csv',index=False)
test_df.to_csv('s3://myaws-capstone-bucket/data/modeling/input/implicit_cat_rating_test.csv',index=False)
val_train_df.to_csv('s3://myaws-capstone-bucket/data/modeling/input/implicit_cat_rating_val_train.csv',index=False)
val_test_df.to_csv('s3://myaws-capstone-bucket/data/modeling/input/implicit_cat_rating_val_test.csv',index=False)

In [53]:
print(train_df.shape[0])
print(test_df.shape[0])
print(val_train_df.shape[0])
print(val_test_df.shape[0])

1834186
786081
1283930
550256


#### Reading categories purchased by each user with the timestamp
- These records were created on the notebook - purchase_dataset_gathering.ipynb
- These records will be used to train and test models like CF with Neural Networks (hybrid recommendation model), by combining the event time with the implicit rating df we just created.
- These records can also be used for RNN, CNN or Basket Market Analysis (Association Rules) recommendation models


In [54]:
# Reading purchase dataset
purchase_df = pd.read_csv('s3://myaws-capstone-bucket/eCommerce_purchase_data.csv')
purchase_df.head()

Unnamed: 0,user_id,user_session,event_time,category_code,category_id,brand,product_id,category
0,543272936,8187d148-3c41-46d4-b0c0-9c08cd9dc564,2019-10-01 00:02:14 UTC,electronics.smartphone,2053013555631882655,samsung,1004856,2053013555631882655_electronics.smartphone
1,551377651,3c80f0d6-e9ec-4181-8c5c-837a30be2d68,2019-10-01 00:04:37 UTC,electronics.smartphone,2053013555631882655,apple,1002532,2053013555631882655_electronics.smartphone
2,555332717,1dea3ee2-2ded-42e8-8e7a-4e2ad6ae942f,2019-10-01 00:07:07 UTC,furniture.bathroom.toilet,2053013557418656265,santeri,13800054,2053013557418656265_furniture.bathroom.toilet
3,524601178,2af9b570-0942-4dcd-8f25-4d84fba82553,2019-10-01 00:09:26 UTC,electronics.audio.headphone,2053013554658804075,apple,4804055,2053013554658804075_electronics.audio.headphone
4,551377651,3c80f0d6-e9ec-4181-8c5c-837a30be2d68,2019-10-01 00:09:54 UTC,electronics.audio.headphone,2053013554658804075,apple,4804056,2053013554658804075_electronics.audio.headphone


In [55]:
# Combining purchase event_time with cust_implicit_df for training and testing models that require both rating and timestamp
cust_implicit_df = pd.merge(cust_implicit_df, purchase_df[['user_id','category','category_id','event_time']], on=["user_id",'category','category_id'], how='inner')
cust_implicit_df.head()

Unnamed: 0,user_id,category,category_id,implicit_rating,catID,event_time
0,128968633,2232732102103663163_furniture.bedroom.blanket,2232732102103663163,2,734,2019-12-31 10:09:41 UTC
1,128968633,2232732108613223108_sport.trainer,2232732108613223108,2,788,2019-12-31 11:30:56 UTC
2,128968633,2232732108613223108_sport.trainer,2232732108613223108,2,788,2019-12-31 15:30:09 UTC
3,192078182,2232732093077520756_construction.tools.light,2232732093077520756,2,668,2020-03-11 05:47:37 UTC
4,192078182,2232732101063475749_appliances.environment.vacuum,2232732101063475749,2,725,2020-01-17 12:51:40 UTC


In [58]:
cust_implicit_df.nunique()

user_id             914574
category               910
category_id            910
implicit_rating          5
catID                  910
event_time         3273883
dtype: int64

In [57]:
# Saving this df in S3
cust_implicit_df.to_csv('s3://myaws-capstone-bucket/data/implicit_cat_rating_timestamp.csv',index=False)

To have the exact same trains, test and validation users across all the modesl doing merge with original train,test and validation dfs

In [60]:
# Reading Training,validation and testing dfs - Using the created implicit_cat data with timestamp
cust_implicit_df = pd.read_csv('s3://myaws-capstone-bucket/data/implicit_cat_rating_timestamp.csv')
train_df = pd.read_csv('s3://myaws-capstone-bucket/data/modeling/input/implicit_cat_rating_train.csv')
val_train_df = pd.read_csv('s3://myaws-capstone-bucket/data/modeling/input/implicit_cat_rating_val_train.csv')
test_df = pd.read_csv('s3://myaws-capstone-bucket/data/modeling/input/implicit_cat_rating_test.csv')
val_test_df = pd.read_csv('s3://myaws-capstone-bucket/data/modeling/input/implicit_cat_rating_val_test.csv')

T_train_df = pd.merge(cust_implicit_df, train_df[['user_id']], on=["user_id"], how='inner')
T_val_train_df = pd.merge(cust_implicit_df, val_train_df[['user_id']], on=["user_id"], how='inner')
T_test_df = pd.merge(cust_implicit_df, test_df[['user_id']], on=["user_id"], how='inner')
T_val_test_df = pd.merge(cust_implicit_df, val_test_df[['user_id']], on=["user_id"], how='inner')

In [61]:
# Keeping only one cateory user combination per event time
T_train_df = T_train_df.drop_duplicates(subset=['user_id','category', 'category_id','event_time'])
T_val_train_df = T_val_train_df.drop_duplicates(subset=['user_id','category', 'category_id','event_time'])
T_test_df = T_test_df.drop_duplicates(subset=['user_id','category', 'category_id','event_time'])
T_val_test_df = T_val_test_df.drop_duplicates(subset=['user_id','category', 'category_id','event_time'])

In [62]:
print(T_train_df.nunique())
print(T_test_df.nunique())
print(T_val_train_df.nunique())
print(T_val_test_df.nunique())

user_id             839183
category               910
category_id            910
implicit_rating          5
catID                  910
event_time         3047065
dtype: int64
user_id             548860
category               903
category_id            903
implicit_rating          5
catID                  903
event_time         2089766
dtype: int64
user_id             727477
category               910
category_id            910
implicit_rating          5
catID                  910
event_time         2693000
dtype: int64
user_id             426789
category               890
category_id            890
implicit_rating          5
catID                  890
event_time         1661444
dtype: int64


In [63]:
print(train_df.nunique())
print(test_df.nunique())
print(val_train_df.nunique())
print(val_test_df.nunique())

user_id            839183
category              924
category_id           924
implicit_rating         5
catID                 924
dtype: int64
user_id            548860
category              916
category_id           916
implicit_rating         5
catID                 916
dtype: int64
user_id            727477
category              920
category_id           920
implicit_rating         5
catID                 920
dtype: int64
user_id            426789
category              908
category_id           908
implicit_rating         5
catID                 908
dtype: int64


In [64]:
print(T_train_df.shape[0])
print(T_test_df.shape[0])
print(T_val_train_df.shape[0])
print(T_val_test_df.shape[0])

3559313
2322074
3087616
1806424


In [65]:
# Saving this df in S3
T_train_df.to_csv('s3://myaws-capstone-bucket/data/modeling/input/T_implicit_cat_rating_train.csv',index=False)
T_test_df.to_csv('s3://myaws-capstone-bucket/data/modeling/input/T_implicit_cat_rating_test.csv',index=False)
T_val_train_df.to_csv('s3://myaws-capstone-bucket/data/modeling/input/T_implicit_cat_rating_val_train.csv',index=False)
T_val_test_df.to_csv('s3://myaws-capstone-bucket/data/modeling/input/T_implicit_cat_rating_val_test.csv',index=False)

In [None]:
T_test_df.head(30)