In this notebook I created dataframes that contain most popular products based on views, add-to-cart and purchases. These dataframes are used for evaluating collaborative recommender system in the first notebook. Also simple recommender that recomends most purchased products for unknows customer is build in this notebook.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
events = pd.read_csv("dataset_events.csv")
print(events.shape)
events.head()

(653419, 4)


Unnamed: 0,customer_id,product_id,type,timestamp
0,1,19685,view_product,1527812004
1,1,19685,view_product,1527812041
2,1,19685,add_to_cart,1527812046
3,1,19685,view_product,1527812048
4,1,19685,view_product,1527812050


In [3]:

events.loc[pd.to_datetime(events['timestamp']).index, 'date'] = pd.to_datetime(events['timestamp']).values
events['date'] = events['date'].apply(lambda x: str(x).split()[0])
events['date'].value_counts(), events.shape

(1970-01-01    653419
 Name: date, dtype: int64, (653419, 5))

In [4]:
purchases = events[events.type == 'purchase_item'].drop(columns=['timestamp', 'date'])
carts = events[events.type == 'add_to_cart'].drop(columns=['timestamp', 'date'])
views = events[events.type == 'view_product'].drop(columns=['timestamp', 'date'])

purchases.head()

Unnamed: 0,customer_id,product_id,type
161,44,24129,purchase_item
162,44,3814,purchase_item
360,121,16959,purchase_item
396,132,3536,purchase_item
649,234,26045,purchase_item


In [5]:
purchases_with_no_duplicates = purchases.drop_duplicates(['product_id', 'customer_id'])
add_cart_with_no_duplicates = carts.drop_duplicates(['product_id', 'customer_id'])
views_with_no_duplicates = views.drop_duplicates(['product_id', 'customer_id'])

purchases_with_no_duplicates.head()

Unnamed: 0,customer_id,product_id,type
161,44,24129,purchase_item
162,44,3814,purchase_item
360,121,16959,purchase_item
396,132,3536,purchase_item
649,234,26045,purchase_item


In [6]:
grouped_purchases_count_by_product_id = purchases_with_no_duplicates.groupby(['product_id']).customer_id.agg('count').to_frame('n_unique_purchases').reset_index()
sorted_popular_items_by_purchase = grouped_purchases_count_by_product_id.sort_values('n_unique_purchases', ascending=False)

grouped_carts_count_by_product_id = add_cart_with_no_duplicates.groupby(['product_id']).customer_id.agg('count').to_frame('n_unique_cart_adds').reset_index()
sorted_popular_items_by_cart_add = grouped_carts_count_by_product_id.sort_values('n_unique_cart_adds', ascending=False)

grouped_views_count_by_product_id = views_with_no_duplicates.groupby(['product_id']).customer_id.agg('count').to_frame('n_unique_views').reset_index()
sorted_popular_items_by_views = grouped_views_count_by_product_id.sort_values('n_unique_views', ascending=False)

sorted_popular_items_by_views.head()

Unnamed: 0,product_id,n_unique_views
22024,22031,2052
11215,11219,1884
25009,25016,1832
21115,21122,1621
20584,20591,1473


In [7]:
sorted_popular_items_by_views.product_id.values

array([22031, 11219, 25016, ...,  5599,  5600, 28369], dtype=int64)

These outputs are imported in the first notebooks.

In [14]:
np.save("sorted_popular_items_by_purchase", sorted_popular_items_by_purchase.product_id.values)
np.save("sorted_popular_items_by_cart_add", sorted_popular_items_by_cart_add.product_id.values)
np.save("sorted_popular_items_by_views", sorted_popular_items_by_views.product_id.values)

<h2>Popular item recommender</h2>

In [11]:

from abc import ABC, abstractmethod

class RecommenderABC(ABC):
    
    @abstractmethod
    def train(self):
        pass
    
    @abstractmethod
    def can_recommend(self):
        pass
    
    @abstractmethod
    def recommend(self):
        pass

import pandas as pd
import numpy as np
import os

class PopRecommender(RecommenderABC):

    def __init__(self):
        self.sorted_popular_item_ids = None

    def train(self):
        events = pd.read_csv("dataset_events.csv")

        purchases = events[events.type == 'purchase_item'].drop(columns=['timestamp'])
        purchases_with_no_duplicates = purchases.drop_duplicates(['product_id', 'customer_id'])

        grouped_purchases_count_by_product_id = purchases_with_no_duplicates.groupby(['product_id']).customer_id.agg('count').to_frame('n_unique_purchases').reset_index()

        sorted_popular_items = grouped_purchases_count_by_product_id.sort_values('n_unique_purchases', ascending=False)
        
        self.sorted_popular_item_ids = sorted_popular_items.product_id.values
        print('pop recommender trained')
    
    def can_recommend(self):
        return True

    def recommend(self, n):
        if self.sorted_popular_item_ids is None: self.train()
        return self.sorted_popular_item_ids[:n].tolist()

In [13]:
popRecommender = PopRecommender()
popRecommender.train()
popRecommender.recommend(50)

pop recommender trained


[11219,
 22031,
 16959,
 24848,
 24851,
 3526,
 20585,
 24849,
 24846,
 4232,
 20591,
 3617,
 23886,
 23889,
 3525,
 3524,
 24850,
 26739,
 4231,
 8785,
 23872,
 22033,
 22030,
 19191,
 24845,
 20586,
 24847,
 26741,
 4130,
 22032,
 26719,
 22188,
 26721,
 20590,
 26720,
 24457,
 23566,
 4230,
 8783,
 20592,
 24464,
 26738,
 24052,
 19190,
 24137,
 23450,
 21122,
 20584,
 3477,
 21413]