In [1]:
import pandas as pd
import numpy as np
import os
import csv
from datetime import datetime
import pytz
import json

In [2]:
from matplotlib import pyplot as plt
%matplotlib inline

source: https://www.kaggle.com/mkechinov/ecommerce-behavior-data-from-multi-category-store?select=2019-Nov.csv

In [3]:
data_folder = "../data/input_data/"

In [4]:
def info_df(df):
    print("Nr. of events: {}".format(len(df)))
    print("Nr. users: {}".format(len(df["user_id"].unique())))
    print("Nr. products: {}".format(len(df["product_id"].unique())))
    print("Nr. categories: {}".format(len(df["category_id"].unique())))
    print("Median events by user: {}".format(np.median(df.groupby("user_id")["product_id"].count())))
    print("Median categories by user: {}".format(np.median(df.groupby("user_id")["category_id"].nunique())))

In [5]:
df = pd.read_csv("../datasets/2019-Nov.csv")

In [6]:
# drop duplicates
df.drop_duplicates(["event_type","product_id","user_id"], inplace=True)

In [7]:
df_agg = df[df["event_type"]=="purchase"].groupby("user_id")["product_id"].nunique().reset_index(name="nr_products")
df_agg = df_agg[df_agg["nr_products"]>3] # at least 4 purchases
df_sample = df.merge(df_agg["user_id"], on="user_id", how="inner")

In [8]:
#take a sample of 10.000 users
df_sample = df_sample.merge(df_sample["user_id"].drop_duplicates().head(10000), on="user_id", how="inner")

In [9]:
info_df(df_sample)

Nr. of events: 759113
Nr. users: 10000
Nr. products: 94698
Nr. categories: 652
Median events by user: 51.0
Median categories by user: 9.0


## Create a Test set: choose 1 purchase

In [10]:
test = df_sample[df_sample["event_type"]=="purchase"].groupby("user_id").first().reset_index()

In [11]:
print("Info test set:\n")
info_df(test)

Info test set:

Nr. of events: 10000
Nr. users: 10000
Nr. products: 4103
Nr. categories: 370
Median events by user: 1.0
Median categories by user: 1.0


In [78]:
test.head()

Unnamed: 0,user_id,event_time,event_type,product_id,category_id,category_code,brand,price,user_session
0,512363650,2019-11-17 13:06:16 UTC,purchase,11200419,2053013562946749253,appliances.personal.scales,maxwell,12.36,593714cc-a8b0-4d65-afab-f6e29305424d
1,512363895,2019-11-02 14:38:14 UTC,purchase,21405841,2053013561579406073,electronics.clocks,casio,216.48,28f5f12c-4b0b-4306-ba5a-3d22c1dc5709
2,512363921,2019-11-16 10:40:33 UTC,purchase,1004856,2053013555631882655,electronics.smartphone,samsung,128.42,60189f57-f6a1-4cb7-ae49-46bc1cfec632
3,512363973,2019-11-01 12:06:42 UTC,purchase,1004838,2053013555631882655,electronics.smartphone,oppo,178.87,f760187d-7715-42e7-b387-78e7c5ddd7d5
4,512364466,2019-11-16 13:49:58 UTC,purchase,20100169,2053013559473865347,furniture.bedroom.blanket,nika,82.37,dc257874-388c-4dc7-b6ed-efca7e7d9557


### Save to csv

In [12]:
cols = ["event_time","event_type","product_id","category_id","category_code","brand",
       "price","user_id"]
test[cols].to_csv(os.path.join(data_folder, "2019-Nov-sample-test-purch.csv"), header=False)
test.reset_index()["user_id"].to_csv(os.path.join(data_folder, "2019-Nov-sample-test-userid.csv"), 
                       header=False)

### For first recommender (purchase): consider only purchases events

In [13]:
df_p = df_sample[(df_sample["event_type"]=="purchase")]

### Split train/test: create a train set by excluding the user-item in the test above

In [14]:
merge_on = ["product_id","user_id", "event_type"]
train_p = df_p.merge(test[merge_on], on=merge_on, how="left", indicator=True)
# exclude user-product that are already in the test set (indipendent from the event type)
train_p = train_p[train_p["_merge"]=="left_only"].drop(columns="_merge")

In [15]:
print("Info train set: \n")
info_df(train_p)

Info train set: 

Nr. of events: 55214
Nr. users: 10000
Nr. products: 14981
Nr. categories: 529
Median events by user: 4.0
Median categories by user: 3.0


### Save to csv

In [16]:
train_p.loc[:,"event_time"]= pd.to_datetime(train_p.loc[:,"event_time"])
train_p[cols].to_csv(os.path.join(data_folder, "2019-Nov-sample-train-purch.csv"), header=False)


### For second recommender (purchase, view category): consider purchases, view

In [17]:
df_view_cat = df_sample[df_sample["event_type"]=="view"].drop_duplicates(["user_id","category_id"])

train_pv = pd.concat([train_p, df_view_cat], axis=0)

In [18]:
print("Info train set: \n")
info_df(train_pv)

Info train set: 

Nr. of events: 184225
Nr. users: 10000
Nr. products: 39267
Nr. categories: 652
Median events by user: 14.0
Median categories by user: 9.0


### Save to csv

In [19]:
#train_pv.loc[:,"event_time"]= pd.to_datetime(train_pv.loc[:,"event_time"])
train_pv[cols].to_csv(os.path.join(data_folder, "2019-Nov-sample-train-purch-view.csv"), header=False)


### For third recommender (purchase, cart, view category): consider purchases, view and cart

In [20]:
df_c = df_sample[df_sample["event_type"]=="cart"]

#exclude cart that are already purchased in the test set
merge_on = ["product_id","user_id"]
df_c = df_c.merge(test[merge_on], on=merge_on, how="left", indicator=True)
df_c = df_c[df_c["_merge"]=="left_only"].drop(columns="_merge")

In [21]:
train_pvc = pd.concat([train_p, df_view_cat, df_c], axis=0)

In [22]:
print("Info train set: \n")
info_df(train_pvc)

Info train set: 

Nr. of events: 255972
Nr. users: 10000
Nr. products: 42096
Nr. categories: 652
Median events by user: 20.0
Median categories by user: 9.0


### Save to csv

In [23]:
train_pvc[cols].to_csv(os.path.join(data_folder, "2019-Nov-sample-train-purch-view-cart.csv"), header=False)
