In [152]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from annoy import AnnoyIndex
from sklearn.decomposition import PCA, IncrementalPCA

In [98]:
# data source: https://www.kaggle.com/mkechinov/ecommerce-behavior-data-from-multi-category-store?select=2019-Oct.csv
df = pd.read_csv('2019-Oct.csv')

In [9]:
# Filter df to limit rows in training
df = df.sample(frac = 0.01) 
df.count()

event_time       424488
event_type       424488
product_id       424488
category_id      424488
category_code    289385
brand            363443
price            424488
user_id          424488
user_session     424488
dtype: int64

In [10]:
# Save data
df.to_pickle("purchase_data.pkl")

# df = pd.read_pickle("purchase_data.pkl")

In [24]:
# Group by user, product, and event type (view, cart, purchase) to get counts
user =  df[['user_id', 'product_id', 'event_type', 'event_time']]. \
    groupby(['user_id', 'product_id', 'event_type']).count().reset_index().rename(columns = {'event_time':'count'})

In [25]:
# Change value to 2 if count is more than 2 
# (1 indicates the event happened once; 2 indicates greater interest)
user.loc[user['count'] > 2, 'count'] = 2

In [27]:
# Group by user_id and product_id (drop event_type). For a user who
# viewed, added to cart, and purchased a product more than once, the
# value will be 6. A customer who only viewed a product once will have
# a value of 1 for the product.
user = user.groupby(['user_id', 'product_id']).sum().reset_index()

In [None]:
# Create 'document' of user-product relationship by repeating the 
# product_id by the value created above. For example, a user who
# had a value of 6 will have the product id repeated 6 times in the
# item column.
def id_repeater(row):
    arr = np.repeat(row['product_id'], row['count'], axis=0)
    lst = arr.tolist()
    lst = [str(l) for l in lst]
    return ' '.join(lst)

user['item'] = user.apply(id_repeater, axis=1)

In [30]:
# Join all of a user's items across products together. This finalizes
# the item 'document' for each user.
user = user.groupby(['user_id'])['item'].apply(' '.join).reset_index()

In [None]:
# Add list_item column for later retrieval
user['list_item'] = user['item'].apply(lambda x: x.split(" "))

In [None]:
# Reindex
user.drop(columns=['index'], inplace=True)

In [31]:
# Save unfiltered data to pickle
user.to_pickle("user_dataframe.pkl")

In [32]:
# Downsample user to allow PCA to run
user = user.sample(frac = 0.02) 
print(user.shape)

# Save data
user.to_pickle("user_dataframe_reduced.pkl")

# user = pd.read_pickle("user_dataframe_reduced.pkl")

(6411, 2)