In [22]:
import torch
import pandas as pd
import numpy as np
import torch.nn.functional as F
import torch.nn as nn
import random
import time

from torch.utils.data import TensorDataset, DataLoader
from utils import *
from model import Recommendation

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'


def preprocessing(data):
    print('Number of rows removed:')
    print('Quantity:\t{:>6} row(s)'.format((data.Quantity<0).sum()))
    data = data[filter_negative(data.Quantity)]
    
    print('Unit Price:\t{:>6} row(s)'.format((data.UnitPrice<0).sum()))
    data = data[filter_negative(data.UnitPrice)]
    
    print('Customer ID:\t{:>6} row(s)'.format(data.CustomerID.isnull().sum()))
    data = data[data.CustomerID.notnull()]
    
    print('\nData final shape: ', data.shape)
    return data
    

def prepare_dataset(data, embedidx_item):
    train = set()
    invoices = set(data.InvoiceNo)
    for invoice in invoices:
        data_ = data[data.InvoiceNo == invoice]
        items_ = list(set(data_.EmbeddingID))
        if len(items_) > 1:
            for i in range(len(items_)-1):
                for j in range(i+1, len(items_)):
                    if (items_[i], items_[j]) not in train:
                        train.add((items_[i], items_[j], 1))
    
    # Create negative training data
    for _ in range(len(train)):
        x, y = random.sample(range(len(embedidx_item)), 2)
        if (x, y, 1) not in train:
            train.add((x, y, -1))
            
    return torch.tensor(list(train))


def generate_recommendation(code, matrix):
    item = item_embedidx[code]  # Take item embedding matrix
    
    #Calculate score
    score = torch.mm(matrix, matrix[item].unsqueeze(1)).view(-1)
    idx_sort = torch.argsort(score, descending=True)
    
    print('Top 5 frequently bought together with "{}": '.format(items[code]))
    for i in idx_sort[1:6].numpy():
        item_code = embedidx_item[i]
        print('Item code: {}\t Similarity: {:4f}\t Name: {}'.format(item_code,
                                                                    score[i].numpy(),
                                                                    items[item_code]
                                                                   ))



data = read_data('data.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    541909 non-null  object 
 1   StockCode    541909 non-null  object 
 2   Description  540455 non-null  object 
 3   Quantity     541909 non-null  int64  
 4   UnitPrice    541909 non-null  float64
 5   CustomerID   406829 non-null  float64
 6   Country      541909 non-null  object 
dtypes: float64(2), int64(1), object(4)
memory usage: 28.9+ MB


# 1. PREPROCESSING DATA
Based on information generated from `DataFrame.info()`:
* `InvoiceNo`: no filter applied</i>
* `StockCode`: no filter applied
* `Description`: no filter applied even when there are lots of null
* `Quantity`: filter negative value
* `UnitPrice`: filter negative value
* `CustomerID`: filter null value
* `Country`: no filter applied
* Add a new column `Value` = `Quantity` * `UnitPrice`

In [7]:
data_filtered = preprocessing(data)   
data_filtered['Value'] = data_filtered.Quantity * data_filtered.UnitPrice
print(data_filtered.describe())
print('\n\nNew DataFrame:\n ',data_filtered.head())

Number of rows removed:
Quantity:	 10624 row(s)
Unit Price:	     2 row(s)
Customer ID:	132220 row(s)

Data final shape:  (397884, 7)
            Quantity      UnitPrice     CustomerID          Value
count  397884.000000  397884.000000  397884.000000  397884.000000
mean       12.988238       3.116488   15294.423453      22.397000
std       179.331775      22.097877    1713.141560     309.071041
min         1.000000       0.001000   12346.000000       0.001000
25%         2.000000       1.250000   13969.000000       4.680000
50%         6.000000       1.950000   15159.000000      11.800000
75%        12.000000       3.750000   16795.000000      19.800000
max     80995.000000    8142.750000   18287.000000  168469.600000


New DataFrame:
    InvoiceNo StockCode                          Description  Quantity  \
0    536365    85123A   WHITE HANGING HEART T-LIGHT HOLDER         6   
1    536365     71053                  WHITE METAL LANTERN         6   
2    536365    84406B       CREAM CUPI

In [8]:
items = dict(data_filtered[['StockCode', 'Description']].values)
item_indexes = dict(data_filtered[['Description', 'StockCode']].values)
len(items), len(item_indexes)

(3665, 3877)

As we can see, the `items` and `item_indexes` have different shape so an item code can have multiple descriptions. Assume that the data file was sorted by order date, the higher number of row the updated desciption of item code. We gonna use `items` dictionary for references

> For visualization. I don't recommend using Python. Other BI tools such as Tableau, Qlik or PowerBI are much more better.

# 2. RECOMMENDATION SYSTEM
The training dataset is created base on which items usually be bought with an item. I create an embedding matrix, as the result, the closer item vectors are the similar they are. First thing is creating some `dict` to map between embedding matrix index, item code & item name

In [9]:
embedidx_item = dict(enumerate(items.keys())) #Embedding reference when put it into model
item_embedidx = dict(list(zip(embedidx_item.values(), embedidx_item.keys())))

# Add Embedding Id for item
data_filtered['EmbeddingID'] = data_filtered.StockCode.apply(lambda x: item_embedidx[x])

A training data looks like `(x1, x2, y)` where:
* `x1` is an embedding index of an item
* `x2` is another embedding index of an time that being bought in the same `InvoiceNo`
* y = 1 when `x1` is bought with `x2`, otherwise y = -1 with random 2 items 

In [10]:
# Generate training dataset
tensors = prepare_dataset(data_filtered.loc[:50000], embedidx_item)
data_loader = DataLoader(TensorDataset(tensors[:,:2], tensors[:,2].type(torch.float).unsqueeze(1)), 
                         batch_size=10000,
                         shuffle=True
                        )


# Load model and train
model = Recommendation(nums=len(embedidx_item), dim=50) # Please refer to model.py for details
model.to(DEVICE)
model.train()
optimizer = torch.optim.Adam(model.parameters())
for epoch in range(10):
    total_loss = 0
    s = time.time()
    for x, y in data_loader:
        x, y = x.to(DEVICE), y.to(DEVICE)
        optimizer.zero_grad()
        loss = model(x,y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print('Epoch = {:<2}, loss = {:<6}, time = {:<5}'.format(epoch, total_loss, time.time()-s))


# Export embedding layers from model
item_embedding = model.embed.weight.data.detach().cpu()
item_embedding = F.normalize(item_embedding, dim=1)

Epoch = 0 , loss = 79.55053985118866, time = 8.842999935150146
Epoch = 1 , loss = 79.39837300777435, time = 8.833001852035522
Epoch = 2 , loss = 79.24727785587311, time = 8.861000299453735
Epoch = 3 , loss = 79.07064807415009, time = 8.61099362373352
Epoch = 4 , loss = 78.90417742729187, time = 8.803002119064331
Epoch = 5 , loss = 78.69434082508087, time = 8.807997465133667
Epoch = 6 , loss = 78.46229928731918, time = 8.936001300811768
Epoch = 7 , loss = 78.19204539060593, time = 8.698003768920898
Epoch = 8 , loss = 77.87197816371918, time = 8.739996433258057
Epoch = 9 , loss = 77.48854184150696, time = 8.927002191543579


In [24]:
ITEM_CODE = '84795C'
generate_recommendation(ITEM_CODE, item_embedding)

Top 5 frequently bought together with "OCEAN STRIPE HAMMOCK ": 
Item code: 22301	 Similarity: 0.494373	 Name: COFFEE MUG CAT + BIRD DESIGN
Item code: 21286	 Similarity: 0.479872	 Name: RETROSPOT CANDLE  LARGE
Item code: 85123A	 Similarity: 0.463026	 Name: CREAM HANGING HEART T-LIGHT HOLDER
Item code: 72816	 Similarity: 0.440908	 Name: SET/3 CHRISTMAS DECOUPAGE CANDLES
Item code: 84877B	 Similarity: 0.428294	 Name: GREEN ROUND COMPACT MIRROR


# 3. ML Pipeline
At the end of day `t`, the database will generate a data file of that day with format `data_yyyymmdd.csv` to keep track. Then the `model` at date `t` take preprocessed data from csv file to update parameters, also save model weights as `model_yyyymmdd.h5`. It is necessary to store model weights after each update to make backups if there is any thing wrong.
Pipeline: <img src="./pipline.png">