In [None]:
import pandas as pd
import numpy as np
import random
from tqdm import tqdm
from gensim.models import Word2Vec 
import matplotlib.pyplot as plt
%matplotlib inline


In [None]:
from zipfile import ZipFile
file_name = "Online Retail.zip"

with ZipFile(file_name,'r') as zip:
  zip.extractall()
  print('done')

done


In [None]:
df = pd.read_excel('/content/Online Retail.xlsx')
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [None]:
# check for missing values
df.isnull().sum()

# remove missing values
df.dropna(inplace=True)

#convert stock_code to string type since it's a unique combinations of numbers and letters
df['StockCode']= df['StockCode'].astype(str)


406829

In [None]:
customers = df['CustomerID'].unique().tolist()
random.shuffle(customers)

#customers

In [None]:
# extract 90% of customer ID's
customers_train = [customers[i] for i in range(round(0.9*len(customers)))]


# split data into train and validation set
train_df = df[df['CustomerID'].isin(customers_train)]
validation_df = df[~df['CustomerID'].isin(customers_train)]


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [None]:
purchases_train = []

# populate the list with the product codes
for i in tqdm(customers_train):
    temp = train_df[train_df["CustomerID"] == i]["StockCode"].tolist()
    #print(temp,i)
    purchases_train.append(temp)

100%|██████████| 3935/3935 [00:06<00:00, 631.34it/s]


In [None]:
purchases_val = []

# populate the list with the product codes
for i in tqdm(validation_df['CustomerID'].unique()):
    temp = validation_df[validation_df["CustomerID"] == i]["StockCode"].tolist()
    purchases_val.append(temp)

100%|██████████| 437/437 [00:00<00:00, 890.93it/s]


In [None]:
from gensim.models.callbacks import CallbackAny2Vec
from gensim.models import Word2Vec

# init callback class
class callback(CallbackAny2Vec):
    """
    Callback to print loss after each epoch
    """
    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        
        if self.epoch == 0:
            print('Loss after epoch {}: {}'.format(self.epoch, loss))
            print("")
            print("")
        elif self.epoch % 1 == 0:
            print('Loss after epoch {}: {}'.format(self.epoch, loss- self.loss_previous_step))
            print("")
            print("")
        
        self.epoch += 1
        self.loss_previous_step = loss

In [None]:
model = Word2Vec(window = 10, sg = 1, 
                 negative = 10, # for negative sampling
                 alpha=0.03, min_alpha=0.0007,
                 seed = 14)

model.build_vocab(purchases_train, progress_per=200)

model.train(purchases_train, total_examples = model.corpus_count, 
            epochs=10, report_delay=1,compute_loss = True, callbacks=[callback()])

Loss after epoch 0: 3616698.75


Loss after epoch 1: 2953461.25


Loss after epoch 2: 2764569.0


Loss after epoch 3: 2623681.0


Loss after epoch 4: 2577583.0


Loss after epoch 5: 2604135.0


Loss after epoch 6: 2530336.0


Loss after epoch 7: 2506150.0


Loss after epoch 8: 2550830.0


Loss after epoch 9: 2488240.0




(3540892, 3578250)

In [None]:
model.init_sims(replace=True)
print(model)

Word2Vec(vocab=3146, size=100, alpha=0.03)


In [None]:
X = model[model.wv.vocab]

X.shape

(3146, 100)

In [None]:
import warnings;
warnings.filterwarnings('ignore')

products = train_df[["StockCode", "Description"]]

# remove duplicates
products.drop_duplicates(inplace=True, subset='StockCode', keep="last")

# create product-ID and product-description dictionary
products_dict = products.groupby('StockCode')['Description'].apply(list).to_dict()

In [None]:
def similar_products(v, n = 6):
    
    # extract most similar products for the input vector
    ms = model.similar_by_vector(v, topn= n+1)[1:]
    
    # extract name and similarity score of the similar products
    new_ms = []
    for j in ms:
        pair = (products_dict[j[0]][0], j[1])
        new_ms.append(pair)
        
    return new_ms

In [None]:
similar_products(model['16218']) #"CARTOON PENCIL SHARPENER"

[('POPART WOODEN PENCILS ASST', 0.6942727565765381),
 ('HOUSE SHAPE PENCIL SHARPENER', 0.6938731074333191),
 ('PIECE OF CAMO STATIONERY SET', 0.6916038990020752),
 ('LETTER SHAPE PENCIL SHARPENER', 0.6747919321060181),
 ('TEATIME ROUND PENCIL SHARPENER ', 0.6340060234069824),
 ('ASSORTED TUTTI FRUTTI KEYRING BALL', 0.621924877166748)]

In [None]:
def aggregate_vectors(products):
    product_vec = []
    for i in products:
        try:
            product_vec.append(model[i])
        except KeyError:
            continue
        
    return np.mean(product_vec, axis=0)

In [None]:
similar_products(aggregate_vectors(purchases_val[8]))

[('SWALLOW WOODEN CHRISTMAS DECORATION', 0.6913470029830933),
 ('HEART WOODEN CHRISTMAS DECORATION', 0.6905133724212646),
 ('ROCKING HORSE GREEN CHRISTMAS ', 0.6867961883544922),
 ('CHRISTMAS TREE DECORATION WITH BELL', 0.6772191524505615),
 ('STAR WOODEN CHRISTMAS DECORATION', 0.6741601824760437),
 ('CHRISTMAS TREE STAR DECORATION', 0.6720138192176819)]

In [None]:
similar_products(aggregate_vectors(purchases_val[23]))

[("IF YOU CAN'T STAND THE HEAT MUG", 0.6660944223403931),
 ('SAVE THE PLANET MUG', 0.6273823976516724),
 ('HOME SWEET HOME MUG', 0.6246086359024048),
 ('PINK SPOTS CHOCOLATE NESTING BOXES ', 0.6047117710113525),
 ('POLKADOT MUG PINK ', 0.602050244808197),
 ('GIN AND TONIC MUG', 0.5945363640785217)]