-------------------------------------------------------------------------------
-- Federal University of Santa Maria
-- Technology Center
-- Computer Engineering Course
--
-- Author : Luis Felipe de Deus
--
-- Created: 19 Jun 2020
-- Update : 20 Jun 2020
-------------------------------------------------------------------------------

####### JUST A SMALL IMPLEMENTATION OF WORD2VEC

####### MAKING SHOOPING PREDICTIONS

In [1]:
#Download the dataset
!gdown https://drive.google.com/uc?id=1NK-2z0l-qTplDJJ2SHpTVBGRP3zWAK-n

Downloading...
From: https://drive.google.com/uc?id=1NK-2z0l-qTplDJJ2SHpTVBGRP3zWAK-n
To: /content/retail.xlsx
23.7MB [00:00, 38.5MB/s]


In [2]:
import pandas as pd
# Read the data
df = pd.read_excel('retail.xlsx')
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom



- **InvoiceNo**: Purchase ID 
- **StockCode**: Product ID
- **Description**: Description.
- **Quantidade**: Quantity.
- **InvoiceDate**: Time
- **CustomerID**: Client ID

In [3]:
df.shape

(541909, 8)

In [4]:
#Check nan data
df.isnull().sum()

InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64

In [5]:
# Drop null data
df.dropna(inplace=True)
df.isnull().sum()

InvoiceNo      0
StockCode      0
Description    0
Quantity       0
InvoiceDate    0
UnitPrice      0
CustomerID     0
Country        0
dtype: int64

In [6]:
#Get just Product ID and Description
products = df[["StockCode", "Description"]]

# Drop duplicates
products.drop_duplicates(inplace=True, subset='StockCode', keep="last")

# create product-ID and product-description dictionary
products_dict = products.groupby('StockCode')['Description'].apply(list).to_dict()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [7]:
#Change to string domain
df['StockCode'] = df['StockCode'].astype(str)

In [8]:
#Create a Customer list with all ID's
customers = df["CustomerID"].unique().tolist()
len(customers)

4372

In [9]:
import random
random.shuffle(customers)

In [10]:
# Split the dataset to train (90%) and test(10%)
train_size = int(0.9 * len(customers))

customers_train = customers[:train_size]
customers_val = customers[train_size:len(customers)]

df_train = df[df['CustomerID'].isin(customers_train)]
df_val = df[df['CustomerID'].isin(customers_val)]

In [11]:
def compile_orders(customers, df):
  orders = []
  for customer in customers:
    order = df[df['CustomerID'] == customer]['StockCode'].tolist()
    orders.append(order)
  return orders

In [12]:
#Create the train and validation lists
orders_train = compile_orders(customers_train, df_train)
orders_val = compile_orders(customers_val, df_val)

In [13]:
#Load GENSIM
import gensim 
from gensim.models import Word2Vec 

In [14]:
#@brief: This class is a search engine for shopping list
#receiving a shopping list as input, the class predict the most similar
#products that the consumer usually buy
class searchEngine_v1():
  def __init__(self, products_dict):
    self.products = (products_dict.copy())

  #@brief: This method compute the Word2Vec model
  #@param: dataset is the train data 
  def compute(self, dataset):
    print("Creating model Word2Vec")
    # Create CBOW model 
    self.model1 = gensim.models.Word2Vec(dataset, min_count = 1,  size = 100, window = 5)
    # Create Skip Gram model 
    #self.model2 = gensim.models.Word2Vec(dataset, min_count = 1, size = 100, window = 5, sg = 1) 
    print("Model created!")

  #@brief: This method receive a product ID and makes a prediction of most 
  #similar products related to this one
  #@param: IDprodc is a number ex: 22345
  def predict(self, IDprod):
    #Get the most similar products
    mostSimilar = self.model1.wv.most_similar(IDprod)
   
    #Walk through the most similar, find and show the description
    print("---- Predictions ----")
    for e,p in enumerate(mostSimilar):
      try:
        cod = int(p[0])
      except:
        cod = p[0]

      pd = self.findProduct(cod)
      print("[%d]"%(e),"Similarity:", round(p[1],3), "| Product Code: ", p[0], " | Desc: ", pd )

  #@brief: Auxiliary method used to find the product description
  #@param: prod is a single code from a product a.k.a '232132'
  def findProduct(self, prod):
    try:
      cod = int(prod)
    except:
      cod = str(prod)
    
    if(cod in self.products):
      #print("Product:", self.products[cod], "Code:",cod)
      return self.products[cod]
    else:
      #print("Invalid Product Code ")
      return "Not found"      

  #@brief: Auxiliary method used to show the product description
  #@param: IDproducts is a product ID ex. 223131
  def showProduct(self, IDproduct):
    try:
      cod = int(IDproduct)
    except:
      cod = str(IDproduct)

    if(cod in self.products):
      print("Product:", self.products[cod])
    else:
      print("Invalid Product Code ")

In [15]:
import numpy as np
#@brief: This class is a search engine for shopping list
#receiving a shopping list as input, the class predict the most similar
#products that the consumer usually buy
class searchEngine_v2():
  def __init__(self, products_dict):
    self.products = dict(products_dict.copy())

  #@brief: This method compute the Word2Vec model
  #@param: dataset is the train data 
  def compute(self, dataset):
    print("Creating model Word2Vec")
    # Create CBOW model 
    self.model1 = gensim.models.Word2Vec(dataset, min_count = 1,  size = 100, window = 5)
    # Create Skip Gram model 
    #self.model2 = gensim.models.Word2Vec(dataset, min_count = 1, size = 100, window = 5, sg = 1) 
    print("Model created!")

  #@brief: This method receive the products ID's and get all the similarity
  #vectors, plus get the resultant vector of all of them
  #@param: IDprods is a list with x ID products
  def getVectors(self, IDprods):
    vectors = []
    self.IDprods = IDprods
    #Walk through all the products
    for ID in self.IDprods:
      #Get the Array from each vector
      v = self.model1.wv[ID]
      #Append to a list
      vectors.append(v)
    
    #Walk through the vectors
    sv = np.zeros(100)
    for v in vectors:
      #Sum each component of the vector
      sv += v
    #Get the resultant vector
    self.rVector = sv/len(self.IDprods)

    #self.predict(self.rVector)
    return self.rVector
    
  #@brief: This method receive a similarity vector and makes a prediction
  #@param: vector is an similarity vector, usually a resultant of more vectors
  def predict(self, vector):
    #Get the most similars products based on the resultant vector
    mostSimilar = self.model1.similar_by_vector(vector)

    #Walk through the most similars products and show the code, description
    #and similarity
    print("---- Predictions ----")
    for e,p in enumerate(mostSimilar):
      try:
        cod = int(p[0])
      except:
        cod = p[0]

      pd = self.findProduct(cod)
      #print(cod)
      #print(self.IDprods)
      if(str(cod) in self.IDprods):
        print("[%d]"%(e),"Code:",cod,"Already on the original list")
      else:
        print("[%d]"%(e),"Similarity:", round(p[1],3), "| Product Code: ", p[0], " | Desc: ", pd )

  #@brief: Auxiliary method used to find the product description
  #@param: prod is a single code from a product a.k.a '232132'
  def findProduct(self, prod):
    try:
      cod = int(prod)
    except:
      cod = str(prod)
    
    if(cod in self.products):
      #print("Product:", self.products[cod], "Code:",cod)
      return self.products[cod]
    else:
      #print("Invalid Product Code ")
      return "Not found"      

  #@brief: Auxiliary method used to show the product description
  #@param: IDproducts is a list with x products ID
  def showProduct(self, IDproducts):
    print("--- Shopping list ---- ")
    
    for p in IDproducts:
      try:
        cod = int(p)
      except:
        cod = str(p)

      if(cod in self.products):
        print("Product:", self.products[cod], "Code:",cod)
      else:
        print("Invalid Product Code ")

In [16]:
#Create the object
s = searchEngine_v1(products_dict)
print(products_dict['84898F'])
s.showProduct('23055')

['YELLOW FLOWERS FELT HANDBAG KIT']
Product: ['IVORY CHANDELIER T-LIGHT HOLDER']


###### ---- EXPERIMENT 1 --- ########
##### USING SEARCH ENGINE V1 WITH ONLY ONE PRODUCT

In [17]:
#Create the object
s = searchEngine_v1(products_dict)
#Add the product to test
prod = (orders_train[0][1])
prod = int(prod)
#Show the product added with the description
s.showProduct(prod)
#Create the W2V model
s.compute(orders_train)

Product: ['BOTTLE BAG RETROSPOT ']
Creating model Word2Vec
Model created!


In [18]:
#Makes a prediction
prod = orders_train[0][1]
s.predict(prod)

---- Predictions ----
[0] Similarity: 0.964 | Product Code:  21033  | Desc:  ['JUMBO BAG CHARLIE AND LOLA TOYS']
[1] Similarity: 0.951 | Product Code:  22381  | Desc:  ['TOY TIDY PINK POLKADOT']
[2] Similarity: 0.942 | Product Code:  85099F  | Desc:  ['JUMBO BAG STRAWBERRY']
[3] Similarity: 0.94 | Product Code:  20711  | Desc:  ['JUMBO BAG TOYS ']
[4] Similarity: 0.937 | Product Code:  22379  | Desc:  ['RECYCLING BAG RETROSPOT ']
[5] Similarity: 0.937 | Product Code:  20717  | Desc:  ['STRAWBERRY SHOPPER BAG']
[6] Similarity: 0.922 | Product Code:  22378  | Desc:  ['WALL TIDY RETROSPOT ']
[7] Similarity: 0.92 | Product Code:  20716  | Desc:  ['PARTY FOOD SHOPPER BAG']
[8] Similarity: 0.92 | Product Code:  22663  | Desc:  ['JUMBO BAG DOLLY GIRL DESIGN']
[9] Similarity: 0.916 | Product Code:  20731  | Desc:  ['POSY CANDY BAG']


  if np.issubdtype(vec.dtype, np.int):


###### ---- EXPERIMENT 2 --- ########
##### USING SEARCH ENGINE V2 WITH A SHOPPING LIST

In [19]:
#Create the object
s2 = searchEngine_v2(products_dict)
#Add the product to test
prod = orders_train[0][0:5]
#Show the products added with the description
s2.showProduct(prod)
#Create the W2V model
s2.compute(orders_train)

--- Shopping list ---- 
Product: ['HIPPY CHIC DECORATIVE PARASOL'] Code: 21631
Product: ['BOTTLE BAG RETROSPOT '] Code: 22377
Product: ['TV DINNER TRAY VINTAGE PAISLEY'] Code: 22473
Product: ['BLACK AND WHITE CAT BOWL'] Code: 22702
Product: ['BLACK AND WHITE DOG BOWL'] Code: 22700
Creating model Word2Vec
Model created!


In [20]:
#Get the resulting vector of this test
prod = orders_train[0][0:5]
vector = s2.getVectors(prod)

In [21]:
#Makes a prediction
s2.predict(vector)

---- Predictions ----
[0] Similarity: 0.955 | Product Code:  84725  | Desc:  ['FREESTYLE CANVAS ART PICTURE']
[1] Similarity: 0.951 | Product Code:  84592  | Desc:  ['CROCHET ROSE PURSE WITH SUEDE BACK']
[2] Similarity: 0.949 | Product Code:  84313B  | Desc:  ['BLUE TV TRAY TABLE ']
[3] Similarity: 0.948 | Product Code:  78124  | Desc:  ['MAGAZINE RACK GEBRA ASSORTED ']
[4] Similarity: 0.947 | Product Code:  84563B  | Desc:  ['BLUE & WHITE BREAKFAST TRAY']
[5] Similarity: 0.947 | Product Code:  79336  | Desc:  ['LIGHT PINK FLOCK GLASS CANDLEHOLDER']
[6] Code: 21631 Already on the original list
[7] Similarity: 0.944 | Product Code:  84912A  | Desc:  ['PINK ROSE WASHBAG']
[8] Similarity: 0.942 | Product Code:  79164  | Desc:  ['BLACK WINE GLASS']
[9] Similarity: 0.942 | Product Code:  46118  | Desc:  ['FUNKY MONKEY CUSHION COVER']


  if np.issubdtype(vec.dtype, np.int):


###### ---- EXPERIMENT 3 --- ########
##### USING SEARCH ENGINE V1 WITH VALIDATION DATA - ONLY ONE PRODUCT

In [22]:
#Create the object
s = searchEngine_v1(products_dict)
#Create the W2V model
s.compute(orders_train)

Creating model Word2Vec
Model created!


In [23]:
#Makes a prediction
#Generate random positions to feed the array
n1 = int(np.random.randint(len(orders_val),size=1))
n2 = int(np.random.randint(len(orders_val[n1]),size=1))
print("from Shopping List: %d - Prod: %d"%(n1,n2))
#Get the related product based on the random index
prod = orders_val[n1][n2]
s.showProduct(prod)
#Get the most similar products
print("\n\n")
s.predict(prod)

from Shopping List: 225 - Prod: 3
Product: ['PACK 3 BOXES BIRD PANETTONE ']



---- Predictions ----
[0] Similarity: 0.921 | Product Code:  22651  | Desc:  ['GENTLEMAN SHIRT REPAIR KIT ']
[1] Similarity: 0.921 | Product Code:  85178  | Desc:  ['VICTORIAN SEWING KIT']
[2] Similarity: 0.92 | Product Code:  85176  | Desc:  ['SEWING SUSAN 21 NEEDLE SET']
[3] Similarity: 0.912 | Product Code:  23559  | Desc:  ['WOODLAND BUNNIES LOLLY MAKERS']
[4] Similarity: 0.906 | Product Code:  22584  | Desc:  ['PACK OF 6 PANETTONE GIFT BOXES']
[5] Similarity: 0.905 | Product Code:  23392  | Desc:  ['SPACEBOY ROCKET LOLLY MAKERS']
[6] Similarity: 0.901 | Product Code:  22453  | Desc:  ['MEASURING TAPE BABUSHKA BLUE']
[7] Similarity: 0.886 | Product Code:  22493  | Desc:  ['PAINT YOUR OWN CANVAS SET']
[8] Similarity: 0.883 | Product Code:  22582  | Desc:  ['PACK OF 6 SWEETIE GIFT BOXES']
[9] Similarity: 0.881 | Product Code:  10135  | Desc:  ['COLOURING PENCILS BROWN TUBE']


  if np.issubdtype(vec.dtype, np.int):


###### ---- EXPERIMENT 4 --- ########
##### USING SEARCH ENGINE V2 WITH VALIDATION DATA - SHOPPING LIST

In [24]:
#Create the object
s2 = searchEngine_v2(products_dict)
#Create the W2V model
s2.compute(orders_train)


Creating model Word2Vec
Model created!


In [25]:
#Add the shopping list
#Generate random positions to feed the array
n1 = int(np.random.randint(len(orders_val),size=1))
print("Using Shopping List: %d"%(n1))
prod = orders_val[n1]
vector = s2.getVectors(prod)
#show the shopping list
#prod = np.asarray(prod, dtype=int)
s2.showProduct(prod)
print("\n")
#Make a prediction based on the resulting vector
s2.predict(vector)

Using Shopping List: 383
--- Shopping list ---- 
Product: ['EGG FRYING PAN RED '] Code: 23338
Product: ['FAMILY ALBUM WHITE PICTURE FRAME'] Code: 22169
Product: ['SMALL PARISIENNE HEART PHOTO FRAME '] Code: 23093
Product: ['HEART CALCULATOR'] Code: 20985
Product: ['WOODEN FRAME ANTIQUE WHITE '] Code: 82494L
Product: ['WOODEN PICTURE FRAME WHITE FINISH'] Code: 82482
Product: ['MINIATURE ANTIQUE ROSE HOOK IVORY'] Code: 23148
Product: ['EDWARDIAN HEART PHOTO FRAME'] Code: 23070
Product: ['AGED GLASS SILVER T-LIGHT HOLDER'] Code: 21326
Product: ['VICTORIAN GLASS HANGING T-LIGHT'] Code: 22178
Product: ['FRENCH LAVENDER SCENT HEART'] Code: 21391
Product: ['LARGE WHITE HEART OF WICKER'] Code: 23322
Product: ['SMALL WHITE HEART OF WICKER'] Code: 23321
Product: ['EMBOSSED GLASS TEALIGHT HOLDER'] Code: 22460
Product: ['T-LIGHT HOLDER SWEETHEART HANGING'] Code: 22789
Product: ['EMBOSSED GLASS TEALIGHT HOLDER'] Code: 22460
Product: ['GLASS BON BON JAR'] Code: 23089
Product: ['SET/5 RED RETROSPOT L

  if np.issubdtype(vec.dtype, np.int):


###### ---- EXPERIMENT 5 --- ########
##### USING SEARCH ENGINE V2 WITH VALIDATION DATA 
##### FOR X SHOPPING LISTs

In [26]:
#Create the object
s2 = searchEngine_v2(products_dict)
#Create the W2V model
s2.compute(orders_train)
size = 5
for i in range(size):
  print("\n\n ########### RUNNING EXPERIMENT %d of %d ###################"%(i+1,size))
  #Add the shopping list
  #Generate random positions to feed the array
  n1 = int(np.random.randint(len(orders_val),size=1))
  print("Using Shopping List: %d"%(n1))
  prod = orders_val[n1]
  vector = s2.getVectors(prod)
  #show the shopping list
  #prod = np.asarray(prod, dtype=int)
  s2.showProduct(prod)
  print("\n")
  #Make a prediction based on the resulting vector
  s2.predict(vector)

Creating model Word2Vec
Model created!


 ########### RUNNING EXPERIMENT 1 of 5 ###################
Using Shopping List: 221
--- Shopping list ---- 
Product: ['GARDENERS KNEELING PAD KEEP CALM '] Code: 23301
Product: ['VICTORIAN GLASS HANGING T-LIGHT'] Code: 22178
Product: ['CREAM HEART CARD HOLDER'] Code: 22189
Product: ['NATURAL SLATE HEART CHALKBOARD '] Code: 22457
Product: ['BLACK HEART CARD HOLDER'] Code: 22188


---- Predictions ----
[0] Code: 22189 Already on the original list
[1] Code: 22457 Already on the original list
[2] Similarity: 0.87 | Product Code:  85062  | Desc:  ['PEARL CRYSTAL PUMPKIN T-LIGHT HLDR']
[3] Similarity: 0.86 | Product Code:  21313  | Desc:  ['GLASS HEART T-LIGHT HOLDER ']
[4] Similarity: 0.857 | Product Code:  84978  | Desc:  ['HANGING HEART JAR T-LIGHT HOLDER']
[5] Similarity: 0.857 | Product Code:  71477  | Desc:  ['COLOURED GLASS STAR T-LIGHT HOLDER']
[6] Similarity: 0.855 | Product Code:  23132  | Desc:  ['SMALL IVORY HEART WALL ORGANISER']
[7] Code:

  if np.issubdtype(vec.dtype, np.int):
