## Implementing a Recommendation System From Usual transaction/sales data

In [33]:
__author__ = 'Eduardo Gil González-Madroño'
__email__ = 'eggom.profesional@gmail.com'
__website__ = 'www.profesordata.com'

__copyright__ = 'Copyright 2010'

### Coding

In [1]:
# Load the CSV data onto a pd.DF:

import numpy as np
import pandas as pd
sales_transact_data_df = pd.read_csv("./data/retail.csv")

sales_transact_data_df.head(10)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,573744,21314,SMALL GLASS HEART TRINKET POT,8,2011-11-01 08:16:00,2.1,17733.0,United Kingdom
1,573744,21704,BAG 250g SWIRLY MARBLES,12,2011-11-01 08:16:00,0.85,17733.0,United Kingdom
2,573744,21791,VINTAGE HEADS AND TAILS CARD GAME,12,2011-11-01 08:16:00,1.25,17733.0,United Kingdom
3,573744,21892,TRADITIONAL WOODEN CATCH CUP GAME,12,2011-11-01 08:16:00,1.25,17733.0,United Kingdom
4,573744,21915,RED HARMONICA IN BOX,12,2011-11-01 08:16:00,1.25,17733.0,United Kingdom
5,573744,22065,CHRISTMAS PUDDING TRINKET POT,48,2011-11-01 08:16:00,0.39,17733.0,United Kingdom
6,573744,22340,NOEL GARLAND PAINTED ZINC,24,2011-11-01 08:16:00,0.39,17733.0,United Kingdom
7,573744,22577,WOODEN HEART CHRISTMAS SCANDINAVIAN,24,2011-11-01 08:16:00,0.29,17733.0,United Kingdom
8,573744,22578,WOODEN STAR CHRISTMAS SCANDINAVIAN,24,2011-11-01 08:16:00,0.29,17733.0,United Kingdom
9,573744,22579,WOODEN TREE CHRISTMAS SCANDINAVIAN,24,2011-11-01 08:16:00,0.29,17733.0,United Kingdom


In [2]:
# Build a dict with keys = StockCode and values = Description.
#
# This dict will be then useful to extract which product description belongs to a certain product StockCode
#
# See at the end for dummy_code explanation

prod_dict = dict(zip(sales_transact_data_df["StockCode"],sales_transact_data_df["Description"]))
prod_dict["21704"]

'BAG 250g SWIRLY MARBLES'

In [3]:
# Creation of the matrix customer-product. For this particular dataset the matrix will have:
#
# On rows: all unique CustomerIDs
# On columns: all unique StockCodes
# Values of the matrix will be the Quantity of product purchased --> with the aggregation of sum
#
# Using the .pivot_table to build it

cust_prod_mtx_df = sales_transact_data_df.pivot_table(index='CustomerID',
                                                      columns='StockCode',
                                                      values='Quantity',
                                                      aggfunc="sum")
print("Matrix Shape: ",cust_prod_mtx_df.shape)
cust_prod_mtx_df

Matrix Shape:  (1711, 2704)


StockCode,10080,10120,10124A,10124G,10125,10135,11001,15030,15034,15036,...,90214M,90214N,90214S,BANK CHARGES,C2,CRUK,D,DOT,M,POST
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12349.0,,,,,,,,,,,...,,,,,,,,,,1.0
12352.0,,,,,,,,,,,...,,,,,,,,,,2.0
12356.0,,,,,,,,,,,...,,,,,,,,,,
12357.0,,,,,,,,,,,...,,,,,,,,,,
12362.0,,,,,,,,,,,...,,,,,,,,,,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18263.0,,,,,,20.0,,,,,...,,,,,,,,,,
18270.0,,,,,,,,,,,...,,,,,,,,,,
18274.0,,,,,,,,,,,...,,,,,,,,,,
18276.0,,,,,,,,,,,...,,,,,,,,,,


In [4]:
# Getting onto an array:
#
# The customerIDs (as displayed on the matrix)
# The StockCodes (as displayed on the matrix)

CustomerID_arr = np.array(cust_prod_mtx_df.index.tolist())
StockCode_arr = np.array(cust_prod_mtx_df.columns.tolist())

In [5]:
# Creation of an sparse matrix from the customer-product matrix 'cust_prod_mtx_df':
#
# 1. Fill the 'NaN' values with zeros on 'cust_prod_mtx_df'. This is due to Python not able to create Sparse Matrixes from
#    objects that have NaN. Also, having 'NaN' on this matrix is coherent with the meaning of a customer not buying any of
#    this products.

cust_prod_mtx_df = cust_prod_mtx_df.fillna(0).copy()

# 2. Check the number of 0s present and its proportion over the overall dimensions of the matrix to ensure it is genuinely
#    an sparse matrix

cust_prod_mtx = cust_prod_mtx_df.values
print("# of 0s present:   ",sum(sum(cust_prod_mtx==0)))
print("% of 0s on matrix: ",round(((sum(sum(cust_prod_mtx==0))/(cust_prod_mtx.shape[0]*cust_prod_mtx.shape[1])))*100, 2),"%")

# 3. Creation of a genuine Python Sparse Matrix using the lib scipy.sparse

from scipy.sparse import coo_matrix

cust_prod_mtx_sparse = coo_matrix(cust_prod_mtx)
cust_prod_mtx_sparse

# of 0s present:    4571015
% of 0s on matrix:  98.8 %


<1711x2704 sparse matrix of type '<class 'numpy.float64'>'
	with 55529 stored elements in COOrdinate format>

In [6]:
# Performing Truncated SVD over the spare matrix:

from scipy.sparse.linalg import svds

U, s, V_t = svds(cust_prod_mtx,  k=10)

U.shape, s.shape, V_t.shape

((1711, 10), (10,), (10, 2704))

In [7]:
# Convert the np array 's' onto a diagonal matrix (To recover the A matrix as seen below, 's' must be a diagonal matrix):
#
# A ~ U x s x V_t

s_diag_mtx = np.diag(s)
print("S Diag Shape: ",(s_diag_mtx.shape))

# Rebuild the 'A' matrix (in our case the cust_prod_mtx) using the matrix-product of U, s_diag and V_t
#
# With this step we have the latent & useful information for our recommender system

cust_prod_mtx_svd = U @ s_diag_mtx @ V_t
cust_prod_mtx_svd

S Diag Shape:  (10, 10)


array([[ 5.35933696e-04,  1.28052995e-03,  2.01128431e-05, ...,
         5.90257550e-04, -1.52337657e-02,  5.37978794e-02],
       [ 4.85998891e-05,  1.20374209e-04,  1.71779851e-06, ...,
         5.25221195e-05,  1.44783840e-03,  4.46877775e-03],
       [ 6.73127138e-06,  2.04804778e-05,  1.96671604e-07, ...,
         7.62166096e-06, -5.78393382e-05,  1.90390914e-04],
       ...,
       [ 9.24355853e-21,  9.45968109e-19,  9.96428365e-21, ...,
        -9.97833217e-19,  2.42406078e-14, -7.54275701e-17],
       [-6.49041982e-08, -1.86346004e-07, -1.42547025e-09, ...,
        -7.07688783e-08,  6.88413887e-07, -1.37401051e-07],
       [ 1.48087850e-03,  4.31613157e-03,  4.29277782e-05, ...,
         1.75324423e-03,  3.43191936e-01,  4.95828338e-02]])

In [8]:
# Checking that our rebuilt matrix shape is equal to the original prod_cust matrix shape
cust_prod_mtx_svd.shape == cust_prod_mtx_df.shape

True

In [9]:
# Building the recommender function.
#
#
# The inputs of the function will be:
#
#   · CustomerID of the customer we want to recommend
#   · Number of products to recommend (by default: 5)


# Top-Level approach of the function:
#
# From a given customerID, look for the row of data belonging to her/him in the 'cust_prod_mtx_svd' and present her/him
# with the StockCodes whose values in the row are higher (given that the customer has not previously bought the StockCode
# recommended)

# Obtaining the target customer we want to recommend products:

target_cust = 12352

print("ID of Target Customer: ",target_cust)

# Obtain the index in 'CustomerID_ls' where 'target_cust' sits:

cust_index = np.where(CustomerID_arr == target_cust)[0][0]
print("Cust_Index on the prod-cust matrix: ", cust_index,"\n")

# Sorting the row in 'cust_prod_mtx_svd' that belongs to 'target_cust' in a descending way:
# This row in the 'cust_prod_mtx_svd' actually represents the chance of the customer to purchase a certain product so it is
# the core of our recommending system. Hence to sort this row within the matrix to get the indexes that give more chance
# of purchasing.

cust_purch_chance_index_sort = cust_prod_mtx_svd[cust_index,:].argsort()[::-1]

# Create of a boolean mask of the products NOT PURCHASED by 'target_cust'. This is because products already purchased
# by the customer might have, within the 'cust_prod_mtx_svd', a high value and it is not pursued a recommendation of
# products already purchased in the past.

prod_not_purch_by_target_cust_mask = cust_prod_mtx[cust_index, :][cust_purch_chance_index_sort] == 0

# Apply that mask over the 'cust_purch_chance_index_sort':

rec_prod_index = cust_purch_chance_index_sort[prod_not_purch_by_target_cust_mask]

# Use the array of StockCodes ('StockCode_arr') to figure out the StockCodes of the products that belong to those indexes
rec_prod_stockcodes = StockCode_arr[rec_prod_index]

# Produce the array of StockCodes with the target lenght of products we would like to actually recommend (Default: 5)
rec_prod_stockcodes_short = rec_prod_stockcodes[:5]
print("Recommended StockCodes for Customer ",target_cust)
print(rec_prod_stockcodes_short,"\n")

# Use that array of stockcodes to iterate over it and get actual product descriptions:

print("Description of Products:")
for stockcode in rec_prod_stockcodes_short:
    print(prod_dict[stockcode])

ID of Target Customer:  12352
Cust_Index on the prod-cust matrix:  1 

Recommended StockCodes for Customer  12352
['22492' '85099B' '22152' '21810' '22356'] 

Description of Products:
MINI PAINT SET VINTAGE 
JUMBO BAG RED RETROSPOT
PLACE SETTING WHITE STAR
CHRISTMAS HANGING STAR WITH BELL
CHARLOTTE BAG PINK POLKADOT


### AUX CODING TO UNDERSTAND SOME CONCEPTS:

In [11]:
cust_prod_mtx_svd[cust_index,:].argsort()[::-1]

array([1065, 2495,  795, ...,  275, 1047, 1046], dtype=int64)

In [12]:
StockCode_arr[cust_purch_chance_index_sort]

array(['22492', '85099B', '22152', ..., '21175', '22470', '22469'],
      dtype='<U12')

In [13]:
StockCode_arr[cust_prod_mtx[cust_index,:]!=0]

array(['21669', '22178', '22624', '22627', '22635', '22668', '22978',
       '22982', '23088', '23089', '23096', '23367', '23368', '23559',
       'POST'], dtype='<U12')

In [14]:
lista1 = [10,20,30,40]
lista2 = ["Luis","Pedro","Juan","Alberto"]
lista3 = ["García","Montalvo","Ararate","Crespo"]

dicc = dict(zip(lista1,lista2))

dicc

{10: 'Luis', 20: 'Pedro', 30: 'Juan', 40: 'Alberto'}

In [15]:
dicc[20]

'Pedro'

In [16]:
a = dicc.get(10)
a

'Luis'

In [17]:
list(dicc.values())

['Luis', 'Pedro', 'Juan', 'Alberto']

In [18]:
res = [n+" "+a for n, a in zip(lista2, lista3)]
res

['Luis García', 'Pedro Montalvo', 'Juan Ararate', 'Alberto Crespo']

In [19]:
def area_triangulo(base,altura):
    return (base*altura)/2

In [20]:
area_triangulo(3,4)

6.0

In [21]:
area_triangulo_lb = lambda base,altura: (base*altura)/2

In [22]:
area_triangulo_lb(3,4)

6.0

In [23]:
a = np.array([3,17,4,100,90,7,45,4500])

In [24]:
a

array([   3,   17,    4,  100,   90,    7,   45, 4500])

In [25]:
np.sort(a)

array([   3,    4,    7,   17,   45,   90,  100, 4500])

In [26]:
a

array([   3,   17,    4,  100,   90,    7,   45, 4500])

In [27]:
a.sort()

In [28]:
a

array([   3,    4,    7,   17,   45,   90,  100, 4500])

In [29]:
a = np.array([3,17,4,100,90,7,45,4500])

In [30]:
a

array([   3,   17,    4,  100,   90,    7,   45, 4500])

In [31]:
np.argsort(a)

array([0, 2, 5, 1, 6, 4, 3, 7], dtype=int64)

In [32]:
a[a.argsort()[::-1]]

array([4500,  100,   90,   45,   17,    7,    4,    3])