In [2]:
# Installing the implicit package 
!pip install implicit



distributed 1.21.8 requires msgpack, which is not installed.
You are using pip version 10.0.1, however version 19.2.3 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


In [28]:
import pandas as pd
import numpy as np
import scipy.sparse as sparse
import implicit
import random
import warnings
warnings.filterwarnings('ignore')

#### Loading Train and test sets

In [4]:
df_train = pd.read_csv('train_5UKooLv.csv')
df_train.head()

Unnamed: 0,CustomerID,InvoiceNo,Quantity,InvoiceDate,UnitPrice,Country,StockCode
0,27270,27270,7,01/12/10 8:26,2.55,PX,85123AY
1,27270,27270,7,01/12/10 8:26,3.39,PX,71053R
2,27270,27270,9,01/12/10 8:26,2.75,PX,84406BH
3,27270,27270,7,01/12/10 8:26,3.39,PX,84029GV
4,27270,27270,7,01/12/10 8:26,3.39,PX,84029EX


In [5]:
df_test = pd.read_csv('test_J1hm2KQ.csv', usecols = df_train.columns)
df_test.head()

Unnamed: 0,Country,CustomerID,InvoiceDate,InvoiceNo,Quantity,StockCode,UnitPrice
0,PX,127269,01/12/10 8:28,127269,7,22633V,1.85
1,PX,227268,01/12/10 8:34,227268,38,84879M,1.69
2,PX,227268,01/12/10 8:34,227268,7,22748P,2.1
3,PX,227268,01/12/10 8:34,227268,9,22749K,3.75
4,PX,227268,01/12/10 8:34,227268,2,22622G,9.95


#### EDA train and test data

In [6]:
# Number of customers and their purchases in train set
print('No of unique customers in train_set - '+ format(df_train['CustomerID'].nunique()))
print('No of unique items in train_set - '+ format(df_train['StockCode'].nunique()))

No of unique customers in train_set - 972
No of unique items in train_set - 3810


In [7]:
# Number of customers and their purchases in test set
print('No of unique customers in test_set - '+ format(df_test['CustomerID'].nunique()))
print('No of unique items in test_set - '+ format(df_test['StockCode'].nunique()))

No of unique customers in test_set - 628
No of unique items in test_set - 3522


In [8]:
# shape of train and test sets
print('Train_shape  - ' + format(df_train.shape))
print('Test_shape  - ' + format(df_test.shape))

Train_shape  - (330575, 7)
Test_shape  - (103097, 7)


In [9]:
# No of items in train set that are in test set
print('No of items in train set that are in test set - ' + format(df_train[df_train['StockCode'].isin(df_test['StockCode'])]['StockCode'].nunique()))
print('No of items in train set that are not in test set - ' + format(df_train[~df_train['StockCode'].isin(df_test['StockCode'])]['StockCode'].nunique()))

No of items in train set that are in test set - 3472
No of items in train set that are not in test set - 338


In [10]:
# No of items in test set that are in train set
print('No of items in test set that are in train set - ' + format(df_test[df_test['StockCode'].isin(df_train['StockCode'])]['StockCode'].nunique()))
print('No of items in test set that are not in train set - ' + format(df_test[~df_test['StockCode'].isin(df_train['StockCode'])]['StockCode'].nunique()))

No of items in test set that are in train set - 3472
No of items in test set that are not in train set - 50


There 338 extra stock codes in train set whereas 50 in test set

In [11]:
# No of items purchased by each customer in train set
pd.DataFrame(df_train.groupby('CustomerID')['StockCode'].count()).head(10)

Unnamed: 0_level_0,StockCode
CustomerID,Unnamed: 1_level_1
0,390
900,826
1800,736
2790,393
4590,252
5400,399
6300,240
6390,638
7200,263
7290,1020


In [12]:
# No of items purchased by each customer in test set
pd.DataFrame(df_test.groupby('CustomerID')['StockCode'].count()).head(10)

Unnamed: 0_level_0,StockCode
CustomerID,Unnamed: 1_level_1
1890,8
2700,534
3600,223
3690,269
4500,224
5490,207
8190,350
9090,443
13680,142
14490,147


It is given that in the test set only 50% transactions of the customers have been provided and we need to recommend the additional 50% transactions 

In [13]:
# Now let us look at the quantity of the items in train set
df_train[df_train.Quantity <= 0].count()

CustomerID     5588
InvoiceNo      5588
Quantity       5588
InvoiceDate    5588
UnitPrice      5588
Country        5588
StockCode      5588
dtype: int64

In [14]:
# Now let us look at the quantity of the items in test set
df_test[df_test.Quantity <= 0].count()

Country        1762
CustomerID     1762
InvoiceDate    1762
InvoiceNo      1762
Quantity       1762
StockCode      1762
UnitPrice      1762
dtype: int64

It seems like there are 5588 items in train set and 1762 items in test set that are returned to the store or received free

We have seen that there are 338 additional stock codes in train set and 50 in test set so let us merge those 338 additional stock codes to the test set so that we can form a user-item matrix from test set

In [15]:
df_train_additional = df_train[~df_train['StockCode'].isin(df_test['StockCode'])]

In [16]:
test_train_merge = pd.concat([df_test,df_train_additional])

#### Pre-processing data

In [17]:
test_train_merge['CustomerID'] = test_train_merge.CustomerID.astype(int) # Convert to int for customer ID
test_train_merge = test_train_merge[['StockCode', 'Quantity', 'CustomerID']] # Get rid of unnecessary info
merged = test_train_merge.groupby(['CustomerID', 'StockCode']).sum().reset_index() # Group together
merged.Quantity.loc[merged.Quantity == 0] = 1 # Replace a sum of zero purchases with a one to indicate purchased
grouped_purchased = merged.query('Quantity > 0') # Only get customers where purchase totals were positive

In [18]:
# Mapping customer id and stock id to an integer
# Create mappings
STOCKCODE_to_idx = {}
idx_to_STOCKCODE = {}
for (idx, StockCode) in enumerate(grouped_purchased.StockCode.unique().tolist()):
    STOCKCODE_to_idx[StockCode] = idx
    idx_to_STOCKCODE[idx] = StockCode
#     print(idx)
    
CUDTOMER_ID_to_idx = {}
idx_to_CUSTOMER_ID = {}
for (idx, CustomerID) in enumerate(grouped_purchased.CustomerID.unique().tolist()):
    CUDTOMER_ID_to_idx[CustomerID] = idx
    idx_to_CUSTOMER_ID[idx] = CustomerID
#     print(idx)

In [19]:
grouped_purchased['HASH_CUSTOMER_ID'] = grouped_purchased['CustomerID'].map(CUDTOMER_ID_to_idx)
grouped_purchased['HASH_STOCK_ID'] = grouped_purchased['StockCode'].map(STOCKCODE_to_idx)
display(grouped_purchased.head(5))

Unnamed: 0,CustomerID,StockCode,Quantity,HASH_CUSTOMER_ID,HASH_STOCK_ID
0,0,90146B,1,0,0
2,1800,84985AB,1,1,1
3,1800,85018DI,1,1,2
4,1800,85231bJ,1,1,3
5,1890,21080R,4,2,4


In [20]:
# Aa dataframe of unique stock code
item_lookup =  pd.DataFrame(grouped_purchased[['StockCode','HASH_STOCK_ID']].drop_duplicates())
item_lookup['StockCode'] = item_lookup.StockCode.astype(str) # Encode as strings for future lookup ease

In [21]:
# Calculating number of unique customers and items for building a ratings matrix
n_customers = grouped_purchased.HASH_CUSTOMER_ID.nunique()
n_items = grouped_purchased.HASH_STOCK_ID.nunique()

In [22]:
data_matrix = np.zeros((n_customers, n_items))

#populate the matrix based on the dataset
for line in grouped_purchased.itertuples():
    data_matrix[line[4], line[5]] = line[3]


In [23]:
customers = list(np.sort(grouped_purchased.HASH_CUSTOMER_ID.unique())) # Get our unique customers
products = list(grouped_purchased.HASH_STOCK_ID.unique()) # Get our unique products that were purchased
quantity = list(grouped_purchased.Quantity) # All of our purchases

#### Sparsity of the matrix

In [24]:
matrix_size = data_matrix.shape[0]*data_matrix.shape[1] # Number of possible interactions in the matrix
num_purchases = len(data_matrix.nonzero()[0]) # Number of items interacted with
sparsity = 100*(1 - (num_purchases/matrix_size))
print('Sparsity of the matrix - '+ format(sparsity))

Sparsity of the matrix - 97.8670149966088


For Validating the Recommendation model let us mask some of the values of a customer in data matrix and after finding the recommendations let us check how many recommended items are actually bought by the customer

In [25]:
def make_train(ratings, pct_test = 0.2):
  
    test_set = ratings.copy() # Make a copy of the original set to be the test set. 
    test_set[test_set != 0] = 1 # Store the test set as a binary preference matrix
    test_set = sparse.csr_matrix(test_set)
    training_set = ratings.copy() # Make a copy of the original data we can alter as our training set. 
    nonzero_inds = training_set.nonzero() # Find the indices in the ratings data where an interaction exists
    nonzero_pairs = list(zip(nonzero_inds[0], nonzero_inds[1])) # Zip these pairs together of user,item index into list
    random.seed(0) # Set the random seed to zero for reproducibility
    num_samples = int(np.ceil(pct_test*len(nonzero_pairs))) # Round the number of samples needed to the nearest integer
    samples = random.sample(nonzero_pairs, num_samples) # Sample a random number of user-item pairs without replacement
    user_inds = [index[0] for index in samples] # Get the user row indices
    item_inds = [index[1] for index in samples] # Get the item column indices
    training_set[user_inds, item_inds] = 0 # Assign all of the randomly chosen user-item pairs to zero
    training_set = sparse.csr_matrix(training_set)
    training_set.eliminate_zeros() # Get rid of zeros in sparse array storage after update to save space
    return training_set, test_set, list(set(user_inds)) # Output the unique list of user rows that were altered  

In [26]:
product_train, product_test, product_users_altered = make_train(data_matrix, pct_test = 0.2)

I am ALS function of implicit library where I have used the default parameters for number of latent factors, regularization and iterations 

In [29]:
#Building the model
model = implicit.als.AlternatingLeastSquares(factors=20, regularization=0.1, iterations=10)
alpha_val = 15
data_conf = (product_train.T * alpha_val).astype('double')
model.fit(data_conf)


100%|████████████████████████████████████████████████████████████████████████████████| 10.0/10 [00:00<00:00, 10.59it/s]


In [30]:
user_vecs = model.user_factors
item_vecs = model.item_factors

In [31]:
# Dot product of the user and item vectors gives the predictions 
predictions = (user_vecs).dot(item_vecs.T)

#### Evaluating the recommendation system

In [32]:
from sklearn import metrics

def auc_score(predictions, test):
    fpr, tpr, thresholds = metrics.roc_curve(test, predictions)
    return metrics.auc(fpr, tpr)   

In [33]:
def calc_mean_auc(training_set, altered_users, predictions, test_set):
   
    store_auc = [] # An empty list to store the AUC for each user that had an item removed from the training set
    popularity_auc = [] # To store popular AUC scores
    pop_items = np.array(test_set.sum(axis = 0)).reshape(-1) # Get sum of item iteractions to find most popular
    item_vecs = predictions[1]
    for user in altered_users: # Iterate through each user that had an item altered
        training_row = training_set[user,:].toarray().reshape(-1) # Get the training set row
        zero_inds = np.where(training_row == 0) # Find where the interaction had not yet occurred
        # Get the predicted values based on our user/item vectors
        user_vec = predictions[0][user,:]
        pred = user_vec.dot(item_vecs).toarray()[0,zero_inds].reshape(-1)
        # Get only the items that were originally zero
        # Select all ratings from the MF prediction for this user that originally had no iteraction
        actual = test_set[user,:].toarray()[0,zero_inds].reshape(-1) 
        # Select the binarized yes/no interaction pairs from the original full data
        # that align with the same pairs in training 
        pop = pop_items[zero_inds] # Get the item popularity for our chosen items
        store_auc.append(auc_score(pred, actual)) # Calculate AUC for the given user and store
        popularity_auc.append(auc_score(pop, actual)) # Calculate AUC using most popular and score
    # End users iteration
    
    return float('%.3f'%np.mean(store_auc)), float('%.3f'%np.mean(popularity_auc))  
   # Return the mean AUC rounded to three decimal places for both test and popularity benchmark

In [34]:
calc_mean_auc(product_train, product_users_altered, 
              [sparse.csr_matrix(user_vecs), sparse.csr_matrix(item_vecs.T)], product_test)
# AUC for our recommender system

(0.729, 0.713)

The above results show the AUC score of the items which were masked and they were compared to the benchmark  AUC score of popular items

In [35]:
customers_arr = np.array(customers) # Array of customer IDs from the ratings matrix
products_arr = np.array(products) # Array of product IDs from the ratings matrix

In [36]:
from sklearn.preprocessing import MinMaxScaler

In [37]:
def rec_items(customer_id, mf_train, user_vecs, item_vecs, customer_list, item_list, item_lookup, num_items = 10):
    
    cust_ind = np.where(customer_list == customer_id)[0][0] # Returns the index row of our customer id
    pref_vec = mf_train[cust_ind,:].toarray() # Get the ratings from the training set ratings matrix
    pref_vec = pref_vec.reshape(-1) + 1 # Add 1 to everything, so that items not purchased yet become equal to 1
#     pref_vec[pref_vec > 1] = 0 # Make everything already purchased zero
    rec_vector = user_vecs[cust_ind,:].dot(item_vecs.T) # Get dot product of user vector and all item vectors
    # Scale this recommendation vector between 0 and 1
    min_max = MinMaxScaler()
    rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1,1))[:,0] 
    recommend_vector = pref_vec*rec_vector_scaled 
    # Items already purchased have their recommendation multiplied by zero
    product_idx = np.argsort(recommend_vector)[::-1][:num_items] # Sort the indices of the items into order 
    # of best recommendations
    rec_list = [] # start empty list to store items
    for index in product_idx:
        code = item_list[index]
        rec_list.append([code, item_lookup.StockCode.loc[item_lookup.HASH_STOCK_ID == code].iloc[0]])        # Append our descriptions to the list
    codes = [item[0] for item in rec_list]
    stock = [item[1] for item in rec_list]

#     descriptions = [item[1] for item in rec_list]
    final_frame = pd.DataFrame({'HASH_STOCK_ID': codes,'StockCode': stock}) # Create a dataframe 
    return final_frame[['HASH_STOCK_ID','StockCode']] # Switch order of columns around

In [38]:
product_users_altered[:10]

[1, 2, 3, 5, 6, 7, 9, 11, 12, 13]

In [39]:
rec_items(1, product_train, user_vecs, item_vecs, customers_arr, products_arr, item_lookup,
                       num_items = 10)

Unnamed: 0,HASH_STOCK_ID,StockCode
0,1,84985AB
1,2,85018DI
2,3052,84925FR
3,1651,85167BE
4,2941,72801GY
5,1444,62096AY
6,2430,84691N
7,831,84534BQ
8,1560,23071Z
9,2875,79149BC


#### Precision of the model

In [41]:
def precision_at_k(ratings,products_altered_list, k=5):
    ratings = ratings.tocsr()
    precisions = []
    for user in products_altered_list:
        predictions = (user_vecs).dot(item_vecs.T)
        top_k = np.argsort(-predictions[user, :])[:k]
        labels = ratings.getrow(user).indices
        precision = float(len(set(top_k) & set(labels))) / float(k)
        precisions.append(precision)
    return np.mean(precisions) 

In [42]:
precision_at_k(product_test,product_users_altered, k=5)

0.12015706806282722