In [4]:
# Installing the implicit package 
# !pip install implicit

In [9]:
import pandas as pd
import numpy as np
import scipy.sparse as sparse
# import implicit
import random
import warnings
warnings.filterwarnings('ignore')

#### Loading Train and test sets

In [16]:
df_train = pd.read_csv('train_5UKooLv.csv')
df_train.head()

Unnamed: 0,CustomerID,InvoiceNo,Quantity,InvoiceDate,UnitPrice,Country,StockCode
0,27270,27270,7,01/12/10 8:26,2.55,PX,85123AY
1,27270,27270,7,01/12/10 8:26,3.39,PX,71053R
2,27270,27270,9,01/12/10 8:26,2.75,PX,84406BH
3,27270,27270,7,01/12/10 8:26,3.39,PX,84029GV
4,27270,27270,7,01/12/10 8:26,3.39,PX,84029EX


In [17]:
df_test = pd.read_csv('test_J1hm2KQ.csv', usecols = df_train.columns)
df_test.head()

Unnamed: 0,Country,CustomerID,InvoiceDate,InvoiceNo,Quantity,StockCode,UnitPrice
0,PX,127269,01/12/10 8:28,127269,7,22633V,1.85
1,PX,227268,01/12/10 8:34,227268,38,84879M,1.69
2,PX,227268,01/12/10 8:34,227268,7,22748P,2.1
3,PX,227268,01/12/10 8:34,227268,9,22749K,3.75
4,PX,227268,01/12/10 8:34,227268,2,22622G,9.95


#### EDA train and test data

In [22]:
# Number of customers and their purchases in train set
print('No of unique customers in train_set - '+ format(df_train['CustomerID'].nunique()))
print('No of unique items in train_set - '+ format(df_train['StockCode'].nunique()))

No of unique customers in train_set - 972
No of unique items in train_set - 3810


In [23]:
# Number of customers and their purchases in test set
print('No of unique customers in test_set - '+ format(df_test['CustomerID'].nunique()))
print('No of unique items in test_set - '+ format(df_test['StockCode'].nunique()))

No of unique customers in test_set - 628
No of unique items in test_set - 3522


In [34]:
# shape of train and test sets
print('Train_shape  - ' + format(df_train.shape))
print('Test_shape  - ' + format(df_test.shape))

Train_shape  - (330575, 7)
Test_shape  - (103097, 7)


In [45]:
# No of items in train set that are in test set
print('No of items in train set that are in test set - ' + format(df_train[df_train['StockCode'].isin(df_test['StockCode'])]['StockCode'].nunique()))
print('No of items in train set that are not in test set - ' + format(df_train[~df_train['StockCode'].isin(df_test['StockCode'])]['StockCode'].nunique()))

No of items in train set that are in test set - 3472
No of items in train set that are not in test set - 338


In [46]:
# No of items in test set that are in train set
print('No of items in test set that are in train set - ' + format(df_test[df_test['StockCode'].isin(df_train['StockCode'])]['StockCode'].nunique()))
print('No of items in test set that are not in train set - ' + format(df_test[~df_test['StockCode'].isin(df_train['StockCode'])]['StockCode'].nunique()))

No of items in test set that are in train set - 3472
No of items in test set that are not in train set - 50


There 338 extra stock codes in train set whereas 50 in test set

In [48]:
# No of items purchased by each customer in train set
pd.DataFrame(df_train.groupby('CustomerID')['StockCode'].count()).head(10)

Unnamed: 0_level_0,StockCode
CustomerID,Unnamed: 1_level_1
0,390
900,826
1800,736
2790,393
4590,252
5400,399
6300,240
6390,638
7200,263
7290,1020


In [49]:
# No of items purchased by each customer in test set
pd.DataFrame(df_test.groupby('CustomerID')['StockCode'].count()).head(10)

Unnamed: 0_level_0,StockCode
CustomerID,Unnamed: 1_level_1
1890,8
2700,534
3600,223
3690,269
4500,224
5490,207
8190,350
9090,443
13680,142
14490,147


It is given that in the test set only 50% transactions of the customers have been provided and we need to recommend the additional 50% transactions 

In [55]:
# Now let us look at the quantity of the items in train set
df_train[df_train.Quantity <= 0].count()

CustomerID     5588
InvoiceNo      5588
Quantity       5588
InvoiceDate    5588
UnitPrice      5588
Country        5588
StockCode      5588
dtype: int64

In [56]:
# Now let us look at the quantity of the items in test set
df_test[df_test.Quantity <= 0].count()

Country        1762
CustomerID     1762
InvoiceDate    1762
InvoiceNo      1762
Quantity       1762
StockCode      1762
UnitPrice      1762
dtype: int64

It seems like there are 5588 items in train set and 1762 items in test set that are returned to the store or received free

We have seen that there are 338 additional stock codes in train set and 50 in test set so let us merge those 338 additional stock codes to the test set so that we can form a user-item matrix from test set

In [57]:
df_train_additional = df_train[~df_train['StockCode'].isin(df_test['StockCode'])]

In [61]:
test_train_merge = pd.concat([df_test,df_train_additional])

#### Pre-processing data

In [74]:
test_train_merge['CustomerID'] = test_train_merge.CustomerID.astype(int) # Convert to int for customer ID
test_train_merge = test_train_merge[['StockCode', 'Quantity', 'CustomerID']] # Get rid of unnecessary info
merged = test_train_merge.groupby(['CustomerID', 'StockCode']).sum().reset_index() # Group together
merged.Quantity.loc[merged.Quantity == 0] = 1 # Replace a sum of zero purchases with a one to indicate purchased
grouped_purchased = merged.query('Quantity > 0') # Only get customers where purchase totals were positive

In [75]:
# Mapping customer id and stock id to an integer
# Create mappings
STOCKCODE_to_idx = {}
idx_to_STOCKCODE = {}
for (idx, StockCode) in enumerate(grouped_purchased.StockCode.unique().tolist()):
    STOCKCODE_to_idx[StockCode] = idx
    idx_to_STOCKCODE[idx] = StockCode
#     print(idx)
    
CUDTOMER_ID_to_idx = {}
idx_to_CUSTOMER_ID = {}
for (idx, CustomerID) in enumerate(grouped_purchased.CustomerID.unique().tolist()):
    CUDTOMER_ID_to_idx[CustomerID] = idx
    idx_to_CUSTOMER_ID[idx] = CustomerID
#     print(idx)

In [76]:
grouped_purchased['HASH_CUSTOMER_ID'] = grouped_purchased['CustomerID'].map(CUDTOMER_ID_to_idx)
grouped_purchased['HASH_STOCK_ID'] = grouped_purchased['StockCode'].map(STOCKCODE_to_idx)
display(grouped_purchased.head(5))

Unnamed: 0,CustomerID,StockCode,Quantity,HASH_CUSTOMER_ID,HASH_STOCK_ID
0,0,90146B,1,0,0
2,1800,84985AB,1,1,1
3,1800,85018DI,1,1,2
4,1800,85231bJ,1,1,3
5,1890,21080R,4,2,4


In [77]:
# Aa dataframe of unique stock code
item_lookup =  pd.DataFrame(grouped_purchased[['StockCode','HASH_STOCK_ID']].drop_duplicates())
item_lookup['StockCode'] = item_lookup.StockCode.astype(str) # Encode as strings for future lookup ease

In [78]:
# Calculating number of unique customers and items for building a ratings matrix
n_customers = grouped_purchased.HASH_CUSTOMER_ID.nunique()
n_items = grouped_purchased.HASH_STOCK_ID.nunique()

In [81]:
data_matrix = np.zeros((n_customers, n_items))

#populate the matrix based on the dataset
for line in grouped_purchased.itertuples():
    data_matrix[line[4], line[5]] = line[3]


In [85]:
customers = list(np.sort(grouped_purchased.HASH_CUSTOMER_ID.unique())) # Get our unique customers
products = list(grouped_purchased.HASH_STOCK_ID.unique()) # Get our unique products that were purchased
quantity = list(grouped_purchased.Quantity) # All of our purchases

#### Sparsity of the matrix

In [87]:
matrix_size = data_matrix.shape[0]*data_matrix.shape[1] # Number of possible interactions in the matrix
num_purchases = len(data_matrix.nonzero()[0]) # Number of items interacted with
sparsity = 100*(1 - (num_purchases/matrix_size))
print('Sparsity of the matrix - '+ format(sparsity))

Sparsity of the matrix - 97.8670149966088


For Validating the Recommendation model let us mask some of the values of a customer in data matrix and after finding the recommendations let us check how many recommended items are actually bought by the customer

In [88]:
def make_train(ratings, pct_test = 0.2):
  
    test_set = ratings.copy() # Make a copy of the original set to be the test set. 
    test_set[test_set != 0] = 1 # Store the test set as a binary preference matrix
    test_set = sparse.csr_matrix(test_set)
    training_set = ratings.copy() # Make a copy of the original data we can alter as our training set. 
    nonzero_inds = training_set.nonzero() # Find the indices in the ratings data where an interaction exists
    nonzero_pairs = list(zip(nonzero_inds[0], nonzero_inds[1])) # Zip these pairs together of user,item index into list
    random.seed(0) # Set the random seed to zero for reproducibility
    num_samples = int(np.ceil(pct_test*len(nonzero_pairs))) # Round the number of samples needed to the nearest integer
    samples = random.sample(nonzero_pairs, num_samples) # Sample a random number of user-item pairs without replacement
    user_inds = [index[0] for index in samples] # Get the user row indices
    item_inds = [index[1] for index in samples] # Get the item column indices
    training_set[user_inds, item_inds] = 0 # Assign all of the randomly chosen user-item pairs to zero
    training_set = sparse.csr_matrix(training_set)
    training_set.eliminate_zeros() # Get rid of zeros in sparse array storage after update to save space
    return training_set, test_set, list(set(user_inds)) # Output the unique list of user rows that were altered  

In [89]:
product_train, product_test, product_users_altered = make_train(data_matrix, pct_test = 0.2)