In [1]:
# using Implicit feedback for RecSys
import pandas as pd
import scipy.sparse as sparse
import numpy as np
from scipy.sparse.linalg import spsolve
from __future__ import division

In [2]:
data_dir = "/home/padam/Documents/modi/Online Retail.xlsx"
df = pd.read_excel(data_dir)

In [3]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
InvoiceNo      541909 non-null object
StockCode      541909 non-null object
Description    540455 non-null object
Quantity       541909 non-null int64
InvoiceDate    541909 non-null datetime64[ns]
UnitPrice      541909 non-null float64
CustomerID     406829 non-null float64
Country        541909 non-null object
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 33.1+ MB


In [5]:
clean_df = df.loc[pd.isnull(df.CustomerID) == False]

In [6]:
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 406829 entries, 0 to 541908
Data columns (total 8 columns):
InvoiceNo      406829 non-null object
StockCode      406829 non-null object
Description    406829 non-null object
Quantity       406829 non-null int64
InvoiceDate    406829 non-null datetime64[ns]
UnitPrice      406829 non-null float64
CustomerID     406829 non-null float64
Country        406829 non-null object
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 27.9+ MB


In [7]:
# Item lookup

item_lookup = clean_df[['StockCode' , 'Description']].drop_duplicates()

In [8]:
item_lookup['StockCode'] = item_lookup.StockCode.astype(str)

In [9]:
# String lookup done
item_lookup.shape

(3916, 2)

In [10]:
clean_df['CustomerID'] = clean_df.CustomerID.astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [11]:
clean_df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom


In [12]:
clean_df = clean_df[['StockCode' , 'Quantity' , 'CustomerID']]


In [13]:
clean_df.head()

Unnamed: 0,StockCode,Quantity,CustomerID
0,85123A,6,17850
1,71053,6,17850
2,84406B,8,17850
3,84029G,6,17850
4,84029E,6,17850


In [14]:
clean_df.shape

(406829, 3)

In [15]:
group_clean = clean_df.groupby(['CustomerID', 'StockCode']).sum().reset_index() # Group together
group_clean.Quantity.loc[group_clean.Quantity == 0] = 1 # Indicate purchased
grouped_purchased = group_clean.query('Quantity > 0') 
grouped_purchased.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 266723 entries, 0 to 267614
Data columns (total 3 columns):
CustomerID    266723 non-null int64
StockCode     266723 non-null object
Quantity      266723 non-null int64
dtypes: int64(2), object(1)
memory usage: 8.1+ MB


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [16]:
grouped_purchased.shape

(266723, 3)

In [17]:
# Create sparse matrix
customers = list(np.sort(grouped_purchased.CustomerID.unique())) # Get our unique customers
products = list(grouped_purchased.StockCode.unique()) # Get our unique products that were purchased
quantity = list(grouped_purchased.Quantity) # All of our purchases

rows = grouped_purchased.CustomerID.astype('category', categories = customers).cat.codes 
# Get the associated row indices
cols = grouped_purchased.StockCode.astype('category', categories = products).cat.codes 
# Get the associated column indices
purchases_sparse = sparse.csr_matrix((quantity, (rows, cols)), shape=(len(customers), len(products)))


In [30]:
purchases_sparse

<4338x3664 sparse matrix of type '<type 'numpy.int64'>'
	with 266723 stored elements in Compressed Sparse Row format>

In [19]:
matrix_size = purchases_sparse.shape[0]*purchases_sparse.shape[1] # Number of possible interactions in the matrix
num_purchases = len(purchases_sparse.nonzero()[0]) # Number of items interacted with
sparsity = 100*(1-(float(num_purchases)/float(matrix_size)))

print(sparsity) 

98.3219092069


In [22]:
import random


In [37]:
# def make_train(ratings , pct = 0.2):
#     # Train test split with 20%
#     random.seed(0)
#     test_set = ratings.copy()
#     test_set[test_set !=0] =1 #Binary Preference
#     training_set= ratings.copy()
#     nonzero_i = training_Set.nonzero( # Indices with interaction
#     nonzero_pairs = list(zip(nonzero_i[0] , nonzero_i[1]))
#     num_samples = int(np.ceil(pct*len(nonzero_pairs)))
#     samples = random.sample(nonzero_pairs , num_samples)
#     user_i = [index[0] for index in samples]
#     item_i = [index[1] for index in samples]
#     training_set[user_i , item_i] = 0
#     training_set.eliminate_zeros()
#     return training_set , test_set , list(set(user_i))

SyntaxError: invalid syntax (<ipython-input-37-5c3143fe82ea>, line 9)

In [42]:
def make_train(ratings , pct = 0.2):
    # Train test split with 20%
    random.seed(0)
    test_set = ratings.copy()
    test_set[test_set !=0] =1 #Binary Preference
    training_set= ratings.copy()
    nonzero_i = training_set.nonzero() # Indices with interaction
    nonzero_pairs = list(zip(nonzero_i[0] , nonzero_i[1]))
    num_samples = int(np.ceil(pct*len(nonzero_pairs)))
    samples = random.sample(nonzero_pairs , num_samples)
    user_i = [index[0] for index in samples]
    item_i = [index[1] for index in samples]
    training_set[user_i , item_i] = 0
    training_set.eliminate_zeros()
    return training_set , test_set , list(set(user_i))

In [43]:
p_train , p_test , p_user_alt = make_train(purchases_sparse , pct=0.2)

In [44]:
p_train

<4338x3664 sparse matrix of type '<type 'numpy.int64'>'
	with 213378 stored elements in Compressed Sparse Row format>

In [50]:
len(p_user_alt)

4054

In [56]:
# Implementing ALS uisng Implicit

import implicit


In [62]:
alpha = 15
user_vec , item_vec= implicit.alternating_least_squares((p_train*alpha).astype('double') ,
                                                        factors = 20,
                                                        regularization = 0.1,
                                                        iterations = 50
                                                         )



In [71]:
user_vec.shape

(4338, 20)

In [75]:
from sklearn import metrics

def auc_score(predictions , test):
    fpr, tpr, thresholds = metrics.roc_curve(test, predictions)
    return metrics.auc(fpr , tpr)

def cal_mean_auc(training_set , alt_users , predictions , test_set):
    store_auc = [] # Buffer for any alt user
    pop_auc = [] # Popular AUC scores
    pop_items = np.array(test_set.sum(axis = 0)).reshape(-1) # Sum of item iteractions
    item_vec = predictions[1]
    for user in alt_users:
        training_row = training_set[user,:].toarray().reshape(-1)
        zero_i = np.where(training_row==0) # NO iteraction
        user_vec = predictions[0][user , :]
        pred = user_vec.dot(item_vec).toarray()[0 , zero_i].reshape(-1)
        actual = test_set[user , :].toarray()[0 , zero_i].reshape(-1)
        pop = pop_items[zero_i]
        store_auc.append(auc_score(pop , actual))
        pop_auc.append(auc_score(pop , actual))
    return float('%.3f'%np.mean(store_auc)) , float('%.3f'%np.mean(pop_auc))

In [76]:
cal_mean_auc(p_train ,p_user_alt , [sparse.csr_matrix(user_vec),sparse.csr_matrix(item_vec.T)] , p_test)

(0.812, 0.812)

In [84]:
from sklearn.preprocessing import MinMaxScaler
customers_arr = np.array(customers) # Array of customer IDs from the ratings matrix
products_arr = np.array(products) # Array of product IDs from the ratings matrix

In [89]:
def rec_items(customer_id, mf_train, user_vecs, item_vecs, customer_list, item_list, item_lookup, num_items = 10):
    cust_ind = np.where(customer_list == customer_id)[0][0] # Returns the index row of our customer id
    pref_vec = mf_train[cust_ind,:].toarray() # Get the ratings from the training set ratings matrix
    pref_vec = pref_vec.reshape(-1) + 1 # Add 1 to everything, so that items not purchased yet become equal to 1
    pref_vec[pref_vec > 1] = 0 # Make everything already purchased zero
    rec_vector = user_vecs[cust_ind,:].dot(item_vecs.T) # Get dot product of user vector and all item vectors
    # Scale this recommendation vector between 0 and 1
    min_max = MinMaxScaler()
    rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1,1))[:,0] 
    recommend_vector = pref_vec*rec_vector_scaled 
    # Items already purchased have their recommendation multiplied by zero
    product_idx = np.argsort(recommend_vector)[::-1][:num_items] # Sort the indices of the items into order 
    # of best recommendations
    rec_list = [] # start empty list to store items
    for index in product_idx:
        code = item_list[index]
        rec_list.append([code, item_lookup.Description.loc[item_lookup.StockCode == code].iloc[0]]) 
        # Append our descriptions to the list
    codes = [item[0] for item in rec_list]
    descriptions = [item[1] for item in rec_list]
    final_frame = pd.DataFrame({'StockCode': codes, 'Description': descriptions}) # Create a dataframe 
    return final_frame[['StockCode', 'Description']] # Switch order of columns around

In [100]:
rec_items(12355, p_train, user_vec, item_vec, customers_arr, products_arr, item_lookup,
                       num_items = 10)

#Demo

Unnamed: 0,StockCode,Description
0,72741,GRAND CHOCOLATECANDLE
1,23078,ICE CREAM PEN LIP GLOSS
2,22646,CERAMIC STRAWBERRY CAKE MONEY BANK
3,22644,CERAMIC CHERRY CAKE MONEY BANK
4,22645,CERAMIC HEART FAIRY CAKE MONEY BANK
5,21108,FAIRY CAKE FLANNEL ASSORTED COLOUR
6,22699,ROSES REGENCY TEACUP AND SAUCER
7,21232,STRAWBERRY CERAMIC TRINKET BOX
8,37446,MINI CAKE STAND WITH HANGING CAKES
9,37449,CERAMIC CAKE STAND + HANGING CAKES
