In [2]:
# using Implicit feedback for RecSys
import pandas as pd
import scipy.sparse as sparse
import numpy as np
from scipy.sparse.linalg import spsolve
from __future__ import division

In [5]:
data_dir = "/home/padam/Desktop/data1.csv"
df = pd.read_csv(data_dir)

In [6]:
df.head()

Unnamed: 0,id_plants,hits,botanical_name,userid
0,1,6,Abelmoschus crinitus,53
1,2,45,Abelmoschus ficulneus,371
2,3,32,Abelmoschus moschatus,129
3,4,49,Abies pindrow,81
4,5,19,Abies spectabilis,244


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1109 entries, 0 to 1108
Data columns (total 4 columns):
id_plants         1109 non-null int64
hits              1109 non-null int64
botanical_name    1109 non-null object
userid            1109 non-null int64
dtypes: int64(3), object(1)
memory usage: 34.7+ KB


In [8]:
clean_df = df.loc[pd.isnull(df.userid) == False]

In [9]:
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1109 entries, 0 to 1108
Data columns (total 4 columns):
id_plants         1109 non-null int64
hits              1109 non-null int64
botanical_name    1109 non-null object
userid            1109 non-null int64
dtypes: int64(3), object(1)
memory usage: 43.3+ KB


In [11]:
# Item lookup

item_lookup = clean_df[['id_plants' , 'botanical_name']].drop_duplicates()

In [12]:
item_lookup['id_plants'] = item_lookup.id_plants.astype(str)

In [14]:
# String lookup done
item_lookup.shape

(1109, 2)

In [15]:
clean_df['userid'] = clean_df.userid.astype(int)

In [16]:
clean_df.head()

Unnamed: 0,id_plants,hits,botanical_name,userid
0,1,6,Abelmoschus crinitus,53
1,2,45,Abelmoschus ficulneus,371
2,3,32,Abelmoschus moschatus,129
3,4,49,Abies pindrow,81
4,5,19,Abies spectabilis,244


In [18]:
clean_df = clean_df[['id_plants' , 'hits' , 'userid']]


In [19]:
clean_df.head()

Unnamed: 0,id_plants,hits,userid
0,1,6,53
1,2,45,371
2,3,32,129
3,4,49,81
4,5,19,244


In [20]:
clean_df.shape

(1109, 3)

In [22]:
group_clean = clean_df.groupby(['userid', 'id_plants']).sum().reset_index() # Group together
group_clean.hits.loc[group_clean.hits == 0] = 1 # Indicate purchased
grouped_purchased = group_clean.query('hits > 0') 
grouped_purchased.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1109 entries, 0 to 1108
Data columns (total 3 columns):
userid       1109 non-null int64
id_plants    1109 non-null int64
hits         1109 non-null int64
dtypes: int64(3)
memory usage: 34.7 KB


In [23]:
grouped_purchased.shape

(1109, 3)

In [27]:
# Create sparse matrix
customers = list(np.sort(grouped_purchased.userid.unique())) # Get our unique customers
products = list(grouped_purchased.id_plants.unique()) # Get our unique products that were purchased
quantity = list(grouped_purchased.hits) # All of our purchases

rows = grouped_purchased.userid.astype('category', categories = customers).cat.codes 
# Get the associated row indices
cols = grouped_purchased.id_plants.astype('category', categories = products).cat.codes 
# Get the associated column indices
purchases_sparse = sparse.csr_matrix((quantity, (rows, cols)), shape=(len(customers), len(products)))


In [28]:
purchases_sparse

<373x1109 sparse matrix of type '<type 'numpy.int64'>'
	with 1109 stored elements in Compressed Sparse Row format>

In [29]:
matrix_size = purchases_sparse.shape[0]*purchases_sparse.shape[1] # Number of possible interactions in the matrix
num_purchases = len(purchases_sparse.nonzero()[0]) # Number of items interacted with
sparsity = 100*(1-(float(num_purchases)/float(matrix_size)))

print(sparsity) 

99.7319034853


In [30]:
import random


In [37]:
# def make_train(ratings , pct = 0.2):
#     # Train test split with 20%
#     random.seed(0)
#     test_set = ratings.copy()
#     test_set[test_set !=0] =1 #Binary Preference
#     training_set= ratings.copy()
#     nonzero_i = training_Set.nonzero( # Indices with interaction
#     nonzero_pairs = list(zip(nonzero_i[0] , nonzero_i[1]))
#     num_samples = int(np.ceil(pct*len(nonzero_pairs)))
#     samples = random.sample(nonzero_pairs , num_samples)
#     user_i = [index[0] for index in samples]
#     item_i = [index[1] for index in samples]
#     training_set[user_i , item_i] = 0
#     training_set.eliminate_zeros()
#     return training_set , test_set , list(set(user_i))

SyntaxError: invalid syntax (<ipython-input-37-5c3143fe82ea>, line 9)

In [31]:
def make_train(ratings , pct = 0.2):
    # Train test split with 20%
    random.seed(0)
    test_set = ratings.copy()
    test_set[test_set !=0] =1 #Binary Preference
    training_set= ratings.copy()
    nonzero_i = training_set.nonzero() # Indices with interaction
    nonzero_pairs = list(zip(nonzero_i[0] , nonzero_i[1]))
    num_samples = int(np.ceil(pct*len(nonzero_pairs)))
    samples = random.sample(nonzero_pairs , num_samples)
    user_i = [index[0] for index in samples]
    item_i = [index[1] for index in samples]
    training_set[user_i , item_i] = 0
    training_set.eliminate_zeros()
    return training_set , test_set , list(set(user_i))

In [32]:
p_train , p_test , p_user_alt = make_train(purchases_sparse , pct=0.2)

In [33]:
p_train

<373x1109 sparse matrix of type '<type 'numpy.int64'>'
	with 887 stored elements in Compressed Sparse Row format>

In [34]:
len(p_user_alt)

177

In [35]:
# Implementing ALS uisng Implicit

import implicit


In [36]:
alpha = 15
user_vec , item_vec= implicit.alternating_least_squares((p_train*alpha).astype('double') ,
                                                        factors = 20,
                                                        regularization = 0.1,
                                                        iterations = 50
                                                         )

No handlers could be found for logger "implicit"


In [37]:
user_vec.shape

(373, 20)

In [38]:
from sklearn import metrics

def auc_score(predictions , test):
    fpr, tpr, thresholds = metrics.roc_curve(test, predictions)
    return metrics.auc(fpr , tpr)

def cal_mean_auc(training_set , alt_users , predictions , test_set):
    store_auc = [] # Buffer for any alt user
    pop_auc = [] # Popular AUC scores
    pop_items = np.array(test_set.sum(axis = 0)).reshape(-1) # Sum of item iteractions
    item_vec = predictions[1]
    for user in alt_users:
        training_row = training_set[user,:].toarray().reshape(-1)
        zero_i = np.where(training_row==0) # NO iteraction
        user_vec = predictions[0][user , :]
        pred = user_vec.dot(item_vec).toarray()[0 , zero_i].reshape(-1)
        actual = test_set[user , :].toarray()[0 , zero_i].reshape(-1)
        pop = pop_items[zero_i]
        store_auc.append(auc_score(pop , actual))
        pop_auc.append(auc_score(pop , actual))
    return float('%.3f'%np.mean(store_auc)) , float('%.3f'%np.mean(pop_auc))

In [39]:
cal_mean_auc(p_train ,p_user_alt , [sparse.csr_matrix(user_vec),sparse.csr_matrix(item_vec.T)] , p_test)

(0.5, 0.5)

In [40]:
from sklearn.preprocessing import MinMaxScaler
customers_arr = np.array(customers) # Array of customer IDs from the ratings matrix
products_arr = np.array(products) # Array of product IDs from the ratings matrix

In [51]:
def rec_items(customer_id, mf_train, user_vecs, item_vecs, customer_list, item_list, item_lookup, num_items = 10):
    cust_ind = np.where(customer_list == customer_id)[0][0] # Returns the index row of our customer id
    pref_vec = mf_train[cust_ind,:].toarray() # Get the ratings from the training set ratings matrix
    pref_vec = pref_vec.reshape(-1) + 1 # Add 1 to everything, so that items not purchased yet become equal to 1
    pref_vec[pref_vec > 1] = 0 # Make everything already purchased zero
    rec_vector = user_vecs[cust_ind,:].dot(item_vecs.T) # Get dot product of user vector and all item vectors
    # Scale this recommendation vector between 0 and 1
    min_max = MinMaxScaler()
    rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1,1))[:,0] 
    recommend_vector = pref_vec*rec_vector_scaled 
    # Items already purchased have their recommendation multiplied by zero
    product_idx = np.argsort(recommend_vector)[::-1][:num_items] # Sort the indices of the items into order 
    # of best recommendations
    rec_list = [] # start empty list to store items
    for index in product_idx:
        code = item_list[index]
        rec_list.append([code, item_lookup.botanical_name.loc[item_lookup.id_plants == code].iloc[0]]) 
        # Append our descriptions to the list
    codes = [item[0] for item in rec_list]
    botanical_name = [item[1] for item in rec_list]
    final_frame = pd.DataFrame({'id_plants': codes, 'botanical_name': botanical_name}) # Create a dataframe 
    return final_frame[['id_plants', 'botanical_name']] # Switch order of columns around

In [None]:
rec_items(150, p_train, user_vec, item_vec, customers_arr, products_arr, item_lookup,
                       num_items = 1)

#Demo