# Top-k Hit Ratio for Implicit FeedBack

### Algorithm: Alternating Least Square Algorithm
#### Evaluation: Top-k Hit Ratio
#### Implicit library in python
#### Rail-rocket dataset

ALS makes use of Confidence Matrix and Preference matrix and calculates the User factor and Item factors. Once these factors are generated when the algorithm converges ( after a certain number of iterations (20), we can use their inner product to make the top-k recommendations.

Preference Matrix: If there is any interaction between user and item, then 1 else 0

Confidence Matrix: Created based on custom logic based on different weights for different user action. Alpha weight used as per the paper to enhance the confidence factor

View: 1, Add: 2, Transaction:3

Iterations = 20

Latent-factor = 200 ( 20 to 200 as per the paper) to split the sparse matrix

K value - 20, 50, 100 ( best result - 100)

Train Test split ( last 1 day, last 5 days)

In [1]:
import datetime
import numpy as np
import pandas as pd

In [21]:
# import scipy and implicit libraries
import scipy.sparse as sp
from scipy.sparse import vstack
from scipy import sparse
from scipy.sparse.linalg import spsolve
import implicit

In [94]:
# Creates a dictionary to map user/item id to sparse_matrix index
def create_id_to_index_dic(ids_list):
    count = 0
    ids_dic = dict()
    for iden in ids_list:
        ids_dic[iden] = count
        count += 1
    return ids_dic
# Create a dictionary to map sparse_matrix index to user/item id
def create_index_to_id_dic(ids_list):
    count = 0
    ids_dic = dict()
    for iden in ids_list:
        ids_dic[count] = iden
        count += 1
    return ids_dic

def create_confidence_matrix(train_df, user_to_item_matrix, user_id_to_index_dic, item_id_to_index_dic):
    """
        Creates a confidence matrix based on weighted frequency of user events
    """
    # Create the confidence matrix
    action_weights = [1,2,3]
    for row in train_df.itertuples():

        user_id = row[2]
        item_id = row[4]
        value = 0
        if row.event == 'view':
            value = action_weights[0]
        elif row.event == 'addtocart':
            value = action_weights[1]       
        elif row.event == 'transaction':
            value = action_weights[2]

        previous_value = user_to_item_matrix[user_id_to_index_dic[user_id], item_id_to_index_dic[item_id]]
        user_to_item_matrix[user_id_to_index_dic[user_id], item_id_to_index_dic[item_id]] = previous_value + value
        
    return user_to_item_matrix

def create_confidence_matrix2(train_df, user_to_item_matrix, user_id_to_index_dic, item_id_to_index_dic):
    """
        Creates a confidence matrix based on the highest weighted event action taken by the user
    """
    # Create the confidence matrix
    action_weights = [1,2,3]
    for row in train_df.itertuples():

        user_id = row[2]
        item_id = row[4]
        value = 0
        if row.event == 'view':
            value = action_weights[0]
        elif row.event == 'addtocart':
            value = action_weights[1]       
        elif row.event == 'transaction':
            value = action_weights[2]

        previous_value = user_to_item_matrix[user_id_to_index_dic[user_id], item_id_to_index_dic[item_id]]
        if value > previous_value:
            user_to_item_matrix[user_id_to_index_dic[user_id], item_id_to_index_dic[item_id]] = value
        
    return user_to_item_matrix

def find_sparsity(user_to_item_matrix):
    sparsity = float(len(user_to_item_matrix.nonzero()[0]))
    sparsity /= (user_to_item_matrix.shape[0] * user_to_item_matrix.shape[1])
    sparsity = 1 - sparsity
    sparsity *= 100
    print (f"Sparsity = {sparsity}")
    
def get_user_activity_count(df):
    user_activity_count = dict()
    for row in df.itertuples():
        if row.visitorid not in user_activity_count:
            user_activity_count[row.visitorid] = {'view':0 , 'addtocart':0, 'transaction':0};
        if row.event == 'addtocart':
            user_activity_count[row.visitorid]['addtocart'] += 1 
        elif row.event == 'transaction':
            user_activity_count[row.visitorid]['transaction'] += 1
        elif row.event == 'view':
            user_activity_count[row.visitorid]['view'] += 1
            
    return user_activity_count

def find_total_user_activities(activities):
    total = 0
   
    for key in activities.keys():
        total += activities[key]
            
    return total

def find_hit_ratio(model, user_to_item_trained, test_users_activities, test_df, user_id_to_index_dic, index_to_item_id_dic,filter_already_liked_items=True, N=100):
    hits = dict()
    print(f"Find hit-ratio with flag filter_already_liked_items = {filter_already_liked_items}")
    test_set_userids = set(test_df['visitorid'].unique())
    train_set_userids = set(user_id_to_index_dic.keys())
    matching_users = train_set_userids.intersection(test_set_userids)
    print(f"Total # of common userIds in TrainSet and TestSet = {len(matching_users)}")
    # Iterate through the test set
    for user_id in list(matching_users):
        # Find all the items user actually performed view/add/transact
        item_ids = set(test_df[(test_df.visitorid == int(user_id))]['itemid'].tolist())
        if user_id in user_id_to_index_dic.keys():
            # Find the top 100 recommendations
            recommendations = model.recommend(user_id_to_index_dic[user_id], user_to_item_trained, N=N, filter_already_liked_items=filter_already_liked_items)
            # convert sparse_matrix_indices to item_id
            rec_item_ids = [ index_to_item_id_dic[i[0]] for i in recommendations if i[0] in index_to_item_id_dic.keys()]
            # Check if there there is any hit between user operations and recommendations
            hit = list(item_ids.intersection(set(rec_item_ids)))
            if hit:
                hit_ratio = len(hit) * 100 / find_total_user_activities(test_users_activities[user_id])
                #print(f"Hit Ratio for user_id: {user_id} =  {hit_ratio:.3f}")
                hits[user_id] = hit_ratio
    print(f"Total # of userIds for successful Recommendation = {len(hits)}")
    print(f"Total Coverage of Test dataset = {len(hits)/len(matching_users) * 100}")
    return hits

def train_test_split(df, num_days=1):
    """
        Splits the input dataset based on the num_days parameter passed by the user. test_df = #num_days data
        Default num_days=1
        Returns train_df, test_df
    """
    print(f"Spliting the dataframe with test data = {num_days} day(s)")
    last_day = max(df['date'])
    if num_days == 1:
        test_df = df[(df.date == last_day)]
        train_df = df[(df.date != last_day)]
    elif num_days > 1:
        test_df = df[(df.date <= last_day) & (df.date > last_day + datetime.timedelta(-num_days))]
        train_df = df[(df.date <= last_day + datetime.timedelta(-num_days))]
        
    print(f"Training set length = {len(train_df)}")
    print(f"Test set length = {len(test_df)}")
    
    test_df.reset_index(drop=True, inplace=True)
    train_df.sort_values('date',inplace=True)
    
    return train_df, test_df

def filter_data_by_events_count(df, min_events_count=2):
    """
     This method will delete all the records for users whose total_events_count < min_events_count
    """
    print(f"Total Unique users in original df = {len(df['visitorid'].unique())}")
    grouped_df = df.groupby('visitorid').count()
    ids_to_delete = list(grouped_df[(grouped_df.event < min_events_count)].index)
    
    df.set_index('visitorid', drop=False, inplace=True)
    df.drop(ids_to_delete, inplace=True)
    df.reset_index(drop=True, inplace=True)
          
    print(f"Total Unique users in filtered df where # of user transactions >= {min_events_count} = {len(df['visitorid'].unique())}")
    return df

def filter_data_by_items_count(df, min_items_count=2):
    """
     This method will delete all the records for users whose total_item_count < min_items_count
    """
    print(f"Total Unique users in original df = {len(df['visitorid'].unique())}")
    grouped_df = df.groupby('itemid').count()
    ids_to_delete = list(grouped_df[(grouped_df.visitorid < min_items_count)].index)
    
    df.set_index('itemid', drop=False, inplace=True)
    df.drop(ids_to_delete, inplace=True)
    df.reset_index(drop=True, inplace=True)
          
    print(f"Total Unique users in filtered df where # of items >= {min_items_count} = {len(df['visitorid'].unique())}")
    return df    

In [95]:
# Read csv file and load pandas dataframe
df = pd.read_csv('events.csv')

In [96]:
df = filter_data_by_events_count(df,10)

Total Unique users in original df = 1407580
Total Unique users in filtered df where # of user transactions >= 10 = 23241


In [99]:
df.head()

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid,date
0,1433224214164,992329,view,248676,,2015-06-02
1,1433223203944,125625,view,17655,,2015-06-02
2,1433222147345,1076270,view,262799,,2015-06-02
3,1433224644638,361387,view,43485,,2015-06-02
4,1433224303386,503970,view,448136,,2015-06-02


In [98]:
df['date'] = df['timestamp'].apply(lambda tt: datetime.date.fromtimestamp(tt/1000))

In [100]:
train_df, test_df = train_test_split(df, 1)

Spliting the dataframe with test data = 1 day(s)
Training set length = 581458
Test set length = 1846


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [101]:
# Find unique user_ids in training set
training_user_ids = sorted(train_df['visitorid'].unique())
user_id_to_index_dic = create_id_to_index_dic(training_user_ids)
index_to_user_id_dic = create_index_to_id_dic(training_user_ids)

# Find unique item_ids in training set
training_item_ids = sorted(train_df['itemid'].unique())
item_id_to_index_dic = create_id_to_index_dic(training_item_ids)
index_to_item_id_dic = create_index_to_id_dic(training_item_ids)

In [102]:
# Initiate a confidence_matrix
user_to_item_matrix = sp.dok_matrix((len(user_id_to_index_dic), len(item_id_to_index_dic)), dtype=np.int8)
user_to_item_matrix = create_confidence_matrix2(train_df, user_to_item_matrix, user_id_to_index_dic, item_id_to_index_dic)

In [103]:
find_sparsity(user_to_item_matrix)

Sparsity = 99.982269305458


In [104]:
model = implicit.als.AlternatingLeastSquares(factors=200, iterations=20)
alpha = 40
confidence_item_to_user = user_to_item_matrix.T * alpha

In [105]:
model.fit(confidence_item_to_user)

100%|███████████████████████████████████████████████████████████████████████████████████████████████| 20.0/20 [03:31<00:00, 11.39s/it]


In [106]:
user_to_item_trained = confidence_item_to_user.T.tocsr()
test_users_activities = get_user_activity_count(test_df)

In [128]:
hit_ratio = find_hit_ratio(model, user_to_item_trained, test_users_activities, test_df, user_id_to_index_dic, index_to_item_id_dic,True,100)

Find hit-ratio with flag filter_already_liked_items = True
Total # of common userIds in TrainSet and TestSet = 228
Total # of userIds for successful Recommendation = 31
Total Coverage of Test dataset = 13.596491228070176


In [129]:
hit_ratio1 = find_hit_ratio(model, user_to_item_trained, test_users_activities, test_df, user_id_to_index_dic, index_to_item_id_dic,False,100)

Find hit-ratio with flag filter_already_liked_items = False
Total # of common userIds in TrainSet and TestSet = 228
Total # of userIds for successful Recommendation = 121
Total Coverage of Test dataset = 53.07017543859649


In [133]:
df1 = pd.read_csv('events.csv')

In [134]:
df1 = filter_data_by_items_count(df1,10)

Total Unique users in original df = 1407580
Total Unique users in filtered df where # of items >= 10 = 1124688


In [146]:
df1 = filter_data_by_events_count(df1,5)

Total Unique users in original df = 1124688
Total Unique users in filtered df where # of user transactions >= 5 = 69528


In [147]:
df1.head()

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid,date
0,1433224214164,992329,view,248676,,2015-06-02
1,1433223291897,794181,view,439202,,2015-06-02
2,1433220899221,824915,view,428805,,2015-06-02
3,1433222531378,57036,view,334662,,2015-06-02
4,1433223203944,125625,view,17655,,2015-06-02


In [135]:
df1['date'] = df1['timestamp'].apply(lambda tt: datetime.date.fromtimestamp(tt/1000))

In [148]:
train_df1, test_df1 = train_test_split(df1, 5)

Spliting the dataframe with test data = 5 day(s)
Training set length = 785848
Test set length = 18502


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [149]:
# Find unique user_ids in training set
training_user_ids1 = sorted(train_df1['visitorid'].unique())
user_id_to_index_dic1 = create_id_to_index_dic(training_user_ids1)
index_to_user_id_dic1= create_index_to_id_dic(training_user_ids1)

# Find unique item_ids in training set
training_item_ids1 = sorted(train_df1['itemid'].unique())
item_id_to_index_dic1 = create_id_to_index_dic(training_item_ids1)
index_to_item_id_dic1 = create_index_to_id_dic(training_item_ids1)

In [150]:
# Initiate a confidence_matrix
user_to_item_matrix1 = sp.dok_matrix((len(user_id_to_index_dic1), len(item_id_to_index_dic1)), dtype=np.int8)
user_to_item_matrix1 = create_confidence_matrix2(train_df1, user_to_item_matrix1, user_id_to_index_dic1, item_id_to_index_dic1)

In [151]:
find_sparsity(user_to_item_matrix1)

Sparsity = 99.9864246542473


In [152]:
model1 = implicit.als.AlternatingLeastSquares(factors=200, iterations=20)
alpha = 40
confidence_item_to_user1 = user_to_item_matrix1.T * alpha

In [153]:
model1.fit(confidence_item_to_user1)

100%|███████████████████████████████████████████████████████████████████████████████████████████████| 20.0/20 [02:09<00:00,  6.18s/it]


In [154]:
user_to_item_trained1 = confidence_item_to_user1.T.tocsr()
test_users_activities1 = get_user_activity_count(test_df1)

### Find Hit Ratio

In [155]:
hit_rate = find_hit_ratio(model1, user_to_item_trained1, test_users_activities1, test_df1, user_id_to_index_dic1, index_to_item_id_dic1)

Find hit-ratio with flag filter_already_liked_items = True
Total # of common userIds in TrainSet and TestSet = 1927
Total # of userIds for successful Recommendation = 380
Total Coverage of Test dataset = 19.719771665801762


In [156]:
hit_rate1 = find_hit_ratio(model1, user_to_item_trained1, test_users_activities1, test_df1, user_id_to_index_dic1, index_to_item_id_dic1,False)

Find hit-ratio with flag filter_already_liked_items = False
Total # of common userIds in TrainSet and TestSet = 1927
Total # of userIds for successful Recommendation = 929
Total Coverage of Test dataset = 48.209652309289055


In [123]:
set(hit_rate1).difference(set(hit_rate))

{2519,
 4413,
 7127,
 8484,
 10175,
 29744,
 29802,
 34893,
 37440,
 46696,
 50083,
 54625,
 54791,
 65078,
 67635,
 71167,
 71606,
 81790,
 85375,
 90729,
 91552,
 101827,
 101884,
 103405,
 104829,
 106947,
 107181,
 108997,
 113302,
 115307,
 118111,
 123595,
 125017,
 139033,
 142294,
 147586,
 148121,
 156168,
 163049,
 171718,
 175606,
 189702,
 191124,
 192989,
 198153,
 198424,
 203562,
 205325,
 207723,
 215044,
 215127,
 215168,
 215407,
 216517,
 216607,
 221170,
 258500,
 266147,
 266350,
 266417,
 267013,
 274359,
 274981,
 284146,
 284710,
 292344,
 294507,
 296431,
 298720,
 298790,
 301340,
 303753,
 310759,
 315106,
 316600,
 316850,
 320638,
 320928,
 323368,
 336228,
 338061,
 341710,
 347497,
 357318,
 359914,
 360944,
 361041,
 361366,
 364521,
 365060,
 372979,
 375007,
 378717,
 381220,
 381477,
 381781,
 383831,
 385432,
 391728,
 391789,
 396954,
 399797,
 403525,
 422523,
 427504,
 430319,
 447891,
 459187,
 463794,
 464073,
 475995,
 476359,
 481397,
 484623,

In [81]:
test_df1[(test_df1.visitorid == 108997)]

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid,date
4195,1442256818904,108997,view,73732,,2015-09-14
4943,1442254634154,108997,view,73732,,2015-09-14
6396,1442342240158,108997,view,73732,,2015-09-15
7100,1442355131825,108997,view,73732,,2015-09-15
7307,1442342097630,108997,view,73732,,2015-09-15
8451,1442378239770,108997,view,73732,,2015-09-16
8664,1442342123679,108997,view,73732,,2015-09-15
8986,1442356625154,108997,transaction,73732,11170.0,2015-09-15
8997,1442342154647,108997,view,73732,,2015-09-15
9857,1442430208813,108997,view,73732,,2015-09-16


In [80]:
train_df1[(train_df1.visitorid == 108997)]

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid,date
267993,1441768977640,108997,view,73732,,2015-09-08
267628,1441748657625,108997,view,73732,,2015-09-08
268434,1441748482786,108997,view,73732,,2015-09-08
265779,1441753919181,108997,view,73732,,2015-09-08
265845,1441758169257,108997,view,73732,,2015-09-08
265851,1441757926207,108997,view,73732,,2015-09-08
266273,1441759328462,108997,view,73732,,2015-09-08
279592,1441993913290,108997,view,73732,,2015-09-11
278187,1441955267722,108997,view,73732,,2015-09-11
278206,1441955215476,108997,view,73732,,2015-09-11


In [66]:
df1 = filter_data_by_items_count(df1)

Total Unique users in original df = 1407580
Total Unique users in filtered df where # of items >= 2 = 1361294


In [67]:
df1 = filter_data_by_events_count(df1,10)

Total Unique users in original df = 1361294
Total Unique users in filtered df where # of user transactions >= 10 = 22896


In [68]:
df1.head()

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,1433224214164,992329,view,248676,
1,1433223203944,125625,view,17655,
2,1433222147345,1076270,view,262799,
3,1433224644638,361387,view,43485,
4,1433224303386,503970,view,448136,
