# Retail Rocket Recommender System

In [0]:
!pip install implicit

Collecting implicit
[?25l  Downloading https://files.pythonhosted.org/packages/2e/d3/162237c5b41bef34faf5fce513535bc84808113dd3b2497c2437222a9bfc/implicit-0.3.8.tar.gz (783kB)
[K     |████████████████████████████████| 788kB 2.8MB/s 
Building wheels for collected packages: implicit
  Building wheel for implicit (setup.py) ... [?25l[?25hdone
  Stored in directory: /root/.cache/pip/wheels/16/6c/e7/54f2a48bd3bd8a90f5b456a3e7b2ee687e111dbe25fc9fbf9d
Successfully built implicit
Installing collected packages: implicit
Successfully installed implicit-0.3.8


In [0]:
import datetime
import numpy as np
import pandas as pd
# import scipy and implicit libraries
import scipy.sparse as sp
from scipy.sparse import vstack
from scipy import sparse
from scipy.sparse.linalg import spsolve
import implicit

In [0]:
# Read csv file and load pandas dataframe
df = pd.read_csv('sample_data/events.csv')

In [0]:
# Creates a dictionary to map user/item id to sparse_matrix index
def create_id_to_index_dic(ids_list):
    count = 0
    ids_dic = dict()
    for iden in ids_list:
        ids_dic[iden] = count
        count += 1
    return ids_dic
# Create a dictionary to map sparse_matrix index to user/item id
def create_index_to_id_dic(ids_list):
    count = 0
    ids_dic = dict()
    for iden in ids_list:
        ids_dic[count] = iden
        count += 1
    return ids_dic

def create_confidence_matrix(train_df, user_to_item_matrix, user_id_to_index_dic, item_id_to_index_dic):

    # Create the confidence matrix
    action_weights = [1,2,3]
    for row in train_df.itertuples():

        user_id = row[2]
        item_id = row[4]
        value = 0
        if row.event == 'view':
            value = action_weights[0]
        elif row.event == 'addtocart':
            value = action_weights[1]       
        elif row.event == 'transaction':
            value = action_weights[2]

        previous_value = user_to_item_matrix[user_id_to_index_dic[user_id], item_id_to_index_dic[item_id]]
        user_to_item_matrix[user_id_to_index_dic[user_id], item_id_to_index_dic[item_id]] = previous_value + value
        
    return user_to_item_matrix

def create_confidence_matrix2(train_df, user_to_item_matrix, user_id_to_index_dic, item_id_to_index_dic):

    # Create the confidence matrix
    action_weights = [1,2,3]
    for row in train_df.itertuples():

        user_id = row[2]
        item_id = row[4]
        value = 0
        if row.event == 'view':
            value = action_weights[0]
        elif row.event == 'addtocart':
            value = action_weights[1]       
        elif row.event == 'transaction':
            value = action_weights[2]

        previous_value = user_to_item_matrix[user_id_to_index_dic[user_id], item_id_to_index_dic[item_id]]
        if value > previous_value:
            user_to_item_matrix[user_id_to_index_dic[user_id], item_id_to_index_dic[item_id]] = value
        
    return user_to_item_matrix

def find_sparsity(user_to_item_matrix):
    sparsity = float(len(user_to_item_matrix.nonzero()[0]))
    sparsity /= (user_to_item_matrix.shape[0] * user_to_item_matrix.shape[1])
    sparsity = 1 - sparsity
    sparsity *= 100
    print (f"Sparsity = {sparsity}")
    
def get_user_activity_count(df):
    user_activity_count = dict()
    for row in df.itertuples():
        if row.visitorid not in user_activity_count:
            user_activity_count[row.visitorid] = {'view':0 , 'addtocart':0, 'transaction':0};
        if row.event == 'addtocart':
            user_activity_count[row.visitorid]['addtocart'] += 1 
        elif row.event == 'transaction':
            user_activity_count[row.visitorid]['transaction'] += 1
        elif row.event == 'view':
            user_activity_count[row.visitorid]['view'] += 1
            
    return user_activity_count

def find_total_user_activities(activities):
    total = 0
   
    for key in activities.keys():
        total += activities[key]
            
    return total

def find_hit_ratio(model, user_to_item_trained, test_users_activities, test_df, user_id_to_index_dic, index_to_item_id_dic,filter_already_liked_items=True):
    hits = dict()
    test_set_userids = set(test_df['visitorid'].unique())
    train_set_userids = set(user_id_to_index_dic.keys())
    #return the common index
    matching_users = train_set_userids.intersection(test_set_userids)
    print(f"Total # of common userIds in TrainSet and TestSet = {len(matching_users)}")
    # Iterate through the test set
    for user_id in list(matching_users):
        # Find all the items user actually performed view/add/transact
        item_ids = set(test_df[(test_df.visitorid == int(user_id))]['itemid'].tolist())
        if user_id in user_id_to_index_dic.keys():
            # Find the top 100 recommendations
            recommendations = model.recommend(user_id_to_index_dic[user_id], user_to_item_trained, N=100, filter_already_liked_items=filter_already_liked_items)
            # convert sparse_matrix_indices to item_id
            rec_item_ids = [ index_to_item_id_dic[i[0]] for i in recommendations if i[0] in index_to_item_id_dic.keys()]
            # Check if there there is any hit between user operations and recommendations
            hit = list(item_ids.intersection(set(rec_item_ids)))
            if hit:
                hit_ratio = len(hit) * 100 / find_total_user_activities(test_users_activities[user_id])
                #print(f"Hit Ratio for user_id: {user_id} =  {hit_ratio:.3f}")
                hits[user_id] = hit_ratio
    print(f"Total # of userIds for successful Recommendation = {len(hits)}")
    print(f"Total Coverage of Test dataset = {len(hits)/len(matching_users) * 100}")
    return hits

def train_test_split(df, num_days=1):
    last_day = max(df['date'])
    if num_days == 1:
        test_df = df[(df.date == last_day)]
        train_df = df[(df.date != last_day)]
    elif num_days > 1:
        test_df = df[(df.date <= last_day) & (df.date > last_day + datetime.timedelta(-num_days))]
        train_df = df[(df.date <= last_day + datetime.timedelta(-num_days))]
        
    print(f"Training set length = {len(train_df)}")
    print(f"Test set length = {len(test_df)}")
    
    test_df.reset_index(drop=True, inplace=True)
    train_df.sort_values('date',inplace=True)
    
    return train_df, test_df

def filter_data_by_events_count(df, min_events_count=2):
    """
     This method will delete all the records for users whose total_events_count < min_events_count
    """
    grouped_df = df.groupby('visitorid').count()
    ids_to_delete = list(grouped_df[(grouped_df.event < min_events_count)].index)
    
    df.set_index('visitorid', drop=False, inplace=True)
    df.drop(ids_to_delete, inplace=True)
    df.reset_index(drop=True, inplace=True)
    
    return df

## Filter Data with user events count < 10 and Test Data for last day

In [0]:
#filter data with events count less than 10
events_less_than_two = filter_data_by_events_count(df, 10);
df['date'] = df['timestamp'].apply(lambda tt: datetime.date.fromtimestamp(tt/1000))
#take last day data as testing data
train_df, test_df = train_test_split(events_less_than_two, 1)

Training set length = 583072
Test set length = 232


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [0]:
# Find unique user_ids in training set
training_user_ids = sorted(train_df['visitorid'].unique())
user_id_to_index_dic = create_id_to_index_dic(training_user_ids)
index_to_user_id_dic = create_index_to_id_dic(training_user_ids)

# Find unique item_ids in training set
training_item_ids = sorted(train_df['itemid'].unique())
item_id_to_index_dic = create_id_to_index_dic(training_item_ids)
index_to_item_id_dic = create_index_to_id_dic(training_item_ids)

In [0]:
len(training_user_ids) 
len(training_item_ids)

77050

In [0]:
# Initiate a confidence_matrix
user_to_item_matrix = sp.dok_matrix((len(user_id_to_index_dic), len(item_id_to_index_dic)), dtype=np.int8)
user_to_item_matrix = create_confidence_matrix(train_df, user_to_item_matrix, user_id_to_index_dic, item_id_to_index_dic)
find_sparsity(user_to_item_matrix)
model = implicit.als.AlternatingLeastSquares(factors=250, iterations=20, use_gpu=False)
alpha = 50
confidence_item_to_user = user_to_item_matrix.T * alpha
model.fit(confidence_item_to_user)
user_to_item_trained = confidence_item_to_user.T.tocsr()
test_users_activities = get_user_activity_count(test_df)



Sparsity = 99.98226386593797


100%|██████████| 20.0/20 [05:41<00:00, 17.37s/it]


In [0]:
# filter_already_liked_items set to true
hit_ratio = find_hit_ratio(model, user_to_item_trained, test_users_activities, test_df, user_id_to_index_dic, index_to_item_id_dic,True)

Total # of common userIds in TrainSet and TestSet = 54
Total # of userIds for successful Recommendation = 7
Total Coverage of Test dataset = 12.962962962962962


In [0]:
# filter_already_liked_items set to false
hit_ratio = find_hit_ratio(model, user_to_item_trained, test_users_activities, test_df, user_id_to_index_dic, index_to_item_id_dic,False)

Total # of common userIds in TrainSet and TestSet = 54
Total # of userIds for successful Recommendation = 26
Total Coverage of Test dataset = 48.148148148148145


### create_confidence_matrix2 Method

In [0]:
# Initiate a confidence_matrix
#filter data with use count <10 and took last 1 day data as
user_to_item_matrix_1 = sp.dok_matrix((len(user_id_to_index_dic), len(item_id_to_index_dic)), dtype=np.int8)
user_to_item_matrix_1 = create_confidence_matrix2(train_df, user_to_item_matrix_1, user_id_to_index_dic, item_id_to_index_dic)
find_sparsity(user_to_item_matrix_1)
model = implicit.als.AlternatingLeastSquares(factors=250, iterations=20, use_gpu=False)
alpha = 50
confidence_item_to_user_1 = user_to_item_matrix_1.T * alpha
model.fit(confidence_item_to_user_1)
user_to_item_trained_1 = confidence_item_to_user_1.T.tocsr()
test_users_activities_1 = get_user_activity_count(test_df)


Sparsity = 99.98226386593797


100%|██████████| 20.0/20 [06:28<00:00, 20.00s/it]


In [0]:
# filter_already_liked_items set to true
hit_ratio = find_hit_ratio(model, user_to_item_trained_1, test_users_activities_1, test_df, user_id_to_index_dic, index_to_item_id_dic,True)

Total # of common userIds in TrainSet and TestSet = 54
Total # of userIds for successful Recommendation = 9
Total Coverage of Test dataset = 16.666666666666664


In [0]:
# filter_already_liked_items set to false
hit_ratio = find_hit_ratio(model, user_to_item_trained_1, test_users_activities_1, test_df, user_id_to_index_dic, index_to_item_id_dic,False)

Total # of common userIds in TrainSet and TestSet = 54
Total # of userIds for successful Recommendation = 38
Total Coverage of Test dataset = 70.37037037037037


### Changed alpha from 50 to 60

In [0]:
# Initiate a confidence_matrix
user_to_item_matrix_2 = sp.dok_matrix((len(user_id_to_index_dic), len(item_id_to_index_dic)), dtype=np.int8)
user_to_item_matrix_2 = create_confidence_matrix(train_df, user_to_item_matrix_2, user_id_to_index_dic, item_id_to_index_dic)
find_sparsity(user_to_item_matrix_2)
model = implicit.als.AlternatingLeastSquares(factors=250, iterations=20, use_gpu=False)
alpha = 60
confidence_item_to_user_2 = user_to_item_matrix_2.T * alpha
model.fit(confidence_item_to_user_2)
user_to_item_trained_2 = confidence_item_to_user_2.T.tocsr()
test_users_activities_2 = get_user_activity_count(test_df)

Sparsity = 99.98226386593797


100%|██████████| 20.0/20 [05:50<00:00, 18.05s/it]


In [0]:
# filter_already_liked_items set to true
hit_ratio = find_hit_ratio(model, user_to_item_trained_2, test_users_activities_2, test_df, user_id_to_index_dic, index_to_item_id_dic,True)

Total # of common userIds in TrainSet and TestSet = 54
Total # of userIds for successful Recommendation = 10
Total Coverage of Test dataset = 18.51851851851852


In [0]:
# filter_already_liked_items set to false
hit_ratio = find_hit_ratio(model, user_to_item_trained_2, test_users_activities_2, test_df, user_id_to_index_dic, index_to_item_id_dic,False)

Total # of common userIds in TrainSet and TestSet = 54
Total # of userIds for successful Recommendation = 29
Total Coverage of Test dataset = 53.70370370370371


### create_confidence_matrix2 Method

In [0]:
# Initiate a confidence_matrix
user_to_item_matrix_3 = sp.dok_matrix((len(user_id_to_index_dic), len(item_id_to_index_dic)), dtype=np.int8)
user_to_item_matrix_3 = create_confidence_matrix2(train_df, user_to_item_matrix_3, user_id_to_index_dic, item_id_to_index_dic)
find_sparsity(user_to_item_matrix_3)
model = implicit.als.AlternatingLeastSquares(factors=250, iterations=20, use_gpu=False)
  alpha = t,ll.m
confidence_item_to_user_3 = user_to_item_matrix_3.T * alpha
model.fit(confidence_item_to_user_3)
user_to_item_trained_3 = confidence_item_to_user_3.T.tocsr()
test_users_activities_3 = get_user_activity_count(test_df)

Sparsity = 99.98226386593797


100%|██████████| 20.0/20 [06:03<00:00, 19.36s/it]


In [0]:
# filter_already_liked_items set to true
hit_ratio = find_hit_ratio(model, user_to_item_trained_3, test_users_activities_3, test_df, user_id_to_index_dic, index_to_item_id_dic,True)

Total # of common userIds in TrainSet and TestSet = 54
Total # of userIds for successful Recommendation = 9
Total Coverage of Test dataset = 16.666666666666664


In [0]:
# filter_already_liked_items set to false
hit_ratio = find_hit_ratio(model, user_to_item_trained_3, test_users_activities_3, test_df, user_id_to_index_dic, index_to_item_id_dic,False)

Total # of common userIds in TrainSet and TestSet = 54
Total # of userIds for successful Recommendation = 38
Total Coverage of Test dataset = 70.37037037037037


## Filter Data with user events count < 10 and Test Data for last 5 days

In [0]:
# Read csv file and load pandas dataframe
df_1 = pd.read_csv('sample_data/events.csv')
#filter data with events count less than 10
events_less_than_ten = filter_data_by_events_count(df_1, 10);
df_1['date'] = df_1['timestamp'].apply(lambda tt: datetime.date.fromtimestamp(tt/1000))
#take last day data as testing data
train_df_1, test_df_1 = train_test_split(events_less_than_ten, 5)

Training set length = 571795
Test set length = 11509


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [0]:
# Find unique user_ids in training set
training_user_ids_1 = sorted(train_df_1['visitorid'].unique())
user_id_to_index_dic_1 = create_id_to_index_dic(training_user_ids_1)
index_to_user_id_dic_1 = create_index_to_id_dic(training_user_ids_1)

# Find unique item_ids in training set
training_item_ids_1 = sorted(train_df_1['itemid'].unique())
item_id_to_index_dic_1 = create_id_to_index_dic(training_item_ids_1)
index_to_item_id_dic_1 = create_index_to_id_dic(training_item_ids_1)

In [0]:
len(training_user_ids_1)
len(user_id_to_index_dic_1)

23013

In [0]:
len(training_item_ids_1)

76317

### Alpha set to 50

In [0]:
# Initiate a confidence_matrix
user_to_item_matrix_4 = sp.dok_matrix((len(user_id_to_index_dic_1), len(item_id_to_index_dic_1)), dtype=np.int8)
user_to_item_matrix_4 = create_confidence_matrix(train_df_1, user_to_item_matrix_4, user_id_to_index_dic_1, item_id_to_index_dic_1)
find_sparsity(user_to_item_matrix_4)
model = implicit.als.AlternatingLeastSquares(factors=250, iterations=20, use_gpu=False)
alpha = 50
confidence_item_to_user_4 = user_to_item_matrix_4.T * alpha
model.fit(confidence_item_to_user_4)
user_to_item_trained_4 = confidence_item_to_user_4.T.tocsr()
test_users_activities_4 = get_user_activity_count(test_df_1)

Sparsity = 99.98226464763708


100%|██████████| 20.0/20 [06:21<00:00, 22.68s/it]


In [0]:
# filter_already_liked_items set to true
hit_ratio = find_hit_ratio(model, user_to_item_trained_4, test_users_activities_4, test_df_1, user_id_to_index_dic_1, index_to_item_id_dic_1,True)

Total # of common userIds in TrainSet and TestSet = 1020
Total # of userIds for successful Recommendation = 153
Total Coverage of Test dataset = 15.0


In [0]:
# filter_already_liked_items set to false
hit_ratio = find_hit_ratio(model, user_to_item_trained_4, test_users_activities_4, test_df_1, user_id_to_index_dic_1, index_to_item_id_dic_1,False)

Total # of common userIds in TrainSet and TestSet = 1020
Total # of userIds for successful Recommendation = 389
Total Coverage of Test dataset = 38.13725490196078


### Alpha set to 60

In [0]:
# Initiate a confidence_matrix
user_to_item_matrix_5 = sp.dok_matrix((len(user_id_to_index_dic_1), len(item_id_to_index_dic_1)), dtype=np.int8)
user_to_item_matrix_5 = create_confidence_matrix(train_df_1, user_to_item_matrix_5, user_id_to_index_dic_1, item_id_to_index_dic_1)
find_sparsity(user_to_item_matrix_5)
model = implicit.als.AlternatingLeastSquares(factors=250, iterations=20, use_gpu=False)
alpha = 60
confidence_item_to_user_5 = user_to_item_matrix_5.T * alpha
model.fit(confidence_item_to_user_5)
user_to_item_trained_5 = confidence_item_to_user_5.T.tocsr()
test_users_activities_5 = get_user_activity_count(test_df_1)

Sparsity = 99.98226464763708


100%|██████████| 20.0/20 [06:41<00:00, 17.99s/it]


In [0]:
# filter_already_liked_items set to true
hit_ratio = find_hit_ratio(model, user_to_item_trained_5, test_users_activities_5, test_df_1, user_id_to_index_dic_1, index_to_item_id_dic_1,True)

Total # of common userIds in TrainSet and TestSet = 1020
Total # of userIds for successful Recommendation = 158
Total Coverage of Test dataset = 15.490196078431373


In [0]:
# filter_already_liked_items set to false
hit_ratio = find_hit_ratio(model, user_to_item_trained_5, test_users_activities_5, test_df_1, user_id_to_index_dic_1, index_to_item_id_dic_1,False)

Total # of common userIds in TrainSet and TestSet = 1020
Total # of userIds for successful Recommendation = 402
Total Coverage of Test dataset = 39.411764705882355


### Use create_confidence_matrix2 Method

In [0]:
# Initiate a confidence_matrix
user_to_item_matrix_6 = sp.dok_matrix((len(user_id_to_index_dic_1), len(item_id_to_index_dic_1)), dtype=np.int8)
user_to_item_matrix_6 = create_confidence_matrix2(train_df_1, user_to_item_matrix_6, user_id_to_index_dic_1, item_id_to_index_dic_1)
find_sparsity(user_to_item_matrix_6)
model = implicit.als.AlternatingLeastSquares(factors=250, iterations=20, use_gpu=False)
alpha = 60
confidence_item_to_user_6 = user_to_item_matrix_6.T * alpha
model.fit(confidence_item_to_user_6)
user_to_item_trained_6 = confidence_item_to_user_6.T.tocsr()
test_users_activities_6 = get_user_activity_count(test_df_1)

Sparsity = 99.98226464763708


100%|██████████| 20.0/20 [06:05<00:00, 18.14s/it]


In [0]:
# filter_already_liked_items set to true
hit_ratio = find_hit_ratio(model, user_to_item_trained_6, test_users_activities_6, test_df_1, user_id_to_index_dic_1, index_to_item_id_dic_1,True)

Total # of common userIds in TrainSet and TestSet = 1020
Total # of userIds for successful Recommendation = 178
Total Coverage of Test dataset = 17.45098039215686


In [0]:
# filter_already_liked_items set to true
hit_ratio = find_hit_ratio(model, user_to_item_trained_6, test_users_activities_6, test_df_1, user_id_to_index_dic_1, index_to_item_id_dic_1,False)

Total # of common userIds in TrainSet and TestSet = 1020
Total # of userIds for successful Recommendation = 505
Total Coverage of Test dataset = 49.50980392156863


# Filter Data with user events count < 5 and Test Data for last 3 days

In [0]:
# Read csv file and load pandas dataframe
df_2 = pd.read_csv('sample_data/events.csv')
#filter data with events count less than 10
events_less_than_five = filter_data_by_events_count(df_2, 5);
df_2['date'] = df_2['timestamp'].apply(lambda tt: datetime.date.fromtimestamp(tt/1000))
#take last day data as testing data
train_df_2, test_df_2 = train_test_split(events_less_than_five, 3)

Training set length = 941151
Test set length = 7386


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [0]:
# Find unique user_ids in training set
training_user_ids_7 = sorted(train_df_2['visitorid'].unique())
user_id_to_index_dic_3 = create_id_to_index_dic(training_user_ids_7)
index_to_user_id_dic_3 = create_index_to_id_dic(training_user_ids_7)

# Find unique item_ids in training set
training_item_ids_7 = sorted(train_df_2['itemid'].unique())
item_id_to_index_dic_3 = create_id_to_index_dic(training_item_ids_7)
index_to_item_id_dic_3 = create_index_to_id_dic(training_item_ids_7)

In [0]:
# Initiate a confidence_matrix
user_to_item_matrix_7 = sp.dok_matrix((len(user_id_to_index_dic_3), len(item_id_to_index_dic_3)), dtype=np.int8)
user_to_item_matrix_7 = create_confidence_matrix(train_df_2, user_to_item_matrix_7, user_id_to_index_dic_3, item_id_to_index_dic_3)
find_sparsity(user_to_item_matrix_7)
model = implicit.als.AlternatingLeastSquares(factors=250, iterations=20, use_gpu=False)
alpha = 60
confidence_item_to_user_7 = user_to_item_matrix_7.T * alpha
model.fit(confidence_item_to_user_7)
user_to_item_trained_7 = confidence_item_to_user_7.T.tocsr()
test_users_activities_7 = get_user_activity_count(test_df_2)

Sparsity = 99.99357762304004


100%|██████████| 20.0/20 [12:34<00:00, 36.76s/it]


In [0]:
# filter_already_liked_items set to true
hit_ratio = find_hit_ratio(model, user_to_item_trained_7, test_users_activities_7, test_df_2, user_id_to_index_dic_3, index_to_item_id_dic_3,True)

Total # of common userIds in TrainSet and TestSet = 919
Total # of userIds for successful Recommendation = 134
Total Coverage of Test dataset = 14.58106637649619


In [0]:
# filter_already_liked_items set to false
hit_ratio = find_hit_ratio(model, user_to_item_trained_7, test_users_activities_7, test_df_2, user_id_to_index_dic_3, index_to_item_id_dic_3,False)

Total # of common userIds in TrainSet and TestSet = 919
Total # of userIds for successful Recommendation = 339
Total Coverage of Test dataset = 36.88792165397171


## Filtering Data with Events less than 2 and test data of last 2 days

In [0]:
df = pd.read_csv('sample_data/events.csv')
#filter data with events less than 2
events_less_than_two = filter_data_by_events_count(df, 2);

In [0]:
events_less_than_two['date'] = events_less_than_two['timestamp'].apply(lambda tt: datetime.date.fromtimestamp(tt/1000))

In [0]:
#take last 2 days data as testing data
train_df, test_df = train_test_split(events_less_than_two, 2)

Training set length = 1747518
Test set length = 7023


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [0]:
# Find unique user_ids in training set
training_user_ids = sorted(train_df['visitorid'].unique())
user_id_to_index_dic = create_id_to_index_dic(training_user_ids)
index_to_user_id_dic = create_index_to_id_dic(training_user_ids)

# Find unique item_ids in training set
training_item_ids = sorted(train_df['itemid'].unique())
item_id_to_index_dic = create_id_to_index_dic(training_item_ids)
index_to_item_id_dic = create_index_to_id_dic(training_item_ids)

In [0]:
# Initiate a confidence_matrix
user_to_item_matrix = sp.dok_matrix((len(user_id_to_index_dic), len(item_id_to_index_dic)), dtype=np.int8)
user_to_item_matrix = create_confidence_matrix2(train_df, user_to_item_matrix, user_id_to_index_dic, item_id_to_index_dic)

In [0]:
find_sparsity(user_to_item_matrix)

Sparsity = 99.99830919191533


### Training with Factors=250, iterations=20, Alpha = 50

In [0]:
model = implicit.als.AlternatingLeastSquares(factors=250, iterations=20, use_gpu=False)
alpha = 50
confidence_item_to_user = user_to_item_matrix.T * alpha

In [0]:
model.fit(confidence_item_to_user)

100%|██████████| 20.0/20 [35:54<00:00, 103.87s/it]


In [0]:
user_to_item_trained = confidence_item_to_user.T.tocsr()
test_users_activities = get_user_activity_count(test_df)

### filter_already_liked_items set to true

In [0]:
#filter_already_liked_items set to true
hit_ratio = find_hit_ratio(model, user_to_item_trained, test_users_activities, test_df, user_id_to_index_dic, index_to_item_id_dic,True)

Total # of common userIds in TrainSet and TestSet = 1300
Total # of userIds for successful Recommendation = 152
Total Coverage of Test dataset = 11.692307692307692


### filter_already_liked_items set to false

In [0]:
#filter_already_liked_items set to false
hit_ratio = find_hit_ratio(model, user_to_item_trained, test_users_activities, test_df, user_id_to_index_dic, index_to_item_id_dic, False)

Total # of common userIds in TrainSet and TestSet = 1300
Total # of userIds for successful Recommendation = 426
Total Coverage of Test dataset = 32.76923076923077


### Training with Factors=250, iterations=20, Alpha = 60

In [0]:
model = implicit.als.AlternatingLeastSquares(factors=250, iterations=20, use_gpu=False)
alpha = 60
confidence_item_to_user = user_to_item_matrix.T * alpha

In [0]:
model.fit(confidence_item_to_user)

100%|██████████| 20.0/20 [35:43<00:00, 97.25s/it] 


In [0]:
user_to_item_trained = confidence_item_to_user.T.tocsr()
test_users_activities = get_user_activity_count(test_df)

In [0]:
#filter_already_liked_items set to true
hhit_ratio = find_hit_ratio(model, user_to_item_trained, test_users_activities, test_df, user_id_to_index_dic, index_to_item_id_dic,True)

Total # of common userIds in TrainSet and TestSet = 1300
Total # of userIds for successful Recommendation = 148
Total Coverage of Test dataset = 11.384615384615385


In [0]:
#filter_already_liked_items set to true
hhit_ratio = find_hit_ratio(model, user_to_item_trained, test_users_activities, test_df, user_id_to_index_dic, index_to_item_id_dic,False)

Total # of common userIds in TrainSet and TestSet = 1300
Total # of userIds for successful Recommendation = 427
Total Coverage of Test dataset = 32.84615384615385
