# Preprocessing And Compressed Bit Vectors Evaluation

In [43]:
import os
import numpy as np
import pandas as pd
from scipy.sparse import *
from scipy import *
from sklearn.preprocessing import LabelEncoder

In [44]:
## Load dataset Movielens 100K or 1 million rating or bookrating(1million)

# Dataset source:Movielens 100k: https://grouplens.org/datasets/movielens/100k/
# Movielens 1 million: https://grouplens.org/datasets/movielens/1m/
# Book crossing: https://grouplens.org/datasets/book-crossing/

header_list = ["userid", "movieid", "movieRating", "timestamp"]
# data = pd.read_csv('movie100kratings.dat', sep='::', names=header_list)
data = pd.read_csv('movie100kratings.data', sep='\t')
nRow, nCol = data.shape
print(f'There are {nRow} rows and {nCol} columns')
data.head()

There are 100000 rows and 4 columns


Unnamed: 0,userid,movieid,movieRating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [45]:
data.dtypes

userid         int64
movieid        int64
movieRating    int64
timestamp      int64
dtype: object

In [46]:
## final dataset sample
sample_data = data.drop(['timestamp'],axis=1)
sample_data.head(10)
#contain 1,000,209 anonymous ratings of approximately 3,900 movies made by 6,040 MovieLens

Unnamed: 0,userid,movieid,movieRating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1
5,298,474,4
6,115,265,2
7,253,465,5
8,305,451,3
9,6,86,3


In [47]:
frequency = sample_data.groupby('userid').count()
print(len(frequency))

test_users = []
for i in range(len(frequency)):
    fre = frequency.iloc[i]
    break
    if (fre[0] > 100):
        test_users.append(i+1)
# print(len(test_users))

943


In [48]:
test_userid = []
test_movieid = []
test_movieRating = []

In [49]:
print(sample_data.userid.max())


943


create testing set

In [50]:
# if the user has rated more than "50" movies then add him to test dataset
for user in range(1,6040): #max userid value from prev step
    filter_df = sample_data.query("userid == " + str(user))
    if (len(filter_df) < 50):
        continue
    # add 20% of that particular user data to the testdata set
    portion = int(len(filter_df)*(0.2))
    random_indices = set(np.random.choice(len(filter_df), portion))
    for i in random_indices:
        row = filter_df.iloc[i]
        test_userid.append(row.userid)
        test_movieid.append(row.movieid)
        test_movieRating.append(row.movieRating)


In [51]:
print(len(test_userid))
print(len(test_movieid))
print(len(test_movieRating))

15865
15865
15865


In [52]:
test_df_dict = {"userid": test_userid, "movieid":test_movieid, "movieRating":test_movieRating}
convert_dict = {
    "userid" :      int,
    "movieid" :     int,
    "movieRating"  :   int
    }
test_df = pd.DataFrame(test_df_dict)
test_df = test_df.astype(convert_dict)
print(test_df.head())
test_df.shape

   userid  movieid  movieRating
0       1      243            1
1       1      154            5
2       1       29            1
3       1      111            5
4       1       52            4


(15865, 3)

In [53]:
test_df.to_csv("test_ratings.csv",index=False)
# test dataset csv

In [54]:
# data redundancy
# adding unique userid,movieid tuple to the test_tuple_set
#  trying to avoid users who have rated same movie twice
test_tuple_set = set()
print(len(test_userid))
for i in range(len(test_userid)):
    userid_movieid_tuple = (test_userid[i], test_movieid[i])
    if userid_movieid_tuple in test_tuple_set:
        print(userid_movieid_tuple)
        print(":)")
    test_tuple_set.add(userid_movieid_tuple)
print(len(test_tuple_set))
print(len(test_df))

15865
15865
15865


create Training set

In [55]:
def drop_rating(row):
    print(len(row))
    t = (row.userid, row.movieid)
    if t in test_tuple_set:
        row.movieRating = 0.0
    return row

In [56]:
# removing test data values/ratings from the training dataset and making them zero not deleting
for i in range(len(sample_data)):
    row = sample_data.iloc[i]
    t = (row.userid, row.movieid)
    if t in test_tuple_set:
        sample_data.at[i,'movieRating'] = 0.0


In [57]:
sample_data.head(10)

Unnamed: 0,userid,movieid,movieRating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1
5,298,474,4
6,115,265,2
7,253,465,5
8,305,451,3
9,6,86,3


In [58]:
sample_data.to_csv("train_ratings.csv", index=False)

In [59]:
len(sample_data.movieid.unique())
# total unique movies/items
# sample_data = whole data

1682

In [60]:
# converting unique movies numpy ndarray to list
unique_movies = sample_data.movieid.unique()
type (list(unique_movies))
print(unique_movies)

[ 242  302  377 ... 1637 1630 1641]


 ***ASSIGN* CONTINUOUS INDEX TO movieid**

In [61]:
# adding index to movie id thru a dict
original_movie_ids = []
mapped_movie_ids = []
movie_dict = {}
count = 1
for movie_id in unique_movies:
    original_movie_ids.append(movie_id)
    mapped_movie_ids.append(count)
    movie_dict[movie_id] = count
    count += 1

In [62]:
print(len(original_movie_ids))
print(len(mapped_movie_ids))

1682
1682


In [63]:
# creating a dataframe with colomn1 as uniquemovieids and column2 as mappedmovieids(aka index) , dataframe index vs movie index
movies_map_df_dict = {"original_movie_ids": original_movie_ids,
                      "mapped_movie_ids":mapped_movie_ids}
movies_map_df = pd.DataFrame(movies_map_df_dict)
print(movies_map_df.head())
movies_map_df.to_csv("mapped_movie_ids.csv",index=False)

   original_movie_ids  mapped_movie_ids
0                 242                 1
1                 302                 2
2                 377                 3
3                  51                 4
4                 346                 5


In [64]:
# adding a column(mapped_movie_id) to orginal wholedataset dataframe sample_data
# len(sample_data)
for i in range(len(sample_data)):
  mapped_movie_id = movie_dict[int(sample_data.iloc[i][1])]
#     print(sample_data.iloc[i][1] ,mapped_movie_id )
  sample_data.loc[sample_data.index[i], 'mapped_movie_id'] = int(mapped_movie_id)


In [65]:
sample_data.head()

Unnamed: 0,userid,movieid,movieRating,mapped_movie_id
0,196,242,3,1.0
1,186,302,3,2.0
2,22,377,1,3.0
3,244,51,2,4.0
4,166,346,1,5.0


In [66]:
users = np.array(sample_data['userid'])
items = np.array(sample_data['movieid'])
ratings = np.array(sample_data['movieRating'])
mapped_movie_ids = np.array(sample_data['mapped_movie_id'])

In [67]:
print(users[3],items[3], ratings[3])

244 51 2


In [68]:
sample_data.movieid.max()

1682

In [69]:
len(users), len(items), len(ratings), len(mapped_movie_ids)

(100000, 100000, 100000, 100000)

In [70]:
len(sample_data.userid.unique())
# unique users in wholedataset

943

In [71]:
len(sample_data.movieid.unique())
# unique movies in wholedataset

1682

In [72]:
# sample_data.to_sparse(fill_value=0)

Create Utility Matrix

In [73]:
# converting original dataframe sample_data to a matrix representation/utility matrix with the help of csr_matrix func of scipy
utility_csr = csr_matrix((ratings, (users , mapped_movie_ids.astype(int))))

In [74]:
utility_matrix = csr_matrix((ratings, (users, mapped_movie_ids.astype(int)))).toarray() # Users x Items
utility_matrix_t = utility_matrix.T
utility_matrix

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 5, 0, ..., 0, 0, 0],
       [0, 5, 5, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [75]:
print(utility_matrix[users[4]][items[4]])

0


In [76]:
# we have an extra expendable row(likely first row), column in both user_utility_matrix and item_utility_matrix
# shud be 943x1682 / 1682x943 , somewhere csr_matrix func is adding an extra row
print(utility_matrix.shape)
print(utility_matrix_t.shape)

(944, 1683)
(1683, 944)


In [77]:
np.savetxt('users_m.txt', utility_matrix, fmt="%d") # All rows as a USERS
np.savetxt('items_m.txt', utility_matrix_t, fmt="%d") # All rows as a ITEMS

In [78]:
test_data = pd.read_csv('test_ratings.csv')
test_users = test_data.userid.unique()
print(len(test_users))
test_users_list = test_users.tolist()
with open ('test_users.txt', 'w') as fo:
     fo.write(','.join(str(i) for i in test_users_list))
# unique test users in testdataset(test_ratings.csv) which is used in javacode for generating recommendations of nearestKneighbors efficiently with compressed bit vectors

568


In [79]:
## passing(copying) the above 3 txt files
# user_m txt file matrix and
# item_m file matrix, and
# test_users.txt (unique test users in testdataset)
## into java code(intelliJ) for generating recommendation through compressed bit vectors

In [80]:
# below four cells revist

In [81]:
mapped_movie_ids_dict = {}
for key in movie_dict:
    value = movie_dict[key]
    mapped_movie_ids_dict[value] = key

In [82]:
def get_original_movie_ids(mapped_movie_ids):
    original_ids = []
    for movie_id in mapped_movie_ids:
        if int(movie_id) in mapped_movie_ids_dict:
            original_ids.append(mapped_movie_ids_dict[int(movie_id)])
        else:
            original_ids.append(-1)
    return original_ids

In [83]:
temp_str = "6,15,18,19,24,25,26,27,31,32,33,231,291,321,462,470,471,473,477,620"

In [84]:
bsi_mapped_movie_ids = temp_str.split(",")
bsi_original_movie_ids = get_original_movie_ids(bsi_mapped_movie_ids)
print(bsi_original_movie_ids)

[474, 29, 274, 1042, 118, 1, 546, 95, 246, 98, 193, 520, 558, 97, 870, 44, 686, 729, 566, 372]


In [85]:
# now we evaluate compressed bit vector recommendations ( java code txt files)

In [86]:
import os
import numpy as np
import pandas as pd
from scipy.sparse import *
from scipy import *
from sklearn.preprocessing import LabelEncoder

In [87]:
test_data = pd.read_csv('test_ratings.csv')
nRow, nCol = test_data.shape
print(f'There are {nRow} rows and {nCol} columns')

There are 15865 rows and 3 columns


In [88]:
mapped_movieID_df = pd.read_csv('mapped_movie_ids.csv')

In [89]:
test_data

Unnamed: 0,userid,movieid,movieRating
0,1,243,1
1,1,154,5
2,1,29,1
3,1,111,5
4,1,52,4
...,...,...,...
15860,943,840,4
15861,943,1044,3
15862,943,1330,3
15863,943,569,2


In [90]:
mapped_movieID_df.head()

Unnamed: 0,original_movie_ids,mapped_movie_ids
0,242,1
1,302,2
2,377,3
3,51,4
4,346,5


In [91]:
# create dictionaries for movieID mapping
mapped_movieID_dict = {}
# mapped_movieID_df.size
for i in range(len(mapped_movieID_df)):
    row = mapped_movieID_df.loc[i]
#     print(row)
    mapped_movieID_dict[row.original_movie_ids] = row.mapped_movie_ids

In [92]:
original_movieID_dict = {}
for i in range(len(mapped_movieID_df)):
    row = mapped_movieID_df.loc[i]
#     print(row)
    original_movieID_dict[row.mapped_movie_ids] = row.original_movie_ids

In [93]:
# unique users in test data for evaluation
test_users = test_data.userid.unique()
test_users_list = test_users.tolist()
print(test_users_list)
print(len(test_users_list))
# used in the java file to generate test_recommendation_bsi.txt and bsi_recommendations_for_all_user.txt

[1, 2, 3, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 18, 21, 22, 23, 24, 25, 26, 28, 37, 38, 41, 42, 43, 44, 48, 49, 52, 54, 56, 57, 58, 59, 60, 62, 63, 64, 65, 69, 70, 72, 73, 75, 76, 77, 79, 81, 82, 83, 84, 85, 87, 89, 90, 91, 92, 94, 95, 96, 97, 99, 100, 101, 102, 104, 106, 109, 110, 113, 115, 116, 117, 118, 119, 121, 122, 123, 125, 128, 130, 135, 138, 141, 144, 145, 148, 151, 152, 154, 157, 158, 159, 160, 161, 164, 167, 168, 174, 176, 177, 178, 180, 181, 183, 184, 186, 187, 188, 189, 190, 193, 194, 195, 197, 198, 200, 201, 206, 207, 210, 213, 214, 215, 216, 217, 218, 221, 222, 223, 224, 226, 227, 230, 232, 233, 234, 235, 236, 239, 243, 244, 246, 248, 249, 250, 251, 253, 254, 255, 256, 257, 262, 263, 264, 267, 268, 269, 270, 271, 272, 274, 275, 276, 277, 279, 280, 283, 286, 287, 288, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 301, 303, 305, 307, 308, 311, 312, 313, 314, 315, 316, 318, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 334, 336, 338, 339, 342, 343, 

In [94]:
def get_mapped_movieids(original_movieIds):
    mapped_movieIds= []
    for _id in original_movieIds:
        mapped_movieIds.append(mapped_movieID_dict[_id])
    return mapped_movieIds

In [95]:
# get Get movies which is rated (>3.0)/(change depending on dataset) by user in test data for BSI
def get_test_rec_movieIds(userId):
    user_df = test_data[test_data.userid == userId]
    user_movie_df = user_df[user_df.movieRating > 3.0]
    original_movieIds = user_movie_df.movieid.unique()
    mapped_movieIds = get_mapped_movieids(original_movieIds)
    return mapped_movieIds


In [96]:
def find_common(original_ids, rec_ids):
    original_ids_set = set(original_ids)
    common_count = 0
    if rec_ids[-1] == '':
        rec_ids.pop()
    for _id in rec_ids:
        if int(_id) in original_ids_set:
            common_count += 1
    return common_count

In [97]:
mapped_movieID_df.head()

Unnamed: 0,original_movie_ids,mapped_movie_ids
0,242,1
1,302,2
2,377,3
3,51,4
4,346,5


In [98]:
def getOrginalMoviesIds(mapped_movie_ids):
    original_movieIds= []
    for _id in mapped_movie_ids:
        original_movieIds.append(original_movieID_dict[int(_id)])
    return original_movieIds


In [99]:
print(getOrginalMoviesIds(['3','4']))

[377, 51]


In [100]:
### Calculate Precision and Recall

def getPrecision(total_count, common_count):
    if(total_count ==0):
        return 0
    return (common_count/total_count)

def getRecall(total_count, common_count):
    if(total_count ==0):
        return 1
    return (common_count/total_count)

# def hitratio(total_count, hits):
#     if(hits ==0):
#         return 1
#     return (hits/total_count)

def getf1score(precision, recall):
    if(precision == 0 and recall == 0 ):
        return 0
    return ((2 * precision * recall )/ ( precision + recall ))

In [101]:
### Evalute Bsi algorithm for all test users
## After executing the java code, copy the output
# copy test_recommendation_bsi.txt and bsi_recommendations_for_all_users.txt from the java IntelliJ env into colab jupyterNotebook env

In [102]:
file = open('test_recommendation_bsi.txt', 'a')

In [103]:
# get_test_rec_movieIds
count = 0
input_file = open('test_recommendation_bsi.txt','r') # file pulled from java code. CBV

userIds = []
actual_movies_watched = []
recomendations = []
common_movies = []
similar_usres_k = []
precision = []
recall = []
f1score = []

while (True):
    line = input_file.readline()
    if not line:
        break
    count +=1

    values = line.split(", ")
    userid = values[0]
    userK = values[1] # neighborhood size
    no_recomendations = values[2] # no of recommendations per user
    recomendations_for_user = values[3:] # actual recommendations

    recomendations_for_user.pop() ## removing last null(\n) value

    actual_movies_wated_by_users = get_test_rec_movieIds(int(userid)) # movies in test data  # actaul movies watched by user which are greater > 3
    common_count = find_common(actual_movies_wated_by_users, recomendations_for_user) # movies in algo's predictions

#     print(userId,userK,no_recomendations)
#     print(common_count)

    userIds.append(userid)
    similar_usres_k.append(int(userK))
    actual_movies_watched.append(len(actual_movies_wated_by_users))
    recomendations.append(int(no_recomendations))
    common_movies.append(common_count)

    total_count_for_precision = int(no_recomendations) # actual results
    total_count_for_recall = len(actual_movies_wated_by_users)

    temp_Precision = getPrecision(total_count_for_precision, common_count)
    precision.append(temp_Precision)
    temp_recall = getRecall(total_count_for_recall, common_count)
    recall.append(temp_recall)
    f1score.append(getf1score(temp_Precision, temp_recall))

print(count)
# 568(test_users) * userK(4 len of list) * itemK(5 len of list) = 11360 test_recommendation_bsi.txt rows count

2840


In [104]:
# storing precision and recall in new dict and dataframe
result_df_dict = {"userId":userIds,"actual_movies_watched":actual_movies_watched,
                  "recomendations":recomendations,"common_movies":common_movies,
                  "similar_usres_k":similar_usres_k, "precision": precision, "recall": recall, "f1_score": f1score}

In [105]:
result_bsi_df = pd.DataFrame(result_df_dict)
result_bsi_df.head(25)
result_bsi_df = result_bsi_df.astype({"userId": int})
result_bsi_df

Unnamed: 0,userId,actual_movies_watched,recomendations,common_movies,similar_usres_k,precision,recall,f1_score
0,1,28,5,0,5,0.00,0.000000,0.000000
1,1,28,10,0,5,0.00,0.000000,0.000000
2,1,28,25,1,5,0.04,0.035714,0.037736
3,1,28,50,1,5,0.02,0.035714,0.025641
4,1,28,100,2,5,0.02,0.071429,0.031250
...,...,...,...,...,...,...,...,...
2835,943,16,5,1,5,0.20,0.062500,0.095238
2836,943,16,10,1,5,0.10,0.062500,0.076923
2837,943,16,25,1,5,0.04,0.062500,0.048780
2838,943,16,50,2,5,0.04,0.125000,0.060606


In [106]:
result_bsi_df.sort_values('precision', ascending=False)

Unnamed: 0,userId,actual_movies_watched,recomendations,common_movies,similar_usres_k,precision,recall,f1_score
2585,864,38,5,2,5,0.4,0.052632,0.093023
2465,825,12,5,2,5,0.4,0.166667,0.235294
2205,716,31,5,2,5,0.4,0.064516,0.111111
2430,804,39,5,2,5,0.4,0.051282,0.090909
2195,712,19,5,2,5,0.4,0.105263,0.166667
...,...,...,...,...,...,...,...,...
1152,373,22,25,0,5,0.0,0.000000,0.000000
1151,373,22,10,0,5,0.0,0.000000,0.000000
1150,373,22,5,0,5,0.0,0.000000,0.000000
1149,372,10,100,0,5,0.0,0.000000,0.000000


In [107]:
result_bsi_df.sort_values('recall', ascending=False)

Unnamed: 0,userId,actual_movies_watched,recomendations,common_movies,similar_usres_k,precision,recall,f1_score
2299,752,4,100,2,5,0.02,0.5,0.038462
2371,778,2,10,1,5,0.10,0.5,0.166667
2374,778,2,100,1,5,0.01,0.5,0.019608
2373,778,2,50,1,5,0.02,0.5,0.038462
2372,778,2,25,1,5,0.04,0.5,0.074074
...,...,...,...,...,...,...,...,...
1152,373,22,25,0,5,0.00,0.0,0.000000
1151,373,22,10,0,5,0.00,0.0,0.000000
1150,373,22,5,0,5,0.00,0.0,0.000000
1149,372,10,100,0,5,0.00,0.0,0.000000


In [108]:
result_bsi_df.sort_values('f1_score', ascending=False)

Unnamed: 0,userId,actual_movies_watched,recomendations,common_movies,similar_usres_k,precision,recall,f1_score
2370,778,2,5,1,5,0.2,0.500000,0.285714
2465,825,12,5,2,5,0.4,0.166667,0.235294
5,2,6,5,1,5,0.2,0.166667,0.181818
120,41,6,5,1,5,0.2,0.166667,0.181818
2466,825,12,10,2,5,0.2,0.166667,0.181818
...,...,...,...,...,...,...,...,...
1152,373,22,25,0,5,0.0,0.000000,0.000000
1151,373,22,10,0,5,0.0,0.000000,0.000000
1150,373,22,5,0,5,0.0,0.000000,0.000000
1149,372,10,100,0,5,0.0,0.000000,0.000000


In [109]:
result_bsi_df_5 = result_bsi_df.loc[result_bsi_df['recomendations']==5]
result_bsi_df_10 = result_bsi_df.loc[result_bsi_df['recomendations']==10]
result_bsi_df_25 = result_bsi_df.loc[result_bsi_df['recomendations']==25] # final run
result_bsi_df_50 = result_bsi_df.loc[result_bsi_df['recomendations']==50]
result_bsi_df_100 = result_bsi_df.loc[result_bsi_df['recomendations']==100]

In [110]:
result_bsi_df_5.mean()

userId                   461.948944
actual_movies_watched     15.385563
recomendations             5.000000
common_movies              0.144366
similar_usres_k            5.000000
precision                  0.028873
recall                     0.009440
f1_score                   0.012932
dtype: float64

In [111]:
result_bsi_df_5.max()

userId                   943.000000
actual_movies_watched     65.000000
recomendations             5.000000
common_movies              2.000000
similar_usres_k            5.000000
precision                  0.400000
recall                     0.500000
f1_score                   0.285714
dtype: float64

In [112]:
result_bsi_df_10.mean()

userId                   461.948944
actual_movies_watched     15.385563
recomendations            10.000000
common_movies              0.234155
similar_usres_k            5.000000
precision                  0.023415
recall                     0.015109
f1_score                   0.016443
dtype: float64

In [113]:
result_bsi_df_10.max()

userId                   943.000000
actual_movies_watched     65.000000
recomendations            10.000000
common_movies              3.000000
similar_usres_k            5.000000
precision                  0.300000
recall                     0.500000
f1_score                   0.181818
dtype: float64

In [114]:
result_bsi_df_25.mean()

userId                   461.948944
actual_movies_watched     15.385563
recomendations            25.000000
common_movies              0.454225
similar_usres_k            5.000000
precision                  0.018169
recall                     0.030525
f1_score                   0.020597
dtype: float64

In [115]:
result_bsi_df_25.max()

userId                   943.000000
actual_movies_watched     65.000000
recomendations            25.000000
common_movies              5.000000
similar_usres_k            5.000000
precision                  0.200000
recall                     0.500000
f1_score                   0.146341
dtype: float64

In [116]:
result_bsi_df_50.mean()

userId                   461.948944
actual_movies_watched     15.385563
recomendations            50.000000
common_movies              0.737676
similar_usres_k            5.000000
precision                  0.014754
recall                     0.049564
f1_score                   0.021127
dtype: float64

In [117]:
result_bsi_df_100.mean()

userId                   461.948944
actual_movies_watched     15.385563
recomendations           100.000000
common_movies              1.177817
similar_usres_k            5.000000
precision                  0.011778
recall                     0.077181
f1_score                   0.019482
dtype: float64

In [118]:
###

In [119]:
userId = []
recommendations = []
input_file_1 = open('bsi_recommendations_for_all_users.txt','r')
while (True):
    line = input_file_1.readline()
    if not line:
        break
    count +=1
    values = line.split(", ")
    user_id = values[0]
    recomendations_for_user = values[1:]
    recomendations_for_user.pop()
    actual_movie_ids = getOrginalMoviesIds(recomendations_for_user)
    userId.append(user_id)
    recommendations.append(actual_movie_ids)

print("Finished")

Finished


In [120]:
result_all_user_df_dict = {"userID": userId, "recommendations": recommendations}

In [121]:
result_all_user_df = pd.DataFrame(result_all_user_df_dict)

In [122]:
result_all_user_df # one user is missing 685/686 rows are 942 # 3 user missing for 1 mil dataset , no rec for missing user

Unnamed: 0,userID,recommendations
0,1,"[474, 98, 603, 4, 143, 423, 288, 427, 480, 200..."
1,2,"[144, 246, 98, 193, 603, 100, 181, 196, 423, 4..."
2,3,"[302, 222, 144, 1, 234, 246, 98, 332, 100, 288..."
3,4,"[302, 222, 144, 1, 98, 181, 196, 248, 237, 101..."
4,5,"[474, 265, 144, 98, 193, 603, 4, 100, 196, 143..."
...,...,...
937,939,"[265, 144, 1, 234, 98, 194, 181, 196, 143, 423..."
938,940,"[302, 265, 144, 1, 193, 88, 603, 100, 181, 196..."
939,941,"[302, 222, 144, 118, 546, 98, 4, 100, 196, 288..."
940,942,"[474, 144, 1, 98, 194, 603, 181, 196, 143, 526..."


# Collaborative Filtering pyspark ( ALS Baseline)

In [123]:
!pip install pyspark



In [124]:
import os
import numpy as np
from pyspark.sql import SparkSession
from pyspark import SparkContext
import pandas as pd
from pyspark.mllib.recommendation import ALS
import math
import pyspark.sql
from pyspark.sql.functions import udf
from pyspark.sql.types import *
from pyspark.ml.evaluation import RegressionEvaluator
import time


In [125]:
# Calling spark session to register application
spark = SparkSession \
    .builder \
    .appName("Recom") \
    .config("spark.recom.demo", "1") \
    .getOrCreate()
# lambda word: (word, 1)

In [126]:
ratings_df = spark.read \
    .format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("train_ratings.csv")

In [127]:
ratings_df.show()
ratings_df.count()

+------+-------+-----------+
|userid|movieid|movieRating|
+------+-------+-----------+
|   196|    242|          3|
|   186|    302|          3|
|    22|    377|          1|
|   244|     51|          2|
|   166|    346|          1|
|   298|    474|          4|
|   115|    265|          2|
|   253|    465|          5|
|   305|    451|          3|
|     6|     86|          3|
|    62|    257|          2|
|   286|   1014|          5|
|   200|    222|          5|
|   210|     40|          3|
|   224|     29|          0|
|   303|    785|          3|
|   122|    387|          5|
|   194|    274|          2|
|   291|   1042|          4|
|   234|   1184|          2|
+------+-------+-----------+
only showing top 20 rows



100000

In [128]:
ratings_df = ratings_df.drop('') # reduntant> no values are dropped
ratings_df.count()

100000

### Drop test data from training set

In [129]:
ratings_df = ratings_df[ratings_df.movieRating != 0.0]
ratings_df.summary
ratings_df.count()

84135

In [130]:
(trainingData,validationData,testData) = ratings_df.randomSplit([0.6,0.2,0.2],5) # randomSplit(weights, seed)

In [131]:
trainingData.show()
print(trainingData.count())
print(validationData.count())
print(testData.count())

+------+-------+-----------+
|userid|movieid|movieRating|
+------+-------+-----------+
|     1|      2|          3|
|     1|      3|          4|
|     1|      5|          3|
|     1|      6|          5|
|     1|      7|          4|
|     1|      8|          1|
|     1|     12|          5|
|     1|     13|          5|
|     1|     14|          5|
|     1|     15|          5|
|     1|     16|          5|
|     1|     19|          5|
|     1|     20|          4|
|     1|     22|          4|
|     1|     23|          4|
|     1|     24|          3|
|     1|     25|          4|
|     1|     26|          3|
|     1|     27|          2|
|     1|     28|          4|
+------+-------+-----------+
only showing top 20 rows

50399
16881
16855


In [132]:
validation_for_predict = validationData.select('userid','movieid')
test_for_predict = testData.select('userid','movieid')

In [133]:
test_for_predict.rdd

MapPartitionsRDD[261] at javaToPython at <unknown>:0

## Using the above data for all the comparable baselines

In [134]:
# Common params for ALS
seed = 5
iterations = 10
regularization_parameter = 0.1
ranks = [4, 8, 12]

In [135]:
min_error = 1000
for rank in ranks:
    model = ALS.train(ratings_df, rank, seed=seed, iterations=iterations,
                      lambda_=regularization_parameter)

    #converting prediction into key value pair like key=(userId,movieId) and value = rating
    predictions = model.predictAll(validation_for_predict.rdd).map(lambda r: ((r[0], r[1]), r[2]))

    #joining predicted rating and original ratings to calculate error
    rates_and_preds = validationData.rdd.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)

    error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())

    print ('For rank',rank, "the RMSE is ", error)
    if error < min_error:
        min_error = error
        best_rank = rank

print ("The best model was trained with rank", best_rank)

For rank 4 the RMSE is  0.8345445460356727
For rank 8 the RMSE is  0.7746039038517858
For rank 12 the RMSE is  0.7369672126682002
The best model was trained with rank 12


In [136]:
predictions_test = model.predictAll(test_for_predict.rdd).map(lambda r: ((r[0], r[1]), r[2]))

In [137]:
predictions_test.take(5)

[((58, 1084), 3.7051529817097295),
 ((74, 1084), 3.2564156245788958),
 ((504, 1084), 3.7377045050400666),
 ((181, 1084), 1.7276882152114181),
 ((537, 1084), 3.11636490909214)]

### Get Recomendations For User

This method returns original movieId where CBV algorithm returns mapped movieId

In [138]:
def getRecommendations(user,ratings_df,trainDf,model, k): #

    userDf = ratings_df.filter(ratings_df.userid == user) # for eg user = 1 ...getting all movies(rows) user1 has rated

    mov = ratings_df.select('movieid').subtract(userDf.select('movieid')) # mov dataframe wont have whatever user1 has rated

    # measure below two only

    pred_rat = model.predictAll(mov.rdd.map(lambda x: (user, x[0]))).collect()

    recommendations = sorted(pred_rat, key=lambda x: x[2], reverse=True)[:k]

    return recommendations

In [139]:
model.predict(1,1084) # make sure product id is in the model

3.795871529474027

In [140]:
user = 1
# how many recommendations you want
k= 10

# Call getRecommendations method
derived_rec = getRecommendations(user, ratings_df, trainingData, model, k)

print ("Movies recommended for:",user)
movie_ids = []
for i in range(len(derived_rec)):
    movie_ids.append(derived_rec[i][1])
#     print (i+1,derived_rec[i][1])

print(movie_ids)
#     movies_df.filter(movies_df.movieId==derived_rec[i][1]).select('title').show()

Movies recommended for: 1
[408, 1449, 694, 838, 511, 1589, 1367, 1131, 963, 1142]


In [141]:
temp_str = "70, 235, 316, 333, 423, 441, 457, 480, 552, 553, 590, 4006, 72378, 538, 36, 300, 344, 380, 531, 371"

In [142]:
bsi_movie_ids = temp_str.split(", ") # not using bsi_movie_ids
print(bsi_movie_ids)
print(movie_ids)

['70', '235', '316', '333', '423', '441', '457', '480', '552', '553', '590', '4006', '72378', '538', '36', '300', '344', '380', '531', '371']
[408, 1449, 694, 838, 511, 1589, 1367, 1131, 963, 1142]


In [143]:
test_data = pd.read_csv('test_ratings.csv')
test_users = test_data.userid.unique()
print(len(test_users))
test_users_list = test_users.tolist()
userKs = [5] # 10,20,25 final run
itemKs = [5,10,25,50,100] # 5, 10, 25, 50 ,100
len(test_users_list)

# 568(test_users) * userK(4 len of list) * itemK(5 len of list) = 11360 test_recommendation_bsi.txt rows

568


568

In [144]:
# def recomendMoviesForAllUsers(test_users, itemKs,testData,trainingData,model):
#     output_file = open("test_recommendations_als_5_10.txt", "a")
#     for u in range(len(test_users)):
#         user = test_users[u]
#         for itemk in itemKs:
#             derived_rec = getRecommendations(user,testData,trainingData,model,itemk)

#             output_file.write(str(user)+", ")
#             output_file.write(str(5)+", ")
#             for i in range(5):
#                 output_file.write(str(derived_rec[i][1])+", ")
#             output_file.write("\n")

#             output_file.write(str(user)+", ")
#             output_file.write(str(itemk)+", ")
#             for i in range(len(derived_rec)):
#                 output_file.write(str(derived_rec[i][1])+", ")
#             output_file.write("\n")
## below snippet final run many itemKs
def recomendMoviesForAllUsers(test_users, itemKs,testData,trainingData,model):
    output_file = open("test_recommendations_als.txt", "a")
    print("total", len(test_users))
    wer = 0
    for u in range(len(test_users)):
      print(wer)
      wer=wer+1
      user = test_users[u]
      for userk in userKs:
        for itemk in itemKs:
            derived_rec = getRecommendations(user,testData,trainingData,model,itemk)
            output_file.write(str(user)+", ")
            # output_file.write(str(userk)+", ")
            output_file.write(str(itemk)+", ")
            for i in range(len(derived_rec)):
                output_file.write(str(derived_rec[i][1])+", ")
            output_file.write("\n")

In [145]:
start = time.time()
derived_rec = recomendMoviesForAllUsers(test_users_list,itemKs,testData,trainingData,model)
end = time.time()
print(end - start)


total 568
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274


In [146]:
###  Get movies which is rated (>3.0) by user in test data for ALS

def get_test_rec_movieIds_als(userId):
    user_df = test_data[test_data.userid == userId]
    user_movie_df = user_df[user_df.movieRating > 3.0]
    original_movieIds = user_movie_df.movieid.unique()
    return list(original_movieIds)

In [147]:
# get_test_rec_movieIds
count = 0
input_file = open('test_recommendations_als.txt','r')

userIds = []
actual_movies_watched = []
recomendations = []
common_movies = []
precision = []
recall = []
f1score = []

while (True):
    line = input_file.readline()
    if not line:
        break
    count +=1
    values = line.split(", ")
    userId = values[0]
    no_recomendations = values[1]
    recomendations_for_user = values[2:]

    recomendations_for_user.pop() ## removing last null(\n) value

    actual_movies_watched_by_users = get_test_rec_movieIds_als(int(userId)) # movies in test data
#     print(count)
    common_count = find_common(actual_movies_watched_by_users, recomendations_for_user) # movies in algo's predictions

#     print(userId,userK,no_recomendations)
#     print(common_count)

    userIds.append(userId)
    actual_movies_watched.append(len(actual_movies_watched_by_users))
    recomendations.append(int(no_recomendations))
    common_movies.append(common_count)

    total_count_for_precision = int(no_recomendations)
    total_count_for_recall = len(actual_movies_watched_by_users)

    temp_Precision = getPrecision(total_count_for_precision, common_count)
    precision.append(temp_Precision)
    temp_recall = getRecall(total_count_for_recall, common_count)
    recall.append(temp_recall)
    f1score.append(getf1score(temp_Precision, temp_recall))

print(count)
# test_recommendations_als_5_10.txt row length

2840


In [148]:
result_df_dict = {"userId":userIds,"actual_movies_watched":actual_movies_watched,
                  "recomendations":recomendations,"common_movies":common_movies,
                   "precision": precision, "recall": recall, "f1_score": f1score}

In [149]:
result_als_df = pd.DataFrame(result_df_dict)

In [150]:
result_als_df

Unnamed: 0,userId,actual_movies_watched,recomendations,common_movies,precision,recall,f1_score
0,1,28,5,0,0.00,0.000000,0.000000
1,1,28,10,0,0.00,0.000000,0.000000
2,1,28,25,1,0.04,0.035714,0.037736
3,1,28,50,3,0.06,0.107143,0.076923
4,1,28,100,8,0.08,0.285714,0.125000
...,...,...,...,...,...,...,...
2835,943,16,5,0,0.00,0.000000,0.000000
2836,943,16,10,0,0.00,0.000000,0.000000
2837,943,16,25,1,0.04,0.062500,0.048780
2838,943,16,50,3,0.06,0.187500,0.090909


In [151]:
result_als_df = result_als_df.astype({"userId": int})


In [152]:
result_als_df_5 = result_als_df.loc[result_als_df['recomendations']==5]
result_als_df_10 = result_als_df.loc[result_als_df['recomendations']==10]
result_als_df_25 = result_als_df.loc[result_als_df['recomendations']==25]
result_als_df_50 = result_als_df.loc[result_als_df['recomendations']==50]
result_als_df_100 = result_als_df.loc[result_als_df['recomendations']==100]

In [153]:
result_als_df_5.mean()

userId                   461.948944
actual_movies_watched     15.385563
recomendations             5.000000
common_movies              0.271127
precision                  0.054225
recall                     0.018825
f1_score                   0.026248
dtype: float64

In [154]:
result_als_df_10.mean()

userId                   461.948944
actual_movies_watched     15.385563
recomendations            10.000000
common_movies              0.556338
precision                  0.055634
recall                     0.038719
f1_score                   0.041907
dtype: float64

In [155]:
result_als_df_25.mean()

userId                   461.948944
actual_movies_watched     15.385563
recomendations            25.000000
common_movies              1.359155
precision                  0.054366
recall                     0.094545
f1_score                   0.063486
dtype: float64

In [156]:
result_als_df_50.mean()

userId                   461.948944
actual_movies_watched     15.385563
recomendations            50.000000
common_movies              2.448944
precision                  0.048979
recall                     0.169670
f1_score                   0.070993
dtype: float64

In [157]:
result_als_df_100.mean()

userId                   461.948944
actual_movies_watched     15.385563
recomendations           100.000000
common_movies              4.264085
precision                  0.042641
recall                     0.292110
f1_score                   0.071221
dtype: float64

## **Collaborative filtering (SVD Baseline)**

In [158]:
# !pip install scikit-surprise

In [159]:
# import time
# import pandas as pd
# from surprise import SVD, Dataset, Reader
# from surprise.model_selection import train_test_split
# from surprise import accuracy
# from collections import defaultdict

# # Load the dataset
# ratings_df = pd.read_csv('train_ratings.csv')
# reader = Reader(rating_scale=(1, 5))
# data = Dataset.load_from_df(ratings_df[['userid', 'movieid', 'movieRating']], reader)

# # Split the dataset into training and test sets
# trainset, testset = train_test_split(data, test_size=0.2)

# # Train the SVD model
# svd_model = SVD()
# start_train = time.time()
# svd_model.fit(trainset)
# end_train = time.time()
# print(f"Training time: {end_train - start_train} seconds")

# # Predict ratings for the testset
# start_pred = time.time()
# predictions = svd_model.test(testset)
# end_pred = time.time()
# print(f"Prediction time: {end_pred - start_pred} seconds")

# # Calculate RMSE
# accuracy.rmse(predictions)

# # Function to get top-N recommendations for each user
# def get_top_n(predictions, n=10):
#     top_n = defaultdict(list)
#     for uid, iid, true_r, est, _ in predictions:
#         top_n[uid].append((iid, est))

#     for uid, user_ratings in top_n.items():
#         user_ratings.sort(key=lambda x: x[1], reverse=True)
#         top_n[uid] = user_ratings[:n]

#     return top_n

# top_n = get_top_n(predictions, n=10)

# # Load test data for comparison
# test_data = pd.read_csv('test_ratings.csv')
# test_users = test_data.userid.unique()

# # Function to calculate precision, recall, and F1 score
# def calculate_metrics(top_n, test_data):
#     precision_list = []
#     recall_list = []
#     f1_list = []

#     for user in test_users:
#         # Your actual movie ids here
#         actual_movies = set(test_data[(test_data['userid'] == user) & (test_data['movieRating'] > 3)]['movieid'])
#         # Predicted movie ids
#         predicted_movies = set([iid for (iid, _) in top_n[user]])
#         tp = len(actual_movies & predicted_movies)
#         fp = len(predicted_movies - actual_movies)
#         fn = len(actual_movies - predicted_movies)

#         precision = tp / (tp + fp) if (tp + fp) != 0 else 0
#         recall = tp / (tp + fn) if (tp + fn) != 0 else 0
#         f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

#         precision_list.append(precision)
#         recall_list.append(recall)
#         f1_list.append(f1_score)

#     return precision_list, recall_list, f1_list

# # Calculate metrics
# precision_list, recall_list, f1_list = calculate_metrics(top_n, test_data)

# # Creating result DataFrame
# result_df_dict = {
#     "userId": list(test_users),
#     "precision": precision_list,
#     "recall": recall_list,
#     "f1_score": f1_list
# }

# result_df = pd.DataFrame(result_df_dict)
# print(result_df)

## **Collaborative filtering (Non-Negative Matrix Factorization Baseline)**

## **Collaborative filtering (Neural Collaborative Filtering Baseline)**



In [160]:
!pip install tensorflow



In [163]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Concatenate, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import RootMeanSquaredError
import time

# Load the data (assuming it's already preprocessed)
ratings_df = pd.read_csv("train_ratings.csv")

# Define the neural collaborative filtering model
def create_ncf_model(user_dim, movie_dim):
    user_input = Input(shape=(1,))
    movie_input = Input(shape=(1,))

    user_embedding = Embedding(user_dim+1, 10)(user_input)
    movie_embedding = Embedding(movie_dim+1, 10)(movie_input)

    user_flatten = Flatten()(user_embedding)
    movie_flatten = Flatten()(movie_embedding)

    concat = Concatenate()([user_flatten, movie_flatten])
    dense1 = Dense(64, activation='relu')(concat)
    dense2 = Dense(32, activation='relu')(dense1)
    output = Dense(1)(dense2)

    model = Model(inputs=[user_input, movie_input], outputs=output)
    model.compile(loss='mean_squared_error', optimizer=Adam(), metrics=[RootMeanSquaredError()])

    return model

def getNCFRecommendations(user,ratings_df,model, k): #

    # userDf = ratings_df.filter(ratings_df.userid == user) # for eg user = 1 ...getting all movies(rows) user1 has rated
    # mov = ratings_df.select('movieid').subtract(userDf.select('movieid')) # mov dataframe wont have whatever user1 has rated
    # # measure below two only
    # # predictions = ncf_model.predict([np.array(test_user), np.array(test_movie)]).flatten()
    # pred_rat = model.predict(mov.rdd.map(lambda x: (user, x[0]))).collect()
    # recommendations = sorted(pred_rat, key=lambda x: x[2], reverse=True)[:k]
    # return recommendations


    userDf = ratings_df.filter(ratings_df.userid == user)
    mov = ratings_df.select('movieid').subtract(userDf.select('movieid'))

    # Convert user and movie IDs to numpy arrays
    user_array = np.full(mov.count(), user)
    movie_array = np.array(mov.select('movieid').rdd.flatMap(lambda x: x).collect())

    # Use the NCF model to predict ratings for unrated movies
    predictions = model.predict([user_array, movie_array]).flatten()

    # Combine user, movie, and prediction into a list of tuples
    recommendations = list(zip(user_array, movie_array, predictions))

    # Sort by predicted ratings in descending order and select the top k
    recommendations = sorted(recommendations, key=lambda x: x[2], reverse=True)[:k]

    return recommendations

def recomendNCFMoviesForAllUsers(test_users,itemKs,testData,model):
    output_file = open("test_recommendations_ncf.txt", "a")
    print("total", len(test_users))
    wer = 0
    for u in range(len(test_users)):
      print(wer)
      wer=wer+1
      user = test_users[u]
      for userk in userKs:
        for itemk in itemKs:
            derived_rec = getNCFRecommendations(user,testData,model,itemk)
            output_file.write(str(user)+", ")
            # output_file.write(str(userk)+", ")
            output_file.write(str(itemk)+", ")
            for i in range(len(derived_rec)):
                output_file.write(str(derived_rec[i][1])+", ")
            output_file.write("\n")


user_dim = ratings_df['userid'].nunique()
movie_dim = ratings_df['movieid'].nunique()

# Prepare data for training
train_user = trainingData.select('userid').rdd.flatMap(lambda x: x).collect()
train_movie = trainingData.select('movieid').rdd.flatMap(lambda x: x).collect()
train_rating = trainingData.select('movieRating').rdd.flatMap(lambda x: x).collect()

test_user = testData.select('userid').rdd.flatMap(lambda x: x).collect()
test_movie = testData.select('movieid').rdd.flatMap(lambda x: x).collect()
test_rating = testData.select('movieRating').rdd.flatMap(lambda x: x).collect()


# Create and train the model
ncf_model = create_ncf_model(user_dim, movie_dim)

start_time = time.time()
ncf_model.fit([np.array(train_user), np.array(train_movie)], np.array(train_rating), epochs=10, batch_size=64)
end_time = time.time()
print(f"Time taken for training: {end_time - start_time} seconds")


# # Function to predict all user-movie pairs
# def predict_all(model, user_ids, movie_ids):
#     predictions = model.predict([np.array(user_ids), np.array(movie_ids)]).flatten()
#     return list(zip(user_ids, movie_ids, predictions))

# # Predict all user-movie pairs in the test set
# test_predictions = predict_all(ncf_model, test_user, test_movie)

# Display some test predictions
# print("Sample Test Predictions:")
# for i in range(5):
#     print(test_predictions[i])

# # Time taken for predictions
# start_time = time.time()
# test_predictions = predict_all(ncf_model, test_user, test_movie)
# end_time = time.time()
# print(f"Time taken for predictions: {end_time - start_time} seconds")

# Evaluate the model on the test set

# predictions = ncf_model.predict([np.array(test_user), np.array(test_movie)]).flatten()

# # Create result_df_dict
# result_df_dict_ncf = {
#     "userId": test_user,
#     "actual_movies_watched": len(test_movie),
#     "recomendations": len(test_movie),
#     "common_movies": np.sum(predictions > 3.0),
#     "precision": np.sum(predictions > 3.0) / len(test_movie),
#     "recall": np.sum(predictions > 3.0) / len(test_movie),
#     "f1_score": 2 * (np.sum(predictions > 3.0) / len(test_movie)) / (len(test_movie) + np.sum(predictions > 3.0))
# }

# print(result_df_dict_ncf)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Time taken for training: 42.00644040107727 seconds


In [165]:
test_data = pd.read_csv('test_ratings.csv')
test_users = test_data.userid.unique()
print(len(test_users))
test_users_list = test_users.tolist()
userKs = [5] # 10,20,25 final run
itemKs = [5,10,25,50,100] # 5, 10, 25, 50 ,100
len(test_users_list)
start = time.time()
derived_rec = recomendNCFMoviesForAllUsers(test_users_list,itemKs,testData,ncf_model)
end = time.time()
print(end - start)

568
total 568
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273


In [166]:
def get_test_rec_movieIds_ncf(userId):
    user_df = test_data[test_data.userid == userId]
    user_movie_df = user_df[user_df.movieRating > 3.0]
    original_movieIds = user_movie_df.movieid.unique()
    return list(original_movieIds)


# get_test_rec_movieIds


count = 0
input_file = open('test_recommendations_ncf.txt','r')

userIds = []
actual_movies_watched = []
recomendations = []
common_movies = []
precision = []
recall = []
f1score = []

while (True):
    line = input_file.readline()
    if not line:
        break
    count +=1
    values = line.split(", ")
    userId = values[0]
    no_recomendations = values[1]
    recomendations_for_user = values[2:]

    recomendations_for_user.pop() ## removing last null(\n) value

    actual_movies_watched_by_users = get_test_rec_movieIds_ncf(int(userId)) # movies in test data
#     print(count)
    common_count = find_common(actual_movies_watched_by_users, recomendations_for_user) # movies in algo's predictions

#     print(userId,userK,no_recomendations)
#     print(common_count)

    userIds.append(userId)
    actual_movies_watched.append(len(actual_movies_watched_by_users))
    recomendations.append(int(no_recomendations))
    common_movies.append(common_count)

    total_count_for_precision = int(no_recomendations)
    total_count_for_recall = len(actual_movies_watched_by_users)

    temp_Precision = getPrecision(total_count_for_precision, common_count)
    precision.append(temp_Precision)
    temp_recall = getRecall(total_count_for_recall, common_count)
    recall.append(temp_recall)
    f1score.append(getf1score(temp_Precision, temp_recall))

print(count)

result_df_dict_ncf = {"userId":userIds,"actual_movies_watched":actual_movies_watched,
                  "recomendations":recomendations,"common_movies":common_movies,
                   "precision": precision, "recall": recall, "f1_score": f1score}

5680


In [167]:
result_ncf_df = pd.DataFrame(result_df_dict_ncf)
result_ncf_df.head(25)
result_ncf_df = result_ncf_df.astype({"userId": int})
result_ncf_df

Unnamed: 0,userId,actual_movies_watched,recomendations,common_movies,precision,recall,f1_score
0,1,28,5,0,0.00,0.000000,0.000000
1,1,28,10,0,0.00,0.000000,0.000000
2,1,28,25,0,0.00,0.000000,0.000000
3,1,28,50,4,0.08,0.142857,0.102564
4,1,28,100,5,0.05,0.178571,0.078125
...,...,...,...,...,...,...,...
5675,943,16,5,0,0.00,0.000000,0.000000
5676,943,16,10,0,0.00,0.000000,0.000000
5677,943,16,25,0,0.00,0.000000,0.000000
5678,943,16,50,1,0.02,0.062500,0.030303


In [168]:
result_ncf_df_5 = result_ncf_df.loc[result_ncf_df['recomendations']==5]
result_ncf_df_10 = result_ncf_df.loc[result_ncf_df['recomendations']==10]
result_ncf_df_25 = result_ncf_df.loc[result_ncf_df['recomendations']==25] # final run
result_ncf_df_50 = result_ncf_df.loc[result_ncf_df['recomendations']==50]
result_ncf_df_100 = result_ncf_df.loc[result_ncf_df['recomendations']==100]

In [169]:
result_ncf_df_5.mean()

userId                   461.948944
actual_movies_watched     15.385563
recomendations             5.000000
common_movies              0.091549
precision                  0.018310
recall                     0.007514
f1_score                   0.009790
dtype: float64

In [170]:
result_ncf_df_10.mean()

userId                   461.948944
actual_movies_watched     15.385563
recomendations            10.000000
common_movies              0.292254
precision                  0.029225
recall                     0.021032
f1_score                   0.022341
dtype: float64

In [172]:
result_ncf_df_25.mean()

userId                   461.948944
actual_movies_watched     15.385563
recomendations            25.000000
common_movies              0.878521
precision                  0.035141
recall                     0.060918
f1_score                   0.040633
dtype: float64

In [173]:
result_ncf_df_50.mean()

userId                   461.948944
actual_movies_watched     15.385563
recomendations            50.000000
common_movies              1.688380
precision                  0.033768
recall                     0.110870
f1_score                   0.048432
dtype: float64

In [174]:
result_ncf_df_100.mean()

userId                   461.948944
actual_movies_watched     15.385563
recomendations           100.000000
common_movies              3.139085
precision                  0.031391
recall                     0.207666
f1_score                   0.052185
dtype: float64