# Jester Recommendations System - Item Based Collaborative Filtering

https://www.kaggle.com/c/jesterdsub2019

*Alex Castrelo, Gerard Marrugat, Eduard Ribas, Pilar Santolaria*

## Imports and functions

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import datetime
import os
import sys
import random
from scipy.stats import pearsonr
from scipy.spatial.distance import euclidean

In [2]:
try: # if running on COLAB
    
    # mount google drive
    from google.colab import drive
    drive.mount('/content/drive')
    PATH = '/content/drive/My Drive/MDS/jester_jokes/'
    
except:
    PATH = os.getcwd() + '/'

In [3]:
PATH_LIB = PATH + "library/"
PATH_DATA = PATH + "data/"
PATH_BATCHES = PATH + 'batches/'
PATH_SUBMISSIONS = PATH + "submissions/"

sys.path.append(PATH_LIB)

print("Folders content:")
print("PATH:\t\t", os.listdir(PATH))
print("PATH LIBRARY:\t", os.listdir(PATH_LIB))
print("PATH DATA:\t", os.listdir(PATH_DATA))
print("PATH BATCHES:\t", os.listdir(PATH_BATCHES))
print("PATH SUBM.:\t", os.listdir(PATH_SUBMISSIONS))

Folders content:
PATH:		 ['.git', '.gitignore', '.ipynb_checkpoints', 'batches', 'ContentBased-Gerard.ipynb', 'ContentBased.ipynb', 'data', 'ItemBasedCollaborativeFiltering.ipynb', 'Jester_Gerard.ipynb', 'library', 'LICENSE', 'mess_around_edu.ipynb', 'mess_around_gerard.ipynb', 'mess_around_gerard_draft.ipynb', 'PersonalizedPageRank.ipynb', 'README.md', 'submissions']
PATH LIBRARY:	 ['.ipynb_checkpoints', 'functions.py', '__init__.py', '__pycache__']
PATH DATA:	 ['jokes', 'submision_sample.csv', 'target_user_items.csv', 'training.csv']
PATH BATCHES:	 ['train_CBCF']
PATH SUBM.:	 ['.ipynb_checkpoints', 'submission_2019.05.29_21.11.csv', 'submission_2019.05.29_21.13.csv']


In [4]:
%load_ext autoreload
%autoreload 2
from functions import *

## Train

### Load train data

In [6]:
%%time
data_train = load_df("data/" + "training.csv")

Wall time: 536 ms


In [7]:
data_train.head()

Unnamed: 0,user_id,item_id,rating
0,13291,98,-0.670408
1,19559,8,1.436404
2,32928,50,1.711739
3,34459,29,-10.0
4,68339,19,4.27797


In [8]:
# Number of items
#N = 10
# Number of nearest neighbors
#NN = N

In [9]:
def pearson_sim(df, item1, item2, min_common_users=1):  
     # GET USERS OF ITEM1
    users_item1 = df[df['item_id'] == item1]
    # GET USERS OF ITEM2
    users_item2 = df[df['item_id'] == item2]
    
    # FIND SHARED USERS
    users_common = pd.merge(users_item1, users_item2, on = 'user_id')
    if len(users_common)==0:
        return 0    
    if len(users_common)<min_common_users:
        return 0    
    corr=pearsonr(users_common['rating_x'],users_common['rating_y'])[0]
    if np.isnan(corr):
        return 0
    return corr

def cos_sim(df, item1, item2, min_common_users=1):
    # GET USERS OF ITEM1
    users_item1 = df[df['item_id'] == item1]
    
    # GET USERS OF ITEM2
    users_item2 = df[df['item_id'] == item2]
    
    # FIND SHARED USERS
    users_common = pd.merge(users_item1, users_item2, on = 'user_id')
    if len(users_common)==0:
        return 0    
    if(len(users_common)<min_common_users):
        return 0  

    num = users_common['rating_x'].dot(users_common['rating_y'])
    den = np.sqrt(users_common['rating_x'].dot(users_common['rating_x'])*\
                  users_common['rating_y'].dot(users_common['rating_y']))
    cos_sim = num/den
    if(np.isnan(cos_sim)):
        return 0
    return cos_sim


#def adjcos_sim(df_, item1, item2, min_common_users=1):
#    df = df_.copy()
#    user_means = df.groupby(['user_id'], axis=0)['rating'].transform('mean')
#    df['rating'] = df['rating'] - user_means
#    
#    # GET USERS OF ITEM1
#    users_item1 = df[df['item_id'] == item1]
#    
#    # GET USERS OF ITEM2
#    users_item2 = df[df['item_id'] == item2]
#    
#    # FIND SHARED USERS
#    u_common = pd.merge(users_item1, users_item2, on = 'user_id')
#    if len(u_common)==0:
#        return 0    
#    if(len(u_common)<min_common_users):
#        return 0 
#    
#    num = u_common['rating_x'].dot(u_common['rating_y'])
#    den = np.sqrt(u_common['rating_x'].dot(u_common['rating_x'])*u_common['rating_y'].dot(u_common['rating_y']))
#    adjcos = num/den
#    if(np.isnan(adjcos)):
#        return 0
#    return adjcos



#def compute_rmse(y_pred, y_true):
#    """ Compute Root Mean Squared Error. """
#    return np.sqrt(np.mean(np.power(y_pred - y_true, 2)))
#
#def evaluate(estimate_f,data_train,data_test):
#    """ RMSE-based predictive performance """
#    ids_to_estimate = zip(data_test.user_id, data_test.item_id)
#    estimated = np.array([estimate_f(u,i) if u in data_train.user_id else 0 for (u,i) in ids_to_estimate ])
#    real = data_test.rating.values
#    return compute_rmse(estimated, real)
#
#def evaluate_k(estimate_f,data_train,data_test,k):
#    """ RMSE-based predictive performance. Takes the number k of nearest neighbors as input """
#    ids_to_estimate = zip(data_test.user_id, data_test.item_id)
#    estimated = np.array([estimate_f(u,i,k) if u in data_train.user_id else 0 for (u,i) in ids_to_estimate ])
#    real = data_test.rating.values
#    return compute_rmse(estimated, real)

In [10]:
class CollaborativeFiltering:
    
    def __init__(self,df, similarity=pearson_sim):
        """ Constructor """
        self.sim_method=similarity# Gets recommendations for a person by using a weighted average
        self.df=df
        self.sim={}   
        
    def get_sim(self):
        """ Return similarity for debugging reasons """ 
        return self.sim    
        
    def train(self):
        """ Prepare data structures for estimation. Similarity matrix for items """
        all_items = set(self.df['item_id'])
        for item1 in all_items:
            self.sim.setdefault(item1, {})
            a=data_train[data_train['item_id']==item1][['user_id']]
            data_reduced=pd.merge(data_train,a,on='user_id')
            for item2 in all_items:
                
                if item1==item2: continue
                self.sim.setdefault(item2, {})
                if(item1 in self.sim[item2]):continue
                sim=self.sim_method(data_reduced,item1,item2)
                if(sim<0):
                    self.sim[item1][item2]=0
                    self.sim[item2][item1]=0
                else:
                    self.sim[item1][item2]=sim
                    self.sim[item2][item1]=sim
        
    #def get_most_similar_items(self, item_id, k):
    #    sorted_sim_of_item = sorted(self.sim[item_id].items(), key=operator.itemgetter(1), reverse = True)
    #    most_similar_items = [sorted_sim_of_item[i][0] for i in range(k-1)]
    #    return(most_similar_items)
    #        
    #def predict_k(self, user_id, item_id, k):
    #    
    #    # Extract k most similar items
    #    most_similar_items = set(self.get_most_similar_items(item_id, k))
    #    
    #    totals={}
    #    user_items=self.df[self.df['user_id'] == user_id]
    #    rating_num=0.0
    #    rating_den=0.0
    #    all_items=set(user_items['item_id'])
    #    
    #    # Intersection of k most similar items with items that have been commonly rated
    #    intersect_items = most_similar_items & all_items
    #    
    #    for other in intersect_items:
    #        if item_id==other: continue 
    #        rating_num += self.sim[item_id][other] * float(user_items[user_items['item_id']==other]['rating'])
    #        rating_den += self.sim[item_id][other]
    #
    #    if rating_den==0: 
    #        if self.df.rating[self.df['user_id']==user_id].mean()>0:
    #            # return the mean user rating if there is no similar for the computation
    #            return self.df.rating[self.df['user_id']==user_id].mean()
    #        else:
    #            # else return mean item rating 
    #            return self.df.rating[self.df['item_id']==item_id].mean()
    #    
    #    return rating_num/rating_den

    
    def predict(self, user_id, item_id):
        
        totals={}
        user_items=self.df[self.df['user_id'] == user_id]
        rating_num=0.0
        rating_den=0.0
        all_items=set(user_items['item_id'])
        
        for other in all_items:
            if item_id==other: continue 
            rating_num += self.sim[item_id][other] * float(user_items[user_items['item_id']==other]['rating'])
            rating_den += self.sim[item_id][other]

        if rating_den==0: 
            if self.df.rating[self.df['user_id']==user_id].mean()>0:
                # return the mean user rating if there is no similar for the computation
                return self.df.rating[self.df['user_id']==user_id].mean()
            else:
                # else return mean item rating 
                return self.df.rating[self.df['item_id']==item_id].mean()
        
        return rating_num/rating_den

In [11]:
#def assign_to_set(df):
#    sampled_ids = np.random.choice(df.index,
#                                   size=np.int64(np.ceil(df.index.size * 0.01)),
#                                   replace=False)
#    df.loc[sampled_ids, 'for_testing'] = True
#    return df

In [12]:
#%%time
#### Entire data set
#data = pd.read_csv(PATH_DATA+'training.csv', sep=',')
#data['for_testing'] = False
#grouped = data.groupby('user_id', group_keys=False).apply(assign_to_set)
#data_train = data[grouped.for_testing == False]
#data_test = data[grouped.for_testing == True]
#
#print("Training data_set has "+ str(data_train.shape[0]) +" ratings")
#print("Test data set has "+ str(data_test.shape[0]) +" ratings")
#print("The dataset has ", data.item_id.nunique(), " items")
#
#data.head()

In [13]:
#def get_user_mean(user_id, df_train):
#    user = df_train[df_train["user_id"] == user_id] 
#    mean = user.rating.mean()
#    return(mean)
#    
#    
#def construct_user_mean_dict(df_train):
#    user_ids = df_train.user_id.unique()
#    
#    counter = 0
#    user_means = {}
#    for user in user_ids:
#        counter += 1
#        user_means[user] = get_user_mean(user, df_train)
#        if float(counter % 1000) == 0.0:
#            print_time(counter)
#    return(user_means)
#
#def predict_mean_ratings_on_test_set(df_train):
#    user_means = construct_user_mean_dict(df_train)
#
#    mean_ratings = []
#
#    for user in data_test.user_id:
#        try:
#            mean_ratings.append(user_means[user])
#        except:
#            mean_ratings.append(0.0)
#    return(mean_ratings)
#        
#compute_rmse(predict_mean_ratings_on_test_set(data_train), data_test.rating)

In [14]:
%%time
rec = CollaborativeFiltering(data_train, similarity=pearson_sim)
rec.train()

Wall time: 46.2 s


In [15]:
rec.predict(19559,8)

5.16559996502665

In [16]:
rec.predict(1,8)

2.740198606176384

## Predict test set

### Load Test data

In [17]:
df_test = load_df("data/" + "target_user_items.csv")
df_test.rename(index=str, columns={"Unnamed: 0": "id"}, inplace=True)
print(df_test.shape)
df_test.head(10)

(50000, 3)


Unnamed: 0,id,user_id,item_id
0,0,52841,68
1,1,51916,84
2,2,46765,65
3,3,9882,35
4,4,22323,24
5,5,40625,87
6,6,6051,34
7,7,40625,52
8,8,23789,38
9,9,19286,83


In [18]:
def predict_IBCF(row):
    if row["id"] % 500 == 0:
        print_time("\tpredicting row " + str(row["id"]))
    return rec.predict(row["user_id"], row["item_id"])

#### Batch predict

In [19]:
batch_size = 10000
batches = np.arange(batch_size,df_test.shape[0]+1,batch_size)
print(batches)

[10000 20000 30000 40000 50000]


In [20]:
BATCH_FOLDER = PATH_BATCHES + "train_IBCF_test/"
create_directory(BATCH_FOLDER)

Directory C:\Users\Gerard Marrugat\Documents\Máster\Foundations Of Data Science\Recommenders\jesterRecSys\jester_jokes/batches/train_IBCF_test/ was created!


##### Predict for each batch and save

In [21]:
for batch in batches:
    filename =  "batch_" + str(batch) + ".csv"
    if filename not in os.listdir(BATCH_FOLDER):
        print_time("computing batch " + str(batch) + ": " + filename)
        df_test_batch = df_test.iloc[batch-batch_size:batch].copy()
        df_test_batch['rating'] = df_test_batch.apply(predict_IBCF, axis=1)
        df_test_batch.to_csv(BATCH_FOLDER + filename, index=False)
    else:
        print_time("batch " + str(batch) + " already exists!")
print("DONE")

 2019-05-29 22:58:47.35 computing batch 10000: batch_10000.csv
 2019-05-29 22:58:47.36 	predicting row 0
 2019-05-29 22:58:53.42 	predicting row 500
 2019-05-29 22:58:59.21 	predicting row 1000
 2019-05-29 22:59:05.62 	predicting row 1500
 2019-05-29 22:59:11.76 	predicting row 2000
 2019-05-29 22:59:17.79 	predicting row 2500
 2019-05-29 22:59:24.66 	predicting row 3000
 2019-05-29 22:59:30.76 	predicting row 3500
 2019-05-29 22:59:36.35 	predicting row 4000
 2019-05-29 22:59:42.13 	predicting row 4500
 2019-05-29 22:59:48.00 	predicting row 5000
 2019-05-29 22:59:53.39 	predicting row 5500
 2019-05-29 22:59:59.32 	predicting row 6000
 2019-05-29 23:00:05.17 	predicting row 6500
 2019-05-29 23:00:10.89 	predicting row 7000
 2019-05-29 23:00:16.62 	predicting row 7500
 2019-05-29 23:00:22.33 	predicting row 8000
 2019-05-29 23:00:28.53 	predicting row 8500
 2019-05-29 23:00:34.32 	predicting row 9000
 2019-05-29 23:00:40.17 	predicting row 9500
 2019-05-29 23:00:46.23 computing batch 2

##### Load prediction files in a single DF

In [24]:
df_test = pd.DataFrame(columns = ['id','user_id','item_id','rating'])
for batch in batches:
    filename = "batch_" + str(batch) + ".csv"
    if filename in os.listdir(BATCH_FOLDER):
        print_time("reading batch file " + filename)
        df_test_batch = pd.read_csv("batches/train_IBCF_test/" + filename)
        df_test = df_test.append(df_test_batch)
    else:
        print_time("batch file " + filename + " doesn't exist!")

 2019-05-29 23:11:28.01 reading batch file batch_10000.csv
 2019-05-29 23:11:28.03 reading batch file batch_20000.csv
 2019-05-29 23:11:28.05 reading batch file batch_30000.csv
 2019-05-29 23:11:28.06 reading batch file batch_40000.csv
 2019-05-29 23:11:28.08 reading batch file batch_50000.csv


In [25]:
print(df_test.shape)
df_test.head(10)

(50000, 4)


Unnamed: 0,id,user_id,item_id,rating
0,0,52841,68,4.103549
1,1,51916,84,-5.474519
2,2,46765,65,3.116544
3,3,9882,35,0.324789
4,4,22323,24,2.764165
5,5,40625,87,-2.401066
6,6,6051,34,6.945594
7,7,40625,52,-2.745516
8,8,23789,38,-0.402887
9,9,19286,83,0.774833


## Submission

In [26]:
subm_file = save_submission_file(df_test, PATH_SUBMISSIONS)

Dataframe saved: submission_2019.05.29_23.11.csv



https://www.kaggle.com/c/jesterdsub2019/submissions

In [None]:
def predict_ratings_on_test_set_item_item_class(test):
    rating_pred = []
    counter = 0
    for user_id, item_id in zip(test.user_id, test.item_id):
        counter += 1
        rating_pred.append(reco_item_adjcos.predict(user_id, item_id))
        if float(counter % 1000) == 0.0:
            print_time(counter)
    return(rating_pred)

def correct_for_NaNs(rating_pred):
    rating_pred_ = pd.Series(test.rating_pred)
    print(rating_pred_.isnull().sum(), "NaNs encountered!")
    
    rating_pred_[rating_pred_.isnull()] = 0
    
    return(rating_pred_)

In [None]:
compute_rmse(data_test.rating, test_ratings)

In [None]:
target_user_items = pd.read_csv(PATH_DATA+'target_user_items.csv')
target_user_items.shape

target_user_items.head()

target_users = target_user_items.user_id.values


In [None]:
submission_ratings = predict_ratings_on_test_set_item_item_class(target_user_items)

In [None]:
submission = pd.DataFrame({
    'id': np.arange(target_user_items.shape[0]),
    'rating': submission_ratings
})

In [None]:
submission.to_csv(PATH_SUBMISSIONS+'submission.csv', index = False)