# Jester Recommendations System - Content Based

https://www.kaggle.com/c/jesterdsub2019

*Alex Castrelo, Gerard Marrugat, Eduard Ribas, Pilar Santolaria*

## Imports and functions

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import datetime
import os
import sys
import random
from scipy.stats import pearsonr
from scipy.spatial.distance import euclidean
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
try: # if running on COLAB
    
    # mount google drive
    from google.colab import drive
    drive.mount('/content/drive')
    PATH = '/content/drive/My Drive/MDS/jester_jokes/'
    
except:
    PATH = os.getcwd() + '/'

In [3]:
PATH_LIB = PATH + "library/"
PATH_DATA = PATH + "data/"
PATH_BATCHES = PATH + 'batches/'
PATH_SUBMISSIONS = PATH + "submissions/"

sys.path.append(PATH_LIB)

print("Folders content:")
print("PATH:\t\t", os.listdir(PATH))
print("PATH LIBRARY:\t", os.listdir(PATH_LIB))
print("PATH DATA:\t", os.listdir(PATH_DATA))
print("PATH BATCHES:\t", os.listdir(PATH_BATCHES))
print("PATH SUBM.:\t", os.listdir(PATH_SUBMISSIONS))

Folders content:
PATH:		 ['.DS_Store', '.git', '.gitignore', '.ipynb_checkpoints', 'batches', 'ContentBased.ipynb', 'data', 'ItemBasedCollaborativeFiltering.ipynb', 'library', 'LICENSE', 'mess_around_edu.ipynb', 'mess_around_gerard.ipynb', 'notebooks', 'PersonalizedPageRank.ipynb', 'README.md', 'submissions']
PATH LIBRARY:	 ['.DS_Store', '__init__.py', '__pycache__', 'functions.py']
PATH DATA:	 ['.DS_Store', 'jokes', 'submision_sample.csv', 'target_user_items.csv', 'training.csv']
PATH BATCHES:	 ['.DS_Store', 'train_1000', 'train_100000', 'train_500000', 'train_CBCF', 'train_full', 'train_IBCF', 'train_IBCF_pers']
PATH SUBM.:	 ['.DS_Store', 'submission_2019.03.28_15.53.csv', 'submission_2019.05.22_20.51.csv', 'submission_2019.05.23_20.00.csv', 'submission_2019.05.23_20.05.csv', 'submission_2019.05.24_16.44.csv', 'submission_2019.05.24_21.06.csv', 'submission_2019.05.25_16.14.csv', 'submission_2019.05.25_21.31.csv', 'submission_2019.05.27_19.58.csv', 'submission_2019.05.29_18.28.csv', '

In [4]:
%load_ext autoreload
%autoreload 2
from functions import *

## Train

### Load train data

In [5]:
%%time
data_train = load_df(
    PATH_DATA + "training.csv",
    item_shift=True
)

CPU times: user 549 ms, sys: 79.9 ms, total: 629 ms
Wall time: 637 ms


In [6]:
data_train.head()

Unnamed: 0,user_id,item_id,rating
0,13291,97,-0.670408
1,19559,7,1.436404
2,32928,49,1.711739
3,34459,28,-10.0
4,68339,18,4.27797


In [7]:
# from:
# http://blog.untrod.com/2016/06/simple-similar-products-recommendation-engine-in-python.html

import html2text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

def get_jokes_test():
    jokes = pd.DataFrame({'joke' : [],'id' : []})

    jokes = jokes.append({'joke' : '', 'id': 0}, ignore_index=True)
    for i in range(1, 101):
        with open("data/jokes/init"+str(i)+".html") as f:
            jokes = jokes.append({'joke' : html2text.html2text(f.read()), 'id': i},
                                 ignore_index=True)
    jokes.drop(0, axis=0, inplace=True)
    return jokes

jokes = get_jokes_test()

In [8]:
def similarity_matrix_content_base(jokes_df, min_df = 0.05, ngram_range=(1, 3)):
    """ This function takes a set of sentence and computes the similarity between them """
    tf = TfidfVectorizer(analyzer='word', ngram_range = ngram_range, min_df = min_df, stop_words='english')
    
    # get the representation of our data in the new space from raw text
    tfidf_matrix = tf.fit_transform(jokes_df['joke'])
    
    # compute the similaities between the senteces
    sim = linear_kernel(tfidf_matrix, tfidf_matrix)
    return sim

#compute simiarlity between jokes
S_CB = similarity_matrix_content_base(jokes)

In [9]:
S_CB[0]

array([1.        , 0.        , 0.        , 0.4799554 , 0.        ,
       0.06234866, 0.        , 0.        , 0.11783031, 0.17741477,
       0.        , 0.35705838, 0.        , 0.04797216, 0.        ,
       0.        , 0.        , 0.13343466, 0.        , 0.31339494,
       0.        , 0.16514902, 0.        , 0.        , 0.        ,
       0.        , 0.28184029, 0.07069321, 0.17914217, 0.        ,
       0.20235803, 0.3653596 , 0.73379049, 0.        , 0.15905642,
       0.16017581, 0.4632751 , 0.        , 0.42725184, 0.        ,
       0.        , 0.21156796, 0.17799653, 0.09080381, 0.        ,
       0.28449808, 0.        , 0.        , 0.        , 0.07125142,
       0.        , 0.        , 0.15624135, 0.        , 0.        ,
       0.61604962, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.74372572, 0.18619918, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

In [10]:
#def content_sim(df, item1, item2):  
#    return S_CB[item1-1,item2-1]

In [12]:
class CBCollaborativeFiltering:
    
    def __init__(self,df):#, similarity=content_sim):
        """ Constructor """
        #self.sim_method=similarity# Gets recommendations for a person by using a weighted average
        self.df=df
        #self.sim={}   
        
    #def get_sim(self):
    #    """ Return similarity for debugging reasons """ 
    #    return self.sim    
    #    
    #def train(self):
    #    """ Prepare data structures for estimation. Similarity matrix for items """
    #    all_items = set(self.df['item_id'])
    #    for item1 in all_items:
    #        self.sim.setdefault(item1, {})
    #        a=data_train[data_train['item_id']==item1][['user_id']]
    #        data_reduced=pd.merge(data_train,a,on='user_id')
    #        for item2 in all_items:
    #            
    #            if item1==item2: continue
    #            self.sim.setdefault(item2, {})
    #            if(item1 in self.sim[item2]):continue
    #            sim=self.sim_method(data_reduced,item1,item2)
    #            if(sim<0):
    #                self.sim[item1][item2]=0
    #                self.sim[item2][item1]=0
    #            else:
    #                self.sim[item1][item2]=sim
    #                self.sim[item2][item1]=sim

    
    def predict(self, user_id, item_id):
        
        totals={}
        user_items=self.df[self.df['user_id'] == user_id]
        rating_num=0.0
        rating_den=0.0
        all_items=set(user_items['item_id'])
        
        for other in all_items:
            if item_id==other: continue 
            #rating_num += self.sim[item_id][other] * float(user_items[user_items['item_id']==other]['rating'])
            #rating_den += self.sim[item_id][other]
            rating_num += S_CB[item_id-1][other-1] * float(user_items[user_items['item_id']==other]['rating'])
            rating_den += S_CB[item_id-1][other-1]

        if rating_den==0: 
            if self.df.rating[self.df['user_id']==user_id].mean()>0:
                # return the mean user rating if there is no similar for the computation
                return self.df.rating[self.df['user_id']==user_id].mean()
            else:
                # else return mean item rating 
                return self.df.rating[self.df['item_id']==item_id].mean()
        
        return rating_num/rating_den

In [14]:
%%time
rec = CBCollaborativeFiltering(data_train)#, similarity=content_sim)
#rec.train()

CPU times: user 6 µs, sys: 1 µs, total: 7 µs
Wall time: 9.06 µs


In [15]:
rec.predict(19559,8)

4.446564545642433

In [16]:
rec.predict(1,8)

-1.0817106889157686

## Predict test set

### Load Test data

In [17]:
df_test = load_df(PATH_DATA + "target_user_items.csv")
df_test.rename(index=str, columns={"Unnamed: 0": "id"}, inplace=True)
print(df_test.shape)
df_test.head(10)

(50000, 3)


Unnamed: 0,id,user_id,item_id
0,0,52841,68
1,1,51916,84
2,2,46765,65
3,3,9882,35
4,4,22323,24
5,5,40625,87
6,6,6051,34
7,7,40625,52
8,8,23789,38
9,9,19286,83


In [18]:
def predict_CBCF(row):
    if row["id"] % 500 == 0:
        print_time("\tpredicting row " + str(row["id"]))
    return rec.predict(row["user_id"], row["item_id"])

#### Batch predict

In [19]:
batch_size = 10000
batches = np.arange(batch_size,df_test.shape[0]+1,batch_size)
print(batches)

[10000 20000 30000 40000 50000]


In [20]:
BATCH_FOLDER = PATH_BATCHES + "train_CBCF/"
create_directory(BATCH_FOLDER)

Directory /Users/edu/Documents/ACADÈMIC/2018-2019 - Master on Foundations of Data Science/S2/Recommenders/jester_jokes/batches/train_CBCF/ already exists!


##### Predict for each batch and save

In [21]:
for batch in batches:
    filename =  "batch_" + str(batch) + ".csv"
    if filename not in os.listdir(BATCH_FOLDER):
        print_time("computing batch " + str(batch) + ": " + filename)
        df_test_batch = df_test.iloc[batch-batch_size:batch].copy()
        df_test_batch['rating'] = df_test_batch.apply(predict_CBCF, axis=1)
        df_test_batch.to_csv(BATCH_FOLDER + filename, index=False)
    else:
        print_time("batch " + str(batch) + " already exists!")
print("DONE")

 2019-05-29 19:33:41.63 computing batch 10000: batch_10000.csv
 2019-05-29 19:33:41.64 	predicting row 0
 2019-05-29 19:33:48.13 	predicting row 500
 2019-05-29 19:33:54.37 	predicting row 1000
 2019-05-29 19:34:00.80 	predicting row 1500
 2019-05-29 19:34:07.00 	predicting row 2000
 2019-05-29 19:34:13.53 	predicting row 2500
 2019-05-29 19:34:20.16 	predicting row 3000
 2019-05-29 19:34:26.34 	predicting row 3500
 2019-05-29 19:34:32.52 	predicting row 4000
 2019-05-29 19:34:40.33 	predicting row 4500
 2019-05-29 19:34:48.62 	predicting row 5000
 2019-05-29 19:34:55.68 	predicting row 5500
 2019-05-29 19:35:05.13 	predicting row 6000
 2019-05-29 19:35:12.74 	predicting row 6500
 2019-05-29 19:35:20.58 	predicting row 7000
 2019-05-29 19:35:28.15 	predicting row 7500
 2019-05-29 19:35:34.49 	predicting row 8000
 2019-05-29 19:35:40.80 	predicting row 8500
 2019-05-29 19:35:47.25 	predicting row 9000
 2019-05-29 19:35:53.99 	predicting row 9500
 2019-05-29 19:36:04.24 computing batch 2

##### Load prediction files in a single DF

In [22]:
df_test = pd.DataFrame(columns = ['id','user_id','item_id','rating'])
for batch in batches:
    filename = "batch_" + str(batch) + ".csv"
    if filename in os.listdir(BATCH_FOLDER):
        print_time("reading batch file " + filename)
        df_test_batch = pd.read_csv(BATCH_FOLDER + filename)
        df_test = df_test.append(df_test_batch)
    else:
        print_time("batch file " + filename + " doesn't exist!")

 2019-05-29 19:45:32.08 reading batch file batch_10000.csv
 2019-05-29 19:45:32.09 reading batch file batch_20000.csv
 2019-05-29 19:45:32.12 reading batch file batch_30000.csv
 2019-05-29 19:45:32.14 reading batch file batch_40000.csv
 2019-05-29 19:45:32.16 reading batch file batch_50000.csv


In [23]:
print(df_test.shape)
df_test.head(10)

(50000, 4)


Unnamed: 0,id,user_id,item_id,rating
0,0,52841,68,2.19028
1,1,51916,84,-8.904854
2,2,46765,65,2.001887
3,3,9882,35,-0.644786
4,4,22323,24,3.454618
5,5,40625,87,-4.272525
6,6,6051,34,6.406365
7,7,40625,52,2.553668
8,8,23789,38,-2.634561
9,9,19286,83,-2.203059


In [29]:
df_test.fillna(0, inplace=True)

In [30]:
df_test[df_test.id==1037]

Unnamed: 0,id,user_id,item_id,rating
1037,1037,29807,100,0.0


## Submission

In [31]:
subm_file = save_submission_file(df_test, PATH_SUBMISSIONS)

Dataframe saved: submission_2019.05.29_19.50.csv



https://www.kaggle.com/c/jesterdsub2019/submissions

In [None]:
def predict_ratings_on_test_set_item_item_class(test):
    rating_pred = []
    counter = 0
    for user_id, item_id in zip(test.user_id, test.item_id):
        counter += 1
        rating_pred.append(reco_item_adjcos.predict(user_id, item_id))
        if float(counter % 1000) == 0.0:
            print_time(counter)
    return(rating_pred)

def correct_for_NaNs(rating_pred):
    rating_pred_ = pd.Series(test.rating_pred)
    print(rating_pred_.isnull().sum(), "NaNs encountered!")
    
    rating_pred_[rating_pred_.isnull()] = 0
    
    return(rating_pred_)

In [None]:
compute_rmse(data_test.rating, test_ratings)

In [None]:
target_user_items = pd.read_csv(PATH_DATA+'target_user_items.csv')
target_user_items.shape

target_user_items.head()

target_users = target_user_items.user_id.values


In [None]:
submission_ratings = predict_ratings_on_test_set_item_item_class(target_user_items)

In [None]:
submission = pd.DataFrame({
    'id': np.arange(target_user_items.shape[0]),
    'rating': submission_ratings
})

In [None]:
submission.to_csv(PATH_SUBMISSIONS+'submission.csv', index = False)