In [1]:
from EMF import *

In [2]:
from preprocess import * 

In [3]:
from usertouser import *

In [4]:
import sys

In [5]:
def calc_cor(user2user, ratings, u,v):
    q1 = user2user.user_rated_items(u)
    q2 = user2user.user_rated_items(v)
    Ru = np.mean(ratings[ratings['userid'] == u]['rating'].tolist())
    Rv = np.mean(ratings[ratings['userid'] == v]['rating'].tolist())
    els = [e for e in q1 if e in q2]
    den = []
    en = []
    for el in els:
        rui = ratings.loc[(ratings['userid'] == u) & (ratings['itemid'] == el)]['rating'].tolist()[0]
        rvi = ratings.loc[(ratings['userid'] == v) & (ratings['itemid'] == el)]['rating'].tolist()[0]
        en.append((rui - Ru)*(rvi - Rv))
        den.append((rui - Ru)**2 * (rvi - Rv) **2)
    if len(den) == 0 or sum(den) == 0:
        return 0 
    else:
        return np.sum(en) / np.sqrt(np.sum(den))

In [51]:
def explainable_score(user2user, users, items, ratings, theta=0):
    
    def _progress(count):
        sys.stdout.write('\rCompute Explainable score. Progress status : %.1f%%'%(float(count/len(users))*100.0))
        sys.stdout.flush()
    # initialize explainable score to zeros
    W = np.zeros((len(users), len(items)))

    for count, u in enumerate(users):            
        candidate_items = user2user.find_user_candidate_items(u)        
        for i in candidate_items:
            user_who_rated_i, similar_user_who_rated_i = \
                user2user.similar_users_who_rated_this_item(u, i)
            if user_who_rated_i.shape[0] == 0:
                w = 0.0
            else:
                l1 = np.array([])
                l2 = np.array([])
                for v in user_who_rated_i[:5]:
                    rvi = ratings.loc[(ratings['userid'] == v) & (ratings['itemid'] == i)]['rating'].tolist()[0]
                    corr = calc_cor(user2user, ratings, u,v)
                    l1 = np.append(l1,rvi * corr)
                    l2 = np.append(l2,np.abs(corr))
                if len(l2) == 0 or np.sum(l2) == 0:
                    w = 0.0
                else:
                    w = np.sum(l1) / np.sum(l2)
            W[u,i] =  w  if w > theta else 0.0
        _progress(count)
    return W


In [52]:
W = explainable_score(user2user, users, items, ratings)

Compute Explainable score. Progress status : 99.9%

In [53]:
np.unique(W)

array([0.00000000e+00, 3.16343332e-04, 1.89689195e-03, ...,
       5.00000000e+00, 5.00000000e+00, 5.00000000e+00])

In [7]:
import pandas as pd
import zipfile
import urllib.request
import sys
import os

DOWNLOAD_DESTINATION_DIR = "dataset"


def unzip(name):
    path = os.path.join(DOWNLOAD_DESTINATION_DIR, name)
    print(f"Unzipping the {name} zip file ...")
        
    with zipfile.ZipFile(path, 'r') as data:
        data.extractall(DOWNLOAD_DESTINATION_DIR)


def _progress(count, block_size, total_size):
    sys.stdout.write('\rDownload data %.1f%%' % (float(count * block_size)/float(total_size) * 100.0))
    sys.stdout.flush()


def download(url, name):
    path = os.path.join(DOWNLOAD_DESTINATION_DIR, name)
    if not os.path.exists(path):        
        os.makedirs(DOWNLOAD_DESTINATION_DIR, exist_ok=True)
        fpath, _ = urllib.request.urlretrieve(url, filename=path, reporthook=_progress)
        
        print()
        statinfo = os.stat(fpath)
        print('Successfully downloaded', name, statinfo.st_size, 'bytes.')
        unzip(name)


class mlLatestSmall:

    @staticmethod
    def load():        
        url = "http://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
        name = 'ml-latest-small'
        
        download(url, f"{name}.zip")
        
        ratings_path = os.path.join(DOWNLOAD_DESTINATION_DIR, name, 'ratings.csv')
        ratings = pd.read_csv(
            ratings_path,
            sep=',',
            names=["userid", "itemid", "rating", "timestamp"],
            skiprows=1
        )

        movies_path = os.path.join(DOWNLOAD_DESTINATION_DIR, name, 'movies.csv')
        movies = pd.read_csv(
            movies_path,
            sep=',',
            names=["itemid", "title", "genres"],
            encoding='latin-1',
            skiprows=1
        )
        
        return ratings, movies


class ml100k:

    @staticmethod
    def load():        
        url = "http://files.grouplens.org/datasets/movielens/ml-100k.zip"
        name = 'ml-100k'
        
        download(url, f"{name}.zip")
        
        ratings_path = os.path.join(DOWNLOAD_DESTINATION_DIR, name, 'u.data')
        ratings = pd.read_csv(
            ratings_path,
            sep='\t',
            names=["userid", "itemid", "rating", "timestamp"],
        )
        ratings = ratings.sort_values(by=['userid', 'itemid']).reset_index(drop=True)
        ratings = ratings.drop(columns=['timestamp'])

        movies_columns = [
            'itemid', 'title', 'release date', 'video release date', 
            'IMDb URL ', 'unknown', 'Action', 'Adventure', 'Animation',
            "Children's", 'Comedy' , 'Crime' , 'Documentary' , 'Drama' , 'Fantasy' ,
            'Film-Noir', 'Horror', 'Musical' , 'Mystery' , 'Romance' , 'Sci-Fi' ,
            'Thriller' , 'War' , 'Western' ,
        ]

        movies_path = os.path.join(DOWNLOAD_DESTINATION_DIR, name, 'u.item')
        movies = pd.read_csv(
            movies_path,
            sep='|',
            names=movies_columns,
            encoding='latin-1',
        )
        # drop non necessary columns. From the third to the last column
        todrop = list(range(2, len(movies.columns)))
        movies = movies.drop(movies.columns[todrop], axis=1)
        
        return ratings, movies

In [8]:
ratings, movies = ml100k.load()

In [21]:
u = users[1]

In [9]:
user2user = UserToUser(ratings, movies)

Normalize users ratings ...
Initialize the similarity model ...
Compute nearest neighbors ...
User to user recommendation model created with success!


In [26]:
candidate_items = user2user.find_user_candidate_items(u)

In [54]:
epochs = 10


In [62]:
ratings, uencoder, iencoder = ids_encoder(ratings)

In [57]:
users = sorted(ratings.userid.unique())
items = sorted(ratings.itemid.unique())

m = len(users)
n = len(items)

# get examples as tuples of userids and itemids and labels from normalize ratings
raw_examples, raw_labels = get_examples(ratings)

# train test split
(x_train, x_test), (y_train, y_test) = train_test_split(examples=raw_examples, labels=raw_labels)


In [82]:
m,n

(943, 1682)

In [11]:
ratings.head()

Unnamed: 0,userid,itemid,rating
0,0,0,5
1,0,1,3
2,0,2,4
3,0,3,3
4,0,4,3


In [69]:
EMF = ExplainableMatrixFactorization(m, n, W, alpha=0.01, beta=0.7, lamb=0.03, k=10)

In [70]:
history = EMF.fit(x_train, y_train, epochs=50, validation_data=(x_test, y_test))

Training EMF
k=10 	 alpha=0.01 	 beta=0.7 	 lambda=0.03
epoch 1/50 - loss : 0.958 - val_loss : 1.055
epoch 2/50 - loss : 0.837 - val_loss : 0.9
epoch 3/50 - loss : 0.818 - val_loss : 0.869
epoch 4/50 - loss : 0.812 - val_loss : 0.858
epoch 5/50 - loss : 0.81 - val_loss : 0.852
epoch 6/50 - loss : 0.809 - val_loss : 0.849
epoch 7/50 - loss : 0.809 - val_loss : 0.847
epoch 8/50 - loss : 0.809 - val_loss : 0.845
epoch 9/50 - loss : 0.809 - val_loss : 0.844
epoch 10/50 - loss : 0.809 - val_loss : 0.844
epoch 11/50 - loss : 0.809 - val_loss : 0.843
epoch 12/50 - loss : 0.809 - val_loss : 0.842
epoch 13/50 - loss : 0.809 - val_loss : 0.842
epoch 14/50 - loss : 0.809 - val_loss : 0.841
epoch 15/50 - loss : 0.809 - val_loss : 0.841
epoch 16/50 - loss : 0.809 - val_loss : 0.841
epoch 17/50 - loss : 0.809 - val_loss : 0.841
epoch 18/50 - loss : 0.809 - val_loss : 0.84
epoch 19/50 - loss : 0.809 - val_loss : 0.84
epoch 20/50 - loss : 0.809 - val_loss : 0.84
epoch 21/50 - loss : 0.809 - val_loss :

In [63]:
predictions = []
for a,b in x_test:
    predictions.append(EMF.predict(a,b,uencoder,iencoder))

In [64]:
k_list = [1,5,10,15,20]

In [66]:
test_user_ids = np.array([a for a,b in x_test]).astype(int)
test_movie_ids = np.array([b for a,b in x_test]).astype(int)

In [67]:
ndcgs = []
recalls = []
mnaps = []

for k in k_list:
    ndcgs.append(EMF.calc_ndcg(np.array(predictions), k, test_user_ids, y_test, test_movie_ids))
    recalls.append(EMF.calc_recalls(k,ratings, test_user_ids, uencoder, iencoder))
    mnaps.append(EMF.calc_mnap(k,ratings, test_user_ids, uencoder, iencoder))

In [68]:
ndcgs, recalls, mnaps

([0.532571249843209,
  0.557032387502378,
  0.578034987237875,
  0.594817350828534,
  0.615358043243442],
 [0.1424432, 0.1613423, 0.17798435, 0.1956721, 0.2351381],
 [0.3116,
  0.16813266666666667,
  0.14312070238095237,
  0.12078960922410923,
  0.12419097175296324])

In [36]:
for k in [7,10,13,16,20]:
    EMF = ExplainableMatrixFactorization(m, n, W, alpha=0.01, beta=0.7, lamb=0.03, k=k)
    EMF.fit(x_train, y_train, epochs=5, validation_data=(x_test, y_test))
    ratings, uencoder, iencoder = ids_encoder(ratings)
    val = EMF.calc_mnap(5,ratings, test_user_ids, uencoder, iencoder)
    print('k:',k, 'score:',val)

Training EMF
k=7 	 alpha=0.01 	 beta=0.7 	 lambda=0.03
epoch 1/5 - loss : 0.98 - val_loss : 1.058
epoch 2/5 - loss : 0.843 - val_loss : 0.9
epoch 3/5 - loss : 0.823 - val_loss : 0.87
epoch 4/5 - loss : 0.816 - val_loss : 0.858
epoch 5/5 - loss : 0.813 - val_loss : 0.852
k: 7 score: 0.27791199999999994
Training EMF
k=10 	 alpha=0.01 	 beta=0.7 	 lambda=0.03
epoch 1/5 - loss : 0.958 - val_loss : 1.055
epoch 2/5 - loss : 0.837 - val_loss : 0.9
epoch 3/5 - loss : 0.818 - val_loss : 0.869
epoch 4/5 - loss : 0.812 - val_loss : 0.858
epoch 5/5 - loss : 0.81 - val_loss : 0.852
k: 10 score: 0.11685466666666666
Training EMF
k=13 	 alpha=0.01 	 beta=0.7 	 lambda=0.03
epoch 1/5 - loss : 0.93 - val_loss : 1.045
epoch 2/5 - loss : 0.829 - val_loss : 0.905
epoch 3/5 - loss : 0.813 - val_loss : 0.873
epoch 4/5 - loss : 0.808 - val_loss : 0.861
epoch 5/5 - loss : 0.807 - val_loss : 0.854
k: 13 score: 0.108135
Training EMF
k=16 	 alpha=0.01 	 beta=0.7 	 lambda=0.03
epoch 1/5 - loss : 0.917 - val_loss : 

In [37]:
tuning_params = dict()
tuning_params = { 
#  "k":(7,25)
  "lamb": (0.01,0.1)
#  "beta": (0.4,0.8)
 }

In [42]:
def func1(lamb):
    
    rat, movies = ml100k.load()
    ratings, uencoder, iencoder = ids_encoder(rat)
    
    users = sorted(ratings.userid.unique())
    items = sorted(ratings.itemid.unique())

    m = len(users)
    n = len(items)

    # get examples as tuples of userids and itemids and labels from normalize ratings
    raw_examples, raw_labels = get_examples(ratings)

    # train test split
    (x_train, x_test), (y_train, y_test) = train_test_split(examples=raw_examples, labels=raw_labels)
    
    recommender = ExplainableMatrixFactorization(943, 1682, W, alpha=0.01, beta=0.7, lamb=lamb, k=7)
    recommender.fit(x_train, y_train, epochs=5, validation_data=(x_test, y_test))
    ratings, uencoder, iencoder = ids_encoder(rat)
    predictions = []
    for a,b in x_test:
        predictions.append(EMF.predict(a,b,uencoder,iencoder))
    
#    val = recommender.calc_mnap(5,ratings, test_user_ids, uencoder, iencoder)
#    val1 = recommender.calc_recalls(5,ratings, test_user_ids, uencoder, iencoder)
    val2 = recommender.calc_ndcg(np.array(predictions), 5, test_user_ids, y_test, test_movie_ids)/2

    return val2

In [43]:
from bayes_opt import BayesianOptimization

optimizer = BayesianOptimization(
  f = func1,
  pbounds = tuning_params,
  random_state = 3, 
 )

optimizer.maximize(
  init_points = 3,
  n_iter = 4, 
 )

|   iter    |  target   |   lamb    |
-------------------------------------
Training EMF
k=7 	 alpha=0.01 	 beta=0.7 	 lambda=0.059571811231711805
epoch 1/5 - loss : 0.98 - val_loss : 1.058
epoch 2/5 - loss : 0.843 - val_loss : 0.9
epoch 3/5 - loss : 0.823 - val_loss : 0.87
epoch 4/5 - loss : 0.816 - val_loss : 0.858
epoch 5/5 - loss : 0.813 - val_loss : 0.852
| [0m 1       [0m | [0m 0.459   [0m | [0m 0.05957 [0m |
Training EMF
k=7 	 alpha=0.01 	 beta=0.7 	 lambda=0.07373330403562944
epoch 1/5 - loss : 0.98 - val_loss : 1.058
epoch 2/5 - loss : 0.843 - val_loss : 0.9
epoch 3/5 - loss : 0.823 - val_loss : 0.87
epoch 4/5 - loss : 0.816 - val_loss : 0.858
epoch 5/5 - loss : 0.813 - val_loss : 0.852
| [0m 2       [0m | [0m 0.459   [0m | [0m 0.07373 [0m |
Training EMF
k=7 	 alpha=0.01 	 beta=0.7 	 lambda=0.03618142650216499
epoch 1/5 - loss : 0.98 - val_loss : 1.058
epoch 2/5 - loss : 0.843 - val_loss : 0.9
epoch 3/5 - loss : 0.823 - val_loss : 0.87
epoch 4/5 - loss : 0.816 - va

TypeError: 'float' object is not subscriptable

In [111]:
optimizer.max

{'target': 0.11861233333333335, 'params': {'beta': 0.7674443631751686}}

In [None]:
beta = optimizer.max['params']['beta']
lamb = optimizer.max['params']['lamb']

In [113]:
EMF = ExplainableMatrixFactorization(m, n, W, alpha=0.01, beta=0.75, lamb=0.03, k=10)

history = EMF.fit(x_train, y_train, epochs=epochs, validation_data=(x_test, y_test))

Training EMF
k=10 	 alpha=0.01 	 beta=0.75 	 lambda=0.03
epoch 1/10 - loss : 0.968 - val_loss : 1.062
epoch 2/10 - loss : 0.847 - val_loss : 0.907
epoch 3/10 - loss : 0.828 - val_loss : 0.877
epoch 4/10 - loss : 0.823 - val_loss : 0.866
epoch 5/10 - loss : 0.82 - val_loss : 0.86
epoch 6/10 - loss : 0.82 - val_loss : 0.857
epoch 7/10 - loss : 0.819 - val_loss : 0.855
epoch 8/10 - loss : 0.819 - val_loss : 0.854
epoch 9/10 - loss : 0.819 - val_loss : 0.853
epoch 10/10 - loss : 0.819 - val_loss : 0.852


In [22]:
test_user_ids = np.array([a for a,b in x_test]).astype(int)
test_movie_ids = np.array([b for a,b in x_test]).astype(int)

In [116]:
predictions = []
for a,b in x_test:
    predictions.append(EMF.predict(a,b,uencoder,iencoder))

In [96]:
predictions[:5]

[2.314878112508595,
 3.319415207445759,
 2.72246692013486,
 4.21325621302688,
 2.668411206363998]

In [10]:
EMF.evaluate(x_test, y_test)

validation error : 0.797
