In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import itertools
import Orange as org

#Predictors
import Predictors as pred
import NMF as nmf
from orangecontrib.associate.fpgrowth import *


In [6]:
class Recommender(object):
    def __init__(self, predictor=None):
        self.predictor = predictor

    def learn(self):
        if self.predictor is not None:
            self.predictor.fit(self.user_ratings)
        else:
            raise ValueError('No predictor')

    def recommend(self, user, n=10, rec_seen=True):
        # n - number of movies
        # rec_seen - already seen
        if user not in self.users:
            raise ValueError('No user with name ' + user)
        userid = self.users.index(user)
        # print(self.predictor.predict(userid))
        #print(self.predictor.predict(userid))
        mitm = [tuple(a) for a in zip(self.predictor.predict(userid), self.items)]
        if rec_seen == False:
            watched = (self.user_ratings[userid] < 1)
            mitm = [i for (i, v) in zip(mitm, watched) if v]

        # mitm = sorted(mitm, reverse=True)
        return mitm[:n]

    def parse_moviebase(self, file_name):
        data = open(file_name, 'rt', encoding='utf-8')
        self.items = []
        self.users = []
        self.user_ratings = []
        mode = 'none'
        for line in data:
            ln = line.strip()
            if not ln or ln[0] == '%': continue  # empty line or comment
            if ln == '[items]':
                # switch to parsing item data
                mode = 'items'
                continue
            if ln == '[users]':
                # switch to parsing user/rating data
                mode = 'users'
                iCount = len(self.items)
                continue
            if mode == 'items':
                self.items.append(ln)
            elif mode == 'users':
                ln = ln.split(',')
                if len(ln) != iCount + 1:  # check DB consistency
                    print("User %s has invalid number of ratings (%d)." % (ln[0], len(self.ratings[ln[0]])))
                self.user_ratings.append([])
                self.users.append(ln[0])
                for v in ln[1:]:
                    v = v.strip()
                    if v == '?':
                        self.user_ratings[-1].append(0)
                    else:
                        self.user_ratings[-1].append(float(v))
            else:
                print('Strange line in database:')
                print(line)
        self.user_ratings = np.array(self.user_ratings, dtype=np.int8)

    def getuser(self, username):
        return self.users.index(username)

    def getusers(self):
        return self.users

    def getitem(self, title):
        return self.items.index(title)

    def getitems(self):
        return self.items

    def split(self, learnsizepercentage=0.75):
        (usersNum, moviesNum) = np.shape(self.user_ratings)
        selector = np.random.rand(usersNum) < learnsizepercentage
        learnArray = self.user_ratings[selector]
        testArray = self.user_ratings[~selector]
        learnUsernames = np.array(self.users)[selector]
        testUsernames = np.array(self.users)[~selector]
        return (((learnArray, learnUsernames), (testArray, testUsernames)))

    def splitKFold(self, slices=10):
        (usersNum, moviesNum) = np.shape(self.user_ratings)
        selector = np.random.rand(usersNum)
        retArr = []
        h = 1 / slices
        for i in range(0, slices):
            s = (selector >= i / slices) & (selector < i / slices + h)
            retArr.append((self.user_ratings[s], np.array(self.users)[s]))
        return np.array(retArr)

    # Hides percentage of data in each column
    def hideRatings(self, hidepercentage=0.3):
        self.user_ratings_hidden = self.user_ratings.copy()
        for i in range(0, len(self.users)):
            ratedNumber = np.sum(self.user_ratings[i, :] != 0)
            rn = np.random.rand(len(self.items))
            sl = (self.user_ratings_hidden[i, :] == 0).astype(int)
            rn[sl] = 1
            sel = rn < hidepercentage
            dt = self.user_ratings[i, :]
            dt[sel] = 0

    def getAllData(self):
        return self.user_ratings_hidden

    def getRatingsData(self):
        return self.user_ratings

    def getMAE(self):
        original = self.getAllData()
        hidden = self.getRatingsData()
        pr = [];
        maxnumerOfitems = len(self.getitems())
        for name in self.getusers():
            data = self.recommend(name, maxnumerOfitems, True)
            #print(data)
            #print([x for x,y in data])
            pr.append([x for x, y in data])
        predicted = np.array(pr)
        s = original - hidden != 0
        return np.mean(np.abs(predicted[s] - original[s]))

    def getRMSE(self):
        original = self.getAllData()
        hidden = self.getRatingsData()
        pr = [];
        maxnumerOfitems = len(self.getitems())
        for name in self.getusers():
            data = self.recommend(name, maxnumerOfitems, True)
            pr.append([x for x, y in data])
        predicted = np.array(pr)
        s = original - hidden != 0
        return np.sqrt(np.mean(np.power(predicted[s] - original[s], 2)))

    def getPrecision(self, numberOfelements):
        original = self.getAllData()
        hidden = self.getRatingsData()
        pr = [];
        maxnumerOfitems = len(self.getitems())
        for name in self.getusers():
            data = self.recommend(name, maxnumerOfitems, True)
            pr.append([x for x, y in data])
        pr = np.array(pr)

        avpr = []
        for i in range(0, len(original)):

            sel = (original[i, :] - hidden[i, :]) != 0

            if len(original[i, :][sel]) < 1:
                break

            orW = original[i, :][sel] - np.mean(original[i, :][sel])

            prd = zip(pr[i, :][sel], np.arange(0, len(pr[i, :][sel])))

            p = [];
            for h, a in prd:
                p.append((h, a))
            p = sorted(p, reverse=True)
            p = p[:numberOfelements]

            retSel = [];
            for n, m in p:
                retSel.append(m);
            if min(numberOfelements, len(p)) != 0 and len(orW[retSel] >= 0) > 0:
                avpr.append(np.sum(orW[retSel] >= 0) / min(numberOfelements, len(p)))
        # print(avpr)
        return np.mean(np.array(avpr))

    def getRecall(self, numberOfelements):
        original = self.getAllData()
        hidden = self.getRatingsData()
        pr = [];
        maxnumerOfitems = len(self.getitems())
        for name in self.getusers():
            data = self.recommend(name, maxnumerOfitems, True)
            pr.append([x for x, y in data])
        pr = np.array(pr)

        avtr = []

        for i in range(0, len(original)):

            sel = (original[i, :] - hidden[i, :]) != 0
            if len(original[i, :][sel]) < 1:
                break

            orW = original[i, :][sel] - np.mean(original[i, sel])
            prd = zip(pr[i, :][sel], np.arange(0, len(pr[i, :][sel])))

            p = [];
            for h, a in prd:
                p.append((h, a))
            p = sorted(p, reverse=True)
            p = p[:numberOfelements]
            # print(p)
            retSel = [];
            for n, m in p:
                retSel.append(m);

            relaccIm = np.sum(orW > 0)

            relaccZg = sum(orW[retSel] > 0)

            recall = relaccZg / relaccIm
            if np.isnan(recall) == False:
                avtr.append(recall)

        m = np.mean(avtr)
        if np.isnan(m):
            return 0
        else:
            return m

    def getF1(self, numberOfelements):
        a = self.getRecall(numberOfelements)
        b = self.getPrecision(numberOfelements)
        return (2 * a * b) / (a + b)

    def parse_MovieLense(self, file_name, movie_titles_file, totalmovieslimit):

        self.items = []
        self.users = []
        self.user_ratings = []

        from numpy import genfromtxt
        data = genfromtxt(file_name, delimiter=',')
        maxuser = int(np.max(data[:, 0]))
        maxmovie = int(min(totalmovieslimit, np.max(data[:, 1])))
        # print(maxuser, maxmovie, len(data))

        # csc - UserBasedPredictor
        # csr - ItemBasedPredictor
        # dok - slope one
        from scipy.sparse import dok_matrix
        mtx = np.zeros((maxuser, maxmovie))
        # mtx = dok_matrix((maxuser, maxmovie));
        for i in range(len(data)):
            u = int(data[i, 0]) - 1
            m = int(data[i, 1]) - 1
            r = data[i, 2]
            if m < totalmovieslimit:
                #    print(maxmovie, maxuser, m, u)
                mtx[u, m] = int(r)

        users = []
        for i in range(1, int(maxuser) + 1):
            users.append(str(i))

        import pandas as pnd
        movies = [""] * int(maxmovie)
        df = pnd.read_csv(movie_titles_file)
        # df.iloc[2]["movieId"]
        # print(movies)
        for i in range(len(df)):
            movieId = int(df.iloc[i]["movieId"]);
            if movieId < totalmovieslimit:
                movies[movieId - 1] = df.iloc[i]["title"]

        self.items = movies
        self.users = users
        self.user_ratings = mtx

In [7]:
predictor = pred.RandomPredictor(1, 5)
predictor = pred.AveragePredictor(10)
predictor = pred.ViewsPredictor() #napovedovanje ogledov
predictor = pred.DeviationPredictor() #napovedovanje variacije ocen
predictor = pred.UserBasedPredictor(0,0.2)
predictor = pred.ItemBasedPredictor(0,0.2)
predictor = pred.SlopeOnePredictor()
predictor = pred.NMFPredictor(8,100,0.02)

r = Recommender(predictor)
r.parse_moviebase("moviebase2016.txt")
r.parse_MovieLense("ratingsSmal.csv", 'movies.csv', 50)
print("read")
r.hideRatings(0.3)
r.learn()
print("learned")


print("MAE:", r.getMAE())
print("RMSE:", r.getRMSE())

fac = 10
pr = r.getPrecision(fac)
re = r.getRecall(fac)
print("Precision:", pr)
print("Recall:", re)
print("F1:", (2*pr*re)/(pr+re))

read
learned
MAE: 0.903136538691
RMSE: 1.16793747934
Precision: 0.75
Recall: 1.0
F1: 0.857142857143


Prikaz prvih dveh vektorjev

In [4]:
#r.parse_moviebase("moviebase2016.txt")
#
#fac = nmf.NMF(5,100,0.02)
#(MA, MB) = fac.fit(r.getRatingsData())
#x, y = (MB[:,1], MB[:,2])
#lbl = r.getitems()
#
#import matplotlib.pyplot as plt
#for i in range(len(x)):
#    plt.plot(x[i],y[i], 'o', label = lbl[i], c=np.random.rand(3,1))
#plt.legend(bbox_to_anchor=(1.9, 1.05))
#plt.show()




In [5]:
#from numpy import genfromtxt
#data = genfromtxt('ratings.csv', delimiter=',')
#maxuser = np.max(data[:,0])
#maxmovie = np.max(data[:,1])
##print(maxuser, maxmovie, len(data))
#from scipy.sparse import dok_matrix 
#mtx = dok_matrix((maxuser, maxmovie), dtype=np.int8)
#for i in range(len(data)):
#    u = data[i, 0]-1
#    m = data[i, 1]-1
#    r = data[i, 2]
#    mtx[u,m] = int(r)
#    
#users = []
#for i in range(1,int(maxuser)):
#    users.append(str(i))
#
#import pandas as pnd
#movies = [""] * int(maxmovie)
#df = pnd.read_csv('movies.csv')
##df.iloc[2]["movieId"]
##print(movies)
#for i in range(len(df)):
#    movies[int(df.iloc[i]["movieId"])-1] = df.iloc[i]["title"]
#    
#    
#mtx[1, :]