In [37]:
from polara import RecommenderData
from polara import SVDModel
from polara import get_movielens_data
from polara.tools.preprocessing import filter_sessions_by_length
from polara.evaluation import evaluation_engine as ee
import numpy as np

from io import BytesIO
import pandas as pd

try:
    from pandas.io.common import ZipFile
except ImportError:
    from zipfile import ZipFile

In [38]:


def get_movielens_data(local_file=None, get_ratings=True, get_genres=False,
                       split_genres=True, mdb_mapping=False):
    '''Downloads movielens data and stores it in pandas dataframe.
    '''
    if not local_file:
        # downloading data
        from requests import get
        zip_file_url = 'http://files.grouplens.org/datasets/movielens/ml-1m.zip'
        zip_response = get(zip_file_url)
        zip_contents = BytesIO(zip_response.content)
    else:
        zip_contents = local_file

    ml_data = ml_genres = mapping = None
    # loading data into memory
    with ZipFile(zip_contents) as zfile:
        zip_files = pd.Series(zfile.namelist())
        zip_file = zip_files[zip_files.str.contains('ratings')].iat[0]
        is_new_format = ('latest' in zip_file) or ('20m' in zip_file)
        delimiter = ','
        header = 0 if is_new_format else None
        if get_ratings:
            zdata = zfile.read(zip_file)
            zdata = zdata.replace(b'::', delimiter.encode())
            # makes data compatible with pandas c-engine
            # returns string objects instead of bytes in that case
            ml_data = pd.read_csv(BytesIO(zdata), sep=delimiter, header=header, engine='c',
                                    names=['userid', 'movieid', 'rating', 'timestamp'],
                                    usecols=['userid', 'movieid', 'rating','timestamp'])

        if get_genres:
            zip_file = zip_files[zip_files.str.contains('movies')].iat[0]
            zdata =  zfile.read(zip_file)
            if not is_new_format:
                # make data compatible with pandas c-engine
                # pandas returns string objects instead of bytes in that case
                delimiter = '^'
                zdata = zdata.replace(b'::', delimiter.encode())
            genres_data = pd.read_csv(BytesIO(zdata), sep=delimiter, header=header,
                                      engine='c', encoding='unicode_escape',
                                      names=['movieid', 'movienm', 'genres'])

            ml_genres = get_split_genres(genres_data) if split_genres else genres_data

        if mdb_mapping and is_new_format:
            # imdb and tmdb mapping - exists only in ml-latest or 20m datasets
            zip_file = zip_files[zip_files.str.contains('links')].iat[0]
            with zfile.open(zip_file) as zdata:
                mapping = pd.read_csv(zdata, sep=',', header=0, engine='c',
                                        names=['movieid', 'imdbid', 'tmdbid'])

    res = [data for data in [ml_data, ml_genres, mapping] if data is not None]
    if len(res)==1: res = res[0]
    return res

In [39]:
def remove_gaps(data):
    data['movieid'] = ml_data.groupby('movieid', sort=False).grouper.group_info[0]
    data['userid'] = ml_data.groupby('userid', sort=False).grouper.group_info[0]
    return data

In [40]:
def normalize_timestamp(x):
    x["timestamp"] = np.argsort(list(x["timestamp"]))
    return x

def length_col(x):
    x['timestamp'] = len(x)
    return x

def train_test_val_split(data):
    
    data = data.groupby("userid").apply(normalize_timestamp)
    lc = data.groupby("userid").apply(length_col)
    max_time_stamp = lc['timestamp']
    timestamp = data['timestamp']
    data_train = data[timestamp<max_time_stamp*0.9]\
                    .groupby("userid").apply(normalize_timestamp)
    data_val = data[(0.9*max_time_stamp<=timestamp)&(timestamp<0.95*max_time_stamp)]\
                    .groupby("userid").apply(normalize_timestamp)
    data_test = data[0.95*max_time_stamp<=timestamp]\
                    .groupby("userid").apply(normalize_timestamp)
    return data_train, data_val,data_test

In [48]:
ml_data = get_movielens_data("ml-10m.zip")
ml_data = remove_gaps(ml_data)

Unnamed: 0,userid,movieid,rating,timestamp
0,0,0,5.0,838985046
1,0,1,5.0,838983525
2,0,2,5.0,838983392
3,0,3,5.0,838983421
4,0,4,5.0,838983392


In [49]:
ml_data_train, ml_data_val,ml_data_test = train_test_val_split(ml_data)

In [67]:
um_ml_data_train, um_ml_data_val,um_ml_data_test =\
                    ml_data_train[["userid","movieid","rating"]],\
                    ml_data_val[["userid","movieid","rating"]],\
                    ml_data_test[["userid","movieid","rating"]]

In [68]:
ml_time_train, ml_time_val,ml_time_test =\
                    ml_data_train[["timestamp"]],\
                    ml_data_val[["timestamp"]],\
                    ml_data_test[["timestamp"]]

In [77]:
data_model = RecommenderData(None, 'userid', 'movieid', 'rating', seed=0)
data_model.holdout_size = 1
data_model.random_holdout = False
data_model.warm_start = False
data_model.permute_tops = False

In [75]:
data_model.prepare_training_only()

Preparing data...
Done.


In [None]:
class RecurentModel(RecommenderModel):
    def __init__(self, train ,*args, **kwargs):
        super(RecurentModel, self).__init__(None, 'userid', 'movieid', 'rating', *args, **kwargs)
        
        self.method = 'RecommenderModel' # pick some meaningful name

    def build(self):
        # build model - calculate item-to-item matrix
        user_item_matrix = self.get_training_matrix()
        # rating matrix product  R^T R  gives cooccurrences count
        i2i_matrix = user_item_matrix.T.dot(user_item_matrix) # gives CSC format
        # exclude "self-links" and ensure only non-zero elements are stored
        i2i_matrix.setdiag(0)
        i2i_matrix.eliminate_zeros()
        # store matrix for generating recommendations
        self.i2i_matrix = i2i_matrix

    def get_recommendations(self):
        # get test users information and generate top-k recommendations
        test_matrix, test_data = self.get_test_matrix()
        # calculate predicted scores
        i2i_scores = test_matrix.dot(self.i2i_matrix)
        # prevent seen items from appearing in recommendations
        if self.filter_seen:
            self.downvote_seen_items(i2i_scores, test_data)
        # generate top-k recommendations for every test user
        top_recs = self.get_topk_elements(i2i_scores)
        return top_recs

In [102]:
import scipy as sp
sp.sparse.csr_matrix((np.ones(3),
                                   ([3,3,3],[3,3,3])),
                                    shape=(4,4), dtype=np.float64).todense()

matrix([[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 3.]])

In [None]:
# size(rec_input) = #time_steps
# size(user) = 1
# size(ordered_rec_list) = time_steps X #items
for (user, ordered_rec_list), true_consumption_list in zip(rec_outputs,rec_inputs):
    true_consumption = true_consumption_list[-2]
    recommended_consumption = ordered_rec_list[-1]
    MRR_score = MRR@k(true_consumption,recommended_consumption,k)
    recall_score = recall@k = (true_consumption,recommended_consumption,k)
    MRR_scores.append(MRR_score)
    recall_scores.append(recall_score)
recall = mean(recall_scores)
MRR = mean(MRR_score)
return MRR,recall
