In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix, save_npz, load_npz

In [2]:
import json

def iter_json_data(path):
    with open(path) as f:
        for line in f:
            data = json.loads(line)
            yield data
            
def get_data_frame():
    uid_to_id = {}
    iid_to_id = {}
    
    cols = ["uid", "iid", "review", "rating", "dt"]
    rows = []
    for d in iter_json_data(JSON_DATA_PATH):
        uid = uid_to_id.setdefault(d["reviewerID"], len(uid_to_id))
        iid = iid_to_id.setdefault(d["asin"], len(iid_to_id))
        review = d["reviewText"]
        rating = float(d["overall"])
        dt = int(d["unixReviewTime"])
        rows.append((uid, iid, review, rating, dt))
        
        
    return pd.DataFrame(rows, columns=cols)

In [3]:
JSON_DATA_PATH = "lab_data/Video_Games_5.json"
N = 10

df = get_data_frame()
df.head()

Unnamed: 0,uid,iid,review,rating,dt
0,0,0,Installing the game was a struggle (because of...,1.0,1341792000
1,1,0,If you like rally cars get this game you will ...,4.0,1372550400
2,2,0,1st shipment received a book instead of the ga...,1.0,1403913600
3,3,0,"I got this version instead of the PS3 version,...",3.0,1315958400
4,4,0,I had Dirt 2 on Xbox 360 and it was an okay ga...,4.0,1308009600


In [4]:
def split_df_by_dt(df, p=0.8):
    """Функция разбивает df на тестовую и тренировочную выборки по времени 
    публикации отзывов (значение времени в поле dt)
    
    :param p: персентиль значений dt, которые образуют тренировочную выборку. Например p=0.8 означает, что в 
    тренировочной части будут отзывы, соответствующие первым 80% временного интервала 
    :return: два pd.DataFrame объекта
    """
    border_dt = df.dt.quantile(p)
    print("Min=%s, border=%s, max=%s" % (df.dt.min(), border_dt, df.dt.max()))
    training_df, test_df  = df[df.dt <= border_dt], df[df.dt > border_dt]
    print("Размер до очистки:", training_df.shape, test_df.shape)
    # удаляем из тестовых данных строки, соответствующие пользователям или объектам, 
    # которых нет в тренировочных данных 
    # (пользователи - избегаем проблем для персональных систем, объекты - для всех)
    test_df = test_df[test_df.uid.isin(training_df.uid) & test_df.iid.isin(training_df.iid)]
    print("Размер после очистки:", training_df.shape, test_df.shape)
    return training_df, test_df

In [5]:
def clean_df(df, min_review_per_uid, min_review_per_iid):
    """Функция удаляет из df строки, соответствующие пользователям и объектам, 
    у которых меньше min_review_per_uid и min_review_per_iid отзывов соответственно
    """
    _df = df.copy()
    while True:
        review_per_uid = _df.groupby("uid").review.count()
        bad_uids = review_per_uid[review_per_uid < min_review_per_uid].index
    
        review_per_iid = _df.groupby("iid").review.count()
        bad_iids = review_per_iid[review_per_iid < min_review_per_iid].index
        
        if bad_uids.shape[0] > 0 or bad_iids.shape[0] > 0:
            _df = _df[(~_df.uid.isin(bad_uids)) & (~_df.iid.isin(bad_iids))]
        else:
            break
    return _df

In [6]:
training_df, test_df = split_df_by_dt(df, p=0.8)
del df

Min=939859200, border=1377129600.0, max=1405987200
Размер до очистки: (185427, 5) (46353, 5)
Размер после очистки: (185427, 5) (19174, 5)


In [7]:
def hit_ratio(recs_dict, test_dict):
    """Функция считает метрику hit-ration для двух словарей
    :recs_dict: словарь рекомендаций типа {uid: {iid: score, ...}, ...}
    :test_dict: тестовый словарь типа {uid: {iid: score, ...}, ...}
    """
    hits = 0
    for uid in test_dict:
        if set(test_dict[uid].keys()).intersection(recs_dict.get(uid, {})):
            hits += 1
    return hits / len(test_dict)

In [8]:
def get_test_dict(test_df):
    """Функция, конвертирующая тестовый df в словарь
    """
    test_dict = {}
    for t in test_df.itertuples():
        test_dict.setdefault(t.uid, {})
        test_dict[t.uid][t.iid] = t.rating
    return test_dict

test_dict = get_test_dict(test_df)

In [9]:
class BasicRecommender(object):
    def __init__(self):
        pass
    
    def get_recs(self, uid, top):
        """Строит рекомендации для пользователя uid
        :return: словарь типа {iid: score, ...}
        """
        return {}
    
    def get_batch_recs(self, uids, top):
        """Строит рекомендации для нескольких пользователей uids
        :return: словарь типа {uid: {iid: score, ...}, ...}
        """
        return {uid: self.get_recs(uid, top) for uid in uids}
    
class NonPersRecommender(BasicRecommender):
    
    def __init__(self, df):
        super(NonPersRecommender, self).__init__()
        self.recs = self._prepare_recs(df)
        
    def _prepare_recs(self, df):
        return pd.Series([])
    
    def get_recs(self, uid, top):
        from collections import OrderedDict
        return OrderedDict(self.recs[:top])
    
    def get_batch_recs(self, uids, top):
        non_pers_recs = self.get_recs(None, top)
        return {uid: non_pers_recs for uid in uids}

In [10]:
# выбирает соответсвие id объекта - rating, уже приобретенные объекты (exclude keys) исключаются
def select_item(source_matrix, keys_to_index, key_, exclude_keys):
    if key_ in exclude_keys:
        return 0
    else:
        return source_matrix[0, keys_to_index[key_]]

In [38]:
class ContentBasedRecommender_dummy(NonPersRecommender):
    
    def __init__(self, uid_to_ind, iid_to_ind, train_df, sim_matrix, block_num, block_len):
        super(NonPersRecommender, self).__init__()
        self.uid_to_ind = uid_to_ind
        self.iid_to_ind = iid_to_ind
        self.train_df = train_df
        self.sim_matrix = sim_matrix
        self.block_num = block_num
        self.block_len = block_len
        #self.recs = self._prepare_recs(uid_to_ind, iid_to_ind, train_df, sim_matrix)
        
    def get_recs(self, uids_to_recommend, top_k=10):
        self.recs = self._prepare_recs(
            uids_to_recommend,
            self.uid_to_ind,
            self.iid_to_ind,
            self.train_df,
            self.sim_matrix,
            top_k,
            self.block_num,
            self.block_len
        )
        return self.recs
        
    def _prepare_recs(self, uids_to_recommend, uid_to_ind, iid_to_ind, train_df, sim_matrix, top_k, block_num, block_len):
        out = {}
        for uid in uids_to_recommend:
            already_bought_items = train_df[train_df.uid == uid].iid.values
            sim_row = sim_matrix.getrow(uid_to_ind[uid] - block_num*block_len).todense()
            item_with_score = { iid: select_item(sim_row, iid_to_ind, iid, already_bought_items) for iid in iid_to_ind.keys() }
            dummy = pd.Series(item_with_score).sort_values(ascending=False)[:top_k].to_dict()
            out[uid] = dummy
        return out

In [69]:
uids_to_recommend = test_df.uid.unique()

In [70]:
len(uids_to_recommend)

6815

In [71]:
id_to_ind = {}
uid_to_ind = {}
for id in training_df.iid.unique():
    id_to_ind[id] = id_to_ind.setdefault(id, len(id_to_ind))
    
for uid in training_df.uid.unique():
    uid_to_ind[uid] = uid_to_ind.setdefault(uid, len(uid_to_ind))

ind_to_uid = { v: k for k,v in uid_to_ind.items() }

In [72]:
from math import ceil

#формируем разбиение списка пользователей для демпфирования матрицы пользователя
user_rows = len(training_df.uid.unique())
Nsplit = 20 #число разбиений
block = ceil(user_rows/Nsplit)

# создаем список диапазонов индексов
block_list = []
for i in range(Nsplit+1):
    val = i*block
    if val > user_rows:
        val = user_rows
    block_list.append(val)

In [73]:
def get_block_number(index, blocks):
    for i in range(len(blocks)):
        if index in range(blocks[i], blocks[i+1]):
            return i

In [74]:
uid_with_block = []
for uid in uids_to_recommend:
    block_number = get_block_number(uid_to_ind[uid], block_list)
    uid_with_block.append((uid, block_number))

In [75]:
df_uids_blocks = pd.DataFrame(uid_with_block, columns=['uid', 'block'])

In [76]:
recs = {}
for bl in range(Nsplit):
    similarity_matrix = load_npz('similarity_'+str(bl)+'.npz')
    recommender = ContentBasedRecommender_dummy(uid_to_ind, id_to_ind, training_df, similarity_matrix, bl, block)
    uids_to_recommend_ = df_uids_blocks[df_uids_blocks.block == bl].uid.values
    recommends = recommender.get_recs(uids_to_recommend_)
    recs.update(recommends)

In [77]:
test_dict = get_test_dict(test_df)

In [78]:
hit_ratio(recs, test_dict)

0.05399853264856933

In [49]:
recs

{}

In [50]:
recommends

{}

In [62]:
len(test_dict.keys())

6815