In [1]:
# импорты, которые точно понадобятся
import pandas as pd
import numpy as np
# Данные взяты отсюда - http://jmcauley.ucsd.edu/data/amazon/
# http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Video_Games_5.json.gz
JSON_DATA_PATH = "lab_data/Video_Games_5.json"
N = 10


In [2]:
import json

def iter_json_data(path):
    with open(path) as f:
        for line in f:
            data = json.loads(line)
            yield data
            
def get_data_frame():
    import re
    uid_to_id = {}
    iid_to_id = {}
    
    cols = ["uid", "iid", "review", "rating", "dt", 'useful']
    rows = []

    for d in iter_json_data(JSON_DATA_PATH):
        uid = uid_to_id.setdefault(d["reviewerID"], len(uid_to_id))
        iid = iid_to_id.setdefault(d["asin"], len(iid_to_id))
        review = d["reviewText"]
        rating = float(d["overall"])
        dt = int(d["unixReviewTime"])
        useful = d["helpful"]
        rows.append((uid, iid, review, rating, dt, useful))
        
        
    return pd.DataFrame(rows, columns=cols)

df = get_data_frame()

In [3]:
df.head()

Unnamed: 0,uid,iid,review,rating,dt,useful
0,0,0,Installing the game was a struggle (because of...,1.0,1341792000,"[8, 12]"
1,1,0,If you like rally cars get this game you will ...,4.0,1372550400,"[0, 0]"
2,2,0,1st shipment received a book instead of the ga...,1.0,1403913600,"[0, 0]"
3,3,0,"I got this version instead of the PS3 version,...",3.0,1315958400,"[7, 10]"
4,4,0,I had Dirt 2 on Xbox 360 and it was an okay ga...,4.0,1308009600,"[2, 2]"


In [4]:
def split_df_by_dt(df, p=0.8):
    """Функция разбивает df на тестовую и тренировочную выборки по времени 
    публикации отзывов (значение времени в поле dt)
    
    :param p: персентиль значений dt, которые образуют тренировочную выборку. Например p=0.8 означает, что в 
    тренировочной части будут отзывы, соответствующие первым 80% временного интервала 
    :return: два pd.DataFrame объекта
    """
    border_dt = df.dt.quantile(p)
    print("Min=%s, border=%s, max=%s" % (df.dt.min(), border_dt, df.dt.max()))
    training_df, test_df  = df[df.dt <= border_dt], df[df.dt > border_dt]
    print("Размер до очистки:", training_df.shape, test_df.shape)
    # удаляем из тестовых данных строки, соответствующие пользователям или объектам, 
    # которых нет в тренировочных данных 
    # (пользователи - избегаем проблем для персональных систем, объекты - для всех)
    test_df = test_df[test_df.uid.isin(training_df.uid) & test_df.iid.isin(training_df.iid)]
    print("Размер после очистки:", training_df.shape, test_df.shape)
    return training_df, test_df

In [5]:
training_df, test_df = split_df_by_dt(df, p=0.8)
del df

Min=939859200, border=1377129600.0, max=1405987200
Размер до очистки: (185427, 6) (46353, 6)
Размер после очистки: (185427, 6) (19174, 6)


In [18]:
def prepare_items_matrix(df, max_features=None, del_texts=True):
    from scipy.sparse import csr_matrix
    from scipy.sparse import vstack
    from sklearn.feature_extraction.text import TfidfVectorizer
    
    id_dict = {} # сщщтветствие id объекта и перечня строк, относящихся к этому объекту в df
    cntr = 0 # счетчик строк
    texts = [] # массив текстов
    items_list = [] # массив для промежуточного хранения векторов объектов
    iid_to_row = {}
    
    for iid in df.iid.unique():
        dummy = []
        for review in df.loc[df.iid == iid].review:
            dummy.append(cntr)
            texts.append(review)
            cntr+=1
        id_dict[iid] = dummy
    
    # создаем векторизатор
    tfvect = TfidfVectorizer(stop_words='english', max_features=max_features).fit(texts)
    voc_len = len(tfvect.vocabulary_)
        
    for iid, rows in id_dict.items():
        item_vect = csr_matrix((1,voc_len))
        for i in rows:
            review = texts[i]
            vect = tfvect.transform([review])
            item_vect += vect
        item_vect = item_vect/len(rows)
        items_list.append(item_vect)
        iid_to_row[iid] = iid_to_row.setdefault(iid, len(iid_to_row))
    
    items_matrix = vstack(items_list)
    del items_list
    if del_texts:
        del texts
    
    return items_matrix, iid_to_row

In [19]:
items_matrix, iid_to_row = prepare_items_matrix(training_df)

In [20]:
items_matrix

<10098x197239 sparse matrix of type '<class 'numpy.float64'>'
	with 7356087 stored elements in Compressed Sparse Row format>

In [22]:
class BasicRecommender(object):
    def __init__(self):
        pass
    
    def get_recs(self, uid, top):
        """Строит рекомендации для пользователя uid
        :return: словарь типа {iid: score, ...}
        """
        return {}
    
    def get_batch_recs(self, uids, top):
        """Строит рекомендации для нескольких пользователей uids
        :return: словарь типа {uid: {iid: score, ...}, ...}
        """
        return {uid: self.get_recs(uid, top) for uid in uids}
    
class NonPersRecommender(BasicRecommender):
    
    def __init__(self, df):
        super(NonPersRecommender, self).__init__()
        self.recs = self._prepare_recs(df)
        
    def _prepare_recs(self, df):
        return pd.Series([])
    
    def get_recs(self, uid, top):
        from collections import OrderedDict
        return OrderedDict(self.recs[:top])
    
    def get_batch_recs(self, uids, top):
        non_pers_recs = self.get_recs(None, top)
        return {uid: non_pers_recs for uid in uids}

In [23]:
def get_test_dict(test_df):
    """Функция, конвертирующая тестовый df в словарь
    """
    test_dict = {}
    for t in test_df.itertuples():
        test_dict.setdefault(t.uid, {})
        test_dict[t.uid][t.iid] = t.rating
    return test_dict

test_dict = get_test_dict(test_df)

In [24]:
def hit_ratio(recs_dict, test_dict):
    """Функция считает метрику hit-ration для двух словарей
    :recs_dict: словарь рекомендаций типа {uid: {iid: score, ...}, ...}
    :test_dict: тестовый словарь типа {uid: {iid: score, ...}, ...}
    """
    hits = 0
    for uid in test_dict:
        if set(test_dict[uid].keys()).intersection(recs_dict.get(uid, {})):
            hits += 1
    return hits / len(test_dict)

In [47]:
class ContentBasedRecommender(NonPersRecommender):
    
    def __init__(self, items_matrix, iid_to_ind, train_df, max_features=None, use_dt=True, use_ratings=False):
        super(NonPersRecommender, self).__init__()

        from sklearn.feature_extraction.text import TfidfVectorizer
        import numpy as np
        import pandas as pd
        

        self.items_matrix = items_matrix #TfidfVectorizer(stop_words='english', max_features=max_features).fit_transform(text_array)
        #self.uid_to_ind = uid_to_ind
        self.iid_to_ind = iid_to_ind
        self.df = train_df[['iid','uid', 'dt', 'rating']]
        self.use_ratings = use_ratings
        self.use_dt = use_dt
        self.DICT_LEN = self.items_matrix.shape[1]
        self.uinque_ids = train_df.iid.unique()
        
    def get_recs(self, uids_to_recommend, top_k=10):
        self.recs = self._prepare_recs(uids_to_recommend, top_k)
        return self.recs

    def select_item(source_matrix, keys_to_index, key_, exclude_keys):
        if key_ in exclude_keys:
            return 0
        else:
            return source_matrix[0, keys_to_index[key_]]
        
    def _prepare_recs(self, uids_to_recommend, top_k):
        from sklearn.metrics.pairwise import cosine_similarity
        from scipy.sparse import csr_matrix
        recs = {}
        for uid in uids_to_recommend:
            user_df = self.df[self.df.uid == uid]
            user_items = user_df.iid

            if self.use_dt:
                user_dt = user_df.dt
                user_dt = user_dt/max(user_dt)
            else:
                user_dt = np.ones(len(user_items))

            if self.use_ratings:
                user_ratings = user_df.rating
            else:
                user_ratings = np.ones(len(user_items))

            #инициализируем вектор плбьзователя
            user_vect = csr_matrix((1, self.DICT_LEN))

            #вектор пользователя
            for iid, time, rating in zip(user_items, user_dt, user_ratings):
                item_vect = self.items_matrix.getrow(self.iid_to_ind[iid])
                user_vect += item_vect.multiply(time*rating)

            #вектор схожести для пользователя
            sim_vector = cosine_similarity(user_vect, self.items_matrix, dense_output=True)
            #item_with_score = { iid: select_item(sim_vector, self.iid_to_ind, iid, user_items) for iid in self.iid_to_ind.keys() }
            items_with_score = { iid: sim_vector[0, self.iid_to_ind[iid]] for iid in set(self.uinque_ids).difference(set(user_items)) }
            user_recs = sorted([(k,v) for k,v in items_with_score.items()], key=lambda x: -x[1])[:top_k]
            user_recs = dict(user_recs)
            recs[uid] = user_recs

        return recs

In [48]:
recommender = ContentBasedRecommender(items_matrix, iid_to_row, training_df, use_dt=False, use_ratings=False)

In [49]:
uids_to_recommend = test_dict.keys()

In [50]:
recs = recommender.get_recs(uids_to_recommend)
hit_ratio(recs, test_dict)

0.07630227439471754