In [0]:
## 加载Google Drive
import os
if not os.path.exists('/content/gdrive/'):
    from google.colab import drive
    drive.mount('/content/gdrive')
else:
    pass


# 全局变量
TEST_RATE = 0.2 # 测试组占比



Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
# 加载数据，Movie Lens - 1M数据
import pandas as pd
rnames = ['user_id', 'item_id', 'rating', 'timestamp']
ratings = pd.read_table('/content/gdrive/My Drive/Data/recommend_sys_practice/dataset/ml-1m/ratings.dat', sep='::', header=None, names=rnames, engine='python')


In [0]:
# 数据预处理
## 截取其中的user_id和movie_id列
df = pd.DataFrame(data = ratings, columns = ['user_id','item_id'])

## 拆分训练组和测试组
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(df, test_size=TEST_RATE)

In [0]:
# 对训练集做处理，做出可供学习使用的训练集
import numpy as np
from operator import itemgetter
import time


user_list = train_set['user_id'].unique()
item_list = train_set['item_id'].unique()



popular_list = train_set.groupby(['item_id']).size()
popular_list = sorted(popular_list.items(),
                      key = itemgetter(1),
                      reverse=True)
popular_list = [i[0] for i in popular_list]

user_item_array = []

for u in user_list:
    positive_list = []
    negative_list = []
    positive_list = list(train_set[train_set['user_id']==u]['item_id'].unique())

    # Todo: 这里要试验两种处理方法，一种是positive太多的，直接抛弃；另一种是也采用
    # 这里会抛弃掉
    if len(positive_list) > len(item_list) / 5:
        negative_list = []
        positive_list = []
    else:
        p = iter(popular_list)
        while len(negative_list) < len(positive_list):
            i = next(p)
            if i in positive_list:
                pass
            else:
                negative_list.append(i)

    for i in positive_list:
        user_item_array.append([u,i,1])
    for i in negative_list:
        user_item_array.append([u,i,0])
        
user_item_array = np.array(user_item_array)

user_item_df = pd.DataFrame(data = user_item_array, 
                            columns = ['user_id', 'item_id', 'r'])


In [0]:
# 模型的超参数
class_cnt = 4
lambda_ = 0.1
alpha = 0.02

In [0]:
# 构建损失函数J(theta)
from math import exp


## 先构建p, q两个矩阵

user_list = user_item_df['user_id'].unique()
item_list = user_item_df['item_id'].unique()

arrayp = np.random.rand(class_cnt,len(user_list)) 
arrayq = np.random.rand(class_cnt,len(item_list)) 
p = pd.DataFrame(arrayp, index = range(0,class_cnt), columns = user_list)
q = pd.DataFrame(arrayq, index = range(0,class_cnt), columns = item_list)

def sigmod(x):
    if x < -100:
        return 0
    else:
        y = 1.0/(1+exp(-x))
        return y


def predict(p, q, uid, iid):
    r = (p[uid] * q[iid]).sum()
    r = sigmod(r)
    return r


## 损失函数
def calculate_Loss(df, p, q):
    J = 0 # 损失函数具体值
    e = 0 # r - \hat{r}
    t = 0 # 计数项，用于观察损失函数的收敛
    for index, row in df.iterrows():
        uid = row['user_id']
        iid = row['item_id']
        r = float(row['r'])
        e = r - predict(p,q,uid,iid)
        
        ## 这步就是随机梯度下降
#         for k in range(0, class_cnt):
        p[uid] += alpha * (e * q[iid] - lambda_ * p[uid])
        q[iid] += alpha * (e * p[uid] - lambda_ * q[iid])
        
        J += pow(e,2)
        t += 1
#         print (r, e, J)
        if t % 100000 == 0:
            print(J)
    J += lambda_ * (np.linalg.norm(p) + np.linalg.norm(q))
    return J, p, q


In [0]:
a=time.time()
for i in range(0,10):
    J, p, q = calculate_Loss(user_item_df, p, q)
    print (J)
print(time.time()-a)

14223.153090541895
29226.589512646853
45143.11376439983
62182.83174986995
79316.90833153628
97081.91828205698
115213.66685520457
133462.07299734146
151842.63435414335
170586.77363568856
189336.45839755525
208079.8177342322
226487.20878701966
244103.10637017465
259065.83900337623
20299.59062406612
40157.87823752851
59791.13403365822
79916.29804757616
99844.28637395002
119944.19871979728
140231.52067570054
160595.21737590383
181012.31110705613
201383.27743929255
221879.9526523742
242421.57002446358
262273.4725891818
280996.0177462155
296480.2506254892
20739.000912445455
41083.56210947284
61151.21132504183
81468.73672559578
101736.54438828994
122161.03066538426
142662.48239023433
163160.4402164183
183591.23835360538
203983.52270339496
224316.52042960518
244558.3593901416
264036.8618255981
282415.7665889626
297580.2270083333
20017.854608917653
39690.406931020174
59145.80548033408
79011.82370086006
98786.34912059353
118766.37604297647
138839.29833550216
158914.69175434974
178934.60043181488

In [0]:
def recommend(uid):
    rank = {}
    for iid in item_list:
        rank[iid] = predict(p,q,uid,iid)
    rank = sorted(rank.items(), 
                  key = itemgetter(1),
                  reverse=True)
    return [i[0] for i in rank][:10]

In [0]:
predict_list = {}
for uid in user_list:
    predict_list[uid] = recommend(uid)

NameError: ignored

In [0]:
# 通用的功能

# 评价函数


def evaluate(predict, test):
    # predict 和 test 都是 user_item_list 二维列表
    
    n_hit = 0
    n_recall = 0
    n_precision = 0
    n_all_items = len(set(item_list))
    all_items_recommended = []
    popularity_sum = 0
    
    uniq_item_list = set(item_list)
    
    
    gini_item_list = {}
    
    for user, items in test.items():
        rank = set(predict[user])
        n_hit += len(rank & items)
        n_recall += len(items)
        n_precision += topN
        all_items_recommended += [i for i in rank]
        popularity_sum += sum([len(item_users[i]) for i in rank])
        
        for i in rank:
            gini_item_list.setdefault(i,0)
            gini_item_list[i] += 1
    
    # 以下开始计算基尼系数
    j = 1
    n = len(gini_item_list)
    G = 0
    gini_item_list = sorted(gini_item_list.items(), key = itemgetter(1), reverse=False)
    g_sum = sum([i[1] for i in gini_item_list])
    
    for i in range(n):
        G += (2 * j - n - 1) * gini_item_list[i][1]
        j += 1
    
    return {'recall': "{0:.2%}".format(n_hit / (1.0 * n_recall)),
            'precision': "{0:.2%}".format(n_hit / (1.0 * n_precision)),
            'coverage': "{0:.2%}".format(len(set(all_items_recommended))/all_items),
            'gini_index': "{0:.2}".format(G/float(n-1)/g_sum),
            'popularity': "{:.3f}".format(math.log(popularity_sum / n_precision))
           }