# 通过随机梯度下降算法实现电影的评分预测

In [None]:
import pandas as pd
import numpy as np



class BaselineCFBySGD(object):

    def __init__(self, number_epochs, alpha, reg, columns=["uid", "iid", "rating"]):
        # 梯度下降最高迭代次数
        self.number_epochs = number_epochs
        # 学习率
        self.alpha = alpha
        # 正则参数 λ
        self.reg = reg
        # 数据集中user-item-rating字段的名称
        self.columns = columns

    def fit(self, dataset):
        '''
        :param dataset: uid, iid, rating
        :return:
        '''
        self.dataset = dataset
        # 用户评分数据，用groupby的目的是以用户去进行分组，一个用户的信息分为一组，并聚合为一个list,第一列是用户看过哪些电影的id，第二列是用户对这些电影的评分
        self.users_ratings = dataset.groupby(self.columns[0]).agg([list])[[self.columns[1], self.columns[2]]]
        # 物品评分数据，第一列是某个电影看过用户的id，第二列是这些用户对这个电影的评分
        self.items_ratings = dataset.groupby(self.columns[1]).agg([list])[[self.columns[0], self.columns[2]]]
        # 计算全局平均分，就是公式里的μ
        self.global_mean = self.dataset[self.columns[2]].mean()
        # 调用sgd方法训练模型参数
        self.bu, self.bi = self.sgd()

    def sgd(self):
        '''
        利用随机梯度下降，优化bu，bi的值
        :return: bu, bi
        '''
        # 初始化bu、bi的值，全部设为0，有多少用户，就生成多少个为0的bu和bi,bu和bi可以为负值
        #bu长度是多少个用户   600多个
        #bi长度是多少步电影   9000多个
        bu = dict(zip(self.users_ratings.index, np.zeros(len(self.users_ratings)))) # 600个用户，每个用户的bu都为0
        bi = dict(zip(self.items_ratings.index, np.zeros(len(self.items_ratings)))) # 9000个电影，每个电影的bi都为0

        for i in range(self.number_epochs): # 迭代次数
            print("iter%d " % i)
            total_error = 0 # 总误差
            for uid, iid, real_rating in self.dataset.itertuples(index=False):
                error = real_rating - (self.global_mean + bu[uid] + bi[iid])
                total_error += error ** 2
                bu[uid] += self.alpha * (error - self.reg * bu[uid])
                bi[iid] += self.alpha * (error - self.reg * bi[iid])
            print('total_error=%f' % total_error)
        return bu, bi

    def predict(self, uid, iid):
        #不存在预测不出来的值，只是看过一部电影的用户，只被评分过一次的电影，没那么准
        predict_rating = self.global_mean + self.bu[uid] + self.bi[iid]
        return predict_rating


if __name__ == '__main__':
    dtype = [("userId", np.int32), ("movieId", np.int32), ("rating", np.float32)]
    #去day1文件夹里读的数据，因此day1和day2课件放同一个目录
    dataset = pd.read_csv("./datasets/ml-latest-small/ratings.csv", usecols=range(3), dtype=dict(dtype))
    #     初始化一个对象
    bcf = BaselineCFBySGD(20, 0.1, 0.1, ["userId", "movieId", "rating"])
    bcf.fit(dataset)  #训练模型，得到bu，bi




iter0 
total_error=80119.897150
iter1 
total_error=74545.257842
iter2 
total_error=72746.999723
iter3 
total_error=71755.329501
iter4 
total_error=71113.752031
iter5 
total_error=70665.665443
iter6 
total_error=70338.046450
iter7 
total_error=70091.127219
iter8 
total_error=69900.986894
iter9 
total_error=69752.207665
iter10 
total_error=69634.338196
iter11 
total_error=69540.019593
iter12 
total_error=69463.916913
iter13 
total_error=69402.073374
iter14 
total_error=69351.500968
iter15 
total_error=69309.910472
iter16 
total_error=69275.527376
iter17 
total_error=69246.962848
iter18 
total_error=69223.121140
iter19 
total_error=69203.131820


In [16]:
print(np.count_nonzero(np.array(list(bcf.bu.values()))))
print(np.count_nonzero(np.array(list(bcf.bi.values()))))

610
9724


In [17]:
bcf.bu

{1: 0.8205842641888456,
 2: -0.05624456228293488,
 3: -0.3861371867433,
 4: -0.2561896079695449,
 5: 0.15145818382666598,
 6: 0.21421751511532042,
 7: -0.7257509416213495,
 8: -0.0959144352259191,
 9: -0.12301024527379503,
 10: -0.2910021824824274,
 11: 0.5382647758330744,
 12: 1.2075663306054578,
 13: 0.3995089525498591,
 14: -0.3451046543115578,
 15: -0.2540079681096897,
 16: -0.07517546766341365,
 17: 0.08890726912389058,
 18: -0.04013801887343536,
 19: -0.639844843734871,
 20: 0.2091272552151351,
 21: -0.5409612081799788,
 22: -2.0103560732310215,
 23: -0.16881266648442994,
 24: -0.19200615106744712,
 25: 0.844605809255059,
 26: -0.20121725807049592,
 27: -0.48511155428683417,
 28: -0.07386853452219902,
 29: 0.41478260902965003,
 30: 0.9432510576774717,
 31: 0.33254243597434385,
 32: 0.40356935916396547,
 33: 0.22072845038810224,
 34: 0.25941482532733695,
 35: 0.3599494333472203,
 36: -0.767311153001969,
 37: 0.36422226073662983,
 38: -0.3549412696570311,
 39: 0.5059247134562253,
 

In [18]:
#下面有报错是因为这里是死循环，输入1,1就可以预测1号用户对1号电影的评分，
# 终止了循环有报错
# while True:
#     uid = int(input("uid: "))
#     iid = int(input("iid: "))
#     print(bcf.predict(uid, iid))
#     break

In [19]:
bcf.predict(1, 3)

3.73289998848148

In [21]:
import numpy as np
import pandas as pd

dtype = [("userId", np.int32), ("movieId", np.int32), ("rating", np.float32)]
dataset = pd.read_csv("./datasets/ml-latest-small/ratings.csv", usecols=range(3), dtype=dict(dtype))

In [None]:
dataset.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [None]:
dataset.index

RangeIndex(start=0, stop=100836, step=1)

In [None]:
dataset.describe()

Unnamed: 0,userId,movieId,rating
count,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557
std,182.618491,35530.987199,1.042529
min,1.0,1.0,0.5
25%,177.0,1199.0,3.0
50%,325.0,2991.0,3.5
75%,477.0,8122.0,4.0
max,610.0,193609.0,5.0


In [None]:
users_ratings = dataset.groupby('userId').agg([list])[['movieId', 'rating']]  #因为就两列，所以加不加[['movieId','rating']] 效果一样的
users_ratings

Unnamed: 0_level_0,movieId,rating
Unnamed: 0_level_1,list,list
userId,Unnamed: 1_level_2,Unnamed: 2_level_2
1,"[1, 3, 6, 47, 50, 70, 101, 110, 151, 157, 163,...","[4.0, 4.0, 4.0, 5.0, 5.0, 3.0, 5.0, 4.0, 5.0, ..."
2,"[318, 333, 1704, 3578, 6874, 8798, 46970, 4851...","[3.0, 4.0, 4.5, 4.0, 4.0, 3.5, 4.0, 4.0, 4.5, ..."
3,"[31, 527, 647, 688, 720, 849, 914, 1093, 1124,...","[0.5, 0.5, 0.5, 0.5, 0.5, 5.0, 0.5, 0.5, 0.5, ..."
4,"[21, 32, 45, 47, 52, 58, 106, 125, 126, 162, 1...","[3.0, 2.0, 3.0, 2.0, 3.0, 3.0, 4.0, 5.0, 1.0, ..."
5,"[1, 21, 34, 36, 39, 50, 58, 110, 150, 153, 232...","[4.0, 4.0, 4.0, 4.0, 3.0, 4.0, 5.0, 4.0, 3.0, ..."
...,...,...
606,"[1, 7, 11, 15, 17, 18, 19, 28, 29, 32, 36, 46,...","[2.5, 2.5, 2.5, 3.5, 4.0, 4.0, 2.0, 3.5, 4.5, ..."
607,"[1, 11, 25, 34, 36, 86, 110, 112, 150, 153, 16...","[4.0, 3.0, 3.0, 3.0, 4.0, 4.0, 5.0, 2.0, 5.0, ..."
608,"[1, 2, 3, 10, 16, 19, 21, 24, 31, 32, 34, 39, ...","[2.5, 2.0, 2.0, 4.0, 4.5, 2.0, 3.5, 2.0, 3.0, ..."
609,"[1, 10, 110, 116, 137, 150, 161, 185, 208, 231...","[3.0, 4.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, ..."


In [None]:
len(users_ratings)

610

In [None]:
users_ratings.index

Int64Index([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,
            ...
            601, 602, 603, 604, 605, 606, 607, 608, 609, 610],
           dtype='int64', name='userId', length=610)

In [None]:
items_ratings = dataset.groupby('movieId').agg([list])
items_ratings

Unnamed: 0_level_0,userId,rating
Unnamed: 0_level_1,list,list
movieId,Unnamed: 1_level_2,Unnamed: 2_level_2
1,"[1, 5, 7, 15, 17, 18, 19, 21, 27, 31, 32, 33, ...","[4.0, 4.0, 4.5, 2.5, 4.5, 3.5, 4.0, 3.5, 3.0, ..."
2,"[6, 8, 18, 19, 20, 21, 27, 51, 62, 68, 82, 91,...","[4.0, 4.0, 3.0, 3.0, 3.0, 3.5, 4.0, 4.5, 4.0, ..."
3,"[1, 6, 19, 32, 42, 43, 44, 51, 58, 64, 68, 91,...","[4.0, 5.0, 3.0, 3.0, 4.0, 5.0, 3.0, 4.0, 3.0, ..."
4,"[6, 14, 84, 162, 262, 411, 600]","[3.0, 3.0, 3.0, 3.0, 1.0, 2.0, 1.5]"
5,"[6, 31, 43, 45, 58, 66, 68, 84, 103, 107, 111,...","[5.0, 3.0, 5.0, 3.0, 4.0, 4.0, 2.0, 3.0, 4.0, ..."
...,...,...
193581,[184],[4.0]
193583,[184],[3.5]
193585,[184],[3.5]
193587,[184],[3.5]


In [None]:
items_ratings.index

Int64Index([     1,      2,      3,      4,      5,      6,      7,      8,
                 9,     10,
            ...
            193565, 193567, 193571, 193573, 193579, 193581, 193583, 193585,
            193587, 193609],
           dtype='int64', name='movieId', length=9724)

In [None]:
len(items_ratings)

9724

In [None]:
#计算平均分
global_mean = dataset['rating'].mean()

In [None]:
global_mean

3.5015569

In [None]:
#初始化bu和 bi
bu = dict(zip(users_ratings.index, np.zeros(len(users_ratings))))

In [None]:
bu

{1: 0.0,
 2: 0.0,
 3: 0.0,
 4: 0.0,
 5: 0.0,
 6: 0.0,
 7: 0.0,
 8: 0.0,
 9: 0.0,
 10: 0.0,
 11: 0.0,
 12: 0.0,
 13: 0.0,
 14: 0.0,
 15: 0.0,
 16: 0.0,
 17: 0.0,
 18: 0.0,
 19: 0.0,
 20: 0.0,
 21: 0.0,
 22: 0.0,
 23: 0.0,
 24: 0.0,
 25: 0.0,
 26: 0.0,
 27: 0.0,
 28: 0.0,
 29: 0.0,
 30: 0.0,
 31: 0.0,
 32: 0.0,
 33: 0.0,
 34: 0.0,
 35: 0.0,
 36: 0.0,
 37: 0.0,
 38: 0.0,
 39: 0.0,
 40: 0.0,
 41: 0.0,
 42: 0.0,
 43: 0.0,
 44: 0.0,
 45: 0.0,
 46: 0.0,
 47: 0.0,
 48: 0.0,
 49: 0.0,
 50: 0.0,
 51: 0.0,
 52: 0.0,
 53: 0.0,
 54: 0.0,
 55: 0.0,
 56: 0.0,
 57: 0.0,
 58: 0.0,
 59: 0.0,
 60: 0.0,
 61: 0.0,
 62: 0.0,
 63: 0.0,
 64: 0.0,
 65: 0.0,
 66: 0.0,
 67: 0.0,
 68: 0.0,
 69: 0.0,
 70: 0.0,
 71: 0.0,
 72: 0.0,
 73: 0.0,
 74: 0.0,
 75: 0.0,
 76: 0.0,
 77: 0.0,
 78: 0.0,
 79: 0.0,
 80: 0.0,
 81: 0.0,
 82: 0.0,
 83: 0.0,
 84: 0.0,
 85: 0.0,
 86: 0.0,
 87: 0.0,
 88: 0.0,
 89: 0.0,
 90: 0.0,
 91: 0.0,
 92: 0.0,
 93: 0.0,
 94: 0.0,
 95: 0.0,
 96: 0.0,
 97: 0.0,
 98: 0.0,
 99: 0.0,
 100: 0.0,
 101: 0.

In [None]:
np.count_nonzero(np.array(list(bu.values())))

0

In [None]:
bi = dict(zip(items_ratings.index, np.zeros(len(items_ratings))))
bi

{1: 0.0,
 2: 0.0,
 3: 0.0,
 4: 0.0,
 5: 0.0,
 6: 0.0,
 7: 0.0,
 8: 0.0,
 9: 0.0,
 10: 0.0,
 11: 0.0,
 12: 0.0,
 13: 0.0,
 14: 0.0,
 15: 0.0,
 16: 0.0,
 17: 0.0,
 18: 0.0,
 19: 0.0,
 20: 0.0,
 21: 0.0,
 22: 0.0,
 23: 0.0,
 24: 0.0,
 25: 0.0,
 26: 0.0,
 27: 0.0,
 28: 0.0,
 29: 0.0,
 30: 0.0,
 31: 0.0,
 32: 0.0,
 34: 0.0,
 36: 0.0,
 38: 0.0,
 39: 0.0,
 40: 0.0,
 41: 0.0,
 42: 0.0,
 43: 0.0,
 44: 0.0,
 45: 0.0,
 46: 0.0,
 47: 0.0,
 48: 0.0,
 49: 0.0,
 50: 0.0,
 52: 0.0,
 53: 0.0,
 54: 0.0,
 55: 0.0,
 57: 0.0,
 58: 0.0,
 60: 0.0,
 61: 0.0,
 62: 0.0,
 63: 0.0,
 64: 0.0,
 65: 0.0,
 66: 0.0,
 68: 0.0,
 69: 0.0,
 70: 0.0,
 71: 0.0,
 72: 0.0,
 73: 0.0,
 74: 0.0,
 75: 0.0,
 76: 0.0,
 77: 0.0,
 78: 0.0,
 79: 0.0,
 80: 0.0,
 81: 0.0,
 82: 0.0,
 83: 0.0,
 85: 0.0,
 86: 0.0,
 87: 0.0,
 88: 0.0,
 89: 0.0,
 92: 0.0,
 93: 0.0,
 94: 0.0,
 95: 0.0,
 96: 0.0,
 97: 0.0,
 99: 0.0,
 100: 0.0,
 101: 0.0,
 102: 0.0,
 103: 0.0,
 104: 0.0,
 105: 0.0,
 106: 0.0,
 107: 0.0,
 108: 0.0,
 110: 0.0,
 111: 0.0,
 112: 0.

In [None]:
list(dataset.itertuples(index=False))

[Pandas(userId=1, movieId=1, rating=4.0),
 Pandas(userId=1, movieId=3, rating=4.0),
 Pandas(userId=1, movieId=6, rating=4.0),
 Pandas(userId=1, movieId=47, rating=5.0),
 Pandas(userId=1, movieId=50, rating=5.0),
 Pandas(userId=1, movieId=70, rating=3.0),
 Pandas(userId=1, movieId=101, rating=5.0),
 Pandas(userId=1, movieId=110, rating=4.0),
 Pandas(userId=1, movieId=151, rating=5.0),
 Pandas(userId=1, movieId=157, rating=5.0),
 Pandas(userId=1, movieId=163, rating=5.0),
 Pandas(userId=1, movieId=216, rating=5.0),
 Pandas(userId=1, movieId=223, rating=3.0),
 Pandas(userId=1, movieId=231, rating=5.0),
 Pandas(userId=1, movieId=235, rating=4.0),
 Pandas(userId=1, movieId=260, rating=5.0),
 Pandas(userId=1, movieId=296, rating=3.0),
 Pandas(userId=1, movieId=316, rating=3.0),
 Pandas(userId=1, movieId=333, rating=5.0),
 Pandas(userId=1, movieId=349, rating=4.0),
 Pandas(userId=1, movieId=356, rating=4.0),
 Pandas(userId=1, movieId=362, rating=5.0),
 Pandas(userId=1, movieId=367, rating=4.0

In [None]:
for i in range(30):
    print("iter%d" % i)
    for uid, iid, real_rating in dataset.itertuples(index=False):
        error = real_rating - (global_mean + bu[uid] + bi[iid])

        bu[uid] += 0.1 * (error - 0.1 * bu[uid])
        bi[iid] += 0.1 * (error - 0.1 * bi[iid])

iter0
iter1
iter2
iter3
iter4
iter5
iter6
iter7
iter8
iter9
iter10
iter11
iter12
iter13
iter14
iter15
iter16
iter17
iter18
iter19
iter20
iter21
iter22
iter23
iter24
iter25
iter26
iter27
iter28
iter29


In [None]:
bu

{1: 0.8250842466016355,
 2: -0.0584030285675993,
 3: -0.428380424021814,
 4: -0.24798103984579922,
 5: 0.15492352308509927,
 6: 0.22056063892521113,
 7: -0.7201007869635866,
 8: -0.09233839072658723,
 9: -0.09144174781996683,
 10: -0.29094432245277213,
 11: 0.5411418659842157,
 12: 1.2123999161031045,
 13: 0.40284564640118986,
 14: -0.34171646457087995,
 15: -0.23804280379719217,
 16: -0.0696404634031877,
 17: 0.0945385084121145,
 18: -0.04819630602692517,
 19: -0.6340065934059044,
 20: 0.21797958208911314,
 21: -0.4772161864989092,
 22: -2.0109774325499163,
 23: -0.1622695011791331,
 24: -0.1820711115484116,
 25: 0.8517662564468492,
 26: -0.19678446831405116,
 27: -0.4800043063968086,
 28: -0.06760444514641992,
 29: 0.42248296836381594,
 30: 0.9514466831897856,
 31: 0.3359249665291047,
 32: 0.4060391932204139,
 33: 0.22406843146471334,
 34: 0.2680740309791405,
 35: 0.36442742011311646,
 36: -0.7622868660825812,
 37: 0.3682243450973895,
 38: -0.35129873079257395,
 39: 0.509122340206774

In [None]:
bi

{1: 0.030760425401449784,
 2: -0.04879765078560369,
 3: -0.598845749329938,
 4: -1.155753147495645,
 5: -0.9858298108016408,
 6: 0.20941908379413443,
 7: -0.4918016427893213,
 8: -0.8027858035266026,
 9: -0.6590231009911913,
 10: -0.01355615939473867,
 11: -0.20511039542295717,
 12: -1.0744862324098992,
 13: -0.341021479586789,
 14: 0.023726131976525924,
 15: -0.6368749058300035,
 16: 0.2762219604373857,
 17: 0.3178760872836067,
 18: 0.12721671136495125,
 19: -0.9776882979148899,
 20: -1.0543109150753651,
 21: 0.29631050394932335,
 22: -0.27969338561647766,
 23: -0.6198811564082893,
 24: -0.5531408900028847,
 25: 0.01618113989039935,
 26: -0.1110513227700079,
 27: -0.2682126964215223,
 28: 0.6079403904265404,
 29: 0.17756959513285872,
 30: -0.5728810051409338,
 31: -0.5155456081485709,
 32: 0.25699469730757485,
 34: -0.18440110941500806,
 36: 0.18747321556385332,
 38: -1.097163470352916,
 39: 0.13360482620706104,
 40: 0.8518432380542461,
 41: 0.1674233460866126,
 42: -0.440432438512201

In [None]:
def predict(uid, iid):
    predict_rating = global_mean + bu[uid] + bi[iid]
    return predict_rating

In [None]:
predict(1, 1)

4.357401545324619

In [None]:
list(dataset.itertuples(index=True))

[Pandas(Index=0, userId=1, movieId=1, rating=4.0),
 Pandas(Index=1, userId=1, movieId=3, rating=4.0),
 Pandas(Index=2, userId=1, movieId=6, rating=4.0),
 Pandas(Index=3, userId=1, movieId=47, rating=5.0),
 Pandas(Index=4, userId=1, movieId=50, rating=5.0),
 Pandas(Index=5, userId=1, movieId=70, rating=3.0),
 Pandas(Index=6, userId=1, movieId=101, rating=5.0),
 Pandas(Index=7, userId=1, movieId=110, rating=4.0),
 Pandas(Index=8, userId=1, movieId=151, rating=5.0),
 Pandas(Index=9, userId=1, movieId=157, rating=5.0),
 Pandas(Index=10, userId=1, movieId=163, rating=5.0),
 Pandas(Index=11, userId=1, movieId=216, rating=5.0),
 Pandas(Index=12, userId=1, movieId=223, rating=3.0),
 Pandas(Index=13, userId=1, movieId=231, rating=5.0),
 Pandas(Index=14, userId=1, movieId=235, rating=4.0),
 Pandas(Index=15, userId=1, movieId=260, rating=5.0),
 Pandas(Index=16, userId=1, movieId=296, rating=3.0),
 Pandas(Index=17, userId=1, movieId=316, rating=3.0),
 Pandas(Index=18, userId=1, movieId=333, rating

# 准确性指标评估

In [None]:
#数据集的拆分
dataset.groupby('userId').any().index

Int64Index([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,
            ...
            601, 602, 603, 604, 605, 606, 607, 608, 609, 610],
           dtype='int64', name='userId', length=610)

In [None]:
round(0.8)

1

In [None]:
dataset.where(dataset['userId'] == 1)

Unnamed: 0,userId,movieId,rating
0,1.0,1.0,4.0
1,1.0,3.0,4.0
2,1.0,6.0,4.0
3,1.0,47.0,5.0
4,1.0,50.0,5.0
...,...,...,...
100831,,,
100832,,,
100833,,,
100834,,,


In [None]:
arr = np.arange(10)
np.random.shuffle(arr)
arr

array([3, 7, 1, 8, 9, 2, 6, 4, 0, 5])

In [None]:
import pandas as pd
import numpy as np


#数据集的拆分
def data_split(data_path, x=0.8, random=False):
    '''
    切分数据集， 这里为了保证用户数量保持不变，将每个用户的评分数据按比例进行拆分
    :param data_path: 数据集路径
    :param x: 训练集的比例，如x=0.8，则0.2是测试集
    :param random: 是否随机切分，默认False
    :return: 用户-物品评分矩阵
    '''
    print("开始切分数据集...")
    # 设置要加载的数据字段的类型
    dtype = {"userId": np.int32, "movieId": np.int32, "rating": np.float32}
    # 加载数据，我们只用前三列数据，分别是用户ID，电影ID，已经用户对电影的对应评分
    ratings = pd.read_csv(data_path, dtype=dtype, usecols=range(3))
    testset_index = []
    # 按用户ID进行分组，然后对每个用户的评分数据进行切分，确保每个用户的评分数据训练集和测试集都有
    for uid in ratings.groupby('userId').any().index:  #上面有例子看any().index
        user_rating_data = ratings.where(ratings['userId'] == uid).dropna()  #相同保留，不同的nan
        if random:
            index = list(user_rating_data.index)  #这里是所有的
            np.random.shuffle(index)  # shuffle洗牌 打乱顺序
            _index = round(len(user_rating_data) * x)  #拿80%作为训练集
            testset_index += list(index[_index:])  #剩余20%作为测试集
        else:  #不随机的话，就是按顺序切分
            index = round(len(user_rating_data) * x)
            #             print(list(user_rating_data.index.values[index:]))
            testset_index += list(user_rating_data.index.values[index:])

    testset = ratings.loc[testset_index]
    #把测试集的id丢掉就是训练集
    trainset = ratings.drop(testset_index)
    print('完成数据集切分....')
    return trainset, testset

In [None]:
trainset, testset = data_split('../datasets/ml-latest-small/ratings.csv', random=True)

开始切分数据集...
完成数据集切分....


In [None]:
trainset.describe()

Unnamed: 0,userId,movieId,rating
count,80672.0,80672.0,80672.0
mean,326.124653,19398.744955,3.501729
std,182.624286,35429.237575,1.0429
min,1.0,1.0,0.5
25%,177.0,1199.75,3.0
50%,325.0,2997.0,3.5
75%,477.0,8169.0,4.0
max,610.0,193609.0,5.0


In [None]:
testset.describe()

Unnamed: 0,userId,movieId,rating
count,20164.0,20164.0,20164.0
mean,326.139208,19581.527772,3.500868
std,182.599836,35935.698713,1.041071
min,1.0,1.0,0.5
25%,177.0,1198.0,3.0
50%,325.0,2959.0,3.5
75%,477.0,7757.5,4.0
max,610.0,193573.0,5.0


In [None]:
trainset.groupby(trainset['userId']).count()  #这里是为了验证训练集里每一个用户的信息都有

Unnamed: 0_level_0,movieId,rating
userId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,186,186
2,23,23
3,31,31
4,173,173
5,35,35
...,...,...
606,892,892
607,150,150
608,665,665
609,30,30


In [None]:
testset.groupby(testset['userId']).count()

Unnamed: 0_level_0,movieId,rating
userId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,46,46
2,6,6
3,8,8
4,43,43
5,9,9
...,...,...
606,223,223
607,37,37
608,166,166
609,7,7


In [None]:
def predict(uid, iid):
    predict_rating = global_mean + bu[uid] + bi[iid]
    return predict_rating

In [None]:
def test(testset):
    for uid, iid, real_rating in testset.itertuples(index=False):
        try:
            pred_rating = predict(uid, iid)
        except Exception as e:
            print(e)
        else:  #else在没有异常时会运行
            yield uid, iid, real_rating, pred_rating

## 计算rmse，mae，和 rmse_mae

In [None]:
def accuray(predict_results, method="all"):
    '''
    准确性指标计算方法
    :param predict_results: 预测结果，类型为容器，每个元素是一个包含uid,iid,real_rating,pred_rating的序列
    :param method: 指标方法，类型为字符串，rmse或mae，否则返回两者rmse和mae
    :return:
    '''

    def rmse(predict_results):
        '''
        rmse评估指标
        :param predict_results:
        :return: rmse r是开方，m是平均，s是平方，e就是error
        '''
        length = 0
        _rmse_sum = 0
        for uid, iid, real_rating, pred_rating in predict_results:
            length += 1
            _rmse_sum += (pred_rating - real_rating) ** 2
        return round(np.sqrt(_rmse_sum / length), 4)

    def mae(predict_results):
        '''
        mae评估指标
        :param predict_results:
        :return: mae  mean abs error
        '''
        length = 0
        _mae_sum = 0
        for uid, iid, real_rating, pred_rating in predict_results:
            length += 1
            _mae_sum += abs(pred_rating - real_rating)
        return round(_mae_sum / length, 4)

    def rmse_mae(predict_results):
        '''
        rmse和mae评估指标
        :param predict_results:
        :return: rmse, mae
        '''
        length = 0
        _rmse_sum = 0
        _mae_sum = 0
        for uid, iid, real_rating, pred_rating in predict_results:
            length += 1
            _rmse_sum += (pred_rating - real_rating) ** 2
            _mae_sum += abs(pred_rating - real_rating)
        return round(np.sqrt(_rmse_sum / length), 4), round(_mae_sum / length, 4)

    if method.lower() == "rmse":
        return rmse(predict_results)
    elif method.lower() == "mae":
        return mae(predict_results)
    else:
        return rmse_mae(predict_results)

In [None]:
testresult = test(testset)

In [None]:
testresult

<generator object test at 0x000001A0C65C1660>

In [None]:
accuray(testresult, method='mae')

0.6576

# 上面的整体封装实现

In [None]:

import pandas as pd
import numpy as np


def data_split(data_path, x=0.8, random=False):
    '''
    切分数据集， 这里为了保证用户数量保持不变，将每个用户的评分数据按比例进行拆分
    :param data_path: 数据集路径
    :param x: 训练集的比例，如x=0.8，则0.2是测试集
    :param random: 是否随机切分，默认False
    :return: 用户-物品评分矩阵
    '''
    print("开始切分数据集...")
    # 设置要加载的数据字段的类型
    dtype = {"userId": np.int32, "movieId": np.int32, "rating": np.float32}
    # 加载数据，我们只用前三列数据，分别是用户ID，电影ID，已经用户对电影的对应评分
    ratings = pd.read_csv(data_path, dtype=dtype, usecols=range(3))

    testset_index = []
    # 为了保证每个用户在测试集和训练集都有数据，因此按userId聚合
    for uid in ratings.groupby("userId").any().index:
        user_rating_data = ratings.where(ratings["userId"] == uid).dropna()
        if random:
            # 因为不可变类型不能被 shuffle方法作用，所以需要强行转换为列表
            index = list(user_rating_data.index)
            np.random.shuffle(index)  # 打乱列表
            _index = round(len(user_rating_data) * x)
            testset_index += list(index[_index:])
        else:
            # 将每个用户的x比例的数据作为训练集，剩余的作为测试集
            index = round(len(user_rating_data) * x)
            testset_index += list(user_rating_data.index.values[index:])

    testset = ratings.loc[testset_index]
    trainset = ratings.drop(testset_index)
    print("完成数据集切分...")
    return trainset, testset


def accuray(predict_results, method="all"):
    '''
    准确性指标计算方法
    :param predict_results: 预测结果，类型为容器，每个元素是一个包含uid,iid,real_rating,pred_rating的序列
    :param method: 指标方法，类型为字符串，rmse或mae，否则返回两者rmse和mae
    :return:
    '''

    def rmse(predict_results):
        '''
        rmse评估指标
        :param predict_results:
        :return: rmse，均方根误差
        '''
        length = 0
        _rmse_sum = 0
        for uid, iid, real_rating, pred_rating in predict_results:
            length += 1
            _rmse_sum += (pred_rating - real_rating) ** 2
        return round(np.sqrt(_rmse_sum / length), 4)

    def mae(predict_results):
        '''
        mae评估指标
        :param predict_results:
        :return: mae
        '''
        length = 0
        _mae_sum = 0
        for uid, iid, real_rating, pred_rating in predict_results:
            length += 1
            _mae_sum += abs(pred_rating - real_rating)
        return round(_mae_sum / length, 4)

    def rmse_mae(predict_results):
        '''
        rmse和mae评估指标
        :param predict_results:
        :return: rmse, mae
        '''
        length = 0
        _rmse_sum = 0
        _mae_sum = 0
        for uid, iid, real_rating, pred_rating in predict_results:
            length += 1
            _rmse_sum += (pred_rating - real_rating) ** 2
            _mae_sum += abs(pred_rating - real_rating)
        return round(np.sqrt(_rmse_sum / length), 4), round(_mae_sum / length, 4)

    if method.lower() == "rmse":
        return rmse(predict_results)
    elif method.lower() == "mae":
        return mae(predict_results)
    else:
        return rmse_mae(predict_results)


class BaselineCFBySGD(object):

    def __init__(self, number_epochs, alpha, reg, columns=["uid", "iid", "rating"]):
        # 梯度下降最高迭代次数
        self.number_epochs = number_epochs
        # 学习率
        self.alpha = alpha
        # 正则参数
        self.reg = reg
        # 数据集中user-item-rating字段的名称
        self.columns = columns

    def fit(self, dataset):
        '''
        :param dataset: uid, iid, rating
        :return:
        '''
        self.dataset = dataset
        # 用户评分数据
        self.users_ratings = dataset.groupby(self.columns[0]).agg([list])[[self.columns[1], self.columns[2]]]
        # 物品评分数据
        self.items_ratings = dataset.groupby(self.columns[1]).agg([list])[[self.columns[0], self.columns[2]]]
        # 计算全局平均分
        self.global_mean = self.dataset[self.columns[2]].mean()
        # 调用sgd方法训练模型参数
        self.bu, self.bi = self.sgd()

    def sgd(self):
        '''
        利用随机梯度下降，优化bu，bi的值
        :return: bu, bi
        '''
        # 初始化bu、bi的值，全部设为0
        bu = dict(zip(self.users_ratings.index, np.zeros(len(self.users_ratings))))
        bi = dict(zip(self.items_ratings.index, np.zeros(len(self.items_ratings))))

        for i in range(self.number_epochs):
            print("iter%d" % i)
            for uid, iid, real_rating in self.dataset.itertuples(index=False):
                error = real_rating - (self.global_mean + bu[uid] + bi[iid])

                bu[uid] += self.alpha * (error - self.reg * bu[uid])
                bi[iid] += self.alpha * (error - self.reg * bi[iid])

        return bu, bi

    def predict(self, uid, iid):
        '''评分预测'''
        #有些电影id在训练集中没有
        if iid not in self.items_ratings.index:
            raise Exception(
                "无法预测用户<{uid}>对电影<{iid}>的评分，因为训练集中缺失<{iid}>的数据".format(uid=uid, iid=iid))

        predict_rating = self.global_mean + self.bu[uid] + self.bi[iid]
        return predict_rating

    def test(self, testset):
        '''预测测试集数据'''
        for uid, iid, real_rating in testset.itertuples(index=False):
            try:
                pred_rating = self.predict(uid, iid)
            except Exception as e:  #报异常的走到这里，出现异常不会在这里求误差
                print(e)
            else:
                yield uid, iid, real_rating, pred_rating


if __name__ == '__main__':
    trainset, testset = data_split("../datasets/ml-latest-small/ratings.csv", random=True)

    bcf = BaselineCFBySGD(20, 0.1, 0.1, ["userId", "movieId", "rating"])
    bcf.fit(trainset)

    pred_results = bcf.test(testset)

    rmse, mae = accuray(pred_results)

    print("rmse: ", rmse, "mae: ", mae)

开始切分数据集...
完成数据集切分...
iter0
iter1
iter2
iter3
iter4
iter5
iter6
iter7
iter8
iter9
iter10
iter11
iter12
iter13
iter14
iter15
iter16
iter17
iter18
iter19
无法预测用户<3>对电影<5764>的评分，因为训练集中缺失<5764>的数据
无法预测用户<4>对电影<4074>的评分，因为训练集中缺失<4074>的数据
无法预测用户<9>对电影<6044>的评分，因为训练集中缺失<6044>的数据
无法预测用户<10>对电影<72330>的评分，因为训练集中缺失<72330>的数据
无法预测用户<18>对电影<166015>的评分，因为训练集中缺失<166015>的数据
无法预测用户<18>对电影<107846>的评分，因为训练集中缺失<107846>的数据
无法预测用户<18>对电影<88094>的评分，因为训练集中缺失<88094>的数据
无法预测用户<19>对电影<1456>的评分，因为训练集中缺失<1456>的数据
无法预测用户<19>对电影<3692>的评分，因为训练集中缺失<3692>的数据
无法预测用户<19>对电影<1816>的评分，因为训练集中缺失<1816>的数据
无法预测用户<21>对电影<149380>的评分，因为训练集中缺失<149380>的数据
无法预测用户<21>对电影<107449>的评分，因为训练集中缺失<107449>的数据
无法预测用户<21>对电影<148675>的评分，因为训练集中缺失<148675>的数据
无法预测用户<23>对电影<7924>的评分，因为训练集中缺失<7924>的数据
无法预测用户<28>对电影<52042>的评分，因为训练集中缺失<52042>的数据
无法预测用户<28>对电影<4251>的评分，因为训练集中缺失<4251>的数据
无法预测用户<28>对电影<59129>的评分，因为训练集中缺失<59129>的数据
无法预测用户<28>对电影<6817>的评分，因为训练集中缺失<6817>的数据
无法预测用户<28>对电影<60389>的评分，因为训练集中缺失<60389>的数据
无法预测用户<28>对电影<63033>的评分，因为训练集中缺失<63033>的数据

# 2 下面是交替最小二乘法

In [None]:
users_ratings.head()

Unnamed: 0_level_0,movieId,rating
Unnamed: 0_level_1,list,list
userId,Unnamed: 1_level_2,Unnamed: 2_level_2
1,"[1, 3, 6, 47, 50, 70, 101, 110, 151, 157, 163,...","[4.0, 4.0, 4.0, 5.0, 5.0, 3.0, 5.0, 4.0, 5.0, ..."
2,"[318, 333, 1704, 3578, 6874, 8798, 46970, 4851...","[3.0, 4.0, 4.5, 4.0, 4.0, 3.5, 4.0, 4.0, 4.5, ..."
3,"[31, 527, 647, 688, 720, 849, 914, 1093, 1124,...","[0.5, 0.5, 0.5, 0.5, 0.5, 5.0, 0.5, 0.5, 0.5, ..."
4,"[21, 32, 45, 47, 52, 58, 106, 125, 126, 162, 1...","[3.0, 2.0, 3.0, 2.0, 3.0, 3.0, 4.0, 5.0, 1.0, ..."
5,"[1, 21, 34, 36, 39, 50, 58, 110, 150, 153, 232...","[4.0, 4.0, 4.0, 4.0, 3.0, 4.0, 5.0, 4.0, 3.0, ..."


In [None]:
items_ratings.head()

Unnamed: 0_level_0,userId,rating
Unnamed: 0_level_1,list,list
movieId,Unnamed: 1_level_2,Unnamed: 2_level_2
1,"[1, 5, 7, 15, 17, 18, 19, 21, 27, 31, 32, 33, ...","[4.0, 4.0, 4.5, 2.5, 4.5, 3.5, 4.0, 3.5, 3.0, ..."
2,"[6, 8, 18, 19, 20, 21, 27, 51, 62, 68, 82, 91,...","[4.0, 4.0, 3.0, 3.0, 3.0, 3.5, 4.0, 4.5, 4.0, ..."
3,"[1, 6, 19, 32, 42, 43, 44, 51, 58, 64, 68, 91,...","[4.0, 5.0, 3.0, 3.0, 4.0, 5.0, 3.0, 4.0, 3.0, ..."
4,"[6, 14, 84, 162, 262, 411, 600]","[3.0, 3.0, 3.0, 3.0, 1.0, 2.0, 1.5]"
5,"[6, 31, 43, 45, 58, 66, 68, 84, 103, 107, 111,...","[5.0, 3.0, 5.0, 3.0, 4.0, 4.0, 2.0, 3.0, 4.0, ..."


In [None]:
#交替最小二乘法
bu = dict(zip(users_ratings.index, np.zeros(len(users_ratings))))
bi = dict(zip(items_ratings.index, np.zeros(len(items_ratings))))

In [None]:
for i in range(15):  #迭代15次
    print('iter%d' % i)
    for iid, uids, ratings in items_ratings.itertuples(index=True):
        _sum = 0
        #         print(uids)
        #         print(ratings)
        for uid, rating in zip(uids, ratings): #遍历uids和ratings
            #             print(uid,rating)
            _sum += rating - global_mean - bu[uid]
        bi[iid] = _sum / (0.1 + len(uids))  #λ假设是0.1,公式的Ru是len(uids)
        #上面一个for循环结束后，所有bi中所有的iid都有值了
    for uid, iids, ratings in users_ratings.itertuples(index=True):
        _sum = 0
        for iid, rating in zip(iids, ratings):
            _sum += rating - global_mean - bi[iid]
        bu[uid] = _sum / (0.1 + len(iids))  #该for循环结束后，所有的bu【uid】都有值了

iter0
iter1
iter2
iter3
iter4
iter5
iter6
iter7
iter8
iter9
iter10
iter11
iter12
iter13
iter14


In [None]:
bu

{1: 0.827993780333348,
 2: 0.02085463426712349,
 3: -1.402601666178307,
 4: -0.22849243055237117,
 5: -0.0266414427678953,
 6: 0.3679469678212412,
 7: -0.3027366028496861,
 8: 0.07798229232441713,
 9: -0.1280775895955428,
 10: -0.23572252394807158,
 11: 0.42542028587625486,
 12: 1.0926730747056452,
 13: 0.22735105306978257,
 14: 0.04812018367232163,
 15: -0.2698014741030977,
 16: -0.28276268913069513,
 17: 0.25999087578835034,
 18: 0.06876106761632104,
 19: -0.5941967586761318,
 20: 0.20212148461195475,
 21: -0.12175002385313148,
 22: -1.2338804072828742,
 23: -0.3425340156617749,
 24: -0.047000342346913644,
 25: 0.8560428804963366,
 26: -0.22478494082532324,
 27: 0.04534223336273621,
 28: -0.6481516337913018,
 29: 0.28427043601820334,
 30: 0.7833213626320845,
 31: 0.3741183978015713,
 32: 0.2700399190445044,
 33: 0.10423561087481575,
 34: -0.07966564627926294,
 35: 0.6549202871010326,
 36: -0.856332551909346,
 37: 0.5863001268673638,
 38: -0.19581889634460317,
 39: 0.17234338304935434

In [None]:
len(bi)

9724

In [None]:
import pandas as pd
import numpy as np


class BaselineCFByALS(object):

    def __init__(self, number_epochs, reg_bu, reg_bi, columns=["uid", "iid", "rating"]):
        # 梯度下降最高迭代次数
        self.number_epochs = number_epochs
        # bu的正则参数
        self.reg_bu = reg_bu
        # bi的正则参数
        self.reg_bi = reg_bi
        # 数据集中user-item-rating字段的名称
        self.columns = columns

    def fit(self, dataset):
        '''
        :param dataset: uid, iid, rating
        :return:
        '''
        self.dataset = dataset
        # 用户评分数据
        self.users_ratings = dataset.groupby(self.columns[0]).agg([list])[[self.columns[1], self.columns[2]]]
        # 物品评分数据
        self.items_ratings = dataset.groupby(self.columns[1]).agg([list])[[self.columns[0], self.columns[2]]]
        # 计算全局平均分
        self.global_mean = self.dataset[self.columns[2]].mean()
        # 调用sgd方法训练模型参数
        self.bu, self.bi = self.als()

    def als(self):
        '''
        利用随机梯度下降，优化bu，bi的值
        :return: bu, bi
        '''
        # 初始化bu、bi的值，全部设为0
        bu = dict(zip(self.users_ratings.index, np.zeros(len(self.users_ratings))))
        bi = dict(zip(self.items_ratings.index, np.zeros(len(self.items_ratings))))

        for i in range(self.number_epochs):
            print("iter%d" % i)
            for iid, uids, ratings in self.items_ratings.itertuples(index=True):
                _sum = 0
                for uid, rating in zip(uids, ratings):
                    _sum += rating - self.global_mean - bu[uid]
                bi[iid] = _sum / (self.reg_bi + len(uids))  #len(uids)就是公式里的R(u)

            for uid, iids, ratings in self.users_ratings.itertuples(index=True):
                _sum = 0
                for iid, rating in zip(iids, ratings):
                    _sum += rating - self.global_mean - bi[iid]
                bu[uid] = _sum / (self.reg_bu + len(iids))
        return bu, bi

    def predict(self, uid, iid):
        predict_rating = self.global_mean + self.bu[uid] + self.bi[iid]
        return predict_rating


if __name__ == '__main__':
    dtype = [("userId", np.int32), ("movieId", np.int32), ("rating", np.float32)]
    dataset = pd.read_csv("../datasets/ml-latest-small/ratings.csv", usecols=range(3), dtype=dict(dtype))

    bcf = BaselineCFByALS(20, 25, 15, ["userId", "movieId", "rating"])
    bcf.fit(dataset)

    while True:
        uid = int(input("uid: "))
        iid = int(input("iid: "))
        print(bcf.predict(uid, iid))

iter0
iter1
iter2
iter3
iter4
iter5
iter6
iter7
iter8
iter9
iter10
iter11
iter12
iter13
iter14
iter15
iter16
iter17
iter18
iter19
uid: 1
iid: 1
4.577542477588154


KeyboardInterrupt: Interrupted by user

# 交替最小二乘法计算rmse，mse，rmse_mse

In [None]:
import pandas as pd
import numpy as np


def data_split(data_path, x=0.8, random=False):
    '''
    切分数据集， 这里为了保证用户数量保持不变，将每个用户的评分数据按比例进行拆分
    :param data_path: 数据集路径
    :param x: 训练集的比例，如x=0.8，则0.2是测试集
    :param random: 是否随机切分，默认False
    :return: 用户-物品评分矩阵
    '''
    print("开始切分数据集...")
    # 设置要加载的数据字段的类型
    dtype = {"userId": np.int32, "movieId": np.int32, "rating": np.float32}
    # 加载数据，我们只用前三列数据，分别是用户ID，电影ID，已经用户对电影的对应评分
    ratings = pd.read_csv(data_path, dtype=dtype, usecols=range(3))

    testset_index = []
    # 为了保证每个用户在测试集和训练集都有数据，因此按userId聚合
    for uid in ratings.groupby("userId").any().index:
        user_rating_data = ratings.where(ratings["userId"] == uid).dropna()
        if random:
            # 因为不可变类型不能被 shuffle方法作用，所以需要强行转换为列表
            index = list(user_rating_data.index)
            np.random.shuffle(index)  # 打乱列表，目的是每个用户的数据都需要去训练
            _index = round(len(user_rating_data) * x)
            testset_index += list(index[_index:])
        else:
            # 将每个用户的x比例的数据作为训练集，剩余的作为测试集
            index = round(len(user_rating_data) * x)
            testset_index += list(user_rating_data.index.values[index:])

    testset = ratings.loc[testset_index]
    trainset = ratings.drop(testset_index)
    print("完成数据集切分...")
    return trainset, testset


def accuray(predict_results, method="all"):
    '''
    准确性指标计算方法
    :param predict_results: 预测结果，类型为容器，每个元素是一个包含uid,iid,real_rating,pred_rating的序列
    :param method: 指标方法，类型为字符串，rmse或mae，否则返回两者rmse和mae
    :return:
    '''

    def rmse(predict_results):
        '''
        rmse评估指标
        :param predict_results:
        :return: rmse
        '''
        length = 0
        _rmse_sum = 0
        for uid, iid, real_rating, pred_rating in predict_results:
            length += 1
            _rmse_sum += (pred_rating - real_rating) ** 2
        return round(np.sqrt(_rmse_sum / length), 4)

    def mae(predict_results):
        '''
        mae评估指标
        :param predict_results:
        :return: mae
        '''
        length = 0
        _mae_sum = 0
        for uid, iid, real_rating, pred_rating in predict_results:
            length += 1
            _mae_sum += abs(pred_rating - real_rating)
        return round(_mae_sum / length, 4)

    def rmse_mae(predict_results):
        '''
        rmse和mae评估指标
        :param predict_results:
        :return: rmse, mae
        '''
        length = 0
        _rmse_sum = 0
        _mae_sum = 0
        for uid, iid, real_rating, pred_rating in predict_results:
            length += 1
            _rmse_sum += (pred_rating - real_rating) ** 2
            _mae_sum += abs(pred_rating - real_rating)
        return round(np.sqrt(_rmse_sum / length), 4), round(_mae_sum / length, 4)

    if method.lower() == "rmse":
        rmse(predict_results)
    elif method.lower() == "mae":
        mae(predict_results)
    else:
        return rmse_mae(predict_results)


class BaselineCFByALS(object):

    def __init__(self, number_epochs, reg_bu, reg_bi, columns=["uid", "iid", "rating"]):
        # 梯度下降最高迭代次数
        self.number_epochs = number_epochs
        # bu的正则参数
        self.reg_bu = reg_bu
        # bi的正则参数
        self.reg_bi = reg_bi
        # 数据集中user-item-rating字段的名称
        self.columns = columns

    def fit(self, dataset):
        '''
        :param dataset: uid, iid, rating
        :return:
        '''
        self.dataset = dataset
        # 用户评分数据
        self.users_ratings = dataset.groupby(self.columns[0]).agg([list])[[self.columns[1], self.columns[2]]]
        # 物品评分数据
        self.items_ratings = dataset.groupby(self.columns[1]).agg([list])[[self.columns[0], self.columns[2]]]
        # 计算全局平均分
        self.global_mean = self.dataset[self.columns[2]].mean()
        # 调用sgd方法训练模型参数
        self.bu, self.bi = self.als()

    def als(self):
        '''
        利用随机梯度下降，优化bu，bi的值
        :return: bu, bi
        '''
        # 初始化bu、bi的值，全部设为0
        bu = dict(zip(self.users_ratings.index, np.zeros(len(self.users_ratings))))
        bi = dict(zip(self.items_ratings.index, np.zeros(len(self.items_ratings))))

        for i in range(self.number_epochs):
            print("iter%d" % i)
            total_error = 0
            for iid, uids, ratings in self.items_ratings.itertuples(index=True):
                _sum = 0
                for uid, rating in zip(uids, ratings):
                    _sum += rating - self.global_mean - bu[uid]
                bi[iid] = _sum / (self.reg_bi + len(uids))

            for uid, iids, ratings in self.users_ratings.itertuples(index=True):
                _sum = 0
                for iid, rating in zip(iids, ratings):
                    _sum += rating - self.global_mean - bi[iid]
                bu[uid] = _sum / (self.reg_bu + len(iids))
            for uid, iid, real_rating in self.dataset.itertuples(index=False):
                error = real_rating - (self.global_mean + bu[uid] + bi[iid])
                total_error += error ** 2
            print("total_error:", total_error)
        return bu, bi

    def predict(self, uid, iid):
        '''评分预测'''
        if iid not in self.items_ratings.index:
            raise Exception(
                "无法预测用户<{uid}>对电影<{iid}>的评分，因为训练集中缺失<{iid}>的数据".format(uid=uid, iid=iid))

        predict_rating = self.global_mean + self.bu[uid] + self.bi[iid]
        return predict_rating

    def test(self, testset):
        '''预测测试集数据'''
        for uid, iid, real_rating in testset.itertuples(index=False):
            try:
                pred_rating = self.predict(uid, iid)
            except Exception as e:
                #   print(e)
                pass
            else:
                yield uid, iid, real_rating, pred_rating


if __name__ == '__main__':
    trainset, testset = data_split("../datasets/ml-latest-small/ratings.csv", random=True)

    bcf = BaselineCFByALS(20, 25, 15, ["userId", "movieId", "rating"])
    bcf.fit(trainset)

    pred_results = bcf.test(testset)

    rmse, mae = accuray(pred_results)

    print("rmse: ", rmse, "mae: ", mae)

开始切分数据集...
完成数据集切分...
iter0
total_error: 58050.843866479096
iter1
total_error: 58229.947540153116
iter2
total_error: 58209.448543497296
iter3
total_error: 58186.95790652108
iter4
total_error: 58174.66404727968
iter5
total_error: 58168.29708890291
iter6
total_error: 58164.99540890676
iter7
total_error: 58163.27416573697
iter8
total_error: 58162.3737076483
iter9
total_error: 58161.90173808734
iter10
total_error: 58161.65411078965
iter11
total_error: 58161.52412139064
iter12
total_error: 58161.45586648612
iter13
total_error: 58161.4200221668
iter14
total_error: 58161.401197025545
iter15
total_error: 58161.391309840226
iter16
total_error: 58161.386116871196
iter17
total_error: 58161.38338938262
iter18
total_error: 58161.381956820114
iter19
total_error: 58161.38120439462
rmse:  0.8767 mae:  0.6749


In [None]:
#结论可以看出最小二乘法好一些