# 14 SVD

- 优点：简化数据，去除噪声，提高算法的结果
- 缺点：数据的转换可能难以理解

In [1]:
import numpy as np

In [2]:
U, Sigma, VT = np.linalg.svd([[1,1],[7,7]])
print("U: ", U)
print("Sigma: ", Sigma)
print("VT: ", VT)

U:  [[-0.14142136 -0.98994949]
 [-0.98994949  0.14142136]]
Sigma:  [10.  0.]
VT:  [[-0.70710678 -0.70710678]
 [-0.70710678  0.70710678]]


In [4]:
def loadExData():
    return[[1, 1, 1, 0, 0],
           [2, 2, 2, 0, 0],
           [1, 1, 1, 0, 0],
           [5, 5, 5, 0, 0],
           [1, 1, 0, 2, 2],
           [0, 0, 0, 3, 3],
           [0, 0, 0, 1, 1]]

In [6]:
Data = loadExData()
U, Sigma, VT = np.linalg.svd(Data)
print("U: ", U)
print("Sigma: ", Sigma)
print("VT: ", VT)
Sig3 = np.mat([[Sigma[0], 0, 0], [0, Sigma[1], 0], [0, 0, Sigma[2]]])
print(U[:, :3] * Sig3 * VT[:3, :])

U:  [[-1.77939726e-01 -1.64228493e-02  1.80501685e-02  9.48215967e-01
  -2.43879623e-01  5.88803098e-02  7.53939908e-02]
 [-3.55879451e-01 -3.28456986e-02  3.61003369e-02 -2.42667431e-02
  -1.30579552e-01 -8.77959841e-01 -2.87253136e-01]
 [-1.77939726e-01 -1.64228493e-02  1.80501685e-02 -2.86917521e-01
  -9.17843690e-01  1.99699755e-01  5.57067394e-02]
 [-8.89698628e-01 -8.21142464e-02  9.02508423e-02 -1.22552992e-01
   2.84576484e-01  2.99467924e-01  8.86811085e-02]
 [-1.33954753e-01  5.33527340e-01 -8.35107599e-01  3.60822483e-16
  -8.53483950e-16  2.25514052e-16  3.05311332e-16]
 [-2.15749771e-02  7.97677135e-01  5.13074760e-01  1.71950731e-02
  -2.25601962e-03  9.80604897e-02 -3.00138935e-01]
 [-7.19165903e-03  2.65892378e-01  1.71024920e-01 -5.15852193e-02
   6.76805885e-03 -2.94181469e-01  9.00416804e-01]]
Sigma:  [9.72140007e+00 5.29397912e+00 6.84226362e-01 1.50962387e-15
 1.15387192e-31]
VT:  [[-5.81200877e-01 -5.81200877e-01 -5.67421508e-01 -3.49564973e-02
  -3.49564973e-02]


In [7]:
def ecludSim(a, b):
    '''
    基于欧氏距离, 计算相似度
    '''
    return 1.0 / (1.0 + np.linalg.norm(a - b))

def pearsSim(a, b):
    '''
    pearsSim()函数会检查是否存在3个或更多的点
    '''
    if len(a) < 3: return 1.0
    return 0.5 + 0.5 * np.corrcoef(a, b, rowvar=0)[0][1]

def cosSim(a, b):
    '''
    基于余弦相似度, 计算相似度
    '''
    num = float(a.T * b)
    denom = np.linalg.norm(a) * np.linalg.norm(b)
    return 0.5 + 0.5 * (num / denom)

In [8]:
myMat = np.mat(loadExData())
print(ecludSim(myMat[:, 0], myMat[:, 4]))
print(pearsSim(myMat[:, 0], myMat[:, 4]))
print(cosSim(myMat[:, 0], myMat[:, 4]))

0.13367660240019172
0.23768619407595815
0.5472455591261534


### 14.5 Example: a restaurant dish recommendation engine

In [9]:
def standEst(dataMat, user, simMeas, item):
    '''
    基于物品相似度的推荐引擎
    :param dataMat: 数据矩阵
    :param user: 用户
    :param simMeas: 相似度计算方法
    :param item: 物品
    :return: 预测评分
    '''
    n = np.shape(dataMat)[1]
    simTotal = 0.0; ratSimTotal = 0.0
    for j in range(n):
        userRating = dataMat[user, j]
        if userRating == 0: continue
        # 找到与item相似的物品
        overLap = np.nonzero(np.logical_and(dataMat[:, item].A > 0, dataMat[:, j].A > 0))[0]
        if len(overLap) == 0: similarity = 0
        else:
            similarity = simMeas(dataMat[overLap, item], dataMat[overLap, j])
        print("the %d and %d similarity is %f" % (item, j, similarity))
        simTotal += similarity
        ratSimTotal += similarity * userRating
    if simTotal == 0: return 0
    else: return ratSimTotal / simTotal

In [10]:
def recommend(dataMat, user, N=3, simMeas=cosSim, estMethod=standEst):
    '''
    推荐引擎
    :param dataMat: 数据矩阵
    :param user: 用户
    :param N: 推荐物品个数
    :param simMeas: 相似度计算方法
    :param estMethod: 评分估计方法
    :return: 推荐物品列表
    '''
    unratedItems = np.nonzero(dataMat[user, :].A == 0)[1] # 找到未评分的物品
    if len(unratedItems) == 0: return 'you rated everything' # 如果没有未评分的物品
    itemScores = [] # 存储物品评分
    for item in unratedItems:
        estimatedScore = estMethod(dataMat, user, simMeas, item) # 估计评分
        itemScores.append((item, estimatedScore)) # 将物品和评分存储在一起
    return sorted(itemScores, key=lambda jj: jj[1], reverse=True)[:N] # 返回评分最高的N个物品

In [13]:
myMat = np.mat(loadExData())
print("recommend: ", recommend(myMat, 2, N=3))
print("recommend: ", recommend(myMat, 2, N=3, simMeas=pearsSim))
print("recommend: ", recommend(myMat, 2, N=3, simMeas=ecludSim))

the 3 and 0 similarity is 1.000000
the 3 and 1 similarity is 1.000000
the 3 and 2 similarity is 0.000000
the 4 and 0 similarity is 1.000000
the 4 and 1 similarity is 1.000000
the 4 and 2 similarity is 0.000000
recommend:  [(3, 1.0), (4, 1.0)]
the 3 and 0 similarity is 1.000000
the 3 and 1 similarity is 1.000000
the 3 and 2 similarity is 0.000000
the 4 and 0 similarity is 1.000000
the 4 and 1 similarity is 1.000000
the 4 and 2 similarity is 0.000000
recommend:  [(3, 1.0), (4, 1.0)]
the 3 and 0 similarity is 0.500000
the 3 and 1 similarity is 0.500000
the 3 and 2 similarity is 0.000000
the 4 and 0 similarity is 0.500000
the 4 and 1 similarity is 0.500000
the 4 and 2 similarity is 0.000000
recommend:  [(3, 1.0), (4, 1.0)]


In [14]:
def svdEst(dataMat, user, simMeas, item):
    '''
    基于SVD的推荐引擎
    :param dataMat: 数据矩阵
    :param user: 用户
    :param simMeas: 相似度计算方法
    :param item: 物品
    :return: 预测评分
    '''
    n = np.shape(dataMat)[1]
    simTotal = 0.0; ratSimTotal = 0.0
    U, Sigma, VT = np.linalg.svd(dataMat)
    Sig4 = np.mat(np.eye(4) * Sigma[:4])
    xformedItems = dataMat.T * U[:, :4] * Sig4.I
    for j in range(n):
        userRating = dataMat[user, j]
        if userRating == 0 or j == item: continue
        similarity = cosSim(xformedItems[item, :].T, xformedItems[j, :].T)
        print("the %d and %d similarity is %f" % (item, j, similarity))
        simTotal += similarity
        ratSimTotal += similarity * userRating
    if simTotal == 0: return 0
    else: return ratSimTotal / simTotal

In [16]:
print(myMat)
print("svdEst: ", recommend(myMat, 2, N=3, estMethod=svdEst))

[[1 1 1 0 0]
 [2 2 2 0 0]
 [1 1 1 0 0]
 [5 5 5 0 0]
 [1 1 0 2 2]
 [0 0 0 3 3]
 [0 0 0 1 1]]
the 3 and 0 similarity is 0.489660
the 3 and 1 similarity is 0.489660
the 3 and 2 similarity is 0.511671
the 4 and 0 similarity is 0.489660
the 4 and 1 similarity is 0.489660
the 4 and 2 similarity is 0.511671
svdEst:  [(3, 1.0), (4, 1.0)]
