In [1]:
from numpy import *

In [2]:
U, Sigma, VT = linalg.svd([[1, 1], [7, 7]])
U

array([[-0.14142136, -0.98994949],
       [-0.98994949,  0.14142136]])

In [3]:
Sigma  # 对角矩阵，将对角线上的元素变为了数组形式

array([10.,  0.])

In [4]:
VT

array([[-0.70710678, -0.70710678],
       [-0.70710678,  0.70710678]])

In [6]:
def loadExData():
    return [[1, 1, 1, 0, 0],
           [2, 2, 2, 0, 0],
           [1, 1, 1, 0, 0],
           [5, 5, 5, 0, 0],
           [1, 1, 0, 2, 2],
           [0, 0, 0, 3, 3],
           [0, 0, 0, 1, 1]]

In [7]:
Data = loadExData()
U, Sigma, VT = linalg.svd(Data)
Sigma

array([9.72140007e+00, 5.29397912e+00, 6.84226362e-01, 4.96619610e-16,
       1.57294073e-16])

In [8]:
# 重构原始矩阵
Sig3 = mat([[Sigma[0], 0, 0], [0, Sigma[1], 0], [0, 0, Sigma[2]]])
U[:, :3] * Sig3 * VT[:3, :]

matrix([[ 1.00000000e+00,  1.00000000e+00,  1.00000000e+00,
         -4.04082150e-16, -3.93673809e-16],
        [ 2.00000000e+00,  2.00000000e+00,  2.00000000e+00,
          3.23525928e-16,  3.58220398e-16],
        [ 1.00000000e+00,  1.00000000e+00,  1.00000000e+00,
         -6.96599896e-16, -6.79252661e-16],
        [ 5.00000000e+00,  5.00000000e+00,  5.00000000e+00,
         -1.04950770e-16, -3.25260652e-17],
        [ 1.00000000e+00,  1.00000000e+00, -2.22044605e-16,
          2.00000000e+00,  2.00000000e+00],
        [ 1.11022302e-16,  3.88578059e-16, -5.55111512e-16,
          3.00000000e+00,  3.00000000e+00],
        [ 4.16333634e-17,  1.31838984e-16, -2.08166817e-16,
          1.00000000e+00,  1.00000000e+00]])

# 基于协同过滤的推荐引擎

In [9]:
from numpy import *
from numpy import linalg as la

In [10]:
# 相似度计算，其中inA, inB都是列向量
def ecludSim(inA, inB):
    return 1.0 / (1.0 + la.norm(inA - inB))

def pearsSim(inA, inB):
    if len(inA) < 3:
        return 1.0
    return 0.5 + 0.5 * corrcoef(inA, inB, rowvar=0)[0][1]

def cosSim(inA, inB):
    num = float(inA.T * inB)
    denom = la.norm(inA) * la.norm(inB)
    return 0.5 + 0.5 * (num / denom)

In [12]:
myMat = mat(loadExData())
myMat

matrix([[1, 1, 1, 0, 0],
        [2, 2, 2, 0, 0],
        [1, 1, 1, 0, 0],
        [5, 5, 5, 0, 0],
        [1, 1, 0, 2, 2],
        [0, 0, 0, 3, 3],
        [0, 0, 0, 1, 1]])

In [13]:
# 欧氏距离计算相似度
ecludSim(myMat[:, 0], myMat[:, 4])

0.13367660240019172

In [14]:
ecludSim(myMat[:, 0], myMat[:, 0])

1.0

In [15]:
# 余弦计算相似度
cosSim(myMat[:, 0], myMat[:, 4])

0.5472455591261534

In [16]:
cosSim(myMat[:, 0], myMat[:, 0])

0.9999999999999999

In [17]:
# 皮尔逊计算相似度
pearsSim(myMat[:, 0], myMat[:, 4])

0.23768619407595826

In [18]:
pearsSim(myMat[:, 0], myMat[:, 0])

1.0

## 示例：餐馆菜肴推荐引擎

In [19]:
# 计算在给定相似度计算方法的条件下，用户对物品的估计评分值
# dataMat 行表示用户，列表示物品
def standEst(dataMat, user, simMeas, item):
    n = shape(dataMat)[1]  # 获取总物品个数
    simTotal = 0.0
    ratSimTotal = 0.0
    for j in range(n):
        userRating = dataMat[user, j]  # 该用户对物品j的评分
        if userRating == 0:  # 未评分则跳过
            continue
        # 对物品item和j都评过分的用户编号
        overLap = nonzero(logical_and(dataMat[:, item].A>0, dataMat[:, j].A>0))[0]
        if len(overLap) == 0:  # 若不存在则这两个物品相似度为0
            similarity = 0
        else:
            similarity = simMeas(dataMat[overLap, item], dataMat[overLap, j])
        simTotal += similarity  # 计算总相似度
        ratSimTotal += similarity * userRating  # 根据该用户评分来计算加权评分
    # 最后返回对该用户于该物品估算的评分
    if simTotal == 0:
        return 0
    else:
        return ratSimTotal / simTotal
    
def recommend(dataMat, user, N=3, simMeas=cosSim, estMethod=standEst):
    unratedItems = nonzero(dataMat[user, :].A == 0)[1]  # 获取该用户为评分的物品编号
    if len(unratedItems) == 0:
        return 'you rated everything'
    itemScores = []
    for item in unratedItems:
        estimatedScore = estMethod(dataMat, user, simMeas, item)
        itemScores.append((item, estimatedScore))
    return sorted(itemScores, key=lambda jj: jj[1], reverse=True)[:N]

In [26]:
myMat = mat([[4, 4, 0, 2, 2],
             [4, 0, 0, 3, 3],
             [4, 0, 0, 1, 1],
             [1, 1, 1, 2, 0],
             [2, 2, 2, 0, 0],
             [1, 1, 1, 0, 0],
             [5, 5, 5, 0, 0]])
myMat

matrix([[4, 4, 0, 2, 2],
        [4, 0, 0, 3, 3],
        [4, 0, 0, 1, 1],
        [1, 1, 1, 2, 0],
        [2, 2, 2, 0, 0],
        [1, 1, 1, 0, 0],
        [5, 5, 5, 0, 0]])

In [31]:
recommend(myMat, 2)

[(2, 2.5), (1, 2.0243290220056256)]

In [32]:
recommend(myMat, 2, simMeas=ecludSim)

[(2, 3.0), (1, 2.8266504712098603)]

In [33]:
recommend(myMat, 2, simMeas=pearsSim)

[(2, 2.5), (1, 2.0)]

### 利用SVD提高推荐的效果

In [37]:
from numpy import linalg as la

In [69]:
myMat = mat([[0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 5],
             [0, 0, 0, 3, 0, 4, 0, 0, 0, 0, 3],
             [0, 0, 0, 0, 4, 0, 0, 1, 0, 4, 0],
             [3, 3, 4, 0, 0, 0, 0, 2, 2, 0, 0],
             [5, 4, 5, 0, 0, 0, 0, 5, 5, 0, 0],
             [0, 0, 0, 0, 5, 0, 1, 0, 0, 5, 0],
             [4, 3, 4, 0, 0, 0, 0, 5, 5, 0, 1],
             [0, 0, 0, 4, 0, 4, 0, 0, 0, 0, 4],
             [0, 0, 0, 2, 0, 2, 5, 0, 0, 1, 2],
             [0, 0, 0, 0, 5, 0, 0, 0, 0, 4, 0],
             [1, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0]])

In [70]:
U, Sigma, VT = la.svd(myMat)
Sigma

array([15.77075346, 11.40670395, 11.03044558,  4.84639758,  3.09292055,
        2.58097379,  1.00413543,  0.72817072,  0.43800353,  0.22082113,
        0.07367823])

In [71]:
# 查看有多少个奇异值能达到总能量的90%
Sig2 = Sigma**2
sum(Sig2)

541.9999999999993

In [72]:
sum(Sig2) * 0.9

487.7999999999994

In [77]:
# 前3个特征包含了90%的量
sum(Sig2[:3])

500.5002891275791

In [78]:
# 基于SVD的评分估计
def svdEst(dataMat, user, simMeas, item):
    n = shape(dataMat)[1]
    simTotal = 0.0
    ratSimTotal = 0.0
    
    # 奇异值分解
    nums = 4
    U, Sigma, VT = la.svd(dataMat)
    Sig4 = mat(eye(nums) * Sigma[:nums])  # 构建对角矩阵
    xformedItems = dataMat.T * U[:, :nums] * Sig4.I  # 构建转化后的物品
    
    for j in range(n):
        userRating = dataMat[user, j]
        if userRating == 0 or j == item:
            continue
        similarity = simMeas(xformedItems[item, :].T, xformedItems[j, :].T)
#         print('the %d and %d similarity is %f' % (item, j, similarity))
        simTotal += similarity
        ratSimTotal += similarity * userRating
    if simTotal == 0:
        return 0
    else:
        return ratSimTotal / simTotal

In [79]:
recommend(myMat, 1, estMethod=svdEst)

[(4, 3.344714938469228), (7, 3.329402072452697), (9, 3.328100876390069)]

In [80]:
recommend(myMat, 1, estMethod=svdEst, simMeas=pearsSim)

[(4, 3.346952186702173), (9, 3.3353796573274703), (6, 3.3071930278130375)]

# 基于SVD的图像压缩

In [92]:
# 输出图像
def printMat(inMat, thresh=0.8):
    for i in range(32):
        for k in range(32):
            if float(inMat[i, k]) > thresh:
                print(1, end='')
            else:
                print(0, end='')
        print('')

def imgCompress(numSV=3, thresh=0.8):
    # 读取原图像并输出
    myl = []
    for line in open('0_5.txt').readlines():
        newRow = []
        for i in range(32):
            newRow.append(int(line[i]))
        myl.append(newRow)
    myMat = mat(myl)
    print('****original matrix******')
    printMat(myMat, thresh)
    
    # 奇异值分解图像
    U, Sigma, VT = la.svd(myMat)
    SigRecon = mat(zeros((numSV, numSV)))
    for k in range(numSV):
        SigRecon[k, k] = Sigma[k]
    
    # 重构图像并输出
    reconMat = U[:, :numSV] * SigRecon * VT[:numSV, :]
    print('****reconstructed matrix using %d sigular values******' % numSV)
    printMat(reconMat, thresh)

In [93]:
imgCompress(2)

****original matrix******
00000000000000110000000000000000
00000000000011111100000000000000
00000000000111111110000000000000
00000000001111111111000000000000
00000000111111111111100000000000
00000001111111111111110000000000
00000000111111111111111000000000
00000000111111100001111100000000
00000001111111000001111100000000
00000011111100000000111100000000
00000011111100000000111110000000
00000011111100000000011110000000
00000011111100000000011110000000
00000001111110000000001111000000
00000011111110000000001111000000
00000011111100000000001111000000
00000001111100000000001111000000
00000011111100000000001111000000
00000001111100000000001111000000
00000001111100000000011111000000
00000000111110000000001111100000
00000000111110000000001111100000
00000000111110000000001111100000
00000000111110000000011111000000
00000000111110000000111111000000
00000000111111000001111110000000
00000000011111111111111110000000
00000000001111111111111110000000
00000000001111111111111110000000
00000000000111111