In [17]:
"""Use SVD for Movie recommendation system construction.
"""

# Author: Linlin Chen<2311491603@qq.com>


'Use SVD for Movie recommendation system construction.\n'

In [1]:
import pandas as pd

In [2]:
!ls

[34mals[m[m                        movie_recommendation.ipynb


In [3]:
import os

In [4]:
filepath = os.path.join('./', 'als/')

In [5]:
filepath

'./als/'

In [6]:
!ls als

[34mml-100k[m[m                      sample_movielens_movies.txt
ml-100k.zip                  sample_movielens_ratings.txt
movie_recommendation.ipynb   test.data


In [7]:
movie_name = 'sample_movielens_movies.txt'
ratings_name = 'sample_movielens_ratings.txt'

# 数据洞察

In [8]:
!head -10 ./als/sample_movielens_movies.txt

0::Movie 0::Romance|Comedy
1::Movie 1::Action|Anime
2::Movie 2::Romance|Thriller
3::Movie 3::Action|Romance
4::Movie 4::Anime|Comedy
5::Movie 5::Action|Action
6::Movie 6::Action|Comedy
7::Movie 7::Anime|Comedy
8::Movie 8::Comedy|Action
9::Movie 9::Anime|Thriller


In [9]:
!head -3 ./als/sample_movielens_ratings.txt

0::2::3::1424380312
0::3::1::1424380312
0::5::2::1424380312


 userId |   movieId |   rating |   timestamp |

In [10]:
movie_cols = ['index', 'movie', 'movie_type']
ratings_cols = ['userId', 'movieId', 'rating','timestamp']

# 读取数据

In [12]:
movies = pd.read_table(filepath+movie_name, sep='::', names=movie_cols)
ratings = pd.read_table(filepath+ratings_name, sep='::',names = ratings_cols)

  """Entry point for launching an IPython kernel.
  """Entry point for launching an IPython kernel.
  
  


In [15]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
movies.head(3)
ratings.head(3)

Unnamed: 0,index,movie,movie_type
0,0,Movie 0,Romance|Comedy
1,1,Movie 1,Action|Anime
2,2,Movie 2,Romance|Thriller


Unnamed: 0,userId,movieId,rating,timestamp
0,0,2,3,1424380312
1,0,3,1,1424380312
2,0,5,2,1424380312


# 随机采样

In [230]:
def train_test_split(data, split_col='userId', ratio=0.8):
    # 对于同一个用户按照时间戳选取前20%的数据
    test_df = pd.DataFrame()
    train_df = pd.DataFrame()
    # 取出所有的userId唯一值
    unq_userId= ratings[split_col].unique()

    for user in unq_userId:
        user_ratings = ratings[ratings[split_col]==user].sort_values(by='timestamp',ascending = True)
        # 取出前80%作为训练集
        bk_point = int(ratio * len(user_ratings))
        test_df = test_df.append(user_ratings.iloc[bk_point:, :])
        train_df = train_df.append(user_ratings.iloc[:bk_point, :])
    return train_df, test_df

In [231]:
train_df, test_df = train_test_split(ratings)

In [232]:
# 已经正确切分

# 将评分表转为矩阵

In [238]:
def trans_utility(data):
    """
        :param data:      Array-like, 2D, nx3
        :return:          utility matrix (n x m), n=users, m=items
    """
    # 取出3列值
    movie_list  = data['movieId']
    user_list   = data['userId']
    rating_list = data['rating']

    movie_unq = movie_list.unique().tolist()
    user_unq  = user_list.unique().tolist()

    # 产生对应字典
    movie_index = {movie_unq[i]:i for i in range(len(movie_unq))}
    user_index  = {user_unq[i]:i for i in range(len(user_unq))}

    pd_dict = {movie_index[movie]:[np.nan for user in user_unq] for movie in movie_unq}
    dict_df = pd.DataFrame(pd_dict)
    
    # 填充相应的值
    for i in range(len(data)):
        mov = movie_list.iloc[i]
        user = user_list.iloc[i]
        rat = rating_list.iloc[i]
        # 开始填充
        dict_df.loc[user_index[user], movie_index[mov]] = rat
    return dict_df, movie_index, user_index

In [239]:
dict_df, movie_index, user_index=trans_utility(train_df)
dict_df.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,87,88,89,90,91,92,93,94,95,96
0,3.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,1.0,1.0,...,,,,,,,,,,
1,2.0,,1.0,,,,,1.0,4.0,,...,,,,,,,,,,
2,,,1.0,1.0,1.0,1.0,1.0,,1.0,,...,,,,,,,,,,
3,1.0,5.0,,,,,,1.0,2.0,,...,,,,,,,,,,
4,,1.0,1.0,1.0,,1.0,,2.0,,1.0,...,,,,,,,,,,
5,,1.0,1.0,5.0,,1.0,3.0,,4.0,,...,,,,,,,,,,
6,3.0,,1.0,1.0,1.0,3.0,,3.0,1.0,1.0,...,,,,,,,,,,
7,2.0,,1.0,1.0,1.0,2.0,,3.0,,,...,,,,,,,,,,
8,4.0,,1.0,1.0,,,1.0,3.0,,1.0,...,3.0,,,,,,,,,
9,3.0,,1.0,,1.0,1.0,3.0,,1.0,1.0,...,,,,,,,,,,


In [96]:
# 填充完毕

In [241]:
def svd(data,k=20):
    """
        :data:    Array-like, 2D
        :k:       integer, scalar
    """
    masked_isnan = np.isnan(data)# 找出矩阵中需要覆盖的位置，标记为True
    # 转换成遮掩型
    masked_df = np.ma.masked_array(data, masked_isnan) 
    # 找出每个movie的平均得分
    mean_mat = data.mean(axis=0)

    #空值填充为0值
    result_df = pd.DataFrame(masked_df.filled(mean_mat))
    # 在进行svd分解之前，对矩阵减去均值，做平移
    mean_arr = np.tile(mean_mat,(result_df.shape[0], 1))
    cor_mat = result_df - mean_arr

    # svd分解
    U, s, V = np.linalg.svd(cor_mat, full_matrices=True)
    
    # 只取前k个重要性的特征
    sk = np.diag(s[:k])
    U = U[:, :k]
    V = V[:k, :]
    # 计算转换后的矩阵
    Usk = np.dot(U, sk)
    skV = np.dot(sk, V)
    UV = np.dot(Usk, skV)

    UV = UV + mean_arr
    return UV

In [242]:
UV = svd(dict_df, 20)

In [243]:
UV.shape

(30, 97)

In [246]:
pd.DataFrame(UV)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,87,88,89,90,91,92,93,94,95,96
0,12.368125,-6.563901,-4.577434,-3.903894,4.93861,5.981257,-6.601067,0.852916,-7.265565,-10.129897,...,1.770377,1.0,1.266617,2.995258,2.128136,1.0,1.0,2.945064,3.420375,1.0
1,1.761685,2.325236,1.931857,3.788516,2.704349,1.731647,2.556805,-0.74182,15.36883,0.464708,...,1.545285,1.0,1.835151,2.174108,1.984871,1.0,1.0,2.934507,2.204663,1.0
2,6.417049,-1.0879,-2.735745,-8.238952,-5.682709,0.079751,-10.013516,2.829589,-14.083638,1.499743,...,0.893272,1.0,1.383533,1.330106,6.002402,1.0,1.0,2.779869,3.840229,1.0
3,-7.901491,27.764834,5.532773,-1.433305,1.287353,1.350846,1.653862,-4.695451,5.002571,7.499631,...,0.447158,1.0,1.32349,0.432601,3.580974,1.0,1.0,3.436387,2.751972,1.0
4,5.794758,-3.930787,-1.895524,-11.115784,3.006153,1.022597,-0.024621,7.340196,-0.515563,-2.402972,...,3.216131,1.0,1.150739,1.789599,3.620637,1.0,1.0,3.495697,3.641099,1.0
5,-3.05883,-8.796979,-1.628034,33.158568,0.209289,0.048536,14.943723,-4.18716,19.319633,-3.47619,...,-0.878406,1.0,1.92192,2.942885,-0.934193,1.0,1.0,4.865332,3.279386,1.0
6,12.151625,-4.913739,-7.210691,-11.839328,-0.92224,12.544467,-3.049693,16.133618,-14.780288,-7.079521,...,4.101807,1.0,1.291007,3.396866,2.253882,1.0,1.0,3.81685,3.700269,1.0
7,5.095118,-1.220922,-0.107783,-8.686806,-2.823421,11.195258,1.475823,15.773453,-3.261803,-2.904622,...,2.55284,1.0,1.521507,3.167338,3.693289,1.0,1.0,1.282258,1.31816,1.0
8,24.780805,0.979759,-5.045674,-11.087572,3.732253,5.154311,-8.030501,15.080938,-5.065456,-7.098555,...,11.464508,1.0,2.306776,3.653871,1.470837,1.0,1.0,3.86756,1.190558,1.0
9,8.773143,-1.046071,-3.325808,4.557882,-0.175419,-0.631253,13.351734,1.299047,-5.857147,-9.386083,...,1.727193,1.0,1.585357,2.903524,0.17073,1.0,1.0,3.346659,3.009519,1.0


In [151]:
masked_isnan = np.isnan(dict_df)# 找出矩阵中需要覆盖的位置，标记为True
# 转换成遮掩型
masked_df = np.ma.masked_array(dict_df, masked_isnan) 
# 找出每个movie的平均得分
mean_mat = dict_df.mean(axis=0)

#空值填充为0值
result_df = pd.DataFrame(masked_df.filled(mean_mat))

In [152]:
result_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,87,88,89,90,91,92,93,94,95,96
0,3.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,1.0,1.0,...,1.5,1.0,1.5,2.5,1.714286,1.0,1.0,3.5,2.666667,1.0
1,2.0,1.875,1.0,1.789474,1.461538,1.266667,1.933333,1.0,4.0,2.076923,...,1.5,1.0,1.5,2.5,1.714286,1.0,1.0,3.5,2.666667,1.0
2,2.105263,1.875,1.0,1.0,1.0,1.0,1.0,1.647059,1.0,2.076923,...,1.5,1.0,1.5,2.5,1.714286,1.0,1.0,3.5,2.666667,1.0
3,1.0,5.0,1.4375,1.789474,1.461538,1.266667,1.933333,1.0,2.0,2.076923,...,1.5,1.0,1.5,2.5,1.714286,1.0,1.0,3.5,2.666667,1.0
4,2.105263,1.0,1.0,1.0,1.461538,1.0,1.933333,2.0,2.157895,1.0,...,1.5,1.0,1.5,2.5,1.714286,1.0,1.0,3.5,2.666667,1.0
5,2.105263,1.0,1.0,5.0,1.461538,1.0,3.0,1.647059,4.0,2.076923,...,1.5,1.0,1.5,2.5,1.714286,1.0,1.0,3.5,2.666667,1.0
6,3.0,1.875,1.0,1.0,1.0,3.0,1.933333,3.0,1.0,1.0,...,1.5,1.0,1.5,2.5,1.714286,1.0,1.0,3.5,2.666667,1.0
7,2.0,1.875,1.0,1.0,1.0,2.0,1.933333,3.0,2.157895,2.076923,...,1.5,1.0,1.5,2.5,1.714286,1.0,1.0,3.5,2.666667,1.0
8,4.0,1.875,1.0,1.0,1.461538,1.266667,1.0,3.0,2.157895,1.0,...,3.0,1.0,1.5,2.5,1.714286,1.0,1.0,3.5,2.666667,1.0
9,3.0,1.875,1.0,1.789474,1.0,1.0,3.0,1.647059,1.0,1.0,...,1.5,1.0,1.5,2.5,1.714286,1.0,1.0,3.5,2.666667,1.0


In [158]:
# 在进行svd分解之前，对矩阵减去均值，做平移
mean_arr = np.tile(mean_mat,(result_df.shape[0], 1))
cor_mat = result_df - mean_arr

# svd分解
U, s, V = np.linalg.svd(cor_mat, full_matrices=True)

# 评估

In [247]:
def rmse(pred, true_val):
    x = pred - true_val
    return  np.mean(np.power(x,2))

In [248]:
# 给定相应的user和item
for _, row in test_df.iterrows():
    user = row['userId']
    movie = row['movieId']
    
    # 存储结果
    pred = []
    
    if movie in movie_index:
        score = UV[user_index[user], movie_index[movie]]
        pred.append(score)
    else:
        score = UV[user_index[user], :].mean()
        pred.append(score)


In [249]:
true_val = test_df['rating']

In [250]:
rmse(pred, true_val.values)

1.5278939289576474

rmse的分数大概是1.44

# 问题

1. 最后得出了对角矩阵后，为什么要开方
2. 最后的矩阵相乘的分解式是否是满足定律？
3. 在进行svd之前减去平均值的平移方式是否必要？