In [1]:
import numpy as np 
import pandas as pd 
from surprise import Reader
from surprise import Dataset
from surprise import BaselineOnly
from surprise import accuracy
from surprise.model_selection import KFold
from collections import deque
from surprise import dump
import gc

In [2]:
df = []
for i in range(1,5):
    data1 = pd.read_csv(f'../input/netflix-prize-data/combined_data_{i}.txt', header=None, names=['UserId', 'Rating'], usecols=[0, 1])
    movie_indices = [(index,int(movieID[:-1])) for index,movieID in data1[data1['Rating'].isna()]['UserId'].items()]
    shifted_movie_indices = deque(movie_indices)
    shifted_movie_indices.rotate(-1)
   
    movie_ids = []
    for (idx1,movie_id1),(idx2,movie_id2) in zip(movie_indices,shifted_movie_indices):
        if idx2 > idx1:
            temp = np.full((1,(idx2-idx1-1)), movie_id1)
        else:
            temp = np.full((1,data1.shape[0]-idx1-1),movie_id1)
        movie_ids = np.append(movie_ids, temp)
    data1.dropna(inplace=True)
    data1['MovieId'] = movie_ids.astype(int)
    
    # 每个文件的数据随机抽取20%训练
    df.append(data1.sample(frac=0.20, replace=False, random_state=1))
    
df = pd.concat(df)
print('随机抽取20%后数据集大小',df.shape)

随机抽取20%后数据集大小 (20096102, 3)


In [3]:
# 释放内存
del data1
gc.collect()

0

In [4]:
# probe集测试
probe = pd.read_csv('../input/netflix-prize-data/probe.txt',header=None,names=['UserId'], usecols=[0])
indices = probe[probe['UserId'].str.contains(':')][['UserId']].apply(lambda x:x['UserId'][:-1],axis=1).to_frame().reset_index()
indices[0] = indices[0].astype(int)
for i in range(indices.shape[0]):
    if i == indices.shape[0] - 1:
        probe.loc[indices.iloc[i,0]:,'MovieId'] = indices.iloc[i,1]
    else:
        probe.loc[indices.iloc[i,0]:indices.iloc[i+1,0],'MovieId'] = indices.iloc[i,1]
probe = probe[~probe['UserId'].str.contains(':')]
probe = probe.merge(df,left_on=['UserId','MovieId'],right_on=['UserId','MovieId'],how='left')
# 抽样数据
sub_probe = probe[~probe['Rating'].isna()]
print('probe对应抽取训练数据的子集大小',sub_probe.shape)

probe对应抽取训练数据的子集大小 (281193, 3)


In [5]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['UserId', 'MovieId', 'Rating']], reader)

In [6]:
# 释放内存
del df
gc.collect()

19

In [7]:
# 使用BaselineOnly方法
bsl_options = {'method': 'sgd','n_epochs': 5}
algo = BaselineOnly(bsl_options=bsl_options)

# define a cross-validation iterator
kf = KFold(n_splits=3)

for trainset, testset in kf.split(data):

    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)

Estimating biases using sgd...
RMSE: 0.9450
Estimating biases using sgd...
RMSE: 0.9443
Estimating biases using sgd...
RMSE: 0.9442


In [8]:
# 释放内存
del data,trainset, testset
gc.collect()

0

In [10]:
# probe子集测试
probe_data = Dataset.load_from_df(sub_probe[['UserId', 'MovieId', 'Rating']],reader)
trainset = probe_data.build_full_trainset()
predictions = algo.test(trainset.build_testset())
# 计算rmse误差
accuracy.rmse(predictions,verbose=True)

RMSE: 1.0012


1.001233254928213