In [1]:
# Created on Feb 2020
# Author: 임일
import os
import numpy as np
import pandas as pd

# 필요한 Surprise 알고리즘 불러오기
from surprise import BaselineOnly 
from surprise import KNNWithMeans, KNNBaseline, KNNBasic
from surprise import SVD
from surprise import SVDpp
from surprise import NMF
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV

- DF Load

In [3]:
path = './raw_data'
os.listdir(path)

['amazon',
 'amazon_testset.csv',
 'amazon_trainset.csv',
 'ml-100k',
 'model',
 'rotten_testset.csv',
 'rotten_tomato',
 'rotten_trainset.csv']

In [4]:
trainset = pd.read_csv(os.path.join(path, 'amazon_trainset.csv'), encoding='utf-8')
testset = pd.read_csv(os.path.join(path, 'amazon_testset.csv'), encoding='utf-8')

In [5]:
trainset.shape

(160000, 9)

In [6]:
testset.shape

(35947, 9)

In [7]:
trainset.head(1)

Unnamed: 0,user_id,movie_id,review,rating,review_date,review_all,helpful,sentiment,emotion
0,104555,30037,They did it again!,5.0,"08 12, 2013",That '70s Show hit a few bumps with this seaso...,"[0, 0]",2,0


In [9]:
trainset = trainset[['user_id','movie_id','rating']]
testset = testset[['user_id','movie_id','rating']]

- Surprise 데이터 형태 변형

In [10]:
reader = Reader(rating_scale=(1,5))
data = Dataset.load_from_df(trainset, reader)
surprise_trainset = data.build_full_trainset()

In [11]:
len(trainset)

160000

In [12]:
len(testset)

35947

In [13]:
records = testset.to_records(index=False)
surprise_testset = list(records)

### 0. Timer

In [14]:
import time
import datetime

In [15]:
# 시간 표시 함수
def format_time(elapsed):
    # 반올림
    elapsed_rounded = int(round((elapsed)))
    # hh:mm:ss으로 형태 변경
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [16]:
start_time = time.time()
print("  Training epoch took: {:}".format(format_time(time.time() - start_time)))

  Training epoch took: 0:00:00


# 1. CF

- k = 10
- name = pearson

In [19]:
def cf_algorithms(k_num, name, user_based):
    start_time = time.time()
    best_test_rmse = list()

    pred_list = list() # 모델별 예측 평점 리스트 10개
    rmse_list = list()
    
    sim_options = {'name': f'{name}', 'user_based':user_based}

    algo = KNNWithMeans(k=k_num, sim_options=sim_options)
    algo.fit(surprise_trainset)
    preds = algo.test(surprise_testset)
    ratings = [pred.est for pred in preds] # 예측 평점 : n*1
    pred_list.append(ratings)
    rmse_list.append(accuracy.rmse(preds))
    
    pred_df = pd.DataFrame(np.array(pred_list).T, columns=['test_pred_1'])
    file_name = f"KNNWithMeans_{user_based}_{k_num}_{name}.csv"
#     pred_df.to_csv("./save_regression/"+file_name)
#     print(f"{file_name} 생성 완료!")

### Pearson_baseline

In [23]:
cf_algorithms(k_num=10, name='pearson_baseline', user_based=True)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 1.8781
KNNWithMeans_True_10_pearson_baseline.csv 생성 완료!
  Training epoch took: 0:00:01


In [24]:
cf_algorithms(k_num=25, name='pearson_baseline', user_based=True)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 1.8724
KNNWithMeans_True_25_pearson_baseline.csv 생성 완료!


In [20]:
cf_algorithms(k_num=10, name='pearson_baseline', user_based=False)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 1.1412


In [22]:
cf_algorithms(k_num=15, name='pearson_baseline', user_based=False)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 1.1405


In [21]:
cf_algorithms(k_num=25, name='pearson_baseline', user_based=False)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 1.1403


### Cosine baseline

In [23]:
cf_algorithms(k_num=10, name='cosine', user_based=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0579


In [24]:
cf_algorithms(k_num=15, name='cosine', user_based=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0547


In [25]:
cf_algorithms(k_num=25, name='cosine', user_based=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0528


In [None]:
cf_algorithms(k_num=10, name='cosine', user_based=False)

Computing the cosine similarity matrix...


In [None]:
cf_algorithms(k_num=15, name='cosine', user_based=False)

In [None]:
cf_algorithms(k_num=25, name='cosine', user_based=False)

# SVD

- lr 0.005

In [None]:
start_time = time.time()

pred_list = list() # 모델별 예측 평점 리스트 10개
rmse_list = list()
mid_time = time.time()

algo = SVD(lr_all=0.005)
algo.fit(surprise_trainset)
preds = algo.test(surprise_testset)
ratings = [pred.est for pred in preds] # 예측 평점 : n*1
pred_list.append(ratings)
rmse_list.append(accuracy.rmse(preds)) # rmse 누적 저장

print("  Training epoch took: {:}".format(format_time(time.time() - start_time)))

- lr 0.006

In [None]:
start_time = time.time()

pred_list = list() # 모델별 예측 평점 리스트 10개
rmse_list = list()
mid_time = time.time()

algo = SVD(lr_all=0.006)
algo.fit(surprise_trainset)
preds = algo.test(surprise_testset)
ratings = [pred.est for pred in preds] # 예측 평점 : n*1
pred_list.append(ratings)
rmse_list.append(accuracy.rmse(preds)) # rmse 누적 저장

print("  Training epoch took: {:}".format(format_time(time.time() - start_time)))

- lr 0.007

In [None]:
start_time = time.time()

pred_list = list() # 모델별 예측 평점 리스트 10개
rmse_list = list()
mid_time = time.time()

algo = SVD(lr_all=0.007)
algo.fit(surprise_trainset)
preds = algo.test(surprise_testset)
ratings = [pred.est for pred in preds] # 예측 평점 : n*1
pred_list.append(ratings)
rmse_list.append(accuracy.rmse(preds)) # rmse 누적 저장

pred_df = pd.DataFrame(np.array(pred_list).T, columns=['test_pred_1'])
pred_df.to_csv(f"./save_surprise/SVD_0..csv")

print("  Training epoch took: {:}".format(format_time(time.time() - start_time)))