In [3]:
# Created on Feb 2020
# Author: 임일
import os
import numpy as np
import pandas as pd

# 필요한 Surprise 알고리즘 불러오기
from surprise import BaselineOnly 
from surprise import KNNWithMeans, KNNBaseline, KNNBasic
from surprise import SVD
from surprise import SVDpp
from surprise import NMF
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV

- DF Load

In [4]:
path = './data'
os.listdir(path)

['.ipynb_checkpoints',
 'l_testset.csv',
 'l_trainset.csv',
 'l_testset_removed.csv']

In [5]:
trainset = pd.read_csv(os.path.join(path, 'l_trainset.csv'), encoding='utf-8')
testset = pd.read_csv(os.path.join(path, 'l_testset.csv'), encoding='utf-8')

In [6]:
trainset.shape

(118880, 11)

In [7]:
testset.shape

(29120, 11)

In [10]:
trainset.head(1)

Unnamed: 0,user_id,movie_id,review_score,sentiment,emotion,review_date,review_content,origin_rating_5,rating_5,origin_rating_10,rating_10
0,9296,7594,0.6,4,2,1800-01-01,It's exciting to see a British horror film wit...,3.4,3,6.4,6


In [11]:
trainset = trainset[['user_id','movie_id','rating_10']]
testset = testset[['user_id','movie_id','rating_10']]

- Surprise 데이터 형태 변형

In [12]:
reader = Reader(rating_scale=(1,10))
data = Dataset.load_from_df(trainset, reader)
surprise_trainset = data.build_full_trainset()

In [13]:
len(trainset)

118880

In [14]:
len(testset)

29120

In [15]:
records = testset.to_records(index=False)
surprise_testset = list(records)

### 0. Timer

In [16]:
import time
import datetime

In [17]:
# 시간 표시 함수
def format_time(elapsed):
    # 반올림
    elapsed_rounded = int(round((elapsed)))
    # hh:mm:ss으로 형태 변경
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [18]:
start_time = time.time()
print("  Training epoch took: {:}".format(format_time(time.time() - start_time)))

  Training epoch took: 0:00:00


# 1. CF

- k = 10
- name = pearson

In [22]:
def cf_algorithms(k_num, name, user_based):
    start_time = time.time()
    best_test_rmse = list()

    pred_list = list() # 모델별 예측 평점 리스트 10개
    rmse_list = list()
    
    sim_options = {'name': f'{name}', 'user_based':user_based}

    algo = KNNWithMeans(k=k_num, sim_options=sim_options)
    algo.fit(surprise_trainset)
    preds = algo.test(surprise_testset)
    ratings = [pred.est for pred in preds] # 예측 평점 : n*1
    pred_list.append(ratings)
    rmse_list.append(accuracy.rmse(preds))
    
    pred_df = pd.DataFrame(np.array(pred_list).T, columns=['test_pred_1'])
    file_name = f"KNNWithMeans_{user_based}_{k_num}_{name}.csv"
    pred_df.to_csv("./save_regression/"+file_name)
    print(f"{file_name} 생성 완료!")

### Pearson_baseline

In [23]:
cf_algorithms(k_num=10, name='pearson_baseline', user_based=True)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 1.8781
KNNWithMeans_True_10_pearson_baseline.csv 생성 완료!
  Training epoch took: 0:00:01


In [24]:
cf_algorithms(k_num=25, name='pearson_baseline', user_based=True)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 1.8724
KNNWithMeans_True_25_pearson_baseline.csv 생성 완료!


In [25]:
cf_algorithms(k_num=10, name='pearson_baseline', user_based=False)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 1.7578
KNNWithMeans_False_10_pearson_baseline.csv 생성 완료!


In [26]:
cf_algorithms(k_num=25, name='pearson_baseline', user_based=False)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 1.7196
KNNWithMeans_False_25_pearson_baseline.csv 생성 완료!


### Cosine baseline

In [27]:
cf_algorithms(k_num=10, name='cosine', user_based=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.8172
KNNWithMeans_True_10_cosine.csv 생성 완료!


In [28]:
cf_algorithms(k_num=15, name='cosine', user_based=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.8069
KNNWithMeans_True_15_cosine.csv 생성 완료!


In [29]:
cf_algorithms(k_num=25, name='cosine', user_based=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.8013
KNNWithMeans_True_25_cosine.csv 생성 완료!


In [30]:
cf_algorithms(k_num=10, name='cosine', user_based=False)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.7386
KNNWithMeans_False_10_cosine.csv 생성 완료!


In [31]:
cf_algorithms(k_num=15, name='cosine', user_based=False)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.7212
KNNWithMeans_False_15_cosine.csv 생성 완료!


In [32]:
cf_algorithms(k_num=25, name='cosine', user_based=False)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.7064
KNNWithMeans_False_25_cosine.csv 생성 완료!


# SVD

- lr 0.005

In [None]:
start_time = time.time()

pred_list = list() # 모델별 예측 평점 리스트 10개
rmse_list = list()
mid_time = time.time()

algo = SVD(lr_all=0.005)
algo.fit(surprise_trainset)
preds = algo.test(surprise_testset)
ratings = [pred.est for pred in preds] # 예측 평점 : n*1
pred_list.append(ratings)
rmse_list.append(accuracy.rmse(preds)) # rmse 누적 저장

print("  Training epoch took: {:}".format(format_time(time.time() - start_time)))

- lr 0.006

In [None]:
start_time = time.time()

pred_list = list() # 모델별 예측 평점 리스트 10개
rmse_list = list()
mid_time = time.time()

algo = SVD(lr_all=0.006)
algo.fit(surprise_trainset)
preds = algo.test(surprise_testset)
ratings = [pred.est for pred in preds] # 예측 평점 : n*1
pred_list.append(ratings)
rmse_list.append(accuracy.rmse(preds)) # rmse 누적 저장

print("  Training epoch took: {:}".format(format_time(time.time() - start_time)))

- lr 0.007

In [None]:
start_time = time.time()

pred_list = list() # 모델별 예측 평점 리스트 10개
rmse_list = list()
mid_time = time.time()

algo = SVD(lr_all=0.007)
algo.fit(surprise_trainset)
preds = algo.test(surprise_testset)
ratings = [pred.est for pred in preds] # 예측 평점 : n*1
pred_list.append(ratings)
rmse_list.append(accuracy.rmse(preds)) # rmse 누적 저장

pred_df = pd.DataFrame(np.array(pred_list).T, columns=['test_pred_1'])
pred_df.to_csv(f"./save_surprise/SVD_0..csv")

print("  Training epoch took: {:}".format(format_time(time.time() - start_time)))