In [3]:
# Created on Feb 2020
# Author: 임일
import os
import numpy as np
import pandas as pd

# 필요한 Surprise 알고리즘 불러오기
from surprise import BaselineOnly 
from surprise import KNNWithMeans, KNNBaseline, KNNBasic
from surprise import SVD
from surprise import SVDpp
from surprise import NMF
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split

In [6]:
path = './../data_bigcomp/'
os.listdir(path)

['data.zip',
 'final_rating(sentiment,emotion).csv',
 'ml-100k.zip',
 'movie_info.csv',
 'rotten_tomatoes_movies.csv',
 'rotten_user_table.csv',
 'save_surprise.zip',
 'testset_filtered.csv',
 'trainset_filtered.csv',
 'user_info.csv']

In [28]:
# csv 파일에서 불러오기
# r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings_df = pd.read_csv(path + 'final_rating(sentiment,emotion).csv')

In [29]:
ratings_df = ratings_df[['user_id', 'movie_id', 'review_score' , 'sentiment', 'emotion', 'review_date', 'review_content']]

In [30]:
ratings_df['origin_rating'] = ratings_df.review_score * 4.5 + 0.5

In [31]:
# 소수점 0.5 단위로 반내림
scaled_rating = list()

for val in ratings_df.origin_rating:
    remainder = 0
    if val % 0.5 != 0.0:
        remainder = val % 0.5
    scaled_rating.append(val - remainder)

In [32]:
set(scaled_rating)

{0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0}

In [33]:
ratings_df['rating'] = scaled_rating

In [34]:
ratings_df.head()

Unnamed: 0,user_id,movie_id,review_score,sentiment,emotion,review_date,review_content,origin_rating,rating
0,943,0,0.7,2,2,2010-02-09,Whether audiences will get behind The Lightnin...,3.65,3.5
1,7242,0,0.25,0,2,2010-02-10,Harry Potter knockoffs don't come more transpa...,1.625,1.5
2,1046,0,0.7,3,2,2010-02-10,"Percy Jackson isn't a great movie, but it's a ...",3.65,3.5
3,4895,0,0.7,4,2,2010-02-10,"Fun, brisk and imaginative",3.65,3.5
4,4517,0,0.6,1,2,2010-02-10,"Crammed with dragons, set-destroying fights an...",3.2,3.0


### 1. 20개 이상의 리뷰 남긴 User

In [35]:
group = ratings_df.groupby('user_id')
group_user_id = group['movie_id'].count()

In [36]:
def cal_rate(num, total, limit, condition):
    if condition == 'low':
        print(f"{limit}개 미만: {sum(num < limit)}, {round((sum(num < limit)/total),2)*100}%")
        return sum(num >= limit)
    
    if condition == 'high':
        print(f"{limit}개 이상: {sum(num >= limit)}, {round((sum(num > limit)/total),2)*100}%")
        return sum(num >= limit)

In [37]:
num = group_user_id.sort_values()
total = len(group_user_id)

condition = 'high'
sample_num = cal_rate(num, total, 20, condition)
# sample_num = cal_rate(num, total, 100, condition)
# sample_num = cal_rate(num, total, 500, condition)

20개 이상: 2994, 30.0%


In [38]:
group_user_id.sort_values(ascending=False)[:5]

user_id
2600    6486
8148    6356
3265    5300
2495    5212
3926    4548
Name: movie_id, dtype: int64

In [39]:
sample = group_user_id.sort_values(ascending=False)[:sample_num]
total = len(group_user_id)

In [40]:
len(sample)

2994

In [41]:
sample.index

Int64Index([2600, 8148, 3265, 2495, 3926, 5446, 5716, 7603, 7917, 1738,
            ...
            5685, 5990, 7188, 6088, 6733, 9250, 4203, 7911, 4540, 1743],
           dtype='int64', name='user_id', length=2994)

In [42]:
# 특정 인덱스에 해당하는 행(리뷰 100개 이상 남긴 유저)만 필터링
df = ratings_df[ratings_df['user_id'].isin(list(sample.index))]

In [43]:
df.shape

(721094, 9)

In [44]:
df.to_csv(path+'rotten_rating(review_20_user).csv', index=False)
# df.to_csv(path+'rotten_rating(review_100_user).csv', index=False)

### 2. 평점 파일 로드하기(리뷰 20 이상)

In [45]:
ratings_df = pd.read_csv(path + 'rotten_rating(review_20_user).csv')
# ratings_df = pd.read_csv(path + 'rotten_rating(review_100_user).csv')

In [46]:
ratings_df.head()

Unnamed: 0,user_id,movie_id,review_score,sentiment,emotion,review_date,review_content,origin_rating,rating
0,7242,0,0.25,0,2,2010-02-10,Harry Potter knockoffs don't come more transpa...,1.625,1.5
1,1046,0,0.7,3,2,2010-02-10,"Percy Jackson isn't a great movie, but it's a ...",3.65,3.5
2,4895,0,0.7,4,2,2010-02-10,"Fun, brisk and imaginative",3.65,3.5
3,4517,0,0.6,1,2,2010-02-10,"Crammed with dragons, set-destroying fights an...",3.2,3.0
4,6226,0,0.8,4,2,2010-02-10,"This action-packed fantasy adventure, based on...",4.1,4.0


In [47]:
ratings_df['review_date'] = pd.to_datetime(ratings_df.review_date)

In [48]:
ratings_df.isnull().sum()

user_id           0
movie_id          0
review_score      0
sentiment         0
emotion           0
review_date       0
review_content    0
origin_rating     0
rating            0
dtype: int64

In [49]:
# 날짜 별로 정렬
ratings_df.sort_values(by='review_date', inplace=True)

In [50]:
ratings_df.head()

Unnamed: 0,user_id,movie_id,review_score,sentiment,emotion,review_date,review_content,origin_rating,rating
73212,7403,2494,1.0,3,4,1800-01-01,"A grimly seductive end-of-the-world thriller, ...",5.0,5.0
18569,6220,970,0.2,1,2,1800-01-01,"All in all, AIrborne is not bad for what it is...",1.4,1.0
257862,9296,7594,0.6,4,2,1800-01-01,It's exciting to see a British horror film wit...,3.2,3.0
304601,7403,8800,0.7,4,2,1800-01-01,"Coming out from behind Spike Lee's camera, Ern...",3.65,3.5
392395,2343,11205,0.25,0,5,1800-01-01,It's the sort of film that can only be watched...,1.625,1.5


In [51]:
# from sklearn.model_selection import train_test_split
# trainset, testset, _, _ = train_test_split(ratings_df, ratings_df, test_size=0.3, random_state=24)

### 3. dataset 분할시 cold-start problem case 분리

In [52]:
ratings_df.shape

(721094, 9)

In [53]:
_test_ratio = 0.7 # test set 비율
num_test = int(np.ceil(ratings_df.shape[0] * _test_ratio))

In [54]:
trainset = ratings_df.iloc[:-num_test]
testset  = ratings_df.iloc[-num_test:]

print(f"trainset 개수: {len(trainset)}")
print(f"testset 개수: {len(testset)}")

trainset 개수: 216328
testset 개수: 504766


In [55]:
trainset_user_id = set(trainset.user_id)
trainset_movie_id = set(trainset.movie_id)
print(f"trainset user: {len(trainset_user_id)}")
print(f"trainset movie: {len(trainset_movie_id)}")

testset_user_id = set(testset.user_id)
testset_movie_id = set(testset.movie_id)
print(f"testset user: {len(testset_user_id)}")
print(f"testset movie: {len(testset_movie_id)}")

trainset user: 1112
trainset movie: 8521
testset user: 2732
testset movie: 15846


In [56]:
intersection_user_id = trainset_user_id.intersection(testset_user_id)
intersection_movie_id = trainset_movie_id.intersection(testset_movie_id)
print(f"교집합 user_id: {len(intersection_user_id)}")
print(f"교집합 movie_id: {len(intersection_movie_id)}")

difference_user_id = testset_user_id.difference(trainset_user_id)
difference_movie_id = testset_movie_id.difference(trainset_movie_id)
print(f"차집합 user_id: {len(difference_user_id)}") # testset에만 있는 user_id
print(f"차집합 movie_id: {len(difference_movie_id)}")

교집합 user_id: 850
교집합 movie_id: 6805
차집합 user_id: 1882
차집합 movie_id: 9041


#### 3-1. testset에서 교집합 user_id, movie_id 데이터만 추출

In [57]:
condition_inter_user  = testset['user_id'].isin(intersection_user_id)
condition_inter_movie = testset['movie_id'].isin(intersection_movie_id)

In [58]:
sum(condition_inter_user & condition_inter_movie)

28766

In [59]:
filtered_testset = testset[condition_inter_user & condition_inter_movie].copy()

In [60]:
filtered_testset

Unnamed: 0,user_id,movie_id,review_score,sentiment,emotion,review_date,review_content,origin_rating,rating
224104,8557,6677,0.70,3,0,2007-06-30,Not since Lecter has a role been this well sui...,3.650,3.5
224108,5108,6677,0.70,3,2,2007-06-30,Anthony Hopkins and Ryan Gosling are electrify...,3.650,3.5
224105,2487,6677,0.75,3,2,2007-06-30,"The main draw, of course, is watching Hopkins ...",3.875,3.5
224106,6613,6677,0.60,4,2,2007-06-30,"Enjoyable, stylishly directed thriller with a ...",3.200,3.0
441976,8103,12292,0.70,4,2,2007-06-30,"Once again, Pixar has crafted a quality story ...",3.650,3.5
...,...,...,...,...,...,...,...,...,...
691624,2495,16971,1.00,4,2,2020-10-20,...a fairly timeless thriller that still ranks...,5.000,5.0
179452,2495,5423,0.75,3,2,2020-10-21,Desperate Measures has recovered its momentum ...,3.875,3.5
523039,7229,13424,0.60,3,2,2020-10-21,A suitably bloody romp about oedipally incestu...,3.200,3.0
523040,7229,13424,0.60,3,2,2020-10-21,A suitably bloody romp about oedipally incestu...,3.200,3.0


#### 3-2. testset에서 차집합 user_id, movie_id 데이터만 추출 (cold-start용)

In [61]:
condition_differ_user  = testset['user_id'].isin(difference_user_id)
condition_differ_movie = testset['movie_id'].isin(difference_movie_id)

In [62]:
removed_testset = testset[condition_inter_user | condition_inter_movie].copy()

In [63]:
removed_testset

Unnamed: 0,user_id,movie_id,review_score,sentiment,emotion,review_date,review_content,origin_rating,rating
224104,8557,6677,0.70,3,0,2007-06-30,Not since Lecter has a role been this well sui...,3.650,3.5
224108,5108,6677,0.70,3,2,2007-06-30,Anthony Hopkins and Ryan Gosling are electrify...,3.650,3.5
224105,2487,6677,0.75,3,2,2007-06-30,"The main draw, of course, is watching Hopkins ...",3.875,3.5
224106,6613,6677,0.60,4,2,2007-06-30,"Enjoyable, stylishly directed thriller with a ...",3.200,3.0
441976,8103,12292,0.70,4,2,2007-06-30,"Once again, Pixar has crafted a quality story ...",3.650,3.5
...,...,...,...,...,...,...,...,...,...
634378,3287,15479,0.70,2,2,2020-10-26,Docu-drama about a kind man skilled in listeni...,3.650,3.5
640084,7172,15624,0.60,4,2,2020-10-27,As intricately assembled as all of Greenaway's...,3.200,3.0
578588,7172,14239,0.50,2,2,2020-10-27,"Meagerly amusing, if thanks to several scenari...",2.750,2.5
714638,6432,17562,0.75,2,2,2020-10-28,This nifty slice of sci-fi basically plays lik...,3.875,3.5


In [64]:
len(filtered_testset) / len(trainset)

0.13297400244073815

In [65]:
trainset

Unnamed: 0,user_id,movie_id,review_score,sentiment,emotion,review_date,review_content,origin_rating,rating
73212,7403,2494,1.000,3,4,1800-01-01,"A grimly seductive end-of-the-world thriller, ...",5.0000,5.0
18569,6220,970,0.200,1,2,1800-01-01,"All in all, AIrborne is not bad for what it is...",1.4000,1.0
257862,9296,7594,0.600,4,2,1800-01-01,It's exciting to see a British horror film wit...,3.2000,3.0
304601,7403,8800,0.700,4,2,1800-01-01,"Coming out from behind Spike Lee's camera, Ern...",3.6500,3.5
392395,2343,11205,0.250,0,5,1800-01-01,It's the sort of film that can only be watched...,1.6250,1.5
...,...,...,...,...,...,...,...,...,...
671667,9095,16446,1.000,4,2,2007-06-30,"For sci-fi excitement and escapist, fast-paced...",5.0000,5.0
671668,1045,16446,0.800,3,2,2007-06-30,In a summer that's seen underperforming tre-qu...,4.1000,4.0
210566,8391,6288,0.400,3,2,2007-06-30,The film's effects people perform many miracle...,2.3000,2.0
210567,6158,6288,0.300,1,4,2007-06-30,Rise of the Silver Surfer is both relentlessly...,1.8500,1.5


#### 3-3. trainset, testset 저장 (GCMC 데이터용)

In [None]:
trainset.to_csv(path+'trainset_filtered.csv', index=False)
filtered_testset.to_csv(path+'testset_filtered.csv', index=False)
removed_testset.to_csv(path+'testset_removed.csv', index=False)
# testset.to_csv(path+'testset.csv', index=False)