In [90]:
import os
import numpy as np
import pandas as pd

# 필요한 Surprise 알고리즘 불러오기
from surprise import BaselineOnly 
from surprise import KNNWithMeans, KNNBaseline, KNNBasic
from surprise import SVD
from surprise import SVDpp
from surprise import NMF
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split

In [91]:
path = './../data_bigcomp/'
os.listdir(path)

['data.zip',
 'final_rating(sentiment,emotion).csv',
 'ml-100k.zip',
 'movie_info.csv',
 'rotten_rating(review_20_user).csv',
 'rotten_rating(review_30_user).csv',
 'rotten_tomatoes_movies.csv',
 'rotten_user_table.csv',
 'save_surprise.zip',
 's_testset.csv',
 's_testset_removed.csv',
 's_trainset.csv',
 'testset_filtered.csv',
 'testset_removed.csv',
 'trainset_filtered.csv',
 'user_info.csv']

In [92]:
# csv 파일에서 불러오기
# r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings_df = pd.read_csv(path + 'final_rating(sentiment,emotion).csv')

In [93]:
ratings_df = ratings_df[['user_id', 'movie_id', 'review_score' , 'sentiment', 'emotion', 'review_date', 'review_content']]

- 0.5 level 스케일

In [94]:
ratings_df['origin_rating_0.5'] = ratings_df.review_score * 4.5 + 0.5

In [95]:
# 소수점 0.5 단위로 반내림
scaled_rating = list()

for val in ratings_df['origin_rating_0.5']:
    remainder = 0
    if val % 0.5 != 0.0:
        remainder = val % 0.5
    scaled_rating.append(val - remainder)

In [96]:
set(scaled_rating)

{0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0}

In [97]:
ratings_df['rating_0.5'] = scaled_rating

- 1 level 스케일

In [98]:
ratings_df['origin_rating_1'] = ratings_df.review_score * 4 + 1.0

In [99]:
ratings_df['origin_rating_1']

0         3.8
1         2.0
2         3.8
3         3.8
4         3.4
         ... 
751124    5.0
751125    3.8
751126    3.8
751127    3.8
751128    3.0
Name: origin_rating_1, Length: 751129, dtype: float64

In [100]:
scaled_rating = list()

for val in ratings_df.origin_rating_1:
    scaled_rating.append(round(val))

In [101]:
set(scaled_rating)

{1, 2, 3, 4, 5}

In [102]:
ratings_df['rating_1'] = scaled_rating

In [103]:
ratings_df.head()

Unnamed: 0,user_id,movie_id,review_score,sentiment,emotion,review_date,review_content,origin_rating_0.5,rating_0.5,origin_rating_1,rating_1
0,943,0,0.7,2,2,2010-02-09,Whether audiences will get behind The Lightnin...,3.65,3.5,3.8,4
1,7242,0,0.25,0,2,2010-02-10,Harry Potter knockoffs don't come more transpa...,1.625,1.5,2.0,2
2,1046,0,0.7,3,2,2010-02-10,"Percy Jackson isn't a great movie, but it's a ...",3.65,3.5,3.8,4
3,4895,0,0.7,4,2,2010-02-10,"Fun, brisk and imaginative",3.65,3.5,3.8,4
4,4517,0,0.6,1,2,2010-02-10,"Crammed with dragons, set-destroying fights an...",3.2,3.0,3.4,3


### 1. 30개 이상의 리뷰 남긴 User

In [104]:
group = ratings_df.groupby('user_id')
group_user_id = group['movie_id'].count()

In [105]:
def cal_rate(num, total, limit, condition):
    if condition == 'low':
        print(f"{limit}개 미만: {sum(num < limit)}, {round((sum(num < limit)/total),2)*100}%")
        return sum(num >= limit)
    
    if condition == 'high':
        print(f"{limit}개 이상: {sum(num >= limit)}, {round((sum(num > limit)/total),2)*100}%")
        return sum(num >= limit)

In [106]:
num = group_user_id.sort_values()
total = len(group_user_id)

condition = 'high'
sample_num = cal_rate(num, total, 30, condition)
# sample_num = cal_rate(num, total, 100, condition)
# sample_num = cal_rate(num, total, 500, condition)

30개 이상: 2490, 25.0%


In [107]:
group_user_id.sort_values(ascending=False)[:5]

user_id
2600    6486
8148    6356
3265    5300
2495    5212
3926    4548
Name: movie_id, dtype: int64

In [108]:
sample = group_user_id.sort_values(ascending=False)[:sample_num]
total = len(group_user_id)

In [109]:
len(sample)

2490

In [110]:
sample.index

Int64Index([2600, 8148, 3265, 2495, 3926, 5446, 5716, 7603, 7917, 1738,
            ...
            6998,  588, 3584, 4627, 6994, 2446, 5048,   70, 6736, 2715],
           dtype='int64', name='user_id', length=2490)

In [111]:
# 특정 인덱스에 해당하는 행(리뷰 100개 이상 남긴 유저)만 필터링
df = ratings_df[ratings_df['user_id'].isin(list(sample.index))]

In [112]:
df.shape

(708917, 11)

In [113]:
df.to_csv(path+'rotten_rating(review_30_user).csv', index=False)
# df.to_csv(path+'rotten_rating(review_100_user).csv', index=False)

### 2. 평점 파일 로드하기(리뷰 30 이상)

In [114]:
ratings_df = pd.read_csv(path + 'rotten_rating(review_30_user).csv')
# ratings_df = pd.read_csv(path + 'rotten_rating(review_100_user).csv')

In [115]:
ratings_df.head()

Unnamed: 0,user_id,movie_id,review_score,sentiment,emotion,review_date,review_content,origin_rating_0.5,rating_0.5,origin_rating_1,rating_1
0,7242,0,0.25,0,2,2010-02-10,Harry Potter knockoffs don't come more transpa...,1.625,1.5,2.0,2
1,1046,0,0.7,3,2,2010-02-10,"Percy Jackson isn't a great movie, but it's a ...",3.65,3.5,3.8,4
2,4895,0,0.7,4,2,2010-02-10,"Fun, brisk and imaginative",3.65,3.5,3.8,4
3,4517,0,0.6,1,2,2010-02-10,"Crammed with dragons, set-destroying fights an...",3.2,3.0,3.4,3
4,6226,0,0.8,4,2,2010-02-10,"This action-packed fantasy adventure, based on...",4.1,4.0,4.2,4


In [116]:
ratings_df['review_date'] = pd.to_datetime(ratings_df.review_date)

In [117]:
ratings_df.isnull().sum()

user_id              0
movie_id             0
review_score         0
sentiment            0
emotion              0
review_date          0
review_content       0
origin_rating_0.5    0
rating_0.5           0
origin_rating_1      0
rating_1             0
dtype: int64

In [118]:
# 날짜 별로 정렬
ratings_df.sort_values(by='review_date', inplace=True)

In [119]:
ratings_df.head()

Unnamed: 0,user_id,movie_id,review_score,sentiment,emotion,review_date,review_content,origin_rating_0.5,rating_0.5,origin_rating_1,rating_1
253595,9296,7594,0.6,4,2,1800-01-01,It's exciting to see a British horror film wit...,3.2,3.0,3.4,3
72106,7403,2494,1.0,3,4,1800-01-01,"A grimly seductive end-of-the-world thriller, ...",5.0,5.0,5.0,5
112724,9296,3644,0.8,1,2,1800-01-01,There aren't many British horror films you cou...,4.1,4.0,4.2,4
299580,7403,8800,0.7,4,2,1800-01-01,"Coming out from behind Spike Lee's camera, Ern...",3.65,3.5,3.8,4
245700,7629,7374,0.625,3,4,1800-01-01,A comedy western loaded with chuckles that onl...,3.3125,3.0,3.5,4


In [120]:
# from sklearn.model_selection import train_test_split
# trainset, testset, _, _ = train_test_split(ratings_df, ratings_df, test_size=0.3, random_state=24)

### 3. dataset 분할시 cold-start problem case 분리

In [121]:
ratings_df.shape

(708917, 11)

- timestamp가 2010년도 이후만 조회

In [122]:
ratings_df.review_date > '1990-01-01'

253595    False
72106     False
112724    False
299580    False
245700    False
          ...  
584382     True
702576     True
676277     True
676021     True
700260     True
Name: review_date, Length: 708917, dtype: bool

In [123]:
# sum((ratings_df.review_date < '2011-01-01') & (ratings_df.review_date > '2000-01-01'))

In [124]:
new_ratings_df = ratings_df[(ratings_df.review_date < '2018-01-01')]

In [125]:
len(new_ratings_df)

594402

In [126]:
_test_ratio = 0.70 # test set 비율
num_test = int(np.ceil(new_ratings_df.shape[0] * _test_ratio))
print(num_test)

416082


In [127]:
trainset = new_ratings_df.iloc[:-num_test]
testset  = new_ratings_df.iloc[-num_test:]

print(f"trainset 개수: {len(trainset)}")
print(f"testset 개수: {len(testset)}")

trainset 개수: 178320
testset 개수: 416082


In [128]:
trainset_user_id = set(trainset.user_id)
trainset_movie_id = set(trainset.movie_id)
print(f"trainset user: {len(trainset_user_id)}")
print(f"trainset movie: {len(trainset_movie_id)}")

testset_user_id = set(testset.user_id)
testset_movie_id = set(testset.movie_id)
print(f"testset user: {len(testset_user_id)}")
print(f"testset movie: {len(testset_movie_id)}")

trainset user: 855
trainset movie: 7844
testset user: 2015
testset movie: 14090


In [129]:
intersection_user_id = trainset_user_id.intersection(testset_user_id)
intersection_movie_id = trainset_movie_id.intersection(testset_movie_id)
print(f"교집합 user_id: {len(intersection_user_id)}")
print(f"교집합 movie_id: {len(intersection_movie_id)}")

difference_user_id = testset_user_id.difference(trainset_user_id)
difference_movie_id = testset_movie_id.difference(trainset_movie_id)
print(f"차집합 user_id: {len(difference_user_id)}") # testset에만 있는 user_id
print(f"차집합 movie_id: {len(difference_movie_id)}")

교집합 user_id: 708
교집합 movie_id: 6201
차집합 user_id: 1307
차집합 movie_id: 7889


#### 3-1. testset에서 교집합 user_id, movie_id 데이터만 추출

In [130]:
condition_inter_user  = testset['user_id'].isin(intersection_user_id)
condition_inter_movie = testset['movie_id'].isin(intersection_movie_id)

In [131]:
sum(condition_inter_user & condition_inter_movie)

26851

In [132]:
filtered_testset = testset[condition_inter_user & condition_inter_movie].copy()

In [133]:
filtered_testset

Unnamed: 0,user_id,movie_id,review_score,sentiment,emotion,review_date,review_content,origin_rating_0.5,rating_0.5,origin_rating_1,rating_1
392623,5127,11393,0.800,4,2,2006-06-21,Thoroughly satisfying... manages to tackle its...,4.1000,4.0,4.2,4
415285,4327,11993,0.700,4,2,2006-06-21,I wonder if anyone will consider Keillor for a...,3.6500,3.5,3.8,4
257499,7403,7725,0.700,4,2,2006-06-21,"The Hidden Blade is tranquil, touching, and, i...",3.6500,3.5,3.8,4
415284,4327,11993,0.700,4,2,2006-06-21,I wonder if anyone will consider Keillor for a...,3.6500,3.5,3.8,4
454947,2858,12595,0.850,4,2,2006-06-21,Up there with United 93 as one of the most pow...,4.3250,4.0,4.4,4
...,...,...,...,...,...,...,...,...,...,...,...
178131,2463,5465,0.875,1,0,2017-12-02,"What gives Zombie the nerve, the sheer audacit...",4.4375,4.0,4.5,4
371926,4367,10733,0.800,3,2,2017-12-06,"No party is as joyous as a Muppet celebration,...",4.1000,4.0,4.2,4
334518,8391,9771,0.700,3,2,2017-12-08,"As I've said, it's not entirely rinsed free of...",3.6500,3.5,3.8,4
19265,2463,1052,0.750,4,0,2017-12-25,The movie creates an atmosphere that is tense ...,3.8750,3.5,4.0,4


#### 3-2. testset에서 차집합 user_id, movie_id 데이터만 추출 (cold-start용)

In [134]:
condition_differ_user  = testset['user_id'].isin(difference_user_id)
condition_differ_movie = testset['movie_id'].isin(difference_movie_id)

In [135]:
removed_testset = testset[condition_inter_user | condition_inter_movie].copy()

In [136]:
removed_testset

Unnamed: 0,user_id,movie_id,review_score,sentiment,emotion,review_date,review_content,origin_rating_0.5,rating_0.5,origin_rating_1,rating_1
392623,5127,11393,0.80,4,2,2006-06-21,Thoroughly satisfying... manages to tackle its...,4.100,4.0,4.2,4
415285,4327,11993,0.70,4,2,2006-06-21,I wonder if anyone will consider Keillor for a...,3.650,3.5,3.8,4
257499,7403,7725,0.70,4,2,2006-06-21,"The Hidden Blade is tranquil, touching, and, i...",3.650,3.5,3.8,4
679890,7242,16970,0.25,2,4,2006-06-21,The film's racial/class issues are less develo...,1.625,1.5,2.0,2
415284,4327,11993,0.70,4,2,2006-06-21,I wonder if anyone will consider Keillor for a...,3.650,3.5,3.8,4
...,...,...,...,...,...,...,...,...,...,...,...
333541,2600,9743,0.50,0,1,2017-12-31,Its insistence on being absurd throughout beca...,2.750,2.5,3.0,3
363550,8103,10513,1.00,2,4,2017-12-31,"Hollywood provides few good roles for women, b...",5.000,5.0,5.0,5
155762,8103,4783,0.70,3,2,2017-12-31,"This is a totally unbelievable movie, but it i...",3.650,3.5,3.8,4
648251,8103,16094,0.70,4,2,2017-12-31,This coming of age movie reminded me a bit of ...,3.650,3.5,3.8,4


In [137]:
len(filtered_testset) / len(trainset)

0.15057761327949754

In [138]:
trainset

Unnamed: 0,user_id,movie_id,review_score,sentiment,emotion,review_date,review_content,origin_rating_0.5,rating_0.5,origin_rating_1,rating_1
253595,9296,7594,0.600,4,2,1800-01-01,It's exciting to see a British horror film wit...,3.2000,3.0,3.4,3
72106,7403,2494,1.000,3,4,1800-01-01,"A grimly seductive end-of-the-world thriller, ...",5.0000,5.0,5.0,5
112724,9296,3644,0.800,1,2,1800-01-01,There aren't many British horror films you cou...,4.1000,4.0,4.2,4
299580,7403,8800,0.700,4,2,1800-01-01,"Coming out from behind Spike Lee's camera, Ern...",3.6500,3.5,3.8,4
245700,7629,7374,0.625,3,4,1800-01-01,A comedy western loaded with chuckles that onl...,3.3125,3.0,3.5,4
...,...,...,...,...,...,...,...,...,...,...,...
566157,8589,14185,0.700,1,0,2006-06-21,... leaves the comic-book super-villain melodr...,3.6500,3.5,3.8,4
563361,4281,14097,0.700,2,2,2006-06-21,"Singer's work is generally strong, but Superma...",3.6500,3.5,3.8,4
321930,5444,9394,0.875,1,2,2006-06-21,No matter how much unique artistry they bring ...,4.4375,4.0,4.5,4
151451,2631,4680,0.400,0,2,2006-06-21,All I wanted was a magical remote control of m...,2.3000,2.0,2.6,3


#### 3-3. trainset, testset 저장 (GCMC 데이터용)

In [139]:
trainset.to_csv(path+'l_trainset.csv', index=False)
filtered_testset.to_csv(path+'l_testset.csv', index=False)
removed_testset.to_csv(path+'l_testset_removed.csv', index=False)
# testset.to_csv(path+'testset.csv', index=False)