In [22]:
from surprise import Reader, Dataset, KNNWithMeans, accuracy
from surprise.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from surprise.model_selection import GridSearchCV
import pandas as pd
import numpy as np
import re
from konlpy.tag import Okt
import pickle
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model

In [23]:
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']

In [24]:
def text_cleaning(text) :
  hangul = re.compile('[^ ㄱ-ㅣ가-힣]+')
  result = hangul.sub('', text)
  return result

In [25]:
okt = Okt()

def get_token(x) :
  tokenized_sentence = okt.morphs(x, stem = True) # 토큰화
  stopwords_removed_sentence = [word for word in tokenized_sentence if not word in stopwords] # 불용어 제거
  return stopwords_removed_sentence

In [26]:
# 데이터 로드 및 전처리
df = pd.read_csv('../review_rating_hellody.csv', encoding = 'cp949')
df.dropna(axis= 0, how= 'any', inplace= True)

df['ko_review'] = df['review'].apply(lambda x : text_cleaning(x))
del df['review']

df['ko_review'] = df['ko_review'].str.replace('^ +', "") # white space 데이터를 empty value로 변경
df['ko_review'].replace('', np.nan, inplace=True)
print(df.isnull().sum()) #64
df.dropna(axis= 0, how= 'any', inplace= True)
df.reset_index(inplace=True, drop=True)

user_id       0
movie_id      0
rating        0
ko_review    64
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['ko_review'].replace('', np.nan, inplace=True)


In [27]:
df['ko_review'] = df['ko_review'].apply(get_token) #10만개 32m 52.3s / 5만개 16m 52.9s

drop_train = [index for index, sentence in enumerate(df['ko_review']) if len(sentence) < 1]
len(drop_train) #

df.drop(index= drop_train, axis= 0, inplace= True)
print(df) #

# 토크나이저 불러오기
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

# 테스트 데이터 전처리 및 토크나이저 적용
X = tokenizer.texts_to_sequences(df['ko_review'])

vocab_size = len(tokenizer.word_index) + 1
vocab_size #

X = pad_sequences(X, maxlen= 43)

       user_id  movie_id  rating  \
0            1       100       8   
1            2       100       5   
2            3       100      10   
3            4       100       7   
4            5       100      10   
...        ...       ...     ...   
26517     2201        99       6   
26518     8454        99       6   
26519    13022        99       1   
26520     2419        99      10   
26521       88        99       1   

                                               ko_review  
0      [다니엘, 크다, 레이, 그, 블루, 컬러, 연기, 훌륭하다, 얌전하다, 고양이, ...  
1      [사랑, 눈, 을, 뜨다, 살다, 음, 을, 느끼다, 노년, 엄마, 이야기, 중반,...  
2                          [자연, 그대, 데려가다, 눈물, 방울, 로, 외면]  
3      [나이, 먹다, 인간, 본능, 살, 아, 있다, 젊음, 늙음, 도시, 시골, 현재,...  
4      [내, 엄마, 이야기, 로, 나아가다, 훗날, 내, 이야기, 로, 상상, 보다, 되...  
...                                                  ...  
26517                              [신약, 이해, 안되다, 살인, 방법]  
26518                                       [어이, 없다, 내용]  
26519                [일본애니, 똑같다, 스토리

In [28]:
X

array([[    0,     0,     0, ...,  8654,   209,    11],
       [    0,     0,     0, ...,  2200,   120,   347],
       [    0,     0,     0, ...,  5332,    16,  2873],
       ...,
       [    0,     0,     0, ...,   496,   231,    58],
       [    0,     0,     0, ...,    63,    11, 17176],
       [    0,     0,     0, ...,   904,    23,     1]])

In [29]:
# 저장된 모델 로드
loaded_model = load_model('best_model_LSTM_30.keras')

# 로드한 모델로 예측 수행
predictions = loaded_model.predict(X)

# 모델 요약 정보 출력
loaded_model.summary()

[1m826/826[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 9ms/step


In [30]:
print(predictions) #26421

[[0.5662551 ]
 [0.21420896]
 [0.9278456 ]
 ...
 [0.0683458 ]
 [0.2641374 ]
 [0.12170036]]


In [31]:
# 예측 결과를 긍정/부정으로 변환
threshold = 0.5
predicted_labels = (predictions > threshold).astype(int)

# predicted_labels를 데이터프레임의 새로운 열로 추가
df['model_label'] = predicted_labels

# 결과 출력
df[df['model_label'] == 0] #8304
df[df['model_label'] == 1] #18117

Unnamed: 0,user_id,movie_id,rating,ko_review,model_label
0,1,100,8,"[다니엘, 크다, 레이, 그, 블루, 컬러, 연기, 훌륭하다, 얌전하다, 고양이, ...",1
2,3,100,10,"[자연, 그대, 데려가다, 눈물, 방울, 로, 외면]",1
3,4,100,7,"[나이, 먹다, 인간, 본능, 살, 아, 있다, 젊음, 늙음, 도시, 시골, 현재,...",1
4,5,100,10,"[내, 엄마, 이야기, 로, 나아가다, 훗날, 내, 이야기, 로, 상상, 보다, 되...",1
10,11,100,7,"[공감, 반감, 을, 동시, 부르다]",1
...,...,...,...,...,...
26505,13020,99,10,"[끝, 급, 마무리, 되다, 늘다, 되다, 생각, 바보, 연기, 진짜, 쩔다]",1
26507,150,99,6,"[그럭저럭, 괜찮다, 긴장감, 추리]",1
26509,1359,99,7,"[처음, 부터, 마지막, 까지, 긴장감, 있다, 내, 용구성]",1
26512,1185,99,7,"[영화, 전체, 적, 분위기, 좋다, 쌩뚱맞, 인과관계, 쫌]",1


In [32]:
df['rating'] = df['rating'].astype(float)
df['rating_label'] = df['rating'].apply(lambda x: 1 if x > 5 else 0)
df

positive_ratings = df[df['rating_label'] == 1]
negative_ratings = df[df['rating_label'] == 0]
print(len(positive_ratings)) #21897
print(len(negative_ratings)) #4524

21897
4524


In [33]:
df #26421

Unnamed: 0,user_id,movie_id,rating,ko_review,model_label,rating_label
0,1,100,8.0,"[다니엘, 크다, 레이, 그, 블루, 컬러, 연기, 훌륭하다, 얌전하다, 고양이, ...",1,1
1,2,100,5.0,"[사랑, 눈, 을, 뜨다, 살다, 음, 을, 느끼다, 노년, 엄마, 이야기, 중반,...",0,0
2,3,100,10.0,"[자연, 그대, 데려가다, 눈물, 방울, 로, 외면]",1,1
3,4,100,7.0,"[나이, 먹다, 인간, 본능, 살, 아, 있다, 젊음, 늙음, 도시, 시골, 현재,...",1,1
4,5,100,10.0,"[내, 엄마, 이야기, 로, 나아가다, 훗날, 내, 이야기, 로, 상상, 보다, 되...",1,1
...,...,...,...,...,...,...
26517,2201,99,6.0,"[신약, 이해, 안되다, 살인, 방법]",0,1
26518,8454,99,6.0,"[어이, 없다, 내용]",0,1
26519,13022,99,1.0,"[일본애니, 똑같다, 스토리, 범인, 까지, 한국영, 화, 뭐]",0,0
26520,2419,99,10.0,"[박해일, 때문, 점, 주다, 근데, 진짜, 시나리오, 발로썻, 나, 하나, 부터,...",0,1


In [34]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(1, 5))
df['scaled_rating'] = scaler.fit_transform(df[['rating']])
df['scaled_rating'] = df['scaled_rating'].round().astype(int)

df_correspond = df[df['rating_label'] == df['model_label']]
df_correspond #4524

df_discord = df[df['rating_label'] != df['model_label']]
df_discord

df_total = df[['user_id', 'movie_id', 'scaled_rating']]
df_correspond = df_correspond[['user_id', 'movie_id', 'scaled_rating']]
df_discord = df_discord[['user_id', 'movie_id', 'scaled_rating']]

In [35]:
train_total, test_total = train_test_split(df_total, test_size=0.2, random_state=42)
train_correspond, test_correspond = train_test_split(df_correspond, test_size=0.2, random_state=42)
train_discord, test_discord = train_test_split(df_discord, test_size=0.2, random_state=42)

# 평점 범위를 1~10으로 설정
# 학습 및 테스트 데이터셋 분리
reader = Reader(rating_scale=(1, 5))
trainset_total = Dataset.load_from_df(train_total, reader)
testset_total = Dataset.load_from_df(test_total, reader)

trainset_correspond = Dataset.load_from_df(train_correspond, reader)
testset_corresopnd = Dataset.load_from_df(test_correspond, reader)

trainset_discord = Dataset.load_from_df(train_discord, reader)
testset_discord = Dataset.load_from_df(test_discord, reader)

In [36]:
from surprise.model_selection import cross_validate
from surprise import SVD, SVDpp, SlopeOne, NMF, NormalPredictor, KNNBaseline, KNNBasic, KNNWithZScore, BaselineOnly, CoClustering

total_result = []
for algorithm in [SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(),
                  KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()] :
  results = cross_validate(algorithm, trainset_total, measures = ['RMSE'], cv = 3, verbose = False)

  tmp = pd.DataFrame.from_dict(results).mean(axis = 0)
  tmp = pd.concat([tmp, pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm'])])
  total_result.append(tmp)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...


In [37]:
total_result_df = pd.DataFrame(total_result).set_index('Algorithm').sort_values('test_rmse')
print(total_result_df)

                 test_rmse  fit_time  test_time
Algorithm                                      
BaselineOnly      1.161898  0.034319   0.019720
SVD               1.167151  0.131195   0.017106
SVDpp             1.170497  0.126104   0.044352
KNNBaseline       1.242642  0.689675   0.410700
CoClustering      1.310165  0.535441   0.019840
KNNBasic          1.348666  0.722737   0.299761
KNNWithMeans      1.365855  0.748416   0.265357
NMF               1.370869  0.412744   0.024205
KNNWithZScore     1.372146  0.803122   0.236021
SlopeOne          1.383118  0.058611   0.038249
NormalPredictor   1.665557  0.012668   0.015458


In [38]:
correspond_result = []
for algorithm in [SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(),
                  KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()] :
  results = cross_validate(algorithm, trainset_correspond, measures = ['RMSE'], cv = 3, verbose = False)

  tmp = pd.DataFrame.from_dict(results).mean(axis = 0)
  tmp = pd.concat([tmp, pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm'])])
  correspond_result.append(tmp)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...


In [39]:
correspond_result_df = pd.DataFrame(correspond_result).set_index('Algorithm').sort_values('test_rmse')
print(correspond_result_df)

                 test_rmse  fit_time  test_time
Algorithm                                      
SVD               1.167157  0.109256   0.027129
BaselineOnly      1.170130  0.027408   0.028441
SVDpp             1.180391  0.094935   0.042588
KNNBaseline       1.240084  0.497605   0.221128
CoClustering      1.331819  0.590276   0.017996
KNNBasic          1.364754  0.473801   0.175846
NMF               1.393033  0.336038   0.014308
KNNWithZScore     1.406711  0.794781   0.199527
KNNWithMeans      1.407666  0.526114   0.214659
SlopeOne          1.417719  0.030385   0.024772
NormalPredictor   1.684282  0.010850   0.016122


In [40]:
discord_result = []
for algorithm in [SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(),
                  KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()] :
  results = cross_validate(algorithm, trainset_discord, measures = ['RMSE'], cv = 3, verbose = False)

  tmp = pd.DataFrame.from_dict(results).mean(axis = 0)
  tmp = pd.concat([tmp, pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm'])])
  discord_result.append(tmp)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...


In [41]:
discord_result_df = pd.DataFrame(discord_result).set_index('Algorithm').sort_values('test_rmse')
print(discord_result_df)

                 test_rmse  fit_time  test_time
Algorithm                                      
SVDpp             1.168319  0.031871   0.014914
BaselineOnly      1.175015  0.005621   0.004044
SVD               1.180070  0.031579   0.009812
KNNBaseline       1.194557  0.044844   0.099575
KNNBasic          1.235492  0.044524   0.091335
KNNWithMeans      1.281553  0.065924   0.022283
CoClustering      1.286686  0.105441   0.005546
KNNWithZScore     1.296439  0.131295   0.024973
SlopeOne          1.306815  0.020231   0.009173
NMF               1.322802  0.112598   0.010991
NormalPredictor   1.613149  0.000000   0.005609


In [42]:
df_total.to_csv('../df_total.csv', encoding='cp949', index= False)
df_correspond.to_csv('../df_correspond.csv', encoding='cp949', index= False)
