# TF-IDF 실습

In [77]:
import pandas as pd
import numpy as np
from math import log
from tqdm import tqdm
from google.colab import drive
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


- 토이 데이터를 가지고 직접 TF-IDF를 계산해봅시다

In [2]:
# 예시 문서 리스트
document_list = [
  '먹고 싶은 사과',
  '먹고 싶은 바나나',
  '길고 노란 바나나 바나나',
  '저는 과일이 좋아요'
]

In [3]:
# 전체 문서
N = len(document_list)
N

4

In [4]:
# 전체 단어에 대한 정보를 확인
vocab = list(set([word for document in document_list for word in document.split(' ')])) # list comprehension에 for문 2번 사용 가능한듯...
vocab.sort()
print(f'전체 단어 : {vocab}')
print(f'전체 단어 길이 : {len(vocab)}')

전체 단어 : ['과일이', '길고', '노란', '먹고', '바나나', '사과', '싶은', '저는', '좋아요']
전체 단어 길이 : 9


In [5]:
# TF, IDF 값을 구하는 함수
def tf(word, document):
  '''
  TF = 문서별 단어의 등장 횟수 카운트
  '''
  return document.count(word)

def idf(word):
  '''
  IDF = TF 값에 log(N/(df+1)) 취한 형태
  '''
  df = 0
  for document in document_list:
    df +=1 if word in document else 0

  return log(N/(df+1))

def tfidf(word, document):
  return tf(word, document) * idf(word)

In [6]:
# 전체 문서내 단어에 대한 TF값 구하기
result = []

for i in range(N):
  doc_tf = []

  # d는 document_list에 있는 각 문서
  d = document_list[i]

  for j in range(len(vocab)):
    t = vocab[j]
    doc_tf.append(tf(t,d))   # vocab 내 단어 각각에 대해 tf값 계산
  
  result.append(doc_tf)

tf_df = pd.DataFrame(result, columns=vocab)
tf_df

Unnamed: 0,과일이,길고,노란,먹고,바나나,사과,싶은,저는,좋아요
0,0,0,0,1,0,1,1,0,0
1,0,0,0,1,1,0,1,0,0
2,0,1,1,0,2,0,0,0,0
3,1,0,0,0,0,0,0,1,1


In [7]:
# 전체 단어에 대한 IDF값 구하기
# IDF는 전체 문서(4개)에 대해서 단어별로 값을 가짐

result = []

for j in range(len(vocab)):
  t = vocab[j]
  result.append(idf(t))

idf_df = pd.DataFrame(result, index=vocab, columns=['idf'])
idf_df.T

Unnamed: 0,과일이,길고,노란,먹고,바나나,사과,싶은,저는,좋아요
idf,0.693147,0.693147,0.693147,0.287682,0.287682,0.693147,0.287682,0.693147,0.693147


In [8]:
# 전체 문서에 대해서 TFIDF값 계산

result = []

for i in range(N):

  doc_tfidf = []
  d=document_list[i]

  for j in range(len(vocab)):
    t = vocab[j]
    doc_tfidf.append(tfidf(t,d))
  
  result.append(doc_tfidf)

tfidf_df = pd.DataFrame(result, columns=vocab)
tfidf_df

Unnamed: 0,과일이,길고,노란,먹고,바나나,사과,싶은,저는,좋아요
0,0.0,0.0,0.0,0.287682,0.0,0.693147,0.287682,0.0,0.0
1,0.0,0.0,0.0,0.287682,0.287682,0.0,0.287682,0.0,0.0
2,0.0,0.693147,0.693147,0.0,0.575364,0.0,0.0,0.0,0.0
3,0.693147,0.0,0.0,0.0,0.0,0.0,0.0,0.693147,0.693147


# 무비렌즈 데이터로 TF-IDF 추천 수행

- 추천 아이템인 영화에 대해서 TF-IDF 벡터를 만드는 것이 핵심 목표입니다

In [9]:
# 학습 데이터가 들어있는 디렉토리 경로
path = '/content/drive/MyDrive/recomm_study/recomm_code/Recommend_learningspoons/data/ml-latest-small-20220921T022859Z-001/ml-latest-small/'
ratings_df = pd.read_csv(path + 'ratings.csv', encoding='utf-8')
movies_df = pd.read_csv(path + 'movies.csv', encoding='utf-8', index_col='movieId')

ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [10]:
# 평점이 존재하는 모든 영화정보 => 우리가 주로 사용하게 될 추천 아이템 영화에 대한 정보
movies_df.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


- 개별 영화에 대해 아이템 프로파일을 구축하려면 영화를 구성하고 있는 정보를 피쳐로 사용해야 합니다
- 영화에 대한 피쳐를 TF-IDF로 만들기 위해선 영화를 구성하는 Document와 Word가 필요합니다
- **개별 영화를 Document, 영화를 표현하는 장르들을 Word로 놓고 TF-IDF를 구해보자**

In [11]:
# 전체 영화 개수(Document), IDF를 구하기 위해서도 N이 필요합니다
N = len(movies_df)
N

9742

In [12]:
# 전체 단어 개수(=영화장르 개수) --> Word
genres = list(set([genre for movie in movies_df['genres'].tolist() for genre in movie.split('|')]))
genres.sort()

print(f'전체 단어 : {genres}')
print(f'전체 단어 길이 : {len(genres)}')

전체 단어 : ['(no genres listed)', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
전체 단어 길이 : 20


In [13]:
# '(no genres listed)'도 하나의 genre로 봄(=하나의 word)
movies_df[movies_df['genres'] == '(no genres listed)'].head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
114335,La cravate (1957),(no genres listed)
122888,Ben-hur (2016),(no genres listed)
122896,Pirates of the Caribbean: Dead Men Tell No Tal...,(no genres listed)
129250,Superfast! (2015),(no genres listed)
132084,Let It Be Me (1995),(no genres listed)


In [14]:
# 전체 단어(장르)에 대한 IDF값 구하기
# 1. 각 단어별 TF를 구한 후
# 2. TF를 활용한 IDF값 구하기

df_dict = dict.fromkeys(genres,0)  # dict.fromkeys(생성하려는 key 목록, 해당 key들이 value) --> 여기서는 genre와 0을 key, value로 설정

for genre_list in tqdm(movies_df['genres']):
  for genre in genres:
    df_dict[genre] += genre in genre_list.split('|')


idf_dict = dict.fromkeys(df_dict.keys())

for k, v in df_dict.items():
  idf_dict[k] = log(N/(v+1))

print(df_dict,'\n\n',idf_dict)

100%|██████████| 9742/9742 [00:00<00:00, 107016.16it/s]

{'(no genres listed)': 34, 'Action': 1828, 'Adventure': 1263, 'Animation': 611, 'Children': 664, 'Comedy': 3756, 'Crime': 1199, 'Documentary': 440, 'Drama': 4361, 'Fantasy': 779, 'Film-Noir': 87, 'Horror': 978, 'IMAX': 158, 'Musical': 334, 'Mystery': 573, 'Romance': 1596, 'Sci-Fi': 980, 'Thriller': 1894, 'War': 382, 'Western': 167} 

 {'(no genres listed)': 5.6288536528770745, 'Action': 1.6726770659756223, 'Adventure': 2.0421651396596854, 'Animation': 2.767469431854162, 'Children': 2.684414673710634, 'Comedy': 0.9528256687925191, 'Crime': 2.0941248785903963, 'Documentary': 3.095156838919642, 'Drama': 0.8035157676049136, 'Fantasy': 2.5249077946828504, 'Film-Noir': 4.706864899888282, 'Horror': 2.2976700718359777, 'IMAX': 4.115297512146256, 'Musical': 3.3700711825414214, 'Mystery': 2.8315723180469217, 'Romance': 1.8083195661514755, 'Sci-Fi': 2.295629254801125, 'Thriller': 1.6372275968499612, 'War': 3.236166725185842, 'Western': 4.060237734963229}





In [15]:
# 전체 단어(genre)의 TF-IDF 구하기
result = []
index_list = []

for movie_id, row in tqdm(movies_df.iterrows()):
  doc_tf = []
  document = row['genres']  # 개별 row의 genre 값

  for genre in genres:
    '''
    개별 문서(row)의 tf값 계산 후 개별 단어의 idf값과 곱하기
    '''
    doc_tf.append(tf(genre, document) * idf_dict[genre])
  
  result.append(doc_tf)
  index_list.append(movie_id)

# 각 영화(문서)는 단어의 개수만큼의 차원을 가진 벡터로 표현됩니다
tfidf_df = pd.DataFrame(result, columns=genres, index=index_list).sort_index()
tfidf_df

9742it [00:00, 14837.38it/s]


Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
1,0.0,0.000000,2.042165,2.767469,2.684415,0.952826,0.0,0.0,0.000000,2.524908,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0
2,0.0,0.000000,2.042165,0.000000,2.684415,0.000000,0.0,0.0,0.000000,2.524908,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0
3,0.0,0.000000,0.000000,0.000000,0.000000,0.952826,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,1.80832,0.0,0.0,0.0,0.0
4,0.0,0.000000,0.000000,0.000000,0.000000,0.952826,0.0,0.0,0.803516,0.000000,0.0,0.0,0.0,0.0,0.0,1.80832,0.0,0.0,0.0,0.0
5,0.0,0.000000,0.000000,0.000000,0.000000,0.952826,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.0,1.672677,0.000000,2.767469,0.000000,0.952826,0.0,0.0,0.000000,2.524908,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0
193583,0.0,0.000000,0.000000,2.767469,0.000000,0.952826,0.0,0.0,0.000000,2.524908,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0
193585,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.803516,0.000000,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0
193587,0.0,1.672677,0.000000,2.767469,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0


## 아이템 유사도 기반 추천
- 아이템끼리의 유사도를 구할 수 있습니다
- 어떤 영화가 주어졌을 때 그 영화와 가장 cosine 유사도가 가장 높은 영화를 추천합니다

In [16]:
# cosine similarity 계산하는 함수
def cos_sim_matrix(a,b):
  '''
  a영화와 b영화간 코사인 유사도 계산
  '''

  cos_sim = cosine_similarity(a,b)
  result_df = pd.DataFrame(data=cos_sim, index=[a.index], columns=b.index)

  return result_df

In [17]:
# 예시

# 3개 영화 벡터
a = [[0.2, 0.4, 1.2, 1.5],
     [0.4, 0.7, 0.3, 0.5],
     [0.3, 1.2, 1.0, 1.0]]

# 결과: 유저와 3개의 영화의 유사도 = 추천 스코어
cosine_similarity(a, a)

array([[1.        , 0.74907437, 0.87434505],
       [0.74907437, 1.        , 0.94147267],
       [0.87434505, 0.94147267, 1.        ]])

In [18]:
# 영화와 영화간 코사인 유사도 계산
# 코사인 유사도 값이 높은 영화끼리는 유사함

movie_sim_df = cos_sim_matrix(tfidf_df, tfidf_df)
movie_sim_df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
1,1.000000,0.821277,0.086580,0.080578,0.185731,0.000000,0.086580,0.657468,0.000000,0.261707,...,0.409432,0.518058,0.141984,0.539452,0.0,0.691516,0.753482,0.000000,0.461676,0.185731
2,0.821277,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.800544,0.000000,0.318658,...,0.000000,0.000000,0.000000,0.000000,0.0,0.359250,0.391443,0.000000,0.000000,0.000000
3,0.086580,0.000000,1.000000,0.930671,0.466160,0.000000,1.000000,0.000000,0.000000,0.000000,...,0.108904,0.000000,0.356361,0.000000,0.0,0.105457,0.114907,0.000000,0.000000,0.466160
4,0.080578,0.000000,0.930671,1.000000,0.433841,0.000000,0.930671,0.000000,0.000000,0.000000,...,0.101354,0.102011,0.567512,0.000000,0.0,0.098145,0.106940,0.365857,0.000000,0.433841
5,0.185731,0.000000,0.466160,0.433841,1.000000,0.000000,0.466160,0.000000,0.000000,0.000000,...,0.233619,0.000000,0.764462,0.000000,0.0,0.226224,0.246496,0.000000,0.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.691516,0.359250,0.105457,0.098145,0.226224,0.211509,0.105457,0.000000,0.397135,0.213852,...,0.661569,0.631007,0.172940,0.657066,0.0,1.000000,0.917760,0.000000,0.767757,0.226224
193583,0.753482,0.391443,0.114907,0.106940,0.246496,0.000000,0.114907,0.000000,0.000000,0.000000,...,0.543386,0.687551,0.188437,0.715945,0.0,0.917760,1.000000,0.000000,0.612723,0.246496
193585,0.000000,0.000000,0.000000,0.365857,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.278828,0.644669,0.000000,0.0,0.000000,0.000000,1.000000,0.000000,0.000000
193587,0.461676,0.000000,0.000000,0.000000,0.000000,0.275490,0.000000,0.000000,0.517266,0.278541,...,0.792853,0.821883,0.000000,0.855825,0.0,0.767757,0.612723,0.000000,1.000000,0.000000


In [19]:
k=10
given_movie = 'Black Butler: Book of the Atlantic (2017)'
movie_id = movies_df[movies_df['title'] == given_movie].index[0]

for idx, value in movie_sim_df[movie_id].sort_values(ascending=False)[:k].items():

  recomm_movie_id = idx[0]
  print(movies_df.loc[recomm_movie_id]['title'], ' : ', value)

Black Butler: Book of the Atlantic (2017)  :  1.0
Justice League: Doom (2012)   :  0.9740752144041196
Dante's Inferno: An Animated Epic (2010)  :  0.9740752144041196
Superman/Batman: Public Enemies (2009)  :  0.9740752144041196
Triplets of Belleville, The (Les triplettes de Belleville) (2003)  :  0.9177602335851786
Mickey's Once Upon a Christmas (1999)  :  0.9177602335851786
South Park: Imaginationland (2008)  :  0.9177602335851786
Monkeybone (2001)  :  0.9177602335851786
Anomalisa (2015)  :  0.9177602335851786
Daddy, I'm A Zombie (2012)  :  0.9177602335851786


## 유저 벡터를 이용한 추천


- 데이터를 train / test로 나눕니다
- 아이템 프로파일인 TF-IDF 벡터를 만듭니다
- 아이템 벡터를 사용하여 **유저 프로파일 벡터**를 만듭니다
- 유저 프로파일 벡터와 추천 후보 아이템 벡터의 simlarity를 계산하여 사용자에게 적합한 Top K 아이템을 추천한다.

- Top-K recommendation의 경우, 추천된 아이템 K에 대해서 사용자가 선호한 아이템이 얼마나 있는지를 평가한다.
- 무비렌즈 데이터를 사용할 경우 사용자가 아이템을 선호한다는 기준이 필요하다. (사용자가 아이템에 내린 절대 평점값만이 존재하기 때문에)
- **아래 실습을 수행할 때는 평점의 크기와 관계없이 평점이 존재하면 모두 선호한다고 가정하자!**

In [20]:
user_id_list = ratings_df['userId'].unique()
movie_id_list = movies_df.index.tolist()
movie_id_set = set(movie_id_list)

In [21]:
len(user_id_list)

610

### hit ratio로 성능 평가
- 사용자가 선호도를 표시한 아이템 가운데 한개만 제거해서 이를 테스트 데이터에 넣습니다
- 남은 데이터를 학습 데이터로 사용하여 추천 모델을 만들고 추천 결과를 생성하여 테스트 데이터에 있는 영화와 일치하는지 평가합니다

In [None]:
# 먼저 사용자 1명에 대해서 추천을 수행하고 성능을 평가해봅시다
# 사용할 train/test 데이터는 ratings_df이고, 아이템 프로파일인 tfidf vector는 이미 위에서 모두 생성하였습니다

In [24]:
# user_id = 10
user_id = 10
_df = ratings_df[ratings_df['userId'] == user_id]
_df

Unnamed: 0,userId,movieId,rating,timestamp
1119,10,296,1.0,1455303387
1120,10,356,3.5,1455301685
1121,10,588,4.0,1455306173
1122,10,597,3.5,1455357645
1123,10,912,4.0,1455302254
...,...,...,...,...
1254,10,119145,1.0,1455302650
1255,10,129428,3.5,1455357384
1256,10,136020,5.0,1455302192
1257,10,137595,4.0,1455356898


In [25]:
# 랜덤하게 하나의 영화만 제외함.

drop_indices = np.random.choice(_df.index, 1, replace=False)
_df_train = _df.drop(drop_indices)
_df_train

Unnamed: 0,userId,movieId,rating,timestamp
1119,10,296,1.0,1455303387
1120,10,356,3.5,1455301685
1121,10,588,4.0,1455306173
1122,10,597,3.5,1455357645
1123,10,912,4.0,1455302254
...,...,...,...,...
1254,10,119145,1.0,1455302650
1255,10,129428,3.5,1455357384
1256,10,136020,5.0,1455302192
1257,10,137595,4.0,1455356898


In [27]:
# user 벡터 생성
# --> user가 봤던 영화들의 벡터 평균이 user 벡터, 가장 간단한 방법
rated_movie_list = _df_train['movieId'].tolist()  # user가 봤던 영화들을 가져옴
user_profile_df = tfidf_df.loc[rated_movie_list, :].mean().to_frame().T # 영화들의 tfidf 평균값 = user 벡터
user_profile_df

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0.0,0.312875,0.455447,0.298648,0.270373,0.541534,0.195853,0.0,0.410429,0.290637,0.0,0.01653,0.532916,0.218206,0.061113,1.001731,0.082577,0.141343,0.093127,0.0


In [31]:
# train 데이터에 있는 영화 id를 추천대상에서 제거
# --> 이미 선호했던 영화(평점을 매긴 영화)를 제외한 나머지 영화 가운데에서 추천 진행
rated_movie_set = set(rated_movie_list)
recomm_movie_list = list(movie_id_set - rated_movie_set)
recomm_movie_tfidf_df = tfidf_df.loc[recomm_movie_list, :]
recomm_movie_tfidf_df

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
1,0.0,0.000000,2.042165,2.767469,2.684415,0.952826,0.000000,0.0,0.000000,2.524908,0.0,0.00000,0.0,0.0,0.0,0.00000,0.0,0.000000,0.000000,0.0
2,0.0,0.000000,2.042165,0.000000,2.684415,0.000000,0.000000,0.0,0.000000,2.524908,0.0,0.00000,0.0,0.0,0.0,0.00000,0.0,0.000000,0.000000,0.0
3,0.0,0.000000,0.000000,0.000000,0.000000,0.952826,0.000000,0.0,0.000000,0.000000,0.0,0.00000,0.0,0.0,0.0,1.80832,0.0,0.000000,0.000000,0.0
4,0.0,0.000000,0.000000,0.000000,0.000000,0.952826,0.000000,0.0,0.803516,0.000000,0.0,0.00000,0.0,0.0,0.0,1.80832,0.0,0.000000,0.000000,0.0
5,0.0,0.000000,0.000000,0.000000,0.000000,0.952826,0.000000,0.0,0.000000,0.000000,0.0,0.00000,0.0,0.0,0.0,0.00000,0.0,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163809,0.0,0.000000,2.042165,2.767469,0.000000,0.000000,0.000000,0.0,0.803516,0.000000,0.0,0.00000,0.0,0.0,0.0,0.00000,0.0,0.000000,0.000000,0.0
32743,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.803516,0.000000,0.0,2.29767,0.0,0.0,0.0,0.00000,0.0,1.637228,0.000000,0.0
98279,0.0,0.000000,0.000000,0.000000,0.000000,0.952826,0.000000,0.0,0.000000,0.000000,0.0,0.00000,0.0,0.0,0.0,0.00000,0.0,0.000000,0.000000,0.0
65514,0.0,1.672677,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.803516,0.000000,0.0,0.00000,0.0,0.0,0.0,0.00000,0.0,0.000000,3.236167,0.0


In [36]:
# user 벡터와 나머지 item 벡터간 cosine 유사도 계산
top_k_result_df = cos_sim_matrix(recomm_movie_tfidf_df, user_profile_df)
top_k_result_df

Unnamed: 0,0
1,0.467011
2,0.364098
3,0.730954
4,0.776670
5,0.347630
...,...
163809,0.379197
32743,0.131117
98279,0.347630
65514,0.198666


In [40]:
# top_k 추천한 결과
# level_0 --> 추천 후보 movie_id가 들어있음
k=10
top_k_result_df = top_k_result_df.sort_values(by=0, ascending=False)
top_k_list = top_k_result_df[:k].reset_index()['level_0'].values.tolist()

# 아까 140개 중 하나 없앤 test용 영화 선호 데이터
_df_test = _df.loc[drop_indices.tolist(), :]
test_movie_id = _df_test['movieId'].values[0]

print("test 선호 영화: ", test_movie_id)
print("top k 추천: ", top_k_list)
print("hit 여부: ", test_movie_id in top_k_list)

test 선호 영화:  72407
top k 추천:  [852, 3270, 351, 50802, 6993, 105213, 6837, 6788, 6765, 116724]
hit 여부:  False


### simple user vector 사용 추천

In [46]:
k=20
hit = 0

for user_id in tqdm(user_id_list):

  #개별 유저의 데이터 로드
  _df = ratings_df[ratings_df['userId'] == user_id]

  # train/test 데이터를 아까처럼 나누기(랜덤하게 하나만 빼기)
  drop_indices = np.random.choice(_df.index, 1, replace=False)
  _df_train = _df.drop(drop_indices)

  # user 벡터 생성
  rated_movie_list = _df_train['movieId'].tolist()
  user_profile_df = tfidf_df.loc[rated_movie_list, :].mean().to_frame().T

  # 이미 user_id의 유저가 평가한 영화는 제거
  rated_movie_set = set(rated_movie_list)
  recomm_movie_list = list(movie_id_set - rated_movie_set)
  recomm_movie_tfidf_df = tfidf_df.loc[recomm_movie_list, :]


  # 추천 대상 영화 item 벡터와 user 벡터와의 cosine 유사도 계산 후, top_k 영화 추천
  top_k_result_df = cos_sim_matrix(recomm_movie_tfidf_df, user_profile_df)
  top_k_result_df = top_k_result_df.sort_values(by=0, ascending=False)
  top_k_list = top_k_result_df[:k].reset_index()['level_0'].values.tolist()

  # 각 user_id에 대한 hit rate
  _df_test = _df.loc[drop_indices.tolist(), :]
  test_movie_id = _df_test['movieId'].values[0]

  hit += test_movie_id in top_k_list

hit_ratio = hit / len(user_id_list)
print(f'\n\n hit_ratio: {hit_ratio}')

100%|██████████| 610/610 [00:11<00:00, 53.53it/s]



 hit_ratio: 0.01639344262295082





### weighted average user vector 사용 추천 (variant)

In [60]:
k=20
hit=0

for user_id in tqdm(user_id_list):

  # 개별 user_id 데이터 로드
  _df = ratings_df[ratings_df['userId'] == user_id]

  # train test split
  drop_indices = np.random.choice(_df.index, 1, replace=False)
  _df_train = _df.drop(drop_indices)

  # user 벡터 생성(변형)
  # user가 봤던 영화들의 평점을 이용해 가중평균한 값이 user 벡터가 됨
  # 1) tfidf_df에서 평가한 영화들에 대해 tfidf_df값을 추출
  # 2) 이를 해당 영화들에 user가 매긴 평점을 가중합
  # 3) 총 평점의 합계로 각 요소값을 나눠줌

  rated_movie_list = _df_train['movieId'].tolist()
  numerator = np.matmul(tfidf_df.loc[rated_movie_list].to_numpy().T, _df_train['rating'].to_numpy())   # (20, user별로 본 영화수) X (user별로 본 영화 수, 1)
  denominator = _df_train['rating'].sum()
  user_profile_df = pd.DataFrame([numerator], columns=tfidf_df.columns) / denominator

  # user가 이미 평가한 영화는 추천에서 제외
  rated_movie_set = set(rated_movie_list)
  recomm_movie_list = list(movie_id_set - rated_movie_set)
  recomm_movie_tfidf_df = tfidf_df.loc[recomm_movie_list, :]

  # 추천 대상 영화들의 tfidf값과 user 벡터간 유사도 구한 후 top-k 영화 추천
  top_k_result_df = cos_sim_matrix(recomm_movie_tfidf_df, user_profile_df)
  top_k_result_df = top_k_result_df.sort_values(by=0, ascending=False)
  top_k_list = top_k_result_df[:k].reset_index()['level_0'].values.tolist()

  # 각 user_id에 대한 hit 여부
  _df_test = _df.loc[drop_indices.tolist(), :]
  test_movie_id = _df_test['movieId'].values[0]

  hit += test_movie_id in top_k_list

hit_ratio = hit / len(user_id_list)
print(f'\n\n hit_ratio: {hit_ratio}')

100%|██████████| 610/610 [00:11<00:00, 54.18it/s]



 hit_ratio: 0.01639344262295082





## 평점 예측 문제
- 예측 문제의 경우 추천 결과를 생성하지 않고 테스트 데이터의 평점을 직접 예측합니다
- rmse, mae 같은 지표로 성능을 평가합니다

In [62]:
# 영화간 유사도 matrix -> movie_sim_df
movie_sim_df.shape

(9742, 9742)

In [64]:
train_df, test_df = train_test_split(ratings_df, test_size=0.2, random_state=10)

In [65]:
print('train 데이터 개수:', len(train_df))
print('test 데이터 개수:', len(test_df))

train 데이터 개수: 80668
test 데이터 개수: 20168


In [66]:
test_user_list = test_df['userId'].unique()
len(test_user_list)

610

### rmse로 추천 성능 평가

- 사용자가 과거에 봤던 아이템을 기반으로 유저 프로파일이 만들어지는 원리는 동일합니다
- 예측 평점을 구하는 방법은 다음과 같습니다
    
    1. train / test 데이터로 평점 데이터를 나눕니다.
    2. 사용자 u에 대해서 train 데이터에 있는 영화들(M)과 test 데이터에 있는 예측하려는 영화(m) 사이의 유사도를 구합니다.
    3. m의 평점은 M과 m과의 유사도를 가중치로 사용한 M의 평점들의 가중 평균입니다.

In [None]:
# 먼저 사용자 1명에 대해서 추천을 수행하고 성능을 평가해봅시다.
# train/test 데이터는 ratings_df를 사용합니다.
# 아이템 프로파일인 tfidf vector는 이미 위에서 모두 생성하였습니다.

In [69]:
user_id=10
user_train_df = train_df[train_df['userId'] == user_id]
user_train_df.index = user_train_df['movieId']
user_train_df = user_train_df.sort_index()
user_train_df

Unnamed: 0_level_0,userId,movieId,rating,timestamp
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
296,10,296,1.0,1455303387
356,10,356,3.5,1455301685
588,10,588,4.0,1455306173
597,10,597,3.5,1455357645
912,10,912,4.0,1455302254
...,...,...,...,...
113275,10,113275,4.5,1455357698
113394,10,113394,4.0,1455398275
136020,10,136020,5.0,1455302192
137595,10,137595,4.0,1455356898


In [70]:
# user 10이 평가한 training data의 115개 영화들의 item_similarity를 확인하면,
user_sim_df = movie_sim_df.loc[user_train_df['movieId']]
user_sim_df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
296,0.060278,0.000000,0.151290,0.240932,0.324545,0.766315,0.151290,0.000000,0.000000,0.293930,...,0.075820,0.076312,0.424541,0.000000,0.0,0.073420,0.079999,0.273688,0.000000,0.324545
356,0.045248,0.000000,0.522619,0.561551,0.243624,0.000000,0.522619,0.000000,0.000000,0.000000,...,0.056915,0.057285,0.318687,0.000000,0.0,0.055114,0.060052,0.205448,0.000000,0.243624
588,0.694848,0.482628,0.079391,0.073887,0.170309,0.000000,0.079391,0.602876,0.000000,0.239976,...,0.375435,0.475041,0.130194,0.494659,0.0,0.363552,0.396129,0.000000,0.423341,0.170309
597,0.086580,0.000000,1.000000,0.930671,0.466160,0.000000,1.000000,0.000000,0.000000,0.000000,...,0.108904,0.000000,0.356361,0.000000,0.0,0.105457,0.114907,0.000000,0.000000,0.466160
912,0.000000,0.000000,0.808480,0.900989,0.000000,0.000000,0.808480,0.000000,0.000000,0.000000,...,0.000000,0.113222,0.261775,0.000000,0.0,0.000000,0.000000,0.406062,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113275,0.141984,0.000000,0.356361,0.567512,0.764462,0.000000,0.356361,0.000000,0.000000,0.000000,...,0.178593,0.179752,1.000000,0.000000,0.0,0.172940,0.188437,0.644669,0.000000,0.764462
113394,0.141984,0.000000,0.356361,0.567512,0.764462,0.000000,0.356361,0.000000,0.000000,0.000000,...,0.178593,0.179752,1.000000,0.000000,0.0,0.172940,0.188437,0.644669,0.000000,0.764462
136020,0.241259,0.293761,0.000000,0.000000,0.000000,0.678782,0.000000,0.366952,0.496414,0.665766,...,0.203588,0.000000,0.000000,0.000000,0.0,0.197144,0.000000,0.000000,0.256778,0.000000
137595,0.141984,0.000000,0.356361,0.567512,0.764462,0.000000,0.356361,0.000000,0.000000,0.000000,...,0.178593,0.179752,1.000000,0.000000,0.0,0.172940,0.188437,0.644669,0.000000,0.764462


In [71]:
# 사용자가 평가한 115개 영화에 대한 평점
user_rating_df = user_train_df[['rating']]
user_rating_df

Unnamed: 0_level_0,rating
movieId,Unnamed: 1_level_1
296,1.0
356,3.5
588,4.0
597,3.5
912,4.0
...,...
113275,4.5
113394,4.0
136020,5.0
137595,4.0


In [74]:
# 유사도를 가중치로 삼아 예측 평점 계산
user_sim_sum = np.sum(user_sim_df.T.to_numpy(), -1)
pred_rating = np.matmul(user_sim_df.T.to_numpy(), user_rating_df.to_numpy()).flatten() / (user_sim_sum + 1)
pred_df = pd.DataFrame(pred_rating, index=movie_sim_df.index).reset_index()
pred_df = pred_df.rename(columns={'level_0' : 'movieId', 0 : 'predict_rating'})
pred_df

Unnamed: 0,movieId,predict_rating
0,1,3.379647
1,2,3.332194
2,3,3.215757
3,4,3.200339
4,5,3.146149
...,...,...
9737,193581,3.327307
9738,193583,3.299837
9739,193585,2.999658
9740,193587,3.416861


In [78]:
# test 데이터의 true rating과 비교해봅시다

result_df = pd.merge(pred_df, test_df[test_df['userId'] == user_id], on='movieId')
result_df

Unnamed: 0,movieId,predict_rating,userId,rating,timestamp
0,1907,3.284974,10,4.0,1455306183
1,4993,3.301527,10,4.0,1455356385
2,5066,3.194188,10,3.0,1455399329
3,5377,3.200339,10,3.5,1455301898
4,6535,3.146149,10,4.0,1455398379
5,8533,3.194188,10,5.0,1455301847
6,8970,2.999658,10,1.0,1455398160
7,31685,3.215757,10,4.5,1455357602
8,33145,3.200339,10,3.0,1455398153
9,33679,3.282119,10,3.0,1455357626


In [79]:
# rmse, mae 구하기
rmse = np.sqrt(mean_squared_error(y_true=result_df['rating'].values, y_pred=result_df['predict_rating'].values))
mae = mean_absolute_error(result_df['rating'].values, result_df['predict_rating'].values)

print(f'rmse : {rmse}')
print(f'mae : {mae}')

rmse : 1.1342882791670417
mae : 0.8684869962217949


### 전체 유저에 대해서 예측

In [82]:
result_df = pd.DataFrame()

for user_id in tqdm(user_id_list):

  user_train_df = train_df[train_df['userId'] == user_id]
  user_train_df.index = user_train_df['movieId']
  user_train_df = user_train_df.sort_index()
  user_sim_df = movie_sim_df.loc[user_train_df['movieId']]

  user_rating_df = user_train_df[['rating']]

  user_sim_sum = np.sum(user_sim_df.T.to_numpy(), -1)
  pred_rating = np.matmul(user_sim_df.T.to_numpy(), user_rating_df.to_numpy()).flatten() / (user_sim_sum + 1)
  pred_df = pd.DataFrame(pred_rating, index=movie_sim_df.index).reset_index()
  pred_df = pred_df.rename(columns={'level_0' : 'movieId', 0 : 'predict_rating'})

  # test
  temp_df = pd.merge(pred_df, test_df[test_df['userId'] == user_id], on='movieId')
  result_df = pd.concat([result_df, temp_df], axis=0) # 전체 유저에대해 수행해야 하므로 각 유저별 결과를 for문 내에서 concat

rmse = np.sqrt(mean_squared_error(y_true=result_df['rating'].values, y_pred=result_df['predict_rating'].values))
mae = mean_absolute_error(result_df['rating'].values, result_df['predict_rating'].values)

print('\n\n')
print(f'rmse : {rmse}')
print(f'mae : {mae}')

100%|██████████| 610/610 [00:18<00:00, 33.83it/s]




rmse : 0.9551243628312982
mae : 0.7485065244915764



