# 13. 아이유팬이 좋아할 만한 다른 아티스트 찾기

|평가문항|상세기준|
|---|---|
|1. CSR matrix가 정상적으로 만들어졌다.|사용자와 아이템 개수를 바탕으로 정확한 사이즈로 만들었다.|
|2. MF 모델이 정상적으로 훈련되어 그럴듯한 추천이 이루어졌다.|사용자와 아이템 벡터 내적수치가 의미있게 형성되었다.|
|3. 비슷한 영화 찾기와 유저에게 추천하기의 과정이 정상적으로 진행되었다.|MF모델이 예측한 유저 선호도 및 아이템간 유사도, 기여도를 측정하고 의미를 분석해보았다.|

In [1]:
import os
import pandas as pd
import numpy as np
import scipy
import implicit

print(np.__version__)
print(scipy.__version__)
print(implicit.__version__)

1.23.5
1.9.3
0.6.2


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
rating_file_path=os.getcwd() + '/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'ratings', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python', encoding = "ISO-8859-1")
orginal_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,ratings,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [3]:
# 3점 이상만 남깁니다.
ratings = ratings[ratings['ratings']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [4]:
# ratings 컬럼의 이름을 counts로 바꿉니다.
ratings.rename(columns={'ratings':'counts'}, inplace=True)
ratings['counts']

0          5
1          3
2          3
3          4
4          5
          ..
1000203    3
1000205    5
1000206    5
1000207    4
1000208    4
Name: counts, Length: 836478, dtype: int64

In [5]:
# 영화 제목을 보기 위해 메타 데이터를 읽어옵니다.
movie_file_path=os.getcwd() + '/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python', encoding='ISO-8859-1')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
genre_col = movies['genre']

# "|" 기호를 기준으로 문자열 분할
genre_col_split = genre_col.str.split("|")

# 분할된 문자열을 새로운 열로 추가
movies['genre_1'] = genre_col_split.str[0]
movies['genre_2'] = genre_col_split.str[1]
movies['genre_3'] = genre_col_split.str[2]
movies.drop('genre',axis=1,inplace=True)
# 결과 확인
movies.head()

Unnamed: 0,movie_id,title,genre_1,genre_2,genre_3
0,1,Toy Story (1995),Animation,Children's,Comedy
1,2,Jumanji (1995),Adventure,Children's,Fantasy
2,3,Grumpier Old Men (1995),Comedy,Romance,
3,4,Waiting to Exhale (1995),Comedy,Drama,
4,5,Father of the Bride Part II (1995),Comedy,,


In [7]:
# genre_1, genre_2, genre_3 열에서 중복을 제거하고 모든 장르를 하나의 리스트로 만듦
genre_list = movies['genre_1'].tolist() + movies['genre_2'].tolist() + movies['genre_3'].tolist()
genre_list = list(set(genre_list))

# 장르 이름에 대한 정수 인코딩 매핑 딕셔너리를 만듦
genre2idx = {genre: idx for idx, genre in enumerate(genre_list)}

# 장르 이름을 정수로 인코딩하여 새로운 열에 추가
movies['genre_1_code'] = movies['genre_1'].map(genre2idx)
movies['genre_2_code'] = movies['genre_2'].map(genre2idx)
movies['genre_3_code'] = movies['genre_3'].map(genre2idx)

# 결과 확인
movies

Unnamed: 0,movie_id,title,genre_1,genre_2,genre_3,genre_1_code,genre_2_code,genre_3_code
0,1,Toy Story (1995),Animation,Children's,Comedy,13,2,0
1,2,Jumanji (1995),Adventure,Children's,Fantasy,10,2,6
2,3,Grumpier Old Men (1995),Comedy,Romance,,0,5,7
3,4,Waiting to Exhale (1995),Comedy,Drama,,0,11,7
4,5,Father of the Bride Part II (1995),Comedy,,,0,7,7
...,...,...,...,...,...,...,...,...
3878,3948,Meet the Parents (2000),Comedy,,,0,7,7
3879,3949,Requiem for a Dream (2000),Drama,,,11,7,7
3880,3950,Tigerland (2000),Drama,,,11,7,7
3881,3951,Two Family House (2000),Drama,,,11,7,7


In [8]:
ratings=ratings[['user_id','movie_id','counts']]
ratings[ratings['user_id']==1]

Unnamed: 0,user_id,movie_id,counts
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5
5,1,1197,3
6,1,1287,5
7,1,2804,5
8,1,594,4
9,1,919,4


In [9]:
print(ratings.shape)
print(movies.shape)

(836478, 3)
(3883, 8)


In [10]:
ratings['user_id_idx'] = pd.factorize(ratings['user_id'])[0]
ratings['movie_id_idx'] = pd.factorize(ratings['movie_id'])[0]

movies['movie_id_idx'] = movies['movie_id'].map(dict(zip(ratings['movie_id'], ratings['movie_id_idx'])))

In [11]:
ratings.head()

Unnamed: 0,user_id,movie_id,counts,user_id_idx,movie_id_idx
0,1,1193,5,0,0
1,1,661,3,0,1
2,1,914,3,0,2
3,1,3408,4,0,3
4,1,2355,5,0,4


In [12]:
movies.head()

Unnamed: 0,movie_id,title,genre_1,genre_2,genre_3,genre_1_code,genre_2_code,genre_3_code,movie_id_idx
0,1,Toy Story (1995),Animation,Children's,Comedy,13,2,0,40.0
1,2,Jumanji (1995),Adventure,Children's,Fantasy,10,2,6,513.0
2,3,Grumpier Old Men (1995),Comedy,Romance,,0,5,7,1862.0
3,4,Waiting to Exhale (1995),Comedy,Drama,,0,11,7,397.0
4,5,Father of the Bride Part II (1995),Comedy,,,0,7,7,1180.0


In [13]:
movies.isnull().sum()

movie_id           0
title              0
genre_1            0
genre_2         2025
genre_3         3347
genre_1_code       0
genre_2_code       0
genre_3_code       0
movie_id_idx     255
dtype: int64

In [14]:
movies[movies['movie_id_idx'].isnull()]

Unnamed: 0,movie_id,title,genre_1,genre_2,genre_3,genre_1_code,genre_2_code,genre_3_code,movie_id_idx
50,51,Guardian Angel (1994),Action,Drama,Thriller,9,11,14,
107,109,Headless Body in Topless Bar (1995),Comedy,,,0,7,7,
113,115,Happiness Is in the Field (1995),Comedy,,,0,7,7,
125,127,"Silence of the Palace, The (Saimt el Qusur) (1...",Drama,,,11,7,7,
131,133,Nueba Yol (1995),Comedy,Drama,,0,11,7,
...,...,...,...,...,...,...,...,...,...
3759,3829,Mad About Mambo (2000),Comedy,Romance,,0,5,7,
3786,3856,Autumn Heart (1999),Drama,,,11,7,7,
3821,3891,Turn It Up (2000),Crime,Drama,,1,11,7,
3834,3904,"Uninvited Guest, An (2000)",Drama,,,11,7,7,


In [15]:
ratings[ratings['movie_id_idx']==40]

Unnamed: 0,user_id,movie_id,counts,user_id_idx,movie_id_idx
40,1,1,5,0,40
469,6,1,4,5,40
581,8,1,4,7,40
711,9,1,5,8,40
837,10,1,5,9,40
...,...,...,...,...,...
997248,6022,1,5,6020,40
997541,6025,1,5,6023,40
998170,6032,1,4,6030,40
998360,6035,1,4,6033,40


In [16]:
genres = movies[['movie_id', 'genre_1_code', 'genre_2_code', 'genre_3_code']]
merged = pd.merge(ratings, genres, on='movie_id', how='left')
merged

Unnamed: 0,user_id,movie_id,counts,user_id_idx,movie_id_idx,genre_1_code,genre_2_code,genre_3_code
0,1,1193,5,0,0,11,7,7
1,1,661,3,0,1,13,2,3
2,1,914,3,0,2,3,5,7
3,1,3408,4,0,3,11,7,7
4,1,2355,5,0,4,13,2,0
...,...,...,...,...,...,...,...,...
836473,6040,1090,3,6038,1030,11,4,7
836474,6040,1094,5,6038,986,11,5,4
836475,6040,562,5,6038,311,0,11,7
836476,6040,1096,4,6038,142,11,7,7


In [17]:
from scipy.sparse import csr_matrix

user_to_idx = ratings['user_id_idx']
movie_to_idx = ratings['movie_id_idx']

num_user = ratings['user_id_idx'].nunique()
num_movie = ratings['movie_id_idx'].nunique()

data = np.array(ratings['counts'])

csr_data = csr_matrix((data, (user_to_idx, movie_to_idx)), shape=(num_user, num_movie))
csr_data


<6039x3628 sparse matrix of type '<class 'numpy.int64'>'
	with 836478 stored elements in Compressed Sparse Row format>

In [20]:
from implicit.als import AlternatingLeastSquares

# implicit 라이브러리에서 권장하고 있는 부분입니다. 학습 내용과는 무관합니다.
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

In [21]:
# Implicit AlternatingLeastSquares 모델의 선언
als_model = AlternatingLeastSquares(factors=100, regularization=0.01, use_gpu=False, iterations=15, dtype=np.float32)

<3628x6039 sparse matrix of type '<class 'numpy.int64'>'
	with 836478 stored elements in Compressed Sparse Column format>

In [22]:
# 모델 훈련
als_model.fit(csr_data)

100%|██████████| 15/15 [00:09<00:00,  1.55it/s]


lms 예제에서는 csr_data를 item X user 꼴의 matrix를 만들기 위해 Transpose를 취해서 학습을 시켯으나  
지금은 user X movie_id의 형태이므로 Transpose를 취할 이유가 없기 때문에 csr_data 형태 그대로 학습을 진행하였다.

In [23]:
exam, exam_peas = user_to_idx[1], movie_to_idx[1]
exam_vector, exam_peas_vector = als_model.user_factors[exam], als_model.item_factors[exam_peas]
exam_vector

array([ 0.11308312,  0.20748425, -0.70742697,  2.56326   ,  2.1351666 ,
       -1.2181388 , -0.21138401, -0.06221412,  1.7183218 ,  1.5865915 ,
       -1.1624329 , -1.427987  ,  1.1065447 ,  0.6304984 ,  1.0148435 ,
        2.3397913 , -0.04767592, -0.55821055,  1.1598129 , -1.2337168 ,
        0.07641721, -1.8291961 , -1.444429  ,  0.0884894 ,  1.5355394 ,
        1.7705578 ,  1.4241085 ,  0.96535534, -1.5052978 , -1.3693013 ,
        3.2263577 , -1.5069038 , -0.9621845 , -1.536803  ,  1.9995402 ,
        1.0133291 , -0.22973424, -0.15488465,  2.141314  , -0.30995598,
        1.7963873 ,  0.90549207,  1.1399453 , -0.31270903,  0.24602419,
        0.5422998 ,  2.6450503 , -0.2045513 ,  1.0134451 , -0.06360134,
        0.6448524 , -2.1638477 ,  0.33748317,  0.00724955,  1.0795144 ,
        0.0681527 ,  2.26966   , -1.1319168 ,  1.4216013 , -0.11577581,
        0.5618818 ,  0.71443695, -0.41755992, -2.026486  , -2.4871562 ,
       -0.73020446,  0.43586293,  0.69810444,  1.0466962 , -2.78

In [24]:
exam_peas_vector

array([-0.01480159,  0.03121143,  0.02548641,  0.00356119,  0.01514975,
       -0.00019845,  0.00865541,  0.02112992,  0.02329485,  0.05466151,
        0.00781976, -0.016457  ,  0.02680555, -0.00390273,  0.00238663,
       -0.00791083,  0.0057195 ,  0.0209788 ,  0.02561568, -0.007283  ,
        0.01502154,  0.00074509, -0.01925371,  0.00978601,  0.02266928,
        0.00286027, -0.00217171,  0.00925226,  0.00958812, -0.01207619,
       -0.01153665, -0.00590043,  0.00725504, -0.0091522 ,  0.00534624,
        0.01034476,  0.00874732,  0.00653183,  0.0181107 ,  0.01897374,
        0.00552965,  0.03069357,  0.00304163,  0.00150652,  0.01159961,
       -0.0007073 ,  0.01916448, -0.00505276, -0.01411222,  0.02693935,
        0.00073953, -0.02029527,  0.01081626,  0.00361592,  0.01105517,
        0.0015565 ,  0.00260683,  0.00054518,  0.02951918, -0.00519501,
       -0.01096479,  0.01854404, -0.00026959,  0.0091935 , -0.00078599,
        0.00117683,  0.00945898,  0.00272293,  0.02180519,  0.00

In [25]:
np.dot(exam_vector, exam_peas_vector)

0.6180721

In [26]:
favorite_movie_idx=movies[movies['title']=='Toy Story (1995)']['movie_id_idx'].values[0]
movie_id = movie_to_idx[favorite_movie_idx]
similar_movie_vec = als_model.similar_items(movie_id, N=15)
similar_movie_vec

(array([ 40,  50,   4,  33, 322, 110, 330,  20,  10, 255,  34, 126,  32,
        160,  22]),
 array([0.9999998 , 0.8164849 , 0.585326  , 0.58499205, 0.5526134 ,
        0.5316863 , 0.43574736, 0.4345397 , 0.4275212 , 0.40422603,
        0.35790074, 0.34685946, 0.34365398, 0.34333736, 0.32651454],
       dtype=float32))

In [28]:
def find_similar_movie(favor_movie):
    movie_id=int(movies[movies['title']==favor_movie]['movie_id_idx'].values[0])
    similar_movie_vec = als_model.similar_items(movie_id, N=15)
    
    movie_idxs=similar_movie_vec[0][1:6]
    movie_idxs=[movies[movies['movie_id_idx']==x]['movie_id'].values[0] for x in movie_idxs]
    movie_recomm=similar_movie_vec[1][1:6]
    
    movie_names=[]
    
    for idx in movie_idxs:
        movie_names.append(movies[movies['movie_id']==idx]['title'].values[0])
    
    return movie_names, movie_recomm

In [29]:
favorite_movie = 'Toy Story (1995)'
a,b = find_similar_movie(favorite_movie)

print(a)
print(b)

['Toy Story 2 (1999)', "Bug's Life, A (1998)", 'Aladdin (1992)', 'Babe (1995)', 'Groundhog Day (1993)']
[0.8164849  0.585326   0.58499205 0.5526134  0.5316863 ]


Toy Story와 비슷한 영화를 찾아보니 위와 같은 결과글 얻은 것을 알 수 있다.  
3위까지는 모두 디즈니 영화로 애니메이션으로 만든 영화로 Toy Story와 매우 유사한 영화이다.  
또한 Babe는 꼬마돼지 베이브라는 영화로 가족이 보기 좋은 영화로 유명하다.  
Groundhog Day는 사랑의 블랙홀이라는 영화로 로맨틱 코메디 영화로 유명하다.  

이정도면 유의미하게 예측을 잘 하였다고 생각된다.

In [47]:
def find_recommend_movie(user_ID):
    user_id=ratings[ratings['user_id']==user_ID]['user_id_idx'].values[0]
    recommend_movie_vec = als_model.recommend(user_id, csr_data[user_id], N=15)
    
    movie_idxs=recommend_movie_vec[0][:5]
    movie_idxs=[movies[movies['movie_id_idx']==x]['movie_id'].values[0] for x in movie_idxs]
    movie_recomm=recommend_movie_vec[1][:5]
    
    movie_names=[]
    
    for idx in movie_idxs:
        movie_names.append(movies[movies['movie_id']==idx]['title'].values[0])
    
    return movie_names, movie_recomm

def user_favorite_check(user_ID):
    favor_movie_idx=ratings[(ratings['user_id']==1) & (ratings['counts']==5)]['movie_id'].values
    favor_movie=[movies[movies['movie_id']==x]['title'].values[0] for x in favor_movie_idx]
    
    for i in favor_movie:
        print(i)

In [31]:
user_id = 1
a,b = find_recommend_movie(user_id)

print(a)
print(b)

['Lion King, The (1994)', 'Shawshank Redemption, The (1994)', 'Jungle Book, The (1967)', 'Fantasia (1940)', 'Little Mermaid, The (1989)']
[0.9996676 0.8083495 0.7803898 0.7619426 0.7271983]


In [48]:
user_favorite_check(1)

One Flew Over the Cuckoo's Nest (1975)
Bug's Life, A (1998)
Ben-Hur (1959)
Christmas Story, A (1983)
Beauty and the Beast (1991)
Sound of Music, The (1965)
Awakenings (1990)
Back to the Future (1985)
Schindler's List (1993)
Pocahontas (1995)
Last Days of Disco, The (1998)
Cinderella (1950)
Apollo 13 (1995)
Toy Story (1995)
Rain Man (1988)
Mary Poppins (1964)
Dumbo (1941)
Saving Private Ryan (1998)


1이라는 user_id를 대상으로 추천하는 영화를 골라보았다.  

라이온킹, 쇼생크 탈출, 정글북, 판타지아, 인어공주를 추천한 것을 볼 수 있다.  
그리고 해당 id를 가진 사람이 5점을 준 영화들을 확인해 본 결과, 위와 같은 결과를 얻을 수 있었다.

또한 해당 추천 목록의 영화들이 5점을 준 영화들과 비슷한 종류의 영화라는 것을 보면 유의미한 예측을 하는 것으로 생각된다.