## 콘텐츠 기반 추천시스템 구현하기
Dataset : MovieLens100k dataset


600명의 사용자가 9,000편의 영화에 매긴 100,000개의 평가 데이터와 3,600건의 태그를 적용한 데이터 (latest update : 2018 9월)


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np

In [None]:
movie_dataset = pd.read_csv('/content/drive/MyDrive/Recsys/ml-latest-small/movies.csv')
rate_dataset = pd.read_csv('/content/drive/MyDrive/Recsys/ml-latest-small/ratings.csv')
tag_dataset = pd.read_csv('/content/drive/MyDrive/Recsys/ml-latest-small/tags.csv')

In [None]:
print(movie_dataset.head())
print(rate_dataset.head())
print(tag_dataset.head())

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931
   userId  movieId              tag   timestamp
0       2    60756            funny  1445714994
1       2    60756  Highly quotable  1445714996
2       2    60756     will ferre

데이터 전처리

In [None]:
# 태그들을 소문자로 통일, 같은 movieId에 해당하는 태그들을 하나의 문자열로 결합
tag_dataset['tag'] = tag_dataset['tag'].astype(str).str.lower()
movie_tags = tag_dataset.groupby('movieId')['tag'].apply(lambda x: ' '.join(x)).reset_index()

# movies와 tags를 movieId 기준으로 merge
movie_dataset = movie_dataset.merge(movie_tags, on='movieId', how='left')

# 결측값 처리
movie_dataset['tag'] = movie_dataset['tag'].fillna('')

# genres와 tags를 결합
movie_dataset['content'] = movie_dataset['genres'] + ' ' + movie_dataset['tag']

In [None]:
print(movie_dataset['content'].head())

0    Adventure|Animation|Children|Comedy|Fantasy pi...
1    Adventure|Children|Fantasy fantasy magic board...
2                             Comedy|Romance moldy old
3                                Comedy|Drama|Romance 
4                              Comedy pregnancy remake
Name: content, dtype: object


TF-IDF 벡터화

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(token_pattern=r'[^| ]+')
tfidf_matrix = tfidf.fit_transform(movie_dataset['content'])

In [None]:
print(tfidf_matrix)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 27219 stored elements and shape (9742, 1784)>
  Coords	Values
  (0, 48)	0.14923127385752047
  (0, 90)	0.18490545901063682
  (0, 302)	0.18038846066758712
  (0, 346)	0.09576778805506665
  (0, 568)	0.17300098313924464
  (0, 1217)	0.8416771249174413
  (0, 629)	0.40432512831075756
  (1, 48)	0.11462426795143799
  (1, 302)	0.13855604603797955
  (1, 568)	0.26576347617521395
  (1, 980)	0.30552767912025
  (1, 201)	0.3577865894015698
  (1, 637)	0.6850036759600162
  (1, 1347)	0.33165713426090987
  (1, 1753)	0.31056139004241473
  (2, 346)	0.14088088145157518
  (2, 1355)	0.2026249156281861
  (2, 1052)	0.6852356240039252
  (2, 1150)	0.6852356240039252
  (3, 346)	0.5048999073185984
  (3, 1355)	0.7261829999003466
  (3, 486)	0.4666201177032598
  (4, 346)	0.17051503723707886
  (4, 1249)	0.7199029622585923
  (4, 1319)	0.6728033494323146
  :	:
  (9731, 1397)	0.5914233380250173
  (9732, 90)	0.2229200478947716
  (9732, 346)	0.23091324630695412
  (

사용자 프로파일

In [None]:
user_id = 1

In [None]:
# 해당 사용자가 4점 이상 준 영화 추출
liked_movies = rate_dataset[(rate_dataset['userId'] == user_id) & (rate_dataset['rating'] >= 4.0)]

# 해당 영화들의 movieId → movies에서 index 추출
user_liked_movie = movie_dataset[movie_dataset['movieId'].isin(liked_movies['movieId'])].index

In [None]:
user_profile = tfidf_matrix[user_liked_movie].mean(axis=0)
user_profile = np.asarray(user_profile)  # np.matrix → ndarray 변환 (버전 문제)

유사도 검사


코사인 유사도

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

user_similarities = cosine_similarity(user_profile.reshape(1, -1), tfidf_matrix).flatten()

In [None]:
# 5) 추천 순위 정렬 (내림차순)
recommended_indices = user_similarities.argsort()[::-1]

In [None]:
# 6) 결과 출력 (예: 상위 10개 추천 영화)
recommended_movies = movie_dataset.iloc[recommended_indices[:10]][['title', 'genres', 'content']]
print(recommended_movies)

                                        title  \
8597    Dragonheart 2: A New Beginning (2000)   
478                  Super Mario Bros. (1993)   
6570                Hunting Party, The (2007)   
1480                      Goonies, The (1985)   
2572  Teenage Mutant Ninja Turtles III (1993)   
7374        Sorcerer's Apprentice, The (2010)   
9394                      Maximum Ride (2016)   
4681           The Great Train Robbery (1978)   
4005                         Flashback (1990)   
3526                      Extreme Days (2001)   

                                               genres  \
8597   Action|Adventure|Comedy|Drama|Fantasy|Thriller   
478   Action|Adventure|Children|Comedy|Fantasy|Sci-Fi   
6570           Action|Adventure|Comedy|Drama|Thriller   
1480         Action|Adventure|Children|Comedy|Fantasy   
2572         Action|Adventure|Children|Comedy|Fantasy   
7374         Action|Adventure|Children|Comedy|Fantasy   
9394  Action|Adventure|Comedy|Fantasy|Sci-Fi|Thriller   
4681