In [1]:
import random
import pandas as pd
import numpy as np
import os, sys

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
from google.colab import drive

# 구글 드라이브와 연결
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# 데이터셋 불러오기(MovieLens 32m)

# CSV 데이터 로드
df_ratings = pd.read_csv("drive/MyDrive/ml-latest-small/ratings.csv")
df_movies = pd.read_csv("drive/MyDrive/ml-latest-small/movies.csv")
df_tags = pd.read_csv("drive/MyDrive/ml-latest-small/tags.csv")

In [4]:
df_ratings.drop(['timestamp'], axis=1, inplace=True)

In [5]:
# 영화 데이터셋 형태 확인
print("### Movie Dataset Format ###", end = '\n\n')
print("Columns of Movie Dataset : ",df_movies.columns, end = '\n\n')
print(df_movies.head())

### Movie Dataset Format ###

Columns of Movie Dataset :  Index(['movieId', 'title', 'genres'], dtype='object')

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  


In [6]:
# Dataset의 User, Movie 수 확인
n_users = df_ratings.userId.unique().shape[0]
n_items = df_ratings.movieId.unique().shape[0]
print("num users: {}, num items:{}".format(n_users, n_items))

num users: 610, num items:9724


In [7]:
movie_rate = dict()

for row in df_ratings.itertuples(index = False):
  user_id, movie_id, rate = row
  if movie_id not in movie_rate:
    movie_rate[movie_id] = [0, 0]
  movie_rate[movie_id][0] += rate
  movie_rate[movie_id][1] += 1

In [8]:
for key, value in movie_rate.items():
  value1 = value[0] / value[1]
  movie_rate[key] = [round(value1, 3),value[1]]

In [9]:
# 데이터 전처리
# user id, movie id의 범위를 (0 ~ 사용자 수 -1), (0 ~ 영화 수 -1) 사이로 맞춰줌.

user_dict = dict()      # {user_id : user_idx}, user_id : original data에서 부여된 user의 id, user_idx : 새로 부여할 user의 id
movie_dict = dict()     # {movie_id: movie_idx}, movie_id : original data에서 부여된 movie의 id, movie_idx: 새로 부여할 movie의 id
user_idx = 0
movie_idx = 0
ratings = np.zeros((n_users, n_items))
for row in df_ratings.itertuples(index=False):
    user_id, movie_id, _ = row
    if user_id not in user_dict:
        user_dict[user_id] = user_idx
        user_idx += 1
    if movie_id not in movie_dict:
        movie_dict[movie_id] = movie_idx
        movie_idx += 1
    ratings[user_dict[user_id], movie_dict[movie_id]] = row[2]
user_idx_to_id = {v: k for k, v in user_dict.items()}

movie_idx_to_name=dict()
movie_idx_to_genre=dict()
for row in df_movies.itertuples(index=False):
    movie_id, movie_name, movie_genre = row
    if movie_id not in movie_dict:              # 어떤 영화가 rating data에 없는 경우 skip
        continue
    movie_idx_to_name[movie_dict[movie_id]] = movie_name
    movie_idx_to_genre[movie_dict[movie_id]] = movie_genre


In [10]:
df_movies['genres'] = df_movies['genres'].apply(lambda x : x.split('|')).apply(lambda x : " ".join(x))

In [11]:
df_movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy
1,2,Jumanji (1995),Adventure Children Fantasy
2,3,Grumpier Old Men (1995),Comedy Romance
3,4,Waiting to Exhale (1995),Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action Animation Comedy Fantasy
9738,193583,No Game No Life: Zero (2017),Animation Comedy Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action Animation


In [13]:
rates = dict()
rates['movieId'] = []
rates['score'] = []
rates['count'] = []
for key, value in movie_rate.items():
  rates['movieId'].append(key)
  rates['score'].append(value[0])
  rates['count'].append(value[1])

In [14]:
scores = pd.DataFrame(rates)
scores

Unnamed: 0,movieId,score,count
0,1,3.921,215
1,3,3.260,52
2,6,3.946,102
3,47,3.975,203
4,50,4.238,204
...,...,...,...
9719,160341,2.500,1
9720,160527,4.500,1
9721,160836,3.000,1
9722,163937,3.500,1


In [15]:
df_movies = pd.merge(df_movies, scores, on='movieId')

In [16]:
df_movies.head(4)

Unnamed: 0,movieId,title,genres,score,count
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,3.921,215
1,2,Jumanji (1995),Adventure Children Fantasy,3.432,110
2,3,Grumpier Old Men (1995),Comedy Romance,3.26,52
3,4,Waiting to Exhale (1995),Comedy Drama Romance,2.357,7


In [24]:
m = df_movies['count'].quantile(0.89)
data = df_movies.loc[df_movies['count'] >= m]

In [25]:
m

24.0

In [26]:
df_movies

Unnamed: 0,movieId,title,genres,score,count
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,3.921,215
1,2,Jumanji (1995),Adventure Children Fantasy,3.432,110
2,3,Grumpier Old Men (1995),Comedy Romance,3.260,52
3,4,Waiting to Exhale (1995),Comedy Drama Romance,2.357,7
4,5,Father of the Bride Part II (1995),Comedy,3.071,49
...,...,...,...,...,...
9719,193581,Black Butler: Book of the Atlantic (2017),Action Animation Comedy Fantasy,4.000,1
9720,193583,No Game No Life: Zero (2017),Animation Comedy Fantasy,3.500,1
9721,193585,Flint (2017),Drama,3.500,1
9722,193587,Bungo Stray Dogs: Dead Apple (2018),Action Animation,3.500,1


In [27]:
data

Unnamed: 0,movieId,title,genres,score,count
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,3.921,215
1,2,Jumanji (1995),Adventure Children Fantasy,3.432,110
2,3,Grumpier Old Men (1995),Comedy Romance,3.260,52
4,5,Father of the Bride Part II (1995),Comedy,3.071,49
5,6,Heat (1995),Action Crime Thriller,3.946,102
...,...,...,...,...,...
9144,148626,"Big Short, The (2015)",Drama,3.962,26
9205,152081,Zootopia (2016),Action Adventure Animation Children Comedy,3.891,32
9374,164179,Arrival (2016),Sci-Fi,3.981,26
9415,166528,Rogue One: A Star Wars Story (2016),Action Adventure Fantasy Sci-Fi,3.926,27


In [28]:
C = df_movies['score'].mean()

In [29]:
print(C)
print(m)

3.262445701357466
24.0


In [30]:
def weighted_rating(x, m=m, C=C):
    v = x['count']
    R = x['score']

    return ( v / (v+m) * R ) + (m / (m + v) * C)

In [31]:
df_movies['weighted_score'] = df_movies.apply(weighted_rating, axis = 1)

In [32]:
df_movies.head(4)

Unnamed: 0,movieId,title,genres,score,count,weighted_score
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,3.921,215,3.854869
1,2,Jumanji (1995),Adventure Children Fantasy,3.432,110,3.401632
2,3,Grumpier Old Men (1995),Comedy Romance,3.26,52,3.260772
3,4,Waiting to Exhale (1995),Comedy Drama Romance,2.357,7,3.05799


In [33]:
count_vector = CountVectorizer(ngram_range=(1, 3))
count_vector

In [34]:
c_vector_genres = count_vector.fit_transform(df_movies['genres'])
c_vector_genres

<9724x589 sparse matrix of type '<class 'numpy.int64'>'
	with 43136 stored elements in Compressed Sparse Row format>

In [35]:
c_vector_genres.shape

(9724, 589)

In [36]:
#코사인 유사도를 구한 벡터를 미리 저장
gerne_c_sim = cosine_similarity(c_vector_genres, c_vector_genres).argsort()[:, ::-1]

In [37]:
gerne_c_sim.shape

(9724, 9724)

In [38]:
def get_recommend_movie_list(df, movie_title, top=30):
    # 특정 영화와 비슷한 영화를 추천해야 하기 때문에 '특정 영화' 정보를 뽑아낸다.
    target_movie_index = df[df['title'] == movie_title].index.values

    #코사인 유사도 중 비슷한 코사인 유사도를 가진 정보를 뽑아낸다.
    sim_index = gerne_c_sim[target_movie_index, :top].reshape(-1)
    #본인을 제외
    sim_index = sim_index[sim_index != target_movie_index]

    #data frame으로 만들고 vote_count으로 정렬한 뒤 return
    result = df.iloc[sim_index].sort_values('weighted_score', ascending=False)[:20]
    return result

In [41]:
get_recommend_movie_list(df_movies, movie_title='Deadpool 2 (2018)')

Unnamed: 0,movieId,title,genres,score,count,weighted_score
2037,2716,Ghostbusters (a.k.a. Ghost Busters) (1984),Action Comedy Sci-Fi,3.775,120,3.689574
816,1077,Sleeper (1973),Comedy Sci-Fi,3.75,28,3.524975
1182,1580,Men in Black (a.k.a. MIB) (1997),Action Comedy Sci-Fi,3.488,165,3.459358
2284,3033,Spaceballs (1987),Comedy Sci-Fi,3.483,59,3.419225
558,671,Mystery Science Theater 3000: The Movie (1996),Comedy Sci-Fi,3.486,36,3.396578
8184,103341,"World's End, The (2013)",Action Comedy Sci-Fi,3.417,18,3.328683
1441,1965,Repo Man (1984),Comedy Sci-Fi,3.381,21,3.317771
7598,87192,Attack the Block (2011),Action Comedy Sci-Fi,3.5,6,3.309957
7837,93805,Iron Sky (2012),Action Comedy Sci-Fi,3.75,2,3.29995
3270,4434,"10th Victim, The (La decima vittima) (1965)",Action Comedy Sci-Fi Thriller,4.0,1,3.291948
