In [39]:
from numpy import dot
from numpy.linalg import norm
import numpy as np
import pandas as pd
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
movies = pd.read_csv('data\movies_metadata.csv', low_memory=False) # movies_metadata경로
movies.head(10) # 데이터 구성 미리보기

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0
5,False,,60000000,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",,949,tt0113277,en,Heat,"Obsessive master thief, Neil McCauley leads a ...",...,1995-12-15,187436818.0,170.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,A Los Angeles Crime Saga,Heat,False,7.7,1886.0
6,False,,58000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...",,11860,tt0114319,en,Sabrina,An ugly duckling having undergone a remarkable...,...,1995-12-15,0.0,127.0,"[{'iso_639_1': 'fr', 'name': 'Français'}, {'is...",Released,You are cordially invited to the most surprisi...,Sabrina,False,6.2,141.0
7,False,,0,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",,45325,tt0112302,en,Tom and Huck,"A mischievous young boy, Tom Sawyer, witnesses...",...,1995-12-22,0.0,97.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,The Original Bad Boys.,Tom and Huck,False,5.4,45.0
8,False,,35000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",,9091,tt0114576,en,Sudden Death,International action superstar Jean Claude Van...,...,1995-12-22,64350171.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Terror goes into overtime.,Sudden Death,False,5.5,174.0
9,False,"{'id': 645, 'name': 'James Bond Collection', '...",58000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",http://www.mgm.com/view/movie/757/Goldeneye/,710,tt0113189,en,GoldenEye,James Bond must unmask the mysterious head of ...,...,1995-11-16,352194034.0,130.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,No limits. No fears. No substitutes.,GoldenEye,False,6.6,1194.0


In [3]:
# movie 목록을 25000개로 줄여서 재정의
# test는 작게 -> 완성후 고치기
moviemeta = movies.head(25000)

In [4]:
# movie 데이터 컬럼 확인
moviemeta.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [5]:
# 전처리 
# overview의 결측치가 있는 항목은 모두 제거 
moviemeta = moviemeta[moviemeta['overview'].notnull()].reset_index(drop=True)

moviemeta.shape

(24791, 24)

In [6]:
# 사용자에서 입력을 받을 때 영어가 아닌 영화를 제외시켜 오류를 방지하기 위해
# 원어가 영어인 영화만 골라내기

moviemeta = moviemeta[moviemeta['original_language'] == 'en']

In [7]:
# 영어 유의미하지 않은 단어 토큰을 제거 (불용어 제거)
tfidf = TfidfVectorizer(stop_words='english', max_features=50000) #메모리 과용 방지

# overview(줄거리)에 대해서 tf-idf 수행
m_tfidf_matrix = tfidf.fit_transform(moviemeta['overview'])
print(m_tfidf_matrix.shape) # 영화 수와 overview 글자 확인용

(19209, 43362)


In [8]:
# ifidf를 통해 추출한 단어 행렬을 가지고 cos유사도를 구함

# 코사인 유사도 함수 계산 방식
#def cos_sim(A, B):
#    return dot(A, B)/(norm(A)*norm(B))

m_cosine_sim = cosine_similarity(m_tfidf_matrix, m_tfidf_matrix)

In [9]:
# 코사인 유사도 계산이 실행됐는지 샘플 확인
np.round(m_cosine_sim, 7)

array([[1.       , 0.0167937, 0.       , ..., 0.03356  , 0.       ,
        0.       ],
       [0.0167937, 1.       , 0.0494804, ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.0494804, 1.       , ..., 0.       , 0.010437 ,
        0.       ],
       ...,
       [0.03356  , 0.       , 0.       , ..., 1.       , 0.       ,
        0.       ],
       [0.       , 0.       , 0.010437 , ..., 0.       , 1.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        1.       ]])

In [10]:
# movie title와 id를 매핑할 딕셔너리를 생성 
movie_id_dic = {}
for i, c in enumerate(movies['title']): movie_id_dic[i] = c

# id와 movie title를 매핑할 딕셔너리를 생성 
id_movie_dic = {}
for i, c in movie_id_dic.items(): id_movie_dic[c] = i

In [11]:
# movie title과 id가 매핑이 됐는지 확인
movie_id_dic

{0: 'Toy Story',
 1: 'Jumanji',
 2: 'Grumpier Old Men',
 3: 'Waiting to Exhale',
 4: 'Father of the Bride Part II',
 5: 'Heat',
 6: 'Sabrina',
 7: 'Tom and Huck',
 8: 'Sudden Death',
 9: 'GoldenEye',
 10: 'The American President',
 11: 'Dracula: Dead and Loving It',
 12: 'Balto',
 13: 'Nixon',
 14: 'Cutthroat Island',
 15: 'Casino',
 16: 'Sense and Sensibility',
 17: 'Four Rooms',
 18: 'Ace Ventura: When Nature Calls',
 19: 'Money Train',
 20: 'Get Shorty',
 21: 'Copycat',
 22: 'Assassins',
 23: 'Powder',
 24: 'Leaving Las Vegas',
 25: 'Othello',
 26: 'Now and Then',
 27: 'Persuasion',
 28: 'The City of Lost Children',
 29: 'Shanghai Triad',
 30: 'Dangerous Minds',
 31: 'Twelve Monkeys',
 32: 'Wings of Courage',
 33: 'Babe',
 34: 'Carrington',
 35: 'Dead Man Walking',
 36: 'Across the Sea of Time',
 37: 'It Takes Two',
 38: 'Clueless',
 39: 'Cry, the Beloved Country',
 40: 'Richard III',
 41: 'Dead Presidents',
 42: 'Restoration',
 43: 'Mortal Kombat',
 44: 'To Die For',
 45: 'How To M

In [12]:
# movie title과 id가 매핑이 됐는지 확인
id_movie_dic

{'Toy Story': 0,
 'Jumanji': 1,
 'Grumpier Old Men': 2,
 'Waiting to Exhale': 3,
 'Father of the Bride Part II': 4,
 'Heat': 29042,
 'Sabrina': 888,
 'Tom and Huck': 7,
 'Sudden Death': 8,
 'GoldenEye': 9,
 'The American President': 10,
 'Dracula: Dead and Loving It': 11,
 'Balto': 12,
 'Nixon': 13,
 'Cutthroat Island': 14,
 'Casino': 15,
 'Sense and Sensibility': 41042,
 'Four Rooms': 17,
 'Ace Ventura: When Nature Calls': 18,
 'Money Train': 19,
 'Get Shorty': 20,
 'Copycat': 21,
 'Assassins': 22,
 'Powder': 23,
 'Leaving Las Vegas': 24,
 'Othello': 21274,
 'Now and Then': 26,
 'Persuasion': 40837,
 'The City of Lost Children': 28,
 'Shanghai Triad': 29,
 'Dangerous Minds': 30,
 'Twelve Monkeys': 31,
 'Wings of Courage': 32,
 'Babe': 33,
 'Carrington': 34,
 'Dead Man Walking': 35,
 'Across the Sea of Time': 36,
 'It Takes Two': 29129,
 'Clueless': 38,
 'Cry, the Beloved Country': 26667,
 'Richard III': 17719,
 'Dead Presidents': 41,
 'Restoration': 38571,
 'Mortal Kombat': 43,
 'To D

In [13]:
# tfidf와 cos유사도를 이용한 컨텐츠 기반 필터링 함수 

def get_recommend(title, m_cosine_sim = m_cosine_sim):
    idx = id_movie_dic[title]

    # 입력 받은 영화, 자신을 제외한 영화들 간의 코사인 유사도와 인덱스 추출
    sim_scores = [(i, c) for i, c in enumerate(m_cosine_sim[idx]) if i != idx]

    # 유사도가 높게 나온 순서대로 정렬
    sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse=True)

    sim_scores[0:10] # 정렬한 목록에서 상위 10개의 인덱스와 유사도를 추출 

    sim_scores = [(movie_id_dic[i], score) for i, score in sim_scores[0:10]]
    # 인덱스를 title로 바꿔주고 유사도와 함께 리스트로 묶어줌

    return sim_scores #리스트 출력

In [14]:
# 영화를 입력받아 추천 함수를 거쳐 추천 리스트를 출력
recom = get_recommend(input('입력한 영화와 비슷한 영화를 10개 추천'))
recom

입력한 영화와 비슷한 영화를 10개 추천 Toy Story


[('P.S. I Love You', 0.5173376977761817),
 ("National Lampoon's Vacation", 0.4568016987075376),
 ('The Hill', 0.27213307411412035),
 ('Bullhead', 0.2685972116672364),
 ('A Muppet Family Christmas', 0.23196707934021946),
 ('A Foreign Affair', 0.19535908522619994),
 ('Dangerous Ground', 0.18170037646867362),
 ('Foolproof', 0.15393963160153162),
 ('Homegrown', 0.1504554719490517),
 ('Gung Ho!', 0.1410709552490119)]

In [15]:
# 보기 쉽게 데이터프레임으로 만들기
recom_df = pd.DataFrame(recom, columns = ['title','sim_score'])
recom_df

Unnamed: 0,title,sim_score
0,P.S. I Love You,0.517338
1,National Lampoon's Vacation,0.456802
2,The Hill,0.272133
3,Bullhead,0.268597
4,A Muppet Family Christmas,0.231967
5,A Foreign Affair,0.195359
6,Dangerous Ground,0.1817
7,Foolproof,0.15394
8,Homegrown,0.150455
9,Gung Ho!,0.141071


In [16]:
# 피어슨 유사도를 이용한 사용자 필터링 데이터 정의

movies.head() # 데이터 구성 미리보기

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [17]:
# movie_metadata에서 사용할 데이터 컬럼만 가져오기
movies_sub = movies[['id', 'original_title', 'original_language', 'genres']]

# id를 ratings 데이터의 movieId와 합치기 위해 똑같은 이름으로 바꿔줌
movies_sub = movies_sub.rename(columns={'id':'movieId'})

# 원어가 영어인 영화 데이터만 가져오기 > input을
movies_sub = movies_sub[movies_sub['original_language'] == 'en']

movies_sub.head()

Unnamed: 0,movieId,original_title,original_language,genres
0,862,Toy Story,en,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '..."
1,8844,Jumanji,en,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '..."
2,15602,Grumpier Old Men,en,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ..."
3,31357,Waiting to Exhale,en,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam..."
4,11862,Father of the Bride Part II,en,"[{'id': 35, 'name': 'Comedy'}]"


In [19]:
# 평점 데이터 불러오기
ratings = pd.read_csv('data/ratings_small.csv')

ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [20]:
# 원하는 데이터 컬럼만 가져와 subtable을 만들기
ratings_sub = ratings[['userId', 'movieId', 'rating']]

ratings_sub.head(10)

Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0
5,1,1263,2.0
6,1,1287,2.0
7,1,1293,2.0
8,1,1339,3.5
9,1,1343,2.0


In [21]:
# ratings 데이터 통계 수치 확인
ratings_sub.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
userId,100004.0,347.01131,195.163838,1.0,182.0,367.0,520.0,671.0
movieId,100004.0,12548.664363,26369.198969,1.0,1028.0,2406.5,5418.0,163949.0
rating,100004.0,3.543608,1.058064,0.5,3.0,4.0,4.0,5.0


In [22]:
# id의 데이터타입 전환을 위해 sub로 만든 데이터프레임을 카피
movies_sub = movies_sub.copy()
ratings_sub = ratings_sub.copy()

In [23]:
# 합칠 id를 numeric으로 재정의
movies_sub.movieId = pd.to_numeric(movies_sub.movieId, errors='coerce')
ratings_sub.movieId = pd.to_numeric(ratings_sub.movieId, errors='coerce')

In [24]:
# movieid를 기준으로 ovie_metadata와 ratings data를 합치기
merge_tbl = pd.merge(ratings_sub, movies_sub, on='movieId', how='inner')
# pd.concat 함수로 위의 두가지 데이터프레임을 결합시키기
# merge_tbl = pd.concat([ratings_sub, movies_sub]) # movieId 공통

# 확인
merge_tbl.head(10)

Unnamed: 0,userId,movieId,rating,original_title,original_language,genres
0,1,1371,2.5,Rocky III,en,"[{'id': 18, 'name': 'Drama'}]"
1,4,1371,4.0,Rocky III,en,"[{'id': 18, 'name': 'Drama'}]"
2,7,1371,3.0,Rocky III,en,"[{'id': 18, 'name': 'Drama'}]"
3,19,1371,4.0,Rocky III,en,"[{'id': 18, 'name': 'Drama'}]"
4,21,1371,3.0,Rocky III,en,"[{'id': 18, 'name': 'Drama'}]"
5,22,1371,2.0,Rocky III,en,"[{'id': 18, 'name': 'Drama'}]"
6,41,1371,3.5,Rocky III,en,"[{'id': 18, 'name': 'Drama'}]"
7,78,1371,4.0,Rocky III,en,"[{'id': 18, 'name': 'Drama'}]"
8,118,1371,3.0,Rocky III,en,"[{'id': 18, 'name': 'Drama'}]"
9,130,1371,3.0,Rocky III,en,"[{'id': 18, 'name': 'Drama'}]"


In [25]:
# 장르가중치를 주기위해 장르 값을 수정, 장르 값의 메타데이터를 날리고 장르 종류만 남김
# json형식(str)을 가져와 장르의 이름만 뽑아냄.
def separ_genres(genres_str):
    genres = json.loads(genres_str.replace('\'', '"'))
    
    genrelist = []
    for c in genres:
        genrelist.append(c['name'])

    return genrelist

# 함수 실행
merge_tbl['genres'] = merge_tbl['genres'].apply(separ_genres)

# 확인
merge_tbl.head(10)

Unnamed: 0,userId,movieId,rating,original_title,original_language,genres
0,1,1371,2.5,Rocky III,en,[Drama]
1,4,1371,4.0,Rocky III,en,[Drama]
2,7,1371,3.0,Rocky III,en,[Drama]
3,19,1371,4.0,Rocky III,en,[Drama]
4,21,1371,3.0,Rocky III,en,[Drama]
5,22,1371,2.0,Rocky III,en,[Drama]
6,41,1371,3.5,Rocky III,en,[Drama]
7,78,1371,4.0,Rocky III,en,[Drama]
8,118,1371,3.0,Rocky III,en,[Drama]
9,130,1371,3.0,Rocky III,en,[Drama]


In [26]:
# 사용자마다 영화의 평가를 보기위해 pivot table을 생성, 기준은 사용자id
# 해당 테이블을 매트릭스처럼 사용

matrix_movie = merge_tbl.pivot_table(index = 'userId',
                                     columns = 'original_title',
                                     values = 'rating')

In [27]:
# 피봇테이블 확인
matrix_movie.head(15)

original_title,!Women Art Revolution,'Gator Bait,'Twas the Night Before Christmas,10 Items or Less,10 Things I Hate About You,"10,000 BC",11'09''01 - September 11,12 + 1,12 Angry Men,1408,...,Young and Innocent,Zaat,Zabriskie Point,Zapped Again!,Zardoz,Zodiac,eXistenZ,xXx,¡Three Amigos!,Мой сводный брат Франкенштейн
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,3.5,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,3.5,,,,,,,,,
6,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
8,,,,,,,,,,,...,4.5,,,,,,,,,
9,,,,,,,,,,,...,4.0,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,


In [57]:
# 장르 가중치 값 설정
genre_weight = 0.05

In [58]:
# 피어슨 상관계수 함수

def pearsonR(e1, e2):
    e1_c = e1 - e1.mean()
    e2_c = e2 - e2.mean()
    
    return np.sum(e1_c * e2_c) / np.sqrt(np.sum(e1_c ** 2) * np.sum(e2_c ** 2))

In [59]:
# 피어슨 상관계수와 장르 가중치를 더한 사용자기반 필터링 함수

def recommend(input_m, matrix_movie, n, sim_genre=True):
    add_genre = merge_tbl[merge_tbl['original_title'] == input_m]['genres'].iloc[0]

    result_list = []
    
    
    for title in matrix_movie.columns:
        if title == input_m: continue

        # 피어슨 상관계수 적용
        corr = pearsonR(matrix_movie[input_m], matrix_movie[title])
        
        # 장르 가중치 적용
        if sim_genre and len(add_genre) > 0:
            temp_genres = merge_tbl[merge_tbl['original_title'] == title]['genres'].iloc[0]

            same_count = np.sum(np.isin(add_genre, temp_genres))
            corr += (genre_weight * same_count)
        
        if np.isnan(corr): continue
        
        else:
            result_list.append((title, '{:.2f}'.format(corr), temp_genres))
            
            
    result_list.sort(key = lambda r: r[1], reverse = True)

    
    return result_list[:n]

In [60]:
# input(영화) 값 받기
recommend_result = recommend(input('비슷한 취향의 영화를 추천'), matrix_movie, 10, sim_genre=True)

pd.DataFrame(recommend_result, columns = ['title', 'score', 'genres'])

비슷한 취향의 영화를 추천 Zaat


  import sys


Unnamed: 0,title,score,genres
0,Boogeyman,0.7,"[Thriller, Horror, Drama, Mystery]"
1,Merlin,0.6,"[Adventure, Drama, Fantasy]"
2,The Pledge,0.6,"[Crime, Drama, Mystery, Thriller]"
3,Bunny Lake Is Missing,0.57,"[Thriller, Mystery]"
4,Ghostbusters,0.55,"[Comedy, Fantasy]"
5,Warlords of the 21st Century,0.55,[Science Fiction]
6,Enigma,0.51,"[Adventure, Drama, Action, Thriller, Foreign]"
7,Just the Ticket,0.5,"[Comedy, Romance]"
8,Laura,0.5,"[Drama, Mystery]"
9,Le Professionnel,0.5,"[Action, Adventure, Thriller]"


In [77]:
# 두개의 필터링을 합쳐서 계산하기
def recommend(input_m, matrix_movie, n, sim_genre=True):
    add_genre = merge_tbl[merge_tbl['original_title'] == input_m]['genres'].iloc[0]
    result_list = []
    
    for title in matrix_movie.columns:
        if title == input_m: continue

        # 피어슨 상관계수 적용
        corr = pearsonR(matrix_movie[input_m], matrix_movie[title])
        
        # 장르 가중치 적용
        if sim_genre and len(add_genre) > 0:
            temp_genres = merge_tbl[merge_tbl['original_title'] == title]['genres'].iloc[0]

            same_count = np.sum(np.isin(add_genre, temp_genres))
            corr += (genre_weight * same_count)
        
        if np.isnan(corr): continue
        else:
            result_list.append((title, '{:.2f}'.format(corr)))
            
            
    result_list.sort(key = lambda r: r[1], reverse = True)
    result2 = result_list[:n]
    
    def get_recommend(title, m_cosine_sim = m_cosine_sim):
        idx = id_movie_dic[title]

        # 입력 받은 영화, 자신을 제외한 영화들 간의 코사인 유사도와 인덱스 추출
        sim_scores = [(i, c) for i, c in enumerate(m_cosine_sim[idx]) if i != idx]

        # 유사도가 높게 나온 순서대로 정렬
        sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse=True)

        sim_scores = [(movie_id_dic[i], score) for i, score in sim_scores[0:10]]
        # 인덱스를 title로 바꿔주고 유사도와 함께 리스트로 묶어줌
        
        result1 = sim_scores[:n]
        return result1
    
    result3 = get_recommend(input_m) + result2
    return result3

In [78]:
recommend_result = recommend(input('비슷한 취향의 영화를 추천'), matrix_movie, 10, sim_genre=True)

pd.DataFrame(recommend_result, columns = ['title', 'scores'])

비슷한 취향의 영화를 추천 Zaat


  import sys


Unnamed: 0,title,scores
0,Return of Django,0.305407
1,"Race for Your Life, Charlie Brown",0.244061
2,Normal,0.18479
3,Port of Call,0.170582
4,Hell,0.163912
5,Return to Lonesome Dove,0.16168
6,The Razor's Edge,0.160724
7,The Merchant of Four Seasons,0.157787
8,Dead Man on Campus,0.153768
9,The Internecine Project,0.151106
