In [1]:
import numpy as np 
import pandas as pd
import re

In [2]:
movies  = pd.read_csv('ml-25m/movies.csv')
tags = pd.read_csv('ml-25m/tags.csv')
ratings = pd.read_csv('ml-25m/ratings.csv')
genome_score = pd.read_csv('ml-25m/genome-scores.csv')
genome_tags = pd.read_csv('ml-25m/genome-tags.csv')

In [3]:
movies.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
movies.shape

(62423, 3)

In [5]:
tags.head(5)

Unnamed: 0,userId,movieId,tag,timestamp
0,3,260,classic,1439472355
1,3,260,sci-fi,1439472256
2,4,1732,dark comedy,1573943598
3,4,1732,great dialogue,1573943604
4,4,7569,so bad it's good,1573943455


In [6]:
tags.shape

(1093360, 4)

In [31]:
ratings.tail(5)

Unnamed: 0,userId,movieId,rating,timestamp
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434
25000094,162541,63876,5.0,1240952515


In [8]:
ratings.shape

(25000095, 4)

In [9]:
genome_tags.head(5)

Unnamed: 0,tagId,tag
0,1,007
1,2,007 (series)
2,3,18th century
3,4,1920s
4,5,1930s


In [10]:
genome_tags.shape

(1128, 2)

In [11]:
# 불필요한 row 제거 (후속편 정보)
genome_tags = genome_tags[~genome_tags['tag'].isin(['original', 'sequel', 'good sequel','sequels'])]
genome_tags

Unnamed: 0,tagId,tag
0,1,007
1,2,007 (series)
2,3,18th century
3,4,1920s
4,5,1930s
...,...,...
1123,1124,writing
1124,1125,wuxia
1125,1126,wwii
1126,1127,zombie


In [12]:
genome_score.head(5)

Unnamed: 0,movieId,tagId,relevance
0,1,1,0.02875
1,1,2,0.02375
2,1,3,0.0625
3,1,4,0.07575
4,1,5,0.14075


In [13]:
# 'tagId'를 기준으로 genome_score와 genome_tags 데이터프레임을 병합
merged = pd.merge(genome_score, genome_tags, on='tagId')

# 'movieId' 별로 그룹화한 후, 각 영화에 대해 관련성이 높은 상위 5개 태그를 선택
top_tags = merged.groupby('movieId').apply(lambda x: x.nlargest(5, 'relevance')['tag'].tolist())

# 결과를 데이터프레임으로 변환
top_tags_df = top_tags.reset_index(name='top_relevance')

top_tags_df

Unnamed: 0,movieId,top_relevance
0,1,"[toys, computer animation, pixar animation, ki..."
1,2,"[adventure, children, fantasy, kids, special e..."
2,3,"[comedy, gunfight, romance, destiny, great]"
3,4,"[women, chick flick, divorce, girlie movie, ro..."
4,5,"[father daughter relationship, pregnancy, midl..."
...,...,...
13811,205072,"[dumb but funny, friendship, runaway, great mo..."
13812,205076,"[girlie movie, light, feel-good, oscar (best w..."
13813,205383,"[chase, suspense, clever, drama, great ending]"
13814,205425,"[stand-up comedy, comedy, highly quotable, ver..."


In [14]:
# top_tags_df와 영화의 기본 정보를 포함하는 movies 데이터프레임을 'movieId'를 기준으로 병합
movies = pd.merge(top_tags_df, movies[['movieId', 'title', 'genres']], on='movieId')

# 결측치를 공백으로 대체
movies = movies.fillna(' ')

movies

Unnamed: 0,movieId,top_relevance,title,genres
0,1,"[toys, computer animation, pixar animation, ki...",Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,"[adventure, children, fantasy, kids, special e...",Jumanji (1995),Adventure|Children|Fantasy
2,3,"[comedy, gunfight, romance, destiny, great]",Grumpier Old Men (1995),Comedy|Romance
3,4,"[women, chick flick, divorce, girlie movie, ro...",Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,"[father daughter relationship, pregnancy, midl...",Father of the Bride Part II (1995),Comedy
...,...,...,...,...
13811,205072,"[dumb but funny, friendship, runaway, great mo...",Zombieland: Double Tap (2019),Action|Comedy|Horror
13812,205076,"[girlie movie, light, feel-good, oscar (best w...",Downton Abbey (2019),Drama
13813,205383,"[chase, suspense, clever, drama, great ending]",El Camino: A Breaking Bad Movie (2019),Crime|Drama|Thriller
13814,205425,"[stand-up comedy, comedy, highly quotable, ver...",Dave Chappelle: Sticks & Stones (2019),Comedy


In [15]:
# 장르(genres) 컬럼에서 '|' 문자를 공백으로 대체하고, 모든 문자를 소문자로 변환
movies['genres'] = movies['genres'].apply(lambda x: re.sub(r'\|', ' ', x).lower())

# 각 영화의 상위 관련 태그(top_relevance) 리스트를 공백으로 구분된 문자열로 변환하고, '-' 문자는 공백으로 대체
movies['top_relevance'] = movies['top_relevance'].apply(lambda x: ' '.join(x).replace('-', ' '))

movies

Unnamed: 0,movieId,top_relevance,title,genres
0,1,toys computer animation pixar animation kids a...,Toy Story (1995),adventure animation children comedy fantasy
1,2,adventure children fantasy kids special effects,Jumanji (1995),adventure children fantasy
2,3,comedy gunfight romance destiny great,Grumpier Old Men (1995),comedy romance
3,4,women chick flick divorce girlie movie romantic,Waiting to Exhale (1995),comedy drama romance
4,5,father daughter relationship pregnancy midlife...,Father of the Bride Part II (1995),comedy
...,...,...,...,...
13811,205072,dumb but funny friendship runaway great movie ...,Zombieland: Double Tap (2019),action comedy horror
13812,205076,girlie movie light feel good oscar (best writi...,Downton Abbey (2019),drama
13813,205383,chase suspense clever drama great ending,El Camino: A Breaking Bad Movie (2019),crime drama thriller
13814,205425,stand up comedy comedy highly quotable very fu...,Dave Chappelle: Sticks & Stones (2019),comedy


In [25]:
# 영화 조회
movies[movies['title'] == 'Zombieland: Double Tap (2019)']
movies[movies['title'] == 'Jumanji (1995)']

Unnamed: 0,movieId,top_relevance,title,genres,combine_relevant
1,2,adventure children fantasy kids special effects,Jumanji (1995),adventure children fantasy,adventure children fantasy kids special effect...


In [17]:
# 관련 태그, 장르 정보 결합
movies['combine_relevant'] = movies['top_relevance']+" "+movies['genres']

In [18]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

# CountVectorizer 초기화 (영어 불용어 제거, 최대 특성 수 1500으로 설정)
counter_vec = CountVectorizer(stop_words='english', max_features=1500)
# 'combine_relevant' 컬럼으로 CountVectorizer 학습
counter_vec.fit(movies['combine_relevant'])

# 영화 태그에 대한 벡터 생성 및 중요도 조정 (0.3 배)
gerne_vec_tags = counter_vec.transform(movies['top_relevance'])*0.3
# 영화 장르에 대한 벡터 생성
gerne_vec_geners = counter_vec.transform(movies['genres'])*1

# 태그와 장르 벡터를 합하여 최종 벡터 생성
gerne_vec = gerne_vec_tags + gerne_vec_geners

# 생성된 벡터 간의 코사인 유사도 계산
cos_similar = cosine_similarity(gerne_vec, gerne_vec)

In [19]:
cos_similar.shape

(13816, 13816)

In [20]:
# 영화 제목을 인덱스로, 영화 데이터프레임의 인덱스를 값으로 하는 시리즈 생성
movie_title_series = pd.Series(movies.index, movies['title'])

# 주어진 영화 제목에 대해 유사한 영화를 추천하는 함수 정의
def get_recommend(title, cosine_sim=cos_similar):
    # 주어진 영화 제목에 해당하는 영화의 인덱스 찾기
    movie_name = movie_title_series[title]
    
    # 선택된 영화와 모든 영화 간의 코사인 유사도 계산
    sim_scores = list(enumerate(cosine_sim[movie_name]))

    # 유사도 점수에 따라 내림차순으로 정렬
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # 가장 유사한 영화 (자기 자신 제외)부터 상위 10개 영화 선택
    sim_scores = sim_scores[1:11]

    # 추천 영화의 인덱스 추출
    movie_indices = [i[0] for i in sim_scores]

    # 추천 영화의 제목 반환
    return movies['title'].iloc[movie_indices]

In [21]:
get_recommend('Zombieland: Double Tap (2019)')

9966                                Zombieland (2009)
11768                               Zombeavers (2014)
12548    Scouts Guide to the Zombie Apocalypse (2015)
2934                  Buffy the Vampire Slayer (1992)
11957               Dead Snow 2: Red vs. Dead (2014) 
13476                               Game Night (2018)
6779       Citizen Toxie: The Toxic Avenger IV (2000)
6543                                    Versus (2000)
8682                                     Feast (2005)
1923                       Surf Nazis Must Die (1987)
Name: title, dtype: object

In [26]:
get_recommend('Jumanji (1995)')

912                        Escape to Witch Mountain (1975)
2138                         Santa Claus: The Movie (1985)
1921     NeverEnding Story II: The Next Chapter, The (1...
12926                                 Pete's Dragon (2016)
12877               Alice Through the Looking Glass (2016)
8309     Chronicles of Narnia: The Lion, the Witch and ...
8788                           Bridge to Terabithia (2007)
10467    Chronicles of Narnia: The Voyage of the Dawn T...
13676            The Nutcracker and the Four Realms (2018)
13757                                         Dumbo (2019)
Name: title, dtype: object

In [27]:
movies[movies['title']=='Father of the Bride Part II (1995)']

Unnamed: 0,movieId,top_relevance,title,genres,combine_relevant
4,5,father daughter relationship pregnancy midlife...,Father of the Bride Part II (1995),comedy,father daughter relationship pregnancy midlife...


In [28]:
user_list_movie=['Zombieland: Double Tap (2019)','Zombieland (2009)','Shaun of the Dead (2004)','Cockneys vs Zombies (2012)']

In [29]:
def get_recommend_by_user_list(user_list):
    feature_names = list(counter_vec.vocabulary_.keys())
    user_vec = np.zeros((1, len(feature_names)))
    for movie_name in user_list:
        vector_tmp = counter_vec.transform( movies[movies['title'] == movie_name]['combine_relevant'])
        user_vec+=vector_tmp.toarray()

    

    cosine = cosine_similarity(gerne_vec,user_vec)

    sim_scores = list(enumerate(cosine))
    

    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    sim_scores = sim_scores[0:len(user_list)+11]

    movie_indices = [i[0] for i in sim_scores]

    return movies['title'].iloc[movie_indices]

In [30]:
get_recommend_by_user_list(user_list_movie)

11258    Juan of the Dead (Juan de los Muertos) (2011)
11210                       Cockneys vs Zombies (2012)
9966                                 Zombieland (2009)
12548     Scouts Guide to the Zombie Apocalypse (2015)
7098                          Shaun of the Dead (2004)
5249               Chopper Chicks in Zombietown (1989)
4075          Return of the Living Dead Part II (1988)
10053                                  Doghouse (2009)
12563                                   Cooties (2015)
10499                     Tucker & Dale vs Evil (2010)
11957                Dead Snow 2: Red vs. Dead (2014) 
10898                             Revenant, The (2009)
2327                                 Idle Hands (1999)
13811                    Zombieland: Double Tap (2019)
13364                               Little Evil (2017)
Name: title, dtype: object