## import and preprocessing

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('./archive/tmdb_5000_movies.csv')
using_data = df[['id', 'genres', 'vote_average', 'vote_count', 'popularity', 'title', 'keywords', 'overview']]
using_data.shape

(4803, 8)

In [3]:
m = using_data['vote_count'].quantile(0.9)
using_data = using_data.loc[using_data['vote_count'] >= m]
c = using_data['vote_average'].mean()

In [4]:
def weight_rating(x, m=m, c=c):
    v = x['vote_count']
    r = x['vote_average']
    return (v / (v + m) * r) + (m / (v + m) * c)

In [5]:
using_data['score'] = using_data.apply(weight_rating, axis=1)
using_data.shape

(481, 9)

In [6]:
using_data.head()

Unnamed: 0,id,genres,vote_average,vote_count,popularity,title,keywords,overview,score
0,19995,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",7.2,11800,150.437577,Avatar,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","In the 22nd century, a paraplegic Marine is di...",7.168053
1,285,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",6.9,4500,139.082615,Pirates of the Caribbean: At World's End,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","Captain Barbossa, long believed to be dead, ha...",6.918271
2,206647,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",6.3,4466,107.376788,Spectre,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",A cryptic message from Bond’s past sends him o...,6.493333
3,49026,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",7.6,9106,112.31295,The Dark Knight Rises,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",Following the death of District Attorney Harve...,7.492998
4,49529,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",6.1,2124,43.926995,John Carter,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...","John Carter is a war-weary, former military ca...",6.500396


In [7]:
using_data[['genres', 'keywords']].head()

Unnamed: 0,genres,keywords
0,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":..."
1,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na..."
2,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name..."
3,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...","[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,..."
4,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 818, ""name"": ""based on novel""}, {""id"":..."


In [8]:
import ast as ast

In [9]:
using_data['genres'] = using_data['genres'].apply(ast.literal_eval)

In [10]:
using_data['keywords'] = using_data['keywords'].apply(ast.literal_eval)
using_data[['genres', 'keywords']].head(2)

Unnamed: 0,genres,keywords
0,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...","[{'id': 1463, 'name': 'culture clash'}, {'id':..."
1,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...","[{'id': 270, 'name': 'ocean'}, {'id': 726, 'na..."


In [11]:
using_data['genres'] = using_data['genres'].apply(lambda x: [d['name'] for d in x]).apply(lambda x: " ".join(x))

In [12]:
using_data['keywords'] = using_data['keywords'].apply(lambda x: [d['name'] for d in x]).apply(lambda x: " ".join(x))

In [13]:
using_data

Unnamed: 0,id,genres,vote_average,vote_count,popularity,title,keywords,overview,score
0,19995,Action Adventure Fantasy Science Fiction,7.2,11800,150.437577,Avatar,culture clash future space war space colony so...,"In the 22nd century, a paraplegic Marine is di...",7.168053
1,285,Adventure Fantasy Action,6.9,4500,139.082615,Pirates of the Caribbean: At World's End,ocean drug abuse exotic island east india trad...,"Captain Barbossa, long believed to be dead, ha...",6.918271
2,206647,Action Adventure Crime,6.3,4466,107.376788,Spectre,spy based on novel secret agent sequel mi6 bri...,A cryptic message from Bond’s past sends him o...,6.493333
3,49026,Action Crime Drama Thriller,7.6,9106,112.312950,The Dark Knight Rises,dc comics crime fighter terrorist secret ident...,Following the death of District Attorney Harve...,7.492998
4,49529,Action Adventure Science Fiction,6.1,2124,43.926995,John Carter,based on novel mars medallion space travel pri...,"John Carter is a war-weary, former military ca...",6.500396
...,...,...,...,...,...,...,...,...,...
4291,176,Horror Mystery Crime,7.2,2184,63.655973,Saw,shotgun based on short film sadist pistol chai...,Obsessed with teaching his victims the value o...,7.091679
4300,500,Crime Thriller,8.0,3697,66.925866,Reservoir Dogs,traitor jewelry psychopath thief heist betraya...,A botched robbery indicates a police informant...,7.655593
4302,429,Western,8.1,2311,88.377076,"The Good, the Bad and the Ugly",bounty hunter refugee gold anti hero gallows h...,While the Civil War rages between the Union an...,7.596247
4337,103,Crime Drama,8.0,2535,58.845025,Taxi Driver,vietnam veteran taxi obsession drug dealer nig...,A mentally unstable Vietnam War veteran works ...,7.564085


## content based filtering

공백 문자열로 표현된 genres와 keyword를 벡터화

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

In [16]:
data = using_data
data.head(2)

Unnamed: 0,id,genres,vote_average,vote_count,popularity,title,keywords,overview,score
0,19995,Action Adventure Fantasy Science Fiction,7.2,11800,150.437577,Avatar,culture clash future space war space colony so...,"In the 22nd century, a paraplegic Marine is di...",7.168053
1,285,Adventure Fantasy Action,6.9,4500,139.082615,Pirates of the Caribbean: At World's End,ocean drug abuse exotic island east india trad...,"Captain Barbossa, long believed to be dead, ha...",6.918271


In [None]:
count_vector = CountVectorizer(ngram_range=(1, 3))
c_vector_genres = count_vector.fit_transform(data['genres'])
c_vector_genres.shape

코사인 유사도를 구한 벡터를 미리 저장

In [25]:
from sklearn.metrics.pairwise import cosine_similarity


In [28]:
genres_c_sim = cosine_similarity(c_vector_genres, c_vector_genres).argsort()[:, ::-1]

In [29]:
genres_c_sim.shape

(481, 481)

In [34]:
def get_recommend_movie_list(df, movie_title, top=30):
    # 특정 영화와 비슷한 영화를 추천해야 하기 때문에 '특정 영화' 정보를 뽑아낸다.
    target_movie_index = df[df['title'] == movie_title].index.values

    # 코사인 유사도 중 비슷한 코사인 유사도를 가진 정보를 뽑아낸다.
    sim_index = genres_c_sim[target_movie_index, :top].reshape(-1)

    # 본인을 제외
    sim_index = sim_index[sim_index != target_movie_index]

    # data frame으로 만들고 score로 정렬한 뒤 10개만 return
    result = df.iloc[sim_index].sort_values('score', ascending=False)[:10]
    return result

In [36]:
get_recommend_movie_list(data, movie_title='The Dark Knight')

Unnamed: 0,id,genres,vote_average,vote_count,popularity,title,keywords,overview,score
494,8587,Family Animation Drama,8.0,5376,90.457886,The Lion King,loss of parents wild boar uncle shaman redempt...,A young lion cub named Simba can't wait to be ...,7.735746
1532,120467,Comedy Drama,8.0,4519,74.417456,The Grand Budapest Hotel,hotel painting wartime gunfight theft mentor p...,The Grand Budapest Hotel tells of a legendary ...,7.700124
298,106646,Crime Drama Comedy,7.9,6571,95.007934,The Wolf of Wall Street,corruption sex sexuality bank humor biography ...,A New York stockbroker refuses to cooperate in...,7.695159
57,10681,Animation Family,7.8,6296,66.390712,WALL·E,romantic comedy,WALL·E is the last robot left on an Earth that...,7.610834
697,37165,Comedy Drama,7.8,4537,56.488027,The Truman Show,claustrophobia hidden camera dystopia reality ...,"Truman Burbank is the star of ""The Truman Show...",7.558642
1541,862,Animation Comedy Family,7.7,5269,73.640445,Toy Story,jealousy toy boy friendship friends rivalry bo...,"Led by Woody, Andy's toys live happily in his ...",7.509366
328,12,Animation Family,7.6,6122,85.688789,Finding Nemo,father son relationship harbor underwater fish...,"Nemo, an adventurous young clownfish, is unexp...",7.452888
42,10193,Animation Family Comedy,7.6,4597,59.995418,Toy Story 3,hostage college toy barbie animation escape da...,"Woody, Buzz, and the rest of Andy's toys haven...",7.418026
231,585,Animation Comedy Family,7.5,5996,106.815545,"Monsters, Inc.",monster infant energy supply company rivalry h...,"James Sullivan and Mike Wazowski are monsters,...",7.373988
352,10674,Animation Family Adventure,7.6,2008,67.427755,Mulan,homeland musical training daughter cricket pri...,A tomboyish girl disguises herself as a young ...,7.295541
