# 추천 시스템 - 콘텐츠 기반 필터링

Reference
- https://github.com/lsjsj92/recommender_system_with_Python
- https://www.kaggle.com/ibtesama/getting-started-with-a-movie-recommendation-system

# 1. 데이터 전처리

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
data = pd.read_csv('C:/Users/kjh96/OneDrive/Desktop/data/the-movies-dataset/tmdb_5000_movies.csv')
data.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


In [3]:
data = data[['id','genres', 'vote_average', 'vote_count','popularity','title',  'keywords', 'overview']]

In [4]:
tmp_m = data['vote_count'].quantile(0.89)
tmp_m

1683.8999999999987

In [5]:
tmp_data = data.copy().loc[data['vote_count'] >= tmp_m]
tmp_data.shape

(529, 8)

In [6]:
del tmp_data

m = data['vote_count'].quantile(0.9)
data = data.loc[data['vote_count'] >= m]

In [7]:
C = data['vote_average'].mean()

In [8]:
# 투표를 받은 수가 상위 10% 481개 영화의 평균 평점
print(C)
print(m)

6.9629937629937615
1838.4000000000015


In [9]:
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    
    return ( v / (v+m) * R ) + (m / (m + v) * C)

In [10]:
data['score'] = data.apply(weighted_rating, axis = 1)

In [11]:
data.head(5)

Unnamed: 0,id,genres,vote_average,vote_count,popularity,title,keywords,overview,score
0,19995,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",7.2,11800,150.437577,Avatar,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","In the 22nd century, a paraplegic Marine is di...",7.168053
1,285,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",6.9,4500,139.082615,Pirates of the Caribbean: At World's End,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","Captain Barbossa, long believed to be dead, ha...",6.918271
2,206647,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",6.3,4466,107.376788,Spectre,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",A cryptic message from Bond’s past sends him o...,6.493333
3,49026,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",7.6,9106,112.31295,The Dark Knight Rises,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",Following the death of District Attorney Harve...,7.492998
4,49529,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",6.1,2124,43.926995,John Carter,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...","John Carter is a war-weary, former military ca...",6.500396


In [12]:
data.shape

(481, 9)

In [13]:
data[['genres', 'keywords']].head(2)

Unnamed: 0,genres,keywords
0,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":..."
1,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na..."


In [14]:
data['genres'] = data['genres'].apply(literal_eval)
data['keywords'] = data['keywords'].apply(literal_eval)

In [15]:
data[['genres', 'keywords']].head(2)

Unnamed: 0,genres,keywords
0,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...","[{'id': 1463, 'name': 'culture clash'}, {'id':..."
1,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...","[{'id': 270, 'name': 'ocean'}, {'id': 726, 'na..."


In [16]:
data['genres'] = data['genres'].apply(lambda x : [d['name'] for d in x]).apply(lambda x : " ".join(x))
data['keywords'] = data['keywords'].apply(lambda x : [d['name'] for d in x]).apply(lambda x : " ".join(x))

In [17]:
data.head(2)

Unnamed: 0,id,genres,vote_average,vote_count,popularity,title,keywords,overview,score
0,19995,Action Adventure Fantasy Science Fiction,7.2,11800,150.437577,Avatar,culture clash future space war space colony so...,"In the 22nd century, a paraplegic Marine is di...",7.168053
1,285,Adventure Fantasy Action,6.9,4500,139.082615,Pirates of the Caribbean: At World's End,ocean drug abuse exotic island east india trad...,"Captain Barbossa, long believed to be dead, ha...",6.918271


In [18]:
data.to_csv('C:/Users/kjh96/OneDrive/Desktop/data/the-movies-dataset/pre_tmdb_5000_movies.csv', index = False)

In [19]:
data.shape

(481, 9)

In [20]:
data

Unnamed: 0,id,genres,vote_average,vote_count,popularity,title,keywords,overview,score
0,19995,Action Adventure Fantasy Science Fiction,7.2,11800,150.437577,Avatar,culture clash future space war space colony so...,"In the 22nd century, a paraplegic Marine is di...",7.168053
1,285,Adventure Fantasy Action,6.9,4500,139.082615,Pirates of the Caribbean: At World's End,ocean drug abuse exotic island east india trad...,"Captain Barbossa, long believed to be dead, ha...",6.918271
2,206647,Action Adventure Crime,6.3,4466,107.376788,Spectre,spy based on novel secret agent sequel mi6 bri...,A cryptic message from Bond’s past sends him o...,6.493333
3,49026,Action Crime Drama Thriller,7.6,9106,112.312950,The Dark Knight Rises,dc comics crime fighter terrorist secret ident...,Following the death of District Attorney Harve...,7.492998
4,49529,Action Adventure Science Fiction,6.1,2124,43.926995,John Carter,based on novel mars medallion space travel pri...,"John Carter is a war-weary, former military ca...",6.500396
...,...,...,...,...,...,...,...,...,...
4291,176,Horror Mystery Crime,7.2,2184,63.655973,Saw,shotgun based on short film sadist pistol chai...,Obsessed with teaching his victims the value o...,7.091679
4300,500,Crime Thriller,8.0,3697,66.925866,Reservoir Dogs,traitor jewelry psychopath thief heist betraya...,A botched robbery indicates a police informant...,7.655593
4302,429,Western,8.1,2311,88.377076,"The Good, the Bad and the Ugly",bounty hunter refugee gold anti hero gallows h...,While the Civil War rages between the Union an...,7.596247
4337,103,Crime Drama,8.0,2535,58.845025,Taxi Driver,vietnam veteran taxi obsession drug dealer nig...,A mentally unstable Vietnam War veteran works ...,7.564085


# 2. 콘텐츠 기반 필터링 추천(Content based filtering)

In [21]:
data.genres.head(2)

0    Action Adventure Fantasy Science Fiction
1                    Adventure Fantasy Action
Name: genres, dtype: object

In [56]:
count_vector = CountVectorizer(ngram_range=(1, 1))

In [57]:
count_vector

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [58]:
c_vector_genres = count_vector.fit_transform(data['genres'])

In [59]:
c_vector_genres.shape

(481, 18)

In [60]:
c_vector_genres

<481x18 sparse matrix of type '<class 'numpy.int64'>'
	with 1550 stored elements in Compressed Sparse Row format>

### 2.1 유사도 측정은 코사인 유사도(cosine similarity)를 사용합니다.

In [61]:
#코사인 유사도를 구한 벡터를 미리 저장
gerne_c_sim = cosine_similarity(c_vector_genres, c_vector_genres).argsort()[:, ::-1]

In [62]:
gerne_c_sim.shape

(481, 481)

In [63]:
gerne_c_sim

array([[  0,  13, 163, ..., 298, 297, 240],
       [ 88,  11, 193, ..., 329, 330, 240],
       [376,   2,  10, ..., 314, 304, 240],
       ...,
       [478, 187,  12, ..., 326, 327,   0],
       [479, 458, 298, ..., 220, 224,   0],
       [480, 468, 294, ..., 246, 248,   0]], dtype=int64)

In [64]:
data

Unnamed: 0,id,genres,vote_average,vote_count,popularity,title,keywords,overview,score
0,19995,Action Adventure Fantasy Science Fiction,7.2,11800,150.437577,Avatar,culture clash future space war space colony so...,"In the 22nd century, a paraplegic Marine is di...",7.168053
1,285,Adventure Fantasy Action,6.9,4500,139.082615,Pirates of the Caribbean: At World's End,ocean drug abuse exotic island east india trad...,"Captain Barbossa, long believed to be dead, ha...",6.918271
2,206647,Action Adventure Crime,6.3,4466,107.376788,Spectre,spy based on novel secret agent sequel mi6 bri...,A cryptic message from Bond’s past sends him o...,6.493333
3,49026,Action Crime Drama Thriller,7.6,9106,112.312950,The Dark Knight Rises,dc comics crime fighter terrorist secret ident...,Following the death of District Attorney Harve...,7.492998
4,49529,Action Adventure Science Fiction,6.1,2124,43.926995,John Carter,based on novel mars medallion space travel pri...,"John Carter is a war-weary, former military ca...",6.500396
...,...,...,...,...,...,...,...,...,...
476,176,Horror Mystery Crime,7.2,2184,63.655973,Saw,shotgun based on short film sadist pistol chai...,Obsessed with teaching his victims the value o...,7.091679
477,500,Crime Thriller,8.0,3697,66.925866,Reservoir Dogs,traitor jewelry psychopath thief heist betraya...,A botched robbery indicates a police informant...,7.655593
478,429,Western,8.1,2311,88.377076,"The Good, the Bad and the Ugly",bounty hunter refugee gold anti hero gallows h...,While the Civil War rages between the Union an...,7.596247
479,103,Crime Drama,8.0,2535,58.845025,Taxi Driver,vietnam veteran taxi obsession drug dealer nig...,A mentally unstable Vietnam War veteran works ...,7.564085


In [65]:
def get_recommend_movie_list(df, movie_title, top=30):
    # 특정 영화와 비슷한 영화를 추천해야 하기 때문에 '특정 영화' 정보를 뽑아낸다.
    target_movie_index = df[df['title'] == movie_title].index.values
    
    #코사인 유사도 중 비슷한 코사인 유사도를 가진 정보를 뽑아낸다.
    sim_index = gerne_c_sim[target_movie_index, :top].reshape(-1)
    
    #본인을 제외
    sim_index = sim_index[sim_index != target_movie_index]

    #data frame으로 만들고 vote_count으로 정렬한 뒤 return
    result = df.iloc[sim_index].sort_values('score', ascending=False)[:10]
    return result

In [66]:
get_recommend_movie_list(data, movie_title='The Dark Knight Rises')

Unnamed: 0,id,genres,vote_average,vote_count,popularity,title,keywords,overview,score
58,155,Drama Action Crime Thriller,8.2,12002,187.322927,The Dark Knight,dc comics crime fighter secret identity scarec...,Batman raises the stakes in his war on crime. ...,8.03569
395,274,Crime Drama Thriller,8.1,4443,18.174804,The Silence of the Lambs,based on novel psychopath horror suspense seri...,"FBI trainee, Clarice Starling ventures into a ...",7.767228
461,629,Drama Crime Thriller,8.1,3254,64.025031,The Usual Suspects,law relatives theft criminal criminal mastermi...,"Held in an L.A. interrogation room, Verbal Kin...",7.689531
212,1422,Drama Thriller Crime,7.9,4339,63.429157,The Departed,undercover boston police friends mafia underco...,"To take down South Boston's Irish Mafia, the p...",7.621146
385,111,Action Crime Drama Thriller,8.0,2948,70.105981,Scarface,miami corruption capitalism cuba prohibition b...,After getting a green card in exchange for ass...,7.601698
312,146233,Drama Thriller Crime,7.9,3085,88.496873,Prisoners,pennsylvania kidnapping maze vigilante rural s...,When Keller Dover's daughter and her friend go...,7.550121
383,6977,Crime Drama Thriller,7.7,3003,53.645267,No Country for Old Men,texas drug traffic hitman united states–mexico...,"Llewelyn Moss stumbles upon dead bodies, $2 mi...",7.42014
296,393,Action Crime Thriller,7.6,3948,50.622607,Kill Bill: Vol. 2,brother brother relationship swordplay katana ...,The Bride unwaveringly continues on her roarin...,7.397616
97,272,Action Crime Drama,7.5,7359,115.040024,Batman Begins,himalaya martial arts dc comics crime fighter ...,"Driven by tragedy, billionaire Bruce Wayne ded...",7.392662
454,242582,Crime Drama Thriller,7.6,3395,89.095538,Nightcrawler,journalism underground tv station sociopath ho...,"When Lou Bloom, desperate for work, muscles in...",7.376231


In [67]:
sum(listt)/4data = data.reset_index(drop=True)

listt = []
for i in range(len(data['keywords'])) :
    a = int(len(data['keywords'][i]))
    listt.append(a)
    
len(listt)
sum(listt)/48181

SyntaxError: invalid syntax (<ipython-input-67-80d7a7a4c7f2>, line 1)

In [68]:
data[data['title'] == 'The Dark Knight Rises']

Unnamed: 0,id,genres,vote_average,vote_count,popularity,title,keywords,overview,score
3,49026,Action Crime Drama Thriller,7.6,9106,112.31295,The Dark Knight Rises,dc comics crime fighter terrorist secret ident...,Following the death of District Attorney Harve...,7.492998


# 결론 
카카오 아레나 멜론 플레이리스트 추천에 그대로 접목시킬 수 있을 듯 하다.
하지만 장르가 많을 경우에 좀 더 정확한 추천이 될 수 있을 듯 하다. 평균 4개로 그리 많지 않고 나이브한 추천이 될 가능성이 높다. 
좀 더 공부해봐야 겠지만 장르 뿐 만 아니라 제목이나 가수 이름 등등까지 고려하여 한번에 추천이 가능한지 알아봐야 겠다.