# 추천 시스템

- 콘텐츠 기반 필터링
사용자가 특정한 아이템을 매우 선호하는 경우, 그 아이템과 비슷한 콘텐츠를 가진 다른 아이템을 추천하는 방식
- 협업 필터링
사용자가 아이템에 매긴 평점 정보나 상품구매이력과 같은 사용자 행동양식을 기반으로 추천을 수행
1. 최근접이웃(Nearest Neighbor)
- 사용자 기반 : 당신과 비슷한 고객들이 다음 상품도 구매했다.
- 아이템 기반 : 이 상품을 선택한 다른 고객들은 다음 상품도 구매했다.( 사용자들이 그 아이템을 좋아하는지/싫어하는지의 평가 척도가 유사한 아이템을 추천하는 기준)

2. 잠재요인(Latent Factor)

In [2]:
import pandas as pd
import numpy as np
import warnings;warnings.filterwarnings('ignore')
from sklearn.metrics import mean_squared_error

In [4]:
R=np.array([[4,np.NaN,np.NaN,2,np.NaN],
           [np.NaN,5,np.NaN,3,1],
           [np.NaN,np.NaN,3,4,4],
           [5,2,1,2,np.NaN]])
num_users,num_items=R.shape

In [31]:
np.random.seed(1)
K=3
P=np.random.normal(scale=1./K,size=(num_users,K)) #(4,3)
Q=np.random.normal(scale=1./K,size=(num_items,K)) #(5,3)

In [37]:
#실제 R행렬과 예측행렬의 오차를 구하자!
#실제R행렬의 null이 아닌 행렬 값의 위치 인덱스를 추출해 이 인덱스에 있는 실제 R행렬값과
#분해된 P,Q를 이용해 다시 조합된 예측 행렬의 RMSE값 반환

def get_rmse(R,P,Q,non_zeros):
    error=0
    pred_matrix=np.dot(P,Q.T) #(4,5)
    x_non_zero_ind=[non_zero[0] for non_zero in non_zeros]
    y_non_zero_ind=[non_zero[1] for non_zero in non_zeros]
    R_non_zeros=R[x_non_zero_ind,y_non_zero_ind]
    full_pred_matrix_non_zeros=pred_matrix[x_non_zero_ind,y_non_zero_ind]
    mse=mean_squared_error(R_non_zeros,full_pred_matrix_non_zeros)
    rmse=np.sqrt(mse)
    return rmse

In [15]:
non_zeros=[(i,j,R[i,j]) for i in range(num_users) for j in range(num_items) if R[i,j]>0]

In [41]:
steps=1000
learning_rate=0.01
r_lambda=0.01
for step in range(steps):
    for i,j,r in non_zeros:
        eij=r-np.dot(P[i,:],Q[j,:].T)
        P[i,:]=P[i,:]+learning_rate*(eij*Q[j,:]-r_lambda*P[i,:])
        Q[j,:]=Q[j,:]+learning_rate*(eij*P[i,:]-r_lambda*Q[j,:])
        rmse=get_rmse(R,P,Q,non_zeros)
    if (step%50)==0:
        print('###iteration step:',step,'rmse:',rmse)

###iteration step: 0 rmse: 0.01655803890814364
###iteration step: 50 rmse: 0.016513859189388212
###iteration step: 100 rmse: 0.016469302176442754
###iteration step: 150 rmse: 0.016424510253994627
###iteration step: 200 rmse: 0.016379631765893563
###iteration step: 250 rmse: 0.016334805739707706
###iteration step: 300 rmse: 0.016290155305291112
###iteration step: 350 rmse: 0.016245785731998592
###iteration step: 400 rmse: 0.016201784817113825
###iteration step: 450 rmse: 0.016158224373926302
###iteration step: 500 rmse: 0.01611516214017394
###iteration step: 550 rmse: 0.01607264374956633
###iteration step: 600 rmse: 0.016030704589303422
###iteration step: 650 rmse: 0.015989371466084153
###iteration step: 700 rmse: 0.01594866405691241
###iteration step: 750 rmse: 0.015908596148844503
###iteration step: 800 rmse: 0.015869176685154265
###iteration step: 850 rmse: 0.015830410640611893
###iteration step: 900 rmse: 0.015792299749517028
###iteration step: 950 rmse: 0.01575484310887397


In [3]:
movies=pd.read_csv(r'C:\Users\PC\Desktop\새 폴더\tmdb_5000_movies.csv')
movies

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.312950,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4798,220000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",,9367,"[{""id"": 5616, ""name"": ""united states\u2013mexi...",es,El Mariachi,El Mariachi just wants to play his guitar and ...,14.269792,"[{""name"": ""Columbia Pictures"", ""id"": 5}]","[{""iso_3166_1"": ""MX"", ""name"": ""Mexico""}, {""iso...",1992-09-04,2040920,81.0,"[{""iso_639_1"": ""es"", ""name"": ""Espa\u00f1ol""}]",Released,"He didn't come looking for trouble, but troubl...",El Mariachi,6.6,238
4799,9000,"[{""id"": 35, ""name"": ""Comedy""}, {""id"": 10749, ""...",,72766,[],en,Newlyweds,A newlywed couple's honeymoon is upended by th...,0.642552,[],[],2011-12-26,0,85.0,[],Released,A newlywed couple's honeymoon is upended by th...,Newlyweds,5.9,5
4800,0,"[{""id"": 35, ""name"": ""Comedy""}, {""id"": 18, ""nam...",http://www.hallmarkchannel.com/signedsealeddel...,231617,"[{""id"": 248, ""name"": ""date""}, {""id"": 699, ""nam...",en,"Signed, Sealed, Delivered","""Signed, Sealed, Delivered"" introduces a dedic...",1.444476,"[{""name"": ""Front Street Pictures"", ""id"": 3958}...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2013-10-13,0,120.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,,"Signed, Sealed, Delivered",7.0,6
4801,0,[],http://shanghaicalling.com/,126186,[],en,Shanghai Calling,When ambitious New York attorney Sam is sent t...,0.857008,[],"[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-05-03,0,98.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,A New Yorker in Shanghai,Shanghai Calling,5.7,7


In [4]:
movies_df=movies[['id','title','genres','vote_average','vote_count',
                  'popularity','keywords','overview']]

In [5]:
pd.set_option('max_colwidth',100)
movies_df[['genres','keywords']][:1]

Unnamed: 0,genres,keywords
0,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""name"": ""Fantasy""}, {...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"": 2964, ""name"": ""future""}, {""id"": 3386, ""name"": ""sp..."


In [6]:
#list_eval:문자열을 딕셔너리/리슽트 형태로 바꿔줄 수 있다.
from ast import literal_eval
movies_df['genres']=movies_df['genres'].apply(literal_eval)
movies_df['keywords']=movies_df['keywords'].apply(literal_eval)

In [13]:
movies_df['genres']=movies_df['genres'].apply(lambda x : [y['name'] for y in x])
movies_df['keywords']=movies_df['keywords'].apply(lambda x : [y['name'] for y in x])

In [14]:
movies_df[['genres','keywords']][:1]

Unnamed: 0,genres,keywords
0,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colony, society, space travel, futuristic, romance, spa..."


- 문자열로 변환된 genres칼럼을 Count기반으로 피처 벡터화 변환한다.
- genres문자열을 피처 벡터화 행렬로 변환한 데이터 세트를 코사인 유사도를 통해 비교, 이를 위해 데이터 세트의 레코드별로 타 레코드와 장르에서 코사인 유사도 값을 가지는 객체 생성
- 장르 유사도가 높은 영화 중에서 평점이 높은 순으로 영화 추천

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer()

movies_df['genres_literal']=movies_df['genres'].apply(lambda x : (' ').join(x))
matrix=tfidf.fit_transform(movies_df['genres_literal'])

from sklearn.metrics.pairwise import cosine_similarity
cosine_matrix=cosine_similarity(matrix,matrix)
np.round(cosine_matrix,4)

array([[1.    , 0.7453, 0.4294, ..., 0.    , 0.    , 0.    ],
       [0.7453, 1.    , 0.5762, ..., 0.    , 0.    , 0.    ],
       [0.4294, 0.5762, 1.    , ..., 0.    , 0.    , 0.    ],
       ...,
       [0.    , 0.    , 0.    , ..., 1.    , 0.    , 0.    ],
       [0.    , 0.    , 0.    , ..., 0.    , 0.    , 0.    ],
       [0.    , 0.    , 0.    , ..., 0.    , 0.    , 1.    ]])

In [19]:
sorted_=cosine_matrix.argsort()[:,::-1]

In [24]:
title_movie=movies_df[movies_df['title']=='The Godfather']
title_index=title_movie.index.values
similar_indexes=sorted_[title_index,:10].reshape(-1)

movies_df.iloc[similar_indexes][['title','vote_average']]

Unnamed: 0,title,vote_average
1847,GoodFellas,8.2
1881,The Shawshank Redemption,8.5
3567,Monster,7.0
4079,Boys Don't Cry,7.2
3887,Trainspotting,7.8
4159,River's Edge,6.7
1243,Mean Streets,7.2
4502,Water & Power,3.0
4098,Ghost Dog: The Way of the Samurai,7.2
3699,25th Hour,7.2


In [51]:
from sklearn.feature_extraction.text import CountVectorizer

movies_df['genres_literal']=movies_df['genres'].apply(lambda x : (' ').join(x))
count_vect=CountVectorizer(min_df=0,ngram_range=(1,2))
genre_mat=count_vect.fit_transform(movies_df['genres_literal'])
genre_mat

<4803x276 sparse matrix of type '<class 'numpy.int64'>'
	with 20631 stored elements in Compressed Sparse Row format>

In [52]:
from sklearn.metrics.pairwise import cosine_similarity
genre_sim=cosine_similarity(genre_mat,genre_mat)

In [55]:
#argsort()[:,::-1]을 이용하면 유사도가 높은 순으로 정리된 genre_sim 객체의 비교 행 위치 인덱스 값을 간편하게 얻을 수 있음
genre_sim_sorted_ind=genre_sim.argsort()[:,::-1]

In [66]:
title_movie=movies_df[movies_df['title']=='The Godfather']
title_index=title_movie.index.values
similar_indexes=genre_sim_sorted_ind[title_index,:10].reshape(-1)
movies_df.iloc[similar_indexes][['title','vote_average']]

Unnamed: 0,title,vote_average
2731,The Godfather: Part II,8.3
1243,Mean Streets,7.2
3636,Light Sleeper,5.7
1946,The Bad Lieutenant: Port of Call - New Orleans,6.0
2640,Things to Do in Denver When You're Dead,6.7
4065,Mi America,0.0
1847,GoodFellas,8.2
4217,Kids,6.8
883,Catch Me If You Can,7.7
3866,City of God,8.1


In [67]:
movies_df[['title','vote_average','vote_count']].sort_values('vote_average',ascending=False)[:10]

Unnamed: 0,title,vote_average,vote_count
3519,Stiff Upper Lips,10.0,1
4247,Me You and Five Bucks,10.0,2
4045,"Dancer, Texas Pop. 81",10.0,1
4662,Little Big Top,10.0,1
3992,Sardaarji,9.5,2
2386,One Man's Hero,9.3,2
2970,There Goes My Baby,8.5,2
1881,The Shawshank Redemption,8.5,8205
2796,The Prisoner of Zenda,8.4,11
3337,The Godfather,8.4,5893


In [68]:
m=movies_df['vote_count'].quantile(0.6)
C=movies_df['vote_average'].mean()

def weighted_vote_average(record):
    v=record['vote_count']
    R=record['vote_average']
    return ( (v/(v+m))*R) + ((m/(m+v))*C)

In [71]:
movies_df['weighted_vote']=movies_df.apply(weighted_vote_average,axis=1)

In [72]:
movies_df[['title','vote_average','weighted_vote','vote_count']].sort_values('weighted_vote',ascending=False)[:10]

Unnamed: 0,title,vote_average,weighted_vote,vote_count
1881,The Shawshank Redemption,8.5,8.396052,8205
3337,The Godfather,8.4,8.263591,5893
662,Fight Club,8.3,8.216455,9413
3232,Pulp Fiction,8.3,8.207102,8428
65,The Dark Knight,8.2,8.13693,12002
1818,Schindler's List,8.3,8.126069,4329
3865,Whiplash,8.3,8.123248,4254
809,Forrest Gump,8.2,8.105954,7927
2294,Spirited Away,8.3,8.105867,3840
2731,The Godfather: Part II,8.3,8.079586,3338


In [76]:
title_movie=movies_df[movies_df['title']=='The Godfather']
title_index=title_movie.index.values
similar_indexes=genre_sim_sorted_ind[title_index,:10*2].reshape(-1)
#movies_df.iloc[similar_indexes][['title','vote_average']]
similar_indexes=similar_indexes[similar_indexes!=title_index]
similar_movies=movies_df.iloc[similar_indexes].sort_values('weighted_vote',ascending=False)[:10]
similar_movies[['title','vote_average','weighted_vote']]

Unnamed: 0,title,vote_average,weighted_vote
2731,The Godfather: Part II,8.3,8.079586
1847,GoodFellas,8.2,7.976937
3866,City of God,8.1,7.759693
1663,Once Upon a Time in America,8.2,7.657811
883,Catch Me If You Can,7.7,7.557097
281,American Gangster,7.4,7.141396
4041,This Is England,7.4,6.739664
1149,American Hustle,6.8,6.717525
1243,Mean Streets,7.2,6.626569
2839,Rounders,6.9,6.530427


In [82]:
from surprise import SVD
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import train_test_split

In [81]:
import surprise

In [85]:
data=Dataset.load_builtin('ml-100k') 
trainset,testset=train_test_split(data,test_size=0.25,random_state=0)

In [88]:
algo=SVD()
algo.fit(trainset)
#test : 사용자-아이템 평점 데이터 세트 전체에 대해서 추천을 예측하는 메서드
#predict : 개별 사용자와 영화에 대한 추천 평점을 반환
predictions=algo.test(testset)
len(predictions)

25000

In [89]:
predictions[:5]

[Prediction(uid='120', iid='282', r_ui=4.0, est=3.5420490217434244, details={'was_impossible': False}),
 Prediction(uid='882', iid='291', r_ui=4.0, est=4.064088825693579, details={'was_impossible': False}),
 Prediction(uid='535', iid='507', r_ui=5.0, est=3.906510091281124, details={'was_impossible': False}),
 Prediction(uid='697', iid='244', r_ui=5.0, est=3.567075899332699, details={'was_impossible': False}),
 Prediction(uid='751', iid='385', r_ui=4.0, est=3.2573561385818643, details={'was_impossible': False})]