MF 잠재요인 협업필터링

In [14]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [15]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook

In [16]:
#/content/drive/MyDrive/Classroom/인공지능 서비스 개발자 _ 강동1기/data/ml_lastest_small/ratings.csv
#/content/drive/MyDrive/Classroom/인공지능 서비스 개발자 _ 강동1기/data/ml_lastest_small/movies.csv

In [17]:
def get_rmse(R,P,Q,non_zeros): #오차최소화
  error=0
  #두 개의 분해된 행렬 P와 Q.T의 내적 곱으로 예측 R 행렬 생성
  full_pred_matrix=np.dot(P,Q.T)

  #실제 R 행렬에서 null 이 아닌 값의 위치 인덱스 추출 >> 실제 R 행렬과 예측행렬의 RMSE 추출
  x_non_zero_ind=[non_zero[0] for non_zero in non_zeros]
  y_non_zero_ind=[non_zero[1] for non_zero in non_zeros]
  R_non_zeros=R[x_non_zero_ind,y_non_zero_ind]

  full_pred_matrix_non_zeros=full_pred_matrix[x_non_zero_ind,y_non_zero_ind]

  mse=mean_squared_error(R_non_zeros,full_pred_matrix_non_zeros)
  rmse=np.sqrt(mse)
  return rmse



In [18]:
def matrix_factorization(R,K,steps=200,learning_rate=0.01,r_lambda=0.01):
  num_users,num_items=R.shape
  #P와 Q매트릭스 크기 지정, 정규분포를 가진 RANDOM 값으로 입력

  np.random.seed(32)
  P=np.random.normal(scale=1./K,size=(num_users,K))
  Q=np.random.normal(scale=1./K,size=(num_items,K))

  break_count=0

  #R>0인 행,열,위치,값을 non_zeros 리스트 객체에 저장
  non_zeros=[(i,j,R[i,j]) for i in range(num_users) for j in range(num_items) if R[i,j]>0]

  #P와 Q 매트릭스 계속 업데이트(확률적 경사하강법)
  for step in tqdm_notebook(range(steps)):
    for i,j,r in non_zeros:
      #실제 값과 예측 값의 차이인 오류 값 구함
      eij=r-np.dot(P[i,:],Q[j,:].T)

      P[i,:]=P[i,:]+learning_rate*(eij*Q[j,:]-r_lambda*P[i,:])
      Q[j,:]=Q[j,:]+learning_rate*(eij*P[i,:]-r_lambda*Q[j,:])

    rmse=get_rmse(R,P,Q,non_zeros)
    if (step%10) == 0: #10으로 나눈 나머지: 10의배수
      print('### iteration step:',step,'rmse',rmse)
  return P,Q



In [19]:
movies=pd.read_csv('/content/drive/MyDrive/Classroom/인공지능 서비스 개발자 _ 강동1기/data/ml_lastest_small/movies.csv')
ratings=pd.read_csv('/content/drive/MyDrive/Classroom/인공지능 서비스 개발자 _ 강동1기/data/ml_lastest_small/ratings.csv')

In [20]:
movies[:10]

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [21]:
ratings[:10]
ratings=ratings.iloc[:,:-1]
ratings[:10]

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
5,1,70,3.0
6,1,101,5.0
7,1,110,4.0
8,1,151,5.0
9,1,157,5.0


In [22]:
ratings_matrix=ratings.pivot_table('rating',index='userId',columns='movieId')
ratings_matrix

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,,,2.5,,,,...,,,,,,,,,,
607,4.0,,,,,,,,,,...,,,,,,,,,,
608,2.5,2.0,2.0,,,,,,,4.0,...,,,,,,,,,,
609,3.0,,,,,,,,,4.0,...,,,,,,,,,,


In [23]:
#사용자-아이템 행렬 : R(원본 행렬)
#title 컬럼을 얻기 위해 movies와 조인 수행
pd.merge(ratings,movies, on='movieId')

Unnamed: 0,userId,movieId,rating,title,genres
0,1,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
...,...,...,...,...,...
100831,610,160341,2.5,Bloodmoon (1997),Action|Thriller
100832,610,160527,4.5,Sympathy for the Underdog (1971),Action|Crime|Drama
100833,610,160836,3.0,Hazard (2005),Action|Drama|Thriller
100834,610,163937,3.5,Blair Witch (2016),Horror|Thriller


In [24]:
rating_movies=pd.merge(ratings,movies, on='movieId')

In [25]:
rating_movies.pivot_table('rating',index='userId',columns='title')

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,4.0,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,,,,,,,,,,,...,,,,,,,,,,
607,,,,,,,,,,,...,,,,,,,,,,
608,,,,,,,,,,,...,,,,,,4.5,3.5,,,
609,,,,,,,,,,,...,,,,,,,,,,


In [26]:
ratings_matrix=rating_movies.pivot_table('rating',index='userId',columns='title')
ratings_matrix.shape

(610, 9719)

In [27]:
%%time
#경사하강법 이용, 행렬분해
#K:잠재요인, steps=200 : 200번 반복 수행, learning_rate:학습률, r_lambda: L2 규제 계수
P,Q=matrix_factorization(ratings_matrix.values, K=50, steps=200, learning_rate=0.001, r_lambda=0.01)

pred_matrix=np.dot(P,Q.T) #행렬의 곱 >> 내적


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for step in tqdm_notebook(range(steps)):


  0%|          | 0/200 [00:00<?, ?it/s]

### iteration step: 0 rmse 3.653301286812075
### iteration step: 10 rmse 2.903676876755157
### iteration step: 20 rmse 1.5205289261254398
### iteration step: 30 rmse 1.1872637331525944
### iteration step: 40 rmse 1.034235122846529
### iteration step: 50 rmse 0.9437771119864942
### iteration step: 60 rmse 0.8819722960105973
### iteration step: 70 rmse 0.8354006138994143
### iteration step: 80 rmse 0.7981498148812622
### iteration step: 90 rmse 0.7668360566260052
### iteration step: 100 rmse 0.7389592475100021
### iteration step: 110 rmse 0.7129114009819667
### iteration step: 120 rmse 0.687871206097809
### iteration step: 130 rmse 0.6635048549033575
### iteration step: 140 rmse 0.6397312115989023
### iteration step: 150 rmse 0.6165910719118193
### iteration step: 160 rmse 0.5941755622370564
### iteration step: 170 rmse 0.5725803455087053
### iteration step: 180 rmse 0.5518795355717809
### iteration step: 190 rmse 0.5321175203605515
CPU times: user 5min 24s, sys: 29.3 s, total: 5min 53s


In [28]:
#>>경사하강법


In [29]:
ratings_pred_matrix=pd.DataFrame(data=pred_matrix,
                                 index=ratings_matrix.index,
                                 columns=ratings_matrix.columns)

In [30]:
ratings_pred_matrix.shape

(610, 9719)

In [31]:
#원본 행렬 확인
ratings_matrix.shape

(610, 9719)

In [32]:
ratings_pred_matrix[:10]

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.329877,4.100679,3.864866,4.82651,4.484839,1.424854,4.194762,2.09937,4.567881,4.214466,...,1.240653,4.455618,4.157095,2.885379,2.860802,4.502518,3.193684,2.338569,4.101453,0.972949
2,2.592863,3.254862,3.094457,3.859734,3.631604,1.140111,3.596917,1.638188,3.869188,3.39616,...,0.909707,3.708883,3.351197,2.310941,2.262682,3.739569,2.71033,1.943624,3.379231,0.76709
3,2.202508,2.288866,2.156771,2.6868,2.482222,0.90595,1.941537,1.30321,2.442678,2.332133,...,0.673669,2.62761,2.410951,1.836412,1.5417,2.604251,1.884215,1.329906,1.64401,0.494645
4,2.546707,3.282678,3.06766,3.691657,3.417916,1.093749,3.041134,1.659128,2.516759,3.251099,...,1.07085,3.41367,3.115354,2.157075,2.160567,3.036363,2.083213,1.674098,3.289263,0.691259
5,2.338228,3.069485,2.914598,3.597598,3.412642,1.012854,3.149484,1.588592,3.233155,3.170628,...,0.896246,3.408117,3.091193,2.13412,2.095671,3.399839,2.261464,1.764326,3.012607,0.702507
6,2.556386,3.155322,3.148384,3.865924,3.690346,1.240711,3.732916,1.805725,3.81831,3.368651,...,0.973249,3.953068,3.312072,2.353487,2.200251,3.676899,2.34267,1.923324,3.504445,0.775738
7,2.119131,2.795907,2.700487,3.267977,3.148195,0.923222,2.77328,1.490266,3.014158,2.930265,...,0.70203,3.159299,2.814725,1.904175,1.847337,2.871051,1.851106,1.601634,3.018455,0.640415
8,2.263941,2.976385,2.84063,3.517507,3.353006,0.989437,3.241837,1.570184,3.492917,3.14598,...,0.800749,3.271287,3.057084,2.047672,2.023811,3.476941,2.459784,1.781551,2.972845,0.70486
9,2.582081,3.272603,3.13568,3.836826,3.61351,1.096828,3.47651,1.678819,3.84856,3.395807,...,0.949191,3.618416,3.2851,2.304268,2.231605,3.468185,2.474688,1.92178,3.2584,0.745029
10,2.552279,2.898823,2.845558,3.583465,3.423763,1.312138,3.435229,1.622045,3.704709,3.154183,...,0.835683,3.740867,3.125745,2.235549,2.039389,3.341033,2.344016,1.748327,3.157199,0.6552


In [33]:
ratings_matrix[:10]

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,4.0,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
6,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
8,,,,,,,,,,,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,1.0,,,
10,,,,,,,,,,,...,,,,,,,,,,


In [39]:
#9번 사용자에게 아직 보지 않은 영화(unssen_movies)를 예측 평점 높은 순으로 추천해주기

def get_unseen_movies(ratings_matrix,userId):
  #userId 로 입력받은 사용자(여기서는 9번)의 모든 영화정보 추출 >> Series 반환함
  #반환된 user_rating은 영화명(title) 을 index로 가지는 Series 객체임
  user_rating=ratings_matrix.loc[userId,:]

  #user_rating이 0보다 크면 기존에 관람한 영화임
  #>>대상 index 추출하여 list 객체로 만들면 되겠다

  already_seen=user_rating[user_rating>0].index.tolist()

  #모든 영화명을 list 객체로 만들자. 그리고 이미 본 영화 리스트(already_seen) 빼주자
  #>>그러면 안 본 영화 리스트가 나오겠네
  movies_list=ratings_matrix.columns.tolist()

  #list comprehensioin 으로 already_seen에 해당하는 movie는 movies_list 에서 제외함
  unseen_list=[movie for movie in movies_list if movie not in already_seen]

  return unseen_list

In [40]:
def recomm_movie_by_userid(pred_df,userId,unseen_list,top_n=10):
  #예측 평점 데이터프레임에서 userId unseen_list 로 들어온 영화명 컬럼을 추출함
  #>>가장 예측평점이 높은 순으로 정렬
  recomm_movies=pred_df.loc[userId,unseen_list].sort_values(ascending=False)[:top_n]

  #loc[행,열] 데이터 찾아서 top_n 까지 내림차순으로 정렬렬
  return recomm_movies


In [41]:
#사용자가 관람하지 않은 영화명 추출

unseen_list=get_unseen_movies(ratings_matrix,9)
unseen_list[:10]

["'71 (2014)",
 "'Hellboy': The Seeds of Creation (2004)",
 "'Round Midnight (1986)",
 "'Salem's Lot (2004)",
 "'Til There Was You (1997)",
 "'Tis the Season for Love (2015)",
 "'burbs, The (1989)",
 "'night Mother (1986)",
 '(500) Days of Summer (2009)',
 '*batteries not included (1987)']

In [43]:
#잠재요인 협업  필터링으로 영화 추천
recomm_movies=recomm_movie_by_userid(ratings_pred_matrix,9,unseen_list,top_n=10)
recomm_movies

title
Three Billboards Outside Ebbing, Missouri (2017)         4.772504
Lord of the Rings: The Return of the King, The (2003)    4.687062
Bad Boy Bubby (1993)                                     4.673322
Man Bites Dog (C'est arrivé près de chez vous) (1992)    4.658136
General, The (1926)                                      4.615529
Star Wars: Episode V - The Empire Strikes Back (1980)    4.584653
Hoop Dreams (1994)                                       4.566768
Star Wars: Episode IV - A New Hope (1977)                4.565560
Big Sleep, The (1946)                                    4.562987
Monty Python and the Holy Grail (1975)                   4.542010
Name: 9, dtype: float64

In [44]:
#평점 데이터를 데이터프레임으로 변환,출력
pd.DataFrame(data=recomm_movies.values,
             index=recomm_movies.index,
             columns=['pred_score'])

Unnamed: 0_level_0,pred_score
title,Unnamed: 1_level_1
"Three Billboards Outside Ebbing, Missouri (2017)",4.772504
"Lord of the Rings: The Return of the King, The (2003)",4.687062
Bad Boy Bubby (1993),4.673322
Man Bites Dog (C'est arrivé près de chez vous) (1992),4.658136
"General, The (1926)",4.615529
Star Wars: Episode V - The Empire Strikes Back (1980),4.584653
Hoop Dreams (1994),4.566768
Star Wars: Episode IV - A New Hope (1977),4.56556
"Big Sleep, The (1946)",4.562987
Monty Python and the Holy Grail (1975),4.54201


In [45]:
recomm_movies=pd.DataFrame(data=recomm_movies.values,
                          index=recomm_movies.index,
                          columns=['pred_score'])

In [46]:
recomm_movies.to_csv('./recomm_movies.csv')

In [None]:
#통찰(insight)
#MF(행렬분해)기반의 잠재요인 협업필터링으로 영화를 추천한 결과
#주로 SF 계열의 어두운 분위기 영화 추천되었음 >> SF 계열의 어두운 분위기 영화를 좋아하는것으로 추정됨