### 2021_10_29_4

### 아이템 기반 협업 필터링 기법을 활용한 영화 추천 시스템 만들기
* 영화 리뷰 데이터를 이용한 추천 시스템 구현

In [1]:
import pandas as pd
import numpy as np

### 첫번째 데이터 불러오기
* df : 10만개 리뷰 정보

In [2]:
columns = ["user_id", "item_id", "rating", "timestamp"]
df = pd.read_csv("../DATA/ml-100k/u.data", sep = "\t", names = columns)
print(df.shape)
print(df.head(3))

(100000, 4)
   user_id  item_id  rating  timestamp
0      196      242       3  881250949
1      186      302       3  891717742
2       22      377       1  878887116


### 두번째 데이터 불러오기
* u.item
    * item_id : 영화 정보
    * movie_title : 영화 제목
    * release date : 출시일
    * video release date : 비디오 출시일
    * IMDb URL : IMDb URL 정보
    * unknown, .. : 기타 장르 정보

In [3]:
# 장르 분야
columns = ['item_id', 'movie title', 'release date', 'video release date', 'IMDb URL', 
           'unknown', 'Action', 'Adventure', 'Animation', 'Childrens', 
           'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 
           'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 
           'Sci-Fi', 'Thriller', 'War', 'Western']
movies = pd.read_csv("../DATA/ml-100k/u.item", sep = "|",
                    names = columns, encoding = "latin-1")

print("movies shape : ", movies.shape)
print(movies.head(3))

movies shape :  (1682, 24)
   item_id        movie title release date  video release date  \
0        1   Toy Story (1995)  01-Jan-1995                 NaN   
1        2   GoldenEye (1995)  01-Jan-1995                 NaN   
2        3  Four Rooms (1995)  01-Jan-1995                 NaN   

                                            IMDb URL  unknown  Action  \
0  http://us.imdb.com/M/title-exact?Toy%20Story%2...        0       0   
1  http://us.imdb.com/M/title-exact?GoldenEye%20(...        0       1   
2  http://us.imdb.com/M/title-exact?Four%20Rooms%...        0       0   

   Adventure  Animation  Childrens  ...  Fantasy  Film-Noir  Horror  Musical  \
0          0          1          1  ...        0          0       0        0   
1          1          0          0  ...        0          0       0        0   
2          0          0          0  ...        0          0       0        0   

   Mystery  Romance  Sci-Fi  Thriller  War  Western  
0        0        0       0         0   

### 두 데이터 병합
* df : 10만개 리뷰 정보
* movies : 영화 정보 및 장르 정보

In [8]:
# 영화 리뷰 정보의 영화 개수 : 1682편
print("영화 리뷰 정보의 영화 개수 :", len(df.item_id.unique()))     
# 영화 정보의 영화 개수 : 1682편
print("영화 정보의 영화 개수 :", len(movies.item_id.unique())) 

영화 리뷰 정보의 영화 개수 : 1682
영화 정보의 영화 개수 : 1682


In [7]:
movie_names = movies[["item_id", "movie title"]]

c_movies_data = pd.merge(df, movie_names, on = "item_id")
print(c_movies_data.shape)
print(c_movies_data.head(3))

(100000, 5)
   user_id  item_id  rating  timestamp   movie title
0      196      242       3  881250949  Kolya (1996)
1       63      242       3  875747190  Kolya (1996)
2      226      242       5  883888671  Kolya (1996)


### 하나의 영화를 선택하고 관련 유사한 영화 10편을 추천해 주는 시스템

In [12]:
rating_c = c_movies_data.pivot_table(values = "rating", 
                                    index = "user_id",
                                    columns = "movie title",
                                    fill_value = 0)
print(rating_c.shape)
rating_c.head(3)

(943, 1664)


movie title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,2,5,0,0,3,4,0,0,...,0,0,0,5,3,0,0,0,4,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### 아이템 기반 협업 필터링을 위해 행열 바꾸기

In [13]:
X = rating_c.T
print(X.shape)

(1664, 943)


In [14]:
X.head(3)

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
movie title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'Til There Was You (1997),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1-900 (1994),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101 Dalmatians (1996),2.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,2.0,4.0,0.0,0.0,0.0,0.0,0.0


### 차원 축소
* SVD
* 사이킷런을 활용하여 SVD를 할 수 있다.
* truncated SVD를 사용하여 차원 축소한다.

In [20]:
from sklearn.decomposition import TruncatedSVD

SVD = TruncatedSVD(n_components = 12, random_state = 5)
resultant_matrix = SVD.fit_transform(X) 

print("X shape : {}".format(X.shape))
print("resultant_matrix shape :", resultant_matrix.shape)

X shape : (1664, 943)
resultant_matrix shape : (1664, 12)


### Correlation Pearson
* 피어슨 상관계수 , 코사인 유사성과 같은 다양한 유사성 측정 지표를 사용할 수 있다.
* 피어슨 상관계수를 이용하여 상관 행렬을 만들어봄.

In [21]:
### correlation matrix
corr_mat = np.corrcoef(resultant_matrix)
print(corr_mat.shape)
corr_mat

(1664, 1664)


array([[ 1.        , -0.11573577,  0.51362284, ...,  0.38310045,
         0.20193733,  0.5065142 ],
       [-0.11573577,  1.        ,  0.05820808, ...,  0.15805829,
         0.51795357,  0.27104818],
       [ 0.51362284,  0.05820808,  1.        , ...,  0.76575655,
         0.43824619,  0.19507139],
       ...,
       [ 0.38310045,  0.15805829,  0.76575655, ...,  1.        ,
         0.18043708,  0.12115972],
       [ 0.20193733,  0.51795357,  0.43824619, ...,  0.18043708,
         1.        ,  0.20126072],
       [ 0.5065142 ,  0.27104818,  0.19507139, ...,  0.12115972,
         0.20126072,  1.        ]])

### 유사 영화를 찾기
### Similar Movies to Star Wars (1977)

In [23]:
rating_c.columns.get_loc("Star Wars (1977)")

1398

In [24]:
col_idx = rating_c.columns.get_loc("Star Wars (1977)")
corr_specific = corr_mat[col_idx]    # Star Wars (1977)의 위치 행 획득
print(corr_specific.shape) 

(1664,)


In [25]:
result = pd.DataFrame({'corr_specific':corr_specific, 'Movies': rating_c.columns})
print(result.shape)
result.head()

(1664, 2)


Unnamed: 0,corr_specific,Movies
0,0.357238,'Til There Was You (1997)
1,0.421507,1-900 (1994)
2,0.593815,101 Dalmatians (1996)
3,0.722361,12 Angry Men (1957)
4,0.325221,187 (1997)


### 10개의 영화 추천

In [26]:
result.sort_values('corr_specific', ascending=False).head(10)

Unnamed: 0,corr_specific,Movies
1398,1.0,Star Wars (1977)
1234,0.988052,Return of the Jedi (1983)
1460,0.942655,Terminator 2: Judgment Day (1991)
1523,0.933978,Toy Story (1995)
1461,0.931701,"Terminator, The (1984)"
1205,0.925185,Raiders of the Lost Ark (1981)
456,0.923562,"Empire Strikes Back, The (1980)"
570,0.915965,"Fugitive, The (1993)"
414,0.914299,Die Hard (1988)
44,0.892894,Aliens (1986)


### 101 Dalmatians (1996) 관련 15개의 영화 추천

In [27]:
rating_c.columns.get_loc("101 Dalmatians (1996)")

2

In [32]:
col_idx = rating_c.columns.get_loc("101 Dalmatians (1996)")
corr_specific = corr_mat[col_idx] # 101 Dalmatians (1996)의 행 획득
print(corr_specific.shape)

result = pd.DataFrame({"corr_specific" : corr_specific, 
                       "Movies" : rating_c.columns})
print(result.shape)
result.sort_values("corr_specific", ascending = False).head(15)

(1664,)
(1664, 2)


Unnamed: 0,corr_specific,Movies
2,1.0,101 Dalmatians (1996)
693,0.944203,Homeward Bound II: Lost in San Francisco (1996)
713,0.93253,"Hunchback of Notre Dame, The (1996)"
659,0.92215,Harriet the Spy (1996)
46,0.910804,All Dogs Go to Heaven 2 (1996)
805,0.903955,Kazaam (1996)
23,0.899279,"Adventures of Pinocchio, The (1996)"
435,0.899266,Dragonheart (1996)
764,0.890192,Jack (1996)
505,0.881306,Father of the Bride Part II (1995)


In [31]:
result

Unnamed: 0,corr_specific,Movies
2,1.0,101 Dalmatians (1996)
693,0.944203,Homeward Bound II: Lost in San Francisco (1996)
713,0.93253,"Hunchback of Notre Dame, The (1996)"
659,0.92215,Harriet the Spy (1996)
46,0.910804,All Dogs Go to Heaven 2 (1996)
805,0.903955,Kazaam (1996)
23,0.899279,"Adventures of Pinocchio, The (1996)"
435,0.899266,Dragonheart (1996)
764,0.890192,Jack (1996)
505,0.881306,Father of the Bride Part II (1995)


### 아이템 기반 협업 필터링 방식 - 코사인 유사도를 활용해 보기

In [33]:
from sklearn.metrics.pairwise import cosine_similarity

In [36]:
cosine_mat = cosine_similarity(resultant_matrix)
print(cosine_mat.shape)
corr_mat

(1664, 1664)


array([[ 1.        , -0.11573577,  0.51362284, ...,  0.38310045,
         0.20193733,  0.5065142 ],
       [-0.11573577,  1.        ,  0.05820808, ...,  0.15805829,
         0.51795357,  0.27104818],
       [ 0.51362284,  0.05820808,  1.        , ...,  0.76575655,
         0.43824619,  0.19507139],
       ...,
       [ 0.38310045,  0.15805829,  0.76575655, ...,  1.        ,
         0.18043708,  0.12115972],
       [ 0.20193733,  0.51795357,  0.43824619, ...,  0.18043708,
         1.        ,  0.20126072],
       [ 0.5065142 ,  0.27104818,  0.19507139, ...,  0.12115972,
         0.20126072,  1.        ]])

In [37]:
col_idx = rating_c.columns.get_loc('101 Dalmatians (1996)')
cosine_spec = cosine_mat[col_idx]

result = pd.DataFrame( {'cosine_sim':cosine_spec, 'Movies':rating_c.columns })
result.sort_values('cosine_sim', ascending=False).head(15)

Unnamed: 0,cosine_sim,Movies
2,1.0,101 Dalmatians (1996)
693,0.94606,Homeward Bound II: Lost in San Francisco (1996)
713,0.943881,"Hunchback of Notre Dame, The (1996)"
46,0.921926,All Dogs Go to Heaven 2 (1996)
23,0.916551,"Adventures of Pinocchio, The (1996)"
659,0.909108,Harriet the Spy (1996)
505,0.892762,Father of the Bride Part II (1995)
1547,0.892041,Twister (1996)
764,0.887689,Jack (1996)
532,0.884186,Flipper (1996)
