### 2021_10_29_4

### 아이템 기반 협업 필터링 기법을 활용한 영화 추천 시스템 만들기
* 영화 리뷰 데이터를 이용한 추천 시스템 구현
* 총 리뷰 데이터 : 약 1000만건, 영화 정보 : 10681편

In [61]:
import pandas as pd
import numpy as np

### 첫번째 데이터 불러오기
* df : 10만개 리뷰 정보

In [62]:
columns = ["user_id", "item_id", "rating", "timestamp"]
rating = pd.read_csv("../DATA/ml-10M100K/ratings.dat", sep = "::", names = columns)
print(rating.shape)
print(rating.head(3))

  return func(*args, **kwargs)


(10000054, 4)
   user_id  item_id  rating  timestamp
0        1      122     5.0  838985046
1        1      185     5.0  838983525
2        1      231     5.0  838983392


### 두번째 데이터 불러오기
* u.item
    * item_id : 영화 정보
    * movie_title : 영화 제목
    * release date : 출시일
    * video release date : 비디오 출시일
    * IMDb URL : IMDb URL 정보
    * unknown, .. : 기타 장르 정보

In [63]:
# 장르 분야
columns = ['item_id', 'movie title', "unknown"]
movies = pd.read_csv("../DATA/ml-10M100K/movies.dat", sep = "::",
                    names = columns, encoding = "latin-1")

print("movies shape : ", movies.shape)
print(movies.head(3))

movies shape :  (10681, 3)
   item_id              movie title  \
0        1         Toy Story (1995)   
1        2           Jumanji (1995)   
2        3  Grumpier Old Men (1995)   

                                       unknown  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  


  return func(*args, **kwargs)


### 두 데이터 병합
* df : 10만개 리뷰 정보
* movies : 영화 정보 및 장르 정보

In [7]:
# 영화 리뷰 정보의 영화 개수 : 1682편
print("영화 리뷰 정보의 영화 개수 :", len(rating.item_id.unique()))     
# 영화 정보의 영화 개수 : 1682편
print("영화 정보의 영화 개수 :", len(movies.item_id.unique())) 

영화 리뷰 정보의 영화 개수 : 10677
영화 정보의 영화 개수 : 10681


In [8]:
movie_names = movies[["item_id", "movie title"]]

c_movies_data = pd.merge(rating, movie_names, on = "item_id")
print(c_movies_data.shape)
print(c_movies_data.head(3))

(10000054, 5)
   user_id  item_id  rating   timestamp       movie title
0        1      122     5.0   838985046  Boomerang (1992)
1      139      122     3.0   974302621  Boomerang (1992)
2      149      122     2.5  1112342322  Boomerang (1992)


### 하나의 영화를 선택하고 관련 유사한 영화 10편을 추천해 주는 시스템

In [9]:
rating_c = c_movies_data.pivot_table(values = "rating", 
                                    index = "user_id",
                                    columns = "movie title",
                                    fill_value = 0)
print(rating_c.shape)
rating_c.head(3)

(69878, 10676)


movie title,"""Great Performances"" Cats (1998)",'Round Midnight (1986),'Til There Was You (1997),"'burbs, The (1989)",'night Mother (1986),*batteries not included (1987),...All the Marbles (a.k.a. The California Dolls) (1981),...And God Created Woman (Et Dieu... crÃ©a la femme) (1956),...And God Spoke (1993),...And Justice for All (1979),...,Zorba the Greek (Alexis Zorbas) (1964),"Zorro, the Gay Blade (1981)",Zulu (1964),Zus & Zo (2001),[Rec] (2007),eXistenZ (1999),ffolks (a.k.a. North Sea Hijack) (1980),loudQUIETloud: A Film About the Pixies (2006),xXx: State of the Union (2005),"Ãge d'or, L' (1930)"
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 아이템 기반 협업 필터링을 위해 행열 바꾸기

In [10]:
X = rating_c.T
print(X.shape)

(10676, 69878)


In [11]:
X.head(3)

user_id,1,2,3,4,5,6,7,8,9,10,...,71558,71559,71560,71561,71562,71563,71564,71565,71566,71567
movie title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"""Great Performances"" Cats (1998)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Round Midnight (1986),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Til There Was You (1997),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 차원 축소
* SVD
* 사이킷런을 활용하여 SVD를 할 수 있다.
* truncated SVD를 사용하여 차원 축소한다.

In [12]:
from sklearn.decomposition import TruncatedSVD

SVD = TruncatedSVD(n_components = 12, random_state = 5)
resultant_matrix = SVD.fit_transform(X) 

print("X shape : {}".format(X.shape))
print("resultant_matrix shape :", resultant_matrix.shape)

X shape : (10676, 69878)
resultant_matrix shape : (10676, 12)


### Correlation Pearson
* 피어슨 상관계수 , 코사인 유사성과 같은 다양한 유사성 측정 지표를 사용할 수 있다.
* 피어슨 상관계수를 이용하여 상관 행렬을 만들어봄.

In [13]:
### correlation matrix
corr_mat = np.corrcoef(resultant_matrix)
print(corr_mat.shape)
corr_mat

(10676, 10676)


array([[1.        , 0.91104477, 0.43040745, ..., 0.58091296, 0.51191829,
        0.86042399],
       [0.91104477, 1.        , 0.28779129, ..., 0.7131907 , 0.4264758 ,
        0.9175588 ],
       [0.43040745, 0.28779129, 1.        , ..., 0.05476156, 0.23606898,
        0.11309555],
       ...,
       [0.58091296, 0.7131907 , 0.05476156, ..., 1.        , 0.45256165,
        0.8176457 ],
       [0.51191829, 0.4264758 , 0.23606898, ..., 0.45256165, 1.        ,
        0.43937709],
       [0.86042399, 0.9175588 , 0.11309555, ..., 0.8176457 , 0.43937709,
        1.        ]])

### 유사 영화를 찾기
### 'night Mother (1986) 관련 10개의 영화 추천

In [52]:
rating_c.columns.get_loc("'night Mother (1986)")

4

In [53]:
col_idx = rating_c.columns.get_loc("'night Mother (1986)")
corr_specific = corr_mat[col_idx]    # Star Wars (1977)의 위치 행 획득
print(corr_specific.shape) 

(10676,)


In [54]:
result = pd.DataFrame({'corr_specific':corr_specific, 'Movies': rating_c.columns})
print(result.shape)
result.sort_values('corr_specific', ascending=False).head(10)

(10676, 2)


Unnamed: 0,corr_specific,Movies
4,1.0,'night Mother (1986)
4250,0.988687,Heartburn (1986)
2215,0.983593,Crimes of the Heart (1986)
253,0.978932,Agnes of God (1985)
3474,0.972053,For Roseanna (Roseanna's Grave) (1997)
9434,0.96802,Tex (1982)
5931,0.965983,Madame Sousatzka (1988)
3537,0.964287,Frances (1982)
4654,0.963117,"I Love You, Don't Touch Me! (1998)"
2172,0.961164,Cousin Bette (1998)


### 101 Dalmatians (1996) 관련 15개의 영화 추천

In [21]:
rating_c.columns.get_loc("101 Dalmatians (1996)")

22

In [22]:
col_idx = rating_c.columns.get_loc("101 Dalmatians (1996)")
corr_specific = corr_mat[col_idx] # 101 Dalmatians (1996)의 행 획득
print(corr_specific.shape)

result = pd.DataFrame({"corr_specific" : corr_specific, 
                       "Movies" : rating_c.columns})
print(result.shape)
result.sort_values("corr_specific", ascending = False).head(15)

(10676,)
(10676, 2)


Unnamed: 0,corr_specific,Movies
22,1.0,101 Dalmatians (1996)
6151,0.975378,Matilda (1996)
4598,0.966167,"Hunchback of Notre Dame, The (1996)"
10653,0.941702,Zeus and Roxanne (1997)
5139,0.938503,Jungle2Jungle (a.k.a. Jungle 2 Jungle) (1997)
8858,0.938176,Space Jam (1996)
6561,0.937814,Muppet Treasure Island (1996)
1280,0.933891,Bogus (1996)
3362,0.933605,First Kid (1996)
3369,0.925414,"First Wives Club, The (1996)"


### 아이템 기반 협업 필터링 방식 - 코사인 유사도를 활용해 보기

In [24]:
from sklearn.metrics.pairwise import cosine_similarity

In [25]:
cosine_mat = cosine_similarity(resultant_matrix)
print(cosine_mat.shape)
corr_mat

(10676, 10676)


array([[1.        , 0.91104477, 0.43040745, ..., 0.58091296, 0.51191829,
        0.86042399],
       [0.91104477, 1.        , 0.28779129, ..., 0.7131907 , 0.4264758 ,
        0.9175588 ],
       [0.43040745, 0.28779129, 1.        , ..., 0.05476156, 0.23606898,
        0.11309555],
       ...,
       [0.58091296, 0.7131907 , 0.05476156, ..., 1.        , 0.45256165,
        0.8176457 ],
       [0.51191829, 0.4264758 , 0.23606898, ..., 0.45256165, 1.        ,
        0.43937709],
       [0.86042399, 0.9175588 , 0.11309555, ..., 0.8176457 , 0.43937709,
        1.        ]])

In [26]:
col_idx = rating_c.columns.get_loc('101 Dalmatians (1996)')
cosine_spec = cosine_mat[col_idx]

result = pd.DataFrame( {'cosine_sim':cosine_spec, 'Movies':rating_c.columns })
result.sort_values('cosine_sim', ascending=False).head(15)

Unnamed: 0,cosine_sim,Movies
22,1.0,101 Dalmatians (1996)
6151,0.977458,Matilda (1996)
4598,0.959141,"Hunchback of Notre Dame, The (1996)"
8858,0.942675,Space Jam (1996)
3362,0.93484,First Kid (1996)
6561,0.932172,Muppet Treasure Island (1996)
3369,0.929352,"First Wives Club, The (1996)"
10653,0.928715,Zeus and Roxanne (1997)
279,0.927311,Alaska (1996)
4187,0.925863,Harriet the Spy (1996)
