# 1. Data

In [1]:
import pandas as pd
import numpy as np

dir_path = "datas/movielens_small/{}"
rating_data = pd.read_csv(dir_path.format("ratings.csv"))
movie_data = pd.read_csv(dir_path.format("movies.csv"))

## 1. ratings : user의 영화 평가정보가 담긴 데이터

In [2]:
crow, ccol = rating_data.shape
print("[count] row : {}, column : {}".format(crow, ccol))
rating_data.head(5)

[count] row : 100004, column : 4


Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


#### 추가정보 : 평점데이터는 한 명의 사용자가 여러개의 영화에 평점을 남길 수 있는 구조 이기 때문에 행의 갯수가 사용자의 수를 나타내지 않는다.

In [3]:
print("user count : {}".format(rating_data['userId'].unique().size))

user count : 671


## 2. movies : 영화 정보 데이터

In [4]:
crow, ccol = movie_data.shape
print("[count] row : {}, column : {}".format(crow, ccol))
movie_data.head(5)

[count] row : 9125, column : 3


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


# 2. Data Preprocessing

## 1. Needed Column Extraction

In [5]:
rating_data.drop('timestamp', axis=1, inplace=True)

crow, ccol = rating_data.shape
print("[count] row : {}, column : {}".format(crow, ccol))
rating_data.head(5)

[count] row : 100004, column : 3


Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


## 2. Merge ratings - movies 
- 2개의 데이터는 movie 데이터의 movieId(1) - 사용자의 평점 데이터의 movieId(N) 칼럼으로 1:N 구조로 형성되어 있다.

In [6]:
usr_movie_rating = pd.merge(rating_data, movie_data, on="movieId")

crow, ccol = usr_movie_rating.shape
print("[count] row : {}, column : {}".format(crow, ccol))
usr_movie_rating.head()

[count] row : 100004, column : 5


Unnamed: 0,userId,movieId,rating,title,genres
0,1,31,2.5,Dangerous Minds (1995),Drama
1,7,31,3.0,Dangerous Minds (1995),Drama
2,31,31,4.0,Dangerous Minds (1995),Drama
3,32,31,4.0,Dangerous Minds (1995),Drama
4,36,31,3.0,Dangerous Minds (1995),Drama


## 3. Generate pivot table
- Item-Based CF 에는 각 사용자들의 모든 영화에 대한 평점 점수 데이터가 행렬로 구성되어 있어야 한다. 
    - 평점이 없는 영화의 경우에는 NaN, 0 처리
- 해당의 경우에는 2가지의 경우의 수로 데이터를 구성할 수 있다.
1. movie_usr_rating : 영화 - 사용자 피봇 테이블 (index : 영화, column : 사용자)
2. usr_movie_rating : 사용자 - 영화 피봇 테이블 (index : 사용자, column : 영화)

In [7]:
# 1번 케이스 (index : 영화 - column : 사용자 피봇테이블)
movie_usr_rating = usr_movie_rating.pivot_table('rating', index='title', columns='userId')

crow, ccol = movie_usr_rating.shape
print("[count] row : {}, column : {}".format(crow, ccol))
movie_usr_rating.head()

[count] row : 9064, column : 671


userId,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"""Great Performances"" Cats (1998)",,,,,,,,,,,...,,,,,,,,,,
$9.99 (2008),,,,,,,,,,,...,,,,,,,,,,
'Hellboy': The Seeds of Creation (2004),,,,,,,,,,,...,,,,,,,,,,
'Neath the Arizona Skies (1934),,,,,,,,,,,...,,,,,,,,,,
'Round Midnight (1986),,,,,,,,,,,...,,,,,,,,,,


In [8]:
# 2번 케이스 (index : 사용자 - column : 영화 피봇테이블)
usr_movie_rating = usr_movie_rating.pivot_table('rating', index='userId', columns='title')
usr_movie_rating.head()

title,"""Great Performances"" Cats (1998)",$9.99 (2008),'Hellboy': The Seeds of Creation (2004),'Neath the Arizona Skies (1934),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),...,Zulu (1964),Zulu (2013),[REC] (2007),eXistenZ (1999),loudQUIETloud: A Film About the Pixies (2006),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931),İtirazım Var (2014)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [9]:
movie_usr_rating.sample().index

Index(['Birthday Girl (2001)'], dtype='object', name='title')

#### 이 때, 본문에서는 아이템기반으로 진행이 되기 때문에 index가 영화인 "영화-사용자 피봇테이블"을 사용한다.

In [10]:
# fillna
movie_usr_rating.fillna(0, inplace=True)
movie_usr_rating.head()

userId,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"""Great Performances"" Cats (1998)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
$9.99 (2008),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Hellboy': The Seeds of Creation (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Neath the Arizona Skies (1934),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Round Midnight (1986),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# 3. Item-Based Collaborative Filtering

## 1. Similarity value extraction

In [11]:
from sklearn.metrics.pairwise import cosine_similarity as cos

sim_rate = cos(movie_usr_rating, movie_usr_rating)

crow, ccol = sim_rate.shape
print("[count] row : {}, column : {}".format(crow, ccol))
print(sim_rate[:5])

[count] row : 9064, column : 9064
[[1.         0.         0.         ... 0.         0.         0.        ]
 [0.         1.         0.         ... 0.05821787 0.         0.        ]
 [0.         0.         1.         ... 0.         0.         0.        ]
 [0.16439899 0.         0.         ... 0.         0.         0.        ]
 [0.02039118 0.         0.         ... 0.         0.         0.        ]]


## 2. Similarity Data Frame
- 영화 - 사용자 구조의 피봇테이블을 코사인 유사도 평가를 진행했다는 것은 각 영화에 대한 사용자 평점 벡터(구조)에 대한 유사도 평가를 진행했다는 의미와 같다.
#### 즉, 각 영화들의 사용자 평점 값의 구조에 대한 유사도 평가를 내린 것 이다.

In [12]:
sim_rate_df = pd.DataFrame(sim_rate, index=movie_usr_rating.index, columns=movie_usr_rating.index)
sim_rate_df.head()

title,"""Great Performances"" Cats (1998)",$9.99 (2008),'Hellboy': The Seeds of Creation (2004),'Neath the Arizona Skies (1934),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),...,Zulu (1964),Zulu (2013),[REC] (2007),eXistenZ (1999),loudQUIETloud: A Film About the Pixies (2006),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931),İtirazım Var (2014)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"""Great Performances"" Cats (1998)",1.0,0.0,0.0,0.164399,0.020391,0.0,0.014046,0.0,0.0,0.003166,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
$9.99 (2008),0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.079474,0.0,0.15633,...,0.0,0.0,0.0,0.0,0.0,0.013899,0.0,0.058218,0.0,0.0
'Hellboy': The Seeds of Creation (2004),0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.217357,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Neath the Arizona Skies (1934),0.164399,0.0,0.0,1.0,0.124035,0.0,0.085436,0.0,0.0,0.019259,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Round Midnight (1986),0.020391,0.0,0.0,0.124035,1.0,0.0,0.010597,0.143786,0.0,0.136163,...,0.0,0.0,0.0,0.121567,0.0,0.0,0.0,0.0,0.0,0.0


## 3. Use

In [13]:
# 이 때, title은 영화 title을 나타낸다.
# item으로 item을 추천 받을 때 사용하는 기법이라고 봐도 무방하다.
def recommended_movie(title):
    return sim_rate_df[title].sort_values(ascending=False)[1:6]

In [14]:
recommended_movie("Toy Story (1995)")

title
Toy Story 2 (1999)                           0.594710
Star Wars: Episode IV - A New Hope (1977)    0.576188
Forrest Gump (1994)                          0.564534
Independence Day (a.k.a. ID4) (1996)         0.562946
Groundhog Day (1993)                         0.548023
Name: Toy Story (1995), dtype: float64

## 4. 의미

In [15]:
title = "Toy Story (1995)"

# top data
top_data = sim_rate_df[title].sort_values(ascending=False).index[1]

# bottom data
bottom_data = sim_rate_df[title].sort_values(ascending=True).index[0]

print("추천 상위 영화 : {} / 추천 하위 영화 : {}".format(top_data, bottom_data))

추천 상위 영화 : Toy Story 2 (1999) / 추천 하위 영화 : "Great Performances" Cats (1998)


In [16]:
# 추천 target에 평점을 준 유저들은 추천 된 데이터에도 이어서 평점을 주었을까? 추천되지 않는 데이터에는?
target_id = movie_data[movie_data['title'] == title]['movieId'].values[0]
top_id = movie_data[movie_data['title'] == top_data]['movieId'].values[0]
bottom_id = movie_data[movie_data['title'] == bottom_data]['movieId'].values[0]
print(target_id, top_id, bottom_id)

# 추천 target에 평점을 준 유저 리스트
target_list = rating_data[rating_data['movieId'] == target_id]['userId'].values

# 추천 영화에 평점을 준 유저 리스트
top_list = rating_data[rating_data['movieId'] == top_id]['userId'].values

# 추천 되지 않는 영화에 평점을 준 유저 리스트
bottom_list = rating_data[rating_data['movieId'] == bottom_id]['userId'].values

1 3114 51372


In [17]:
top_result = top_list[np.isin(top_list,target_list)]
bottom_result = bottom_list[np.isin(bottom_list,target_list)]

print("추천영화 '{}'를 관람한 사용자 중에서 '{}'도 관람했던 사용자 수 : {}".format(top_data, title, top_result.size))
print("추천되지 않는 영화 '{}'를 관람한 사용자 중에서 '{}'도 관람했던 사용자 수 : {}".format(bottom_data, title, bottom_result.size))

추천영화 'Toy Story 2 (1999)'를 관람한 사용자 중에서 'Toy Story (1995)'도 관람했던 사용자 수 : 101
추천되지 않는 영화 '"Great Performances" Cats (1998)'를 관람한 사용자 중에서 'Toy Story (1995)'도 관람했던 사용자 수 : 0
