# 제2장 기본적인 추천 시스템
## 2.1 데이터 읽기

In [1]:
import pandas as pd

u_cols = ['user_id','age','sex','occupation','zip_code']
users = pd.read_csv('data/u.user',sep='|',names=u_cols,encoding='latin-1') # 구분자 |, 열이름 지정
users = users.set_index('user_id') #users의 user_id를 인덱스로 설정
users.head()

Unnamed: 0_level_0,age,sex,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213


In [2]:
i_cols = ['movie_id', 'title', 'release date', 'video release date', 'IMDB URL', 
          'unknown', 'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 
          'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 
          'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
movies = pd.read_csv('data/u.item',sep='|',names=i_cols,encoding='latin-1')
movies=movies.set_index('movie_id')
movies.head()

Unnamed: 0_level_0,title,release date,video release date,IMDB URL,unknown,Action,Adventure,Animation,Children's,Comedy,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [3]:
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('data/u.data',sep='\t',names=r_cols,encoding='latin-1')
ratings = ratings.set_index('user_id')
ratings.head()

Unnamed: 0_level_0,movie_id,rating,timestamp
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
196,242,3,881250949
186,302,3,891717742
22,377,1,878887116
244,51,2,880606923
166,346,1,886397596


## 2.2 인기제품 방식

개별 사용자에 대한 정보가 없는 경우나, 정확도에 관계없이 가장 간단한 추천을 제공해야 하는 상황에서 사용하는 방법  
모든 사람에게 똑같은 추천을 하는 방 

In [4]:
# Best-seller 추천
def recom_movie(n_items):
    movie_sort = movie_mean.sort_values(ascending=False)[:n_items]
    # sort_values(ascending=False) 데이터 값을 내림차순으로 정렬
    recom_movies = movies.loc[movie_sort.index] # loc는 인덱싱 할 때 사용(인덱스 이름,열 이름,행 이름)
    recommendations = recom_movies['title']
    return recommendations

# 한줄로 정#
def recom_movie2(n_items):
    return movies.loc[movie_mean.sort_values(ascending=False)[:n_items].index]['title']

movie_mean = ratings.groupby(['movie_id'])['rating'].mean()
recom_movie(5)

movie_id
814                         Great Day in Harlem, A (1994)
1599                        Someone Else's America (1995)
1201           Marlene Dietrich: Shadow and Light (1996) 
1122                       They Made Me a Criminal (1939)
1653    Entertaining Angels: The Dorothy Day Story (1996)
Name: title, dtype: object

## 2.3 추천 시스템의 정확도 측정

RMSE(Root Mean Squared Error)
- 예측값과 해당 실제값의 차이(error)의 제곱(squared)의 평균(mean)의 제곱근(root)

- $RMSE = \sqrt{\frac{1}{n} \sum_{i=1}^{n} (y_i - \hat{y}_i)^2}$
    - $y_i$: 실제값 (Ground Truth)  
    - $\hat{y}_i$: 예측값 (Predicted Value)  
    - $n$: 데이터 포인트의 개수  



In [8]:
import numpy as np
# 정확도 계산
def RMSE(y_true,y_pred):
    return np.sqrt(np.mean((np.array(y_true)-np.array(y_pred))**2))

rmse = []
for user in set(ratings.index): # set()은 중복을 제거하고 고유한 값을 가지는 데이터 구조를 생성 (user를 뽑기위해)
    y_true = ratings.loc[user]['rating']
    y_pred = movie_mean[ratings.loc[user]['movie_id']] # user가 평가한 영화들의 평균
    accuracy = RMSE(y_true,y_pred)
    rmse.append(accuracy)
print(np.mean(rmse))

0.996007224010567


## 2.4 사용자 집단별 추천

사용자들을 비슷한 특성의 사람들을 묶은 소집단으로 만든 다음에 각 집단의 평점평균을 바탕으로 추천하는 것

In [3]:
import pandas as pd
import numpy as np

# 데이터 읽어 오기 
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('data/u.user', sep='|', names=u_cols, encoding='latin-1')
i_cols = ['movie_id', 'title', 'release date', 'video release date', 'IMDB URL', 'unknown', 
          'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 
          'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 
          'Thriller', 'War', 'Western']
movies = pd.read_csv('data/u.item', sep='|', names=i_cols, encoding='latin-1')
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('data/u.data', sep='\t', names=r_cols, encoding='latin-1')

# timestamp 제거
ratings=ratings.drop('timestamp',axis = 1)

# movies ID와 title 빼고 다른 데이터 제거
movies = movies[['movie_id','title']]

In [4]:
# train, test set 분
from sklearn.model_selection import train_test_split
x = ratings.copy()
y = ratings['user_id']
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25,stratify=y) 
# stratify는 특정 클래스의 비율(분포)을 유지 None으로 하면 무작위

In [5]:
# 정확도(RMSE)를 계산하는 함수
def RMSE(y_true,y_pred):
    return np.sqrt(np.mean((np.array(y_true)-np.array(y_pred))**2))

# 모델별 RMSE를 계산하는 함수
def score(model):
    id_pairs = zip(x_test['user_id'],x_test['movie_id'])
    y_pred = np.array([model(user,movie) for (user,movie) in id_pairs])
    y_true = np.array(x_test['rating'])
    return RMSE(y_true, y_pred)

# train 데이터로 Full matrix 구하기
rating_matrix = x_train.pivot(index = 'user_id',columns = 'movie_id',values = 'rating')

In [6]:
# 전체 평균으로 예측치를 계산하는 기본 모델
def best_seller(user_id,movie_id):
    try :
        rating = train_mean[movie_id]
    except:
        rating = 3.0
    return rating

train_mean = x_train.groupby(['movie_id'])['rating'].mean()
score(best_seller)

1.0237545326930837

In [7]:
# Full matrix를 사용자 데이터와 merge
merged_ratings = pd.merge(x_train,users) # merge()는 공통의 이름을 가진 key가 있으면 그것을 기준으로 합친다
users = users.set_index('user_id')

# gender별 평점평균 계산
g_mean = merged_ratings[['movie_id','sex','rating']].groupby(['movie_id','sex'])['rating'].mean()

### Gender 기준 추천

In [11]:
# gender별 평균을 예측치로 돌려주는 함수
def cf_gender(user_id,movie_id):
    if movie_id in rating_matrix:
        gender = users.loc[user_id]['sex']
        if gender in g_mean[movie_id]:
            gender_rating = g_mean[movie_id][gender]
        else :
            gender_rating = 3.0
    else :
        gender_rating = 3.0
    return gender_rating

score(cf_gender)

1.0333541676225388

### 연습문제

2-1 위의 성별('gender')추천 코드를 수정해서 사용자의 직업('occupation')에 따라 집단을 나누어서 예측값을 구하는 함수를 만들고 이의 정확도를 계산하는 코드를 작성하시오

movie_id  sex
1         F      3.765432
          M      3.892308
2         F      3.500000
          M      3.117021
3         F      2.538462
Name: rating, dtype: float64

In [16]:
# 직업별 평점평균 계산
o_mean = merged_ratings[['movie_id', 'occupation', 'rating']].groupby(['movie_id', 'occupation'])['rating'].mean()

def cf_occupation(user_id,movie_id):
    if movie_id in rating_matrix:
        occupation = users.loc[user_id]['occupation']
        if occupation in o_mean[movie_id]:
            occupation_rating = o_mean[movie_id][occupation]
        else :
            occupation_rating = 3.0
    else :
        occupation_rating = 3.0
    return occupation_rating

score(cf_occupation)

1.1116366168470408

2-2 사용자의 성별과 직업을 동시에 고려한 집단을 나누어서 예측값을 구하는 함수를ㄹ 만들고 이의 정확도을 계산하는 코드를 작성하시오

In [32]:
g_o_mean = merged_ratings[['movie_id','occupation','sex','rating']].groupby(['movie_id','sex','occupation'])['rating'].mean()
g_o_mean.head(20)
users.loc[1:15]

Unnamed: 0_level_0,age,sex,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213
6,42,M,executive,98101
7,57,M,administrator,91344
8,36,M,administrator,5201
9,29,M,student,1002
10,53,M,lawyer,90703


In [37]:
def cf_occ_sex(user_id, movie_id):
    if movie_id in rating_matrix:
        gender = users.loc[user_id]['sex']
        occupation = users.loc[user_id]['occupation']
        if (gender,occupation) in g_o_mean[movie_id]:
            occ_sex_rating = g_o_mean[movie_id][gender][occupation]
        elif occupation in g_o_mean[movie_id]:
            occ_sex_rating = o_mean[movie_id][occupation]
        elif gender in g_o_mean[movie_id]:
            occ_sex_rating = g_mean[movie_id][gender]
        else :
            occ_sex_rating = 3.0
    else:
        occ_sex_rating = 3.0
    return occ_sex_rating

score(cf_occ_sex)

1.1329243917692464

## 2.5 내용 기반 필터링 추천(CB)

1. 아이템 간의 유사도를 어떤 방법으로 계산할 것인가  
    **tf-idf** : 아이템 간의 유사도를 측정하는 지표. 
    - 어떤 단어가 해당 문서(아이템)에 얼마나 자주 등장(term frequency), 다른 문서에 비해서 상대적으로 얼마나 자주 등장하는가(inverse document frequency)를 계산
    - tf와 idf를 조합해서 각 문서(아이템)에 등장하는 모든 단어의 가중치(중요도)를 계산
    - 이들 단어의 가중치가 문서 간에 얼마나 유사한지를 *cosin similarity* 지표를 사용해서 계산

<br>

2. 사용자가 좋게 평가한 아이템을 몇 개나 선정할 것인가
3. 만일 2에서 다수의 아이템을 선정한 경우에 각 아이템과 유사도가 높은 아이템이 별도로 존재할 것인데 이를 어떻게 결합할 것인가


In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("rounakbanik/the-movies-dataset")

print("Path to dataset files:", path)

Path to dataset files: /Users/igwanhyeong/.cache/kagglehub/datasets/rounakbanik/the-movies-dataset/versions/7


In [4]:
import pandas as pd

# Data 읽기
movies = pd.read_csv('data/movies_metadata.csv', encoding='latin-1', low_memory=False)
movies = movies[['id', 'title', 'overview']]
movies.head(10)

Unnamed: 0,id,title,overview
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,8844,Jumanji,When siblings Judy and Peter discover an encha...
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...
3,31357,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom..."
4,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...
5,949,Heat,"Obsessive master thief, Neil McCauley leads a ..."
6,11860,Sabrina,An ugly duckling having undergone a remarkable...
7,45325,Tom and Huck,"A mischievous young boy, Tom Sawyer, witnesses..."
8,9091,Sudden Death,International action superstar Jean Claude Van...
9,710,GoldenEye,James Bond must unmask the mysterious head of ...


In [5]:
# 데이터 전처리
movies = movies.dropna()
movies['overview']=movies['overview'].fillna(' ')
print(len(movies))

# 불용어를 english로 지정하고 tf-idf 계산
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english') # 불용어 english로 지정 (a, the, and 등등)
tfidf_matrix = tfidf.fit_transform(movies['overview'])

# Cosin 유사도 계산
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(tfidf_matrix,tfidf_matrix)
cosine_sim = pd.DataFrame(cosine_sim,index=movies.index,columns=movies.index)

44506


In [8]:
# index-title을 뒤집는다
indices = pd.Series(movies.index,index=movies['title'])

# 영화제목을 받아서 추천 영화를 돌려주는 함수
def content_recommender(title,n_of_recomm):
    # title에서 영화 index 받아오기
    idx = indices[title]
    # 주어진 영화와 다른 영화의 similarity를 가져온다
    sim_scores = cosine_sim[idx]
    # similarity 기준으로 정렬하고 n_of_recomm만큼 가져오기(자기 자신 제외)
    sim_scores = sim_scores.sort_values(ascending=False)[1:n_of_recomm+1]
    # 영화 title 반환
    return movies.loc[sim_scores.index]['title']

# 추천받기
print(content_recommender('The Lion King',5))
print(content_recommender('The Dark Knight Rises',10))

34682    How the Lion Cub and the Turtle Sang a Song
9353                               The Lion King 1Â½
9115                  The Lion King 2: Simba's Pride
42829                                           Prey
25654                                 Fearless Fagan
Name: title, dtype: object
12481                                      The Dark Knight
150                                         Batman Forever
1328                                        Batman Returns
15511                           Batman: Under the Red Hood
585                                                 Batman
21194    Batman Unmasked: The Psychology of the Dark Kn...
9230                    Batman Beyond: Return of the Joker
18035                                     Batman: Year One
19792              Batman: The Dark Knight Returns, Part 1
3095                          Batman: Mask of the Phantasm
Name: title, dtype: object
