# 추천 시스템
- 사용자 집단별 추천
    - 성별 

In [64]:
import pandas as pd
import numpy as np

## 전처리

In [2]:
# 사용자 
users = pd.read_csv('users.csv')
users[:3]

Unnamed: 0,user_id,age,gender,job,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067


In [3]:
# 영화 평점
ratings = pd.read_csv('ratings.csv')
ratings[:3]

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,55,5,875072688
1,1,203,4,878542231
2,1,183,5,875072262


In [4]:
# 영화 정보
movies = pd.read_csv('movies.csv')
movies[:3]

Unnamed: 0,movie_id,title,release date,imdb url,action,adventure,animation,children,comedy,crime,...,fantasy,film-noir,horror,musical,mystery,romance,sci-fi,thriller,war,western
0,1,Toy Story (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?GoldenEye%20(...,1,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [5]:
# ratings의 timestamp 컬럼 삭제
del ratings['timestamp']

In [6]:
ratings.columns

Index(['user_id', 'movie_id', 'rating'], dtype='object')

In [7]:
# movies의 movie_id, title만 사용
movies = movies[['movie_id', 'title']]
movies[:3]

Unnamed: 0,movie_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)


## 데이터 셋 분리
- Train data
    - 모델을 학습하기 위한 데이터 셋
    - 학습은 최적의 파라미터를 찾는 것
    - 학습을 위한 데이터
    <br><br>
- Test data
    - 모델의 '최종 성능'을 평가하기 위한 데이터 셋
    - 모델 학습에 관여하지 않음
    <br><br>
- Train data로 학습을 하고, Test data로 최종 성능 평가

In [8]:
# train, test set 분리
from sklearn.model_selection import train_test_split

In [9]:
x= ratings.copy()
y= ratings.user_id

In [58]:
# x_train : 학습용 데이터
# x_test : 정확도 검증용 데이터
# y_train, y_test : 사용자 ID
x_train, x_test, y_train, y_test = train_test_split(
    x, 
    y,
    test_size=0.25,
    stratify=y,
)
# x(feature) : 속성, 변수, 입력값
# y(target, label) : 결과값
# test_size : 테스트용 데이터의 비율
# random_state : 정수값(random seed)을 기준으로 데이터 분리 -> 항상 일정한 결과로 데이터 분리
# shuffle : 데이터를 분리하기 전에 데이터를 섞음(default = True)
# stratify(계층) : 설정한 데이터를 기준으로 데이터를 분리해 편향되지 않게 설정

In [57]:
x_train.groupby('user_id').size()

user_id
1      208
2       47
3       43
4       17
5      127
      ... 
939     38
940     86
941     19
942     57
943    125
Length: 943, dtype: int64

In [56]:
ratings.groupby('user_id').size()

user_id
1      272
2       62
3       54
4       24
5      175
      ... 
939     49
940    107
941     22
942     79
943    168
Length: 943, dtype: int64

In [11]:
x.shape

(100000, 3)

In [12]:
x_train.shape

(75000, 3)

In [50]:
x_train

Unnamed: 0,user_id,movie_id,rating
29539,290,423,5
99012,933,187,4
8374,83,728,4
22211,224,751,3
56042,497,575,3
...,...,...,...
3643,25,177,3
78879,727,465,2
74050,669,127,5
23791,242,275,5


In [14]:
x_test.shape

(25000, 3)

In [15]:
y_train.shape

(75000,)

In [16]:
y_train

57314    505
48129    429
14064    140
32534    307
34018    314
        ... 
91798    869
59784    533
16601    174
59727    533
46318    413
Name: user_id, Length: 75000, dtype: int64

In [17]:
y_test.shape

(25000,)

## 정확도 (Accuracy)
- 10분 동안 줄넘기 횟수

이름 | 홍길동 | 박보검 | 이미자
- | - | - | -
예측 | 50 | 35 | 40
실제 | 60 | 20 | 45

- 오차(잔차) : 실제값 - 예측값 (y - y^)
    - 오차 : 10,-15, 5
- 오차(잔차) 합 : 0, 양수, 음수가 될 수 있음 -> 0을 제거하기 위해 제곱의 합을 구함
- 평균 제곱 오차(MSE : Mean Square Error) : 오차의 제곱 합의 평균
    - 이상치(극단값)이 더욱 극대화되는 단점이 생김
- 평균 제곱근 오차(RMSE : Root Mean Square Error) : 오차의 제곱 합의 평균의 제곱근
    - 0에 가까울 수록 훈련이 잘 된 상태

In [61]:
## MSE
mse = ((10)**2 + (-15)**2 + (5)**2) / 3
mse

116.66666666666667

In [62]:
## RMSE
import math
math.sqrt(mse) # 훈련이 잘 안된 상태(underfitting)

10.801234497346433

### RMSE 정의

In [65]:
## 정확도(RMSE) 계산하는 함수 정의
def RMSE(y_true, y_pred):
    return math.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))

### 모델별 예측치의 정확도 계산

In [89]:
## 모델별 RMSE를 계산하는 함수 정의
## 모델별 예측치의 정확도 계산
def score(model):
    id_pairs = zip(x_test['user_id'], x_test['movie_id']) 
    
    y_pred = np.array([model(user, movie) for (user, movie) in id_pairs])
    print('y_pred shape >>', y_pred.shape)
    
    y_true = np.array(x_test['rating'])
    
    return RMSE(y_true, y_pred)

In [67]:
x_test[:2]

Unnamed: 0,user_id,movie_id,rating
42273,387,33,3
59841,533,566,4


In [70]:
# zip : Iterable의 각 요소를 묶어 튜플로 생성
# 일회용
for z in zip([1, 3, 5], ['a', 'b', 'c']):
    print(z)

(1, 'a')
(3, 'b')
(5, 'c')


## 모델

In [93]:
train_mean = x_train.groupby('movie_id').rating.mean()

In [94]:
train_mean

movie_id
1       3.890490
2       3.313131
3       3.076923
4       3.562130
5       3.400000
          ...   
1678    1.000000
1679    3.000000
1680    2.000000
1681    3.000000
1682    3.000000
Name: rating, Length: 1643, dtype: float64

### best-seller 모델 : 예측

In [77]:
# 영화별 평점 평균을 예측치로 계산하는 기본 모델
def best_seller(user_id, movie_id):
    try:
        rating = train_mean[movie_id]
    except:
        print('movie_id 없음 >>', movie_id) # user_id를 기준으로 나눴기 때문에 movie_id가 없는 데이터가 존재 
        rating = 3.0
    return rating

## best-seller 정확도 측정

In [95]:
score(best_seller) # 괜찮은 수준

movie_id 없음 >> 1494
movie_id 없음 >> 1586
movie_id 없음 >> 1641
movie_id 없음 >> 1671
movie_id 없음 >> 1486
movie_id 없음 >> 1460
movie_id 없음 >> 1656
movie_id 없음 >> 1366
movie_id 없음 >> 1506
movie_id 없음 >> 1339
movie_id 없음 >> 1497
movie_id 없음 >> 1398
movie_id 없음 >> 1506
movie_id 없음 >> 1520
movie_id 없음 >> 1447
movie_id 없음 >> 1506
movie_id 없음 >> 957
movie_id 없음 >> 1497
movie_id 없음 >> 1080
movie_id 없음 >> 1525
movie_id 없음 >> 1599
movie_id 없음 >> 1546
movie_id 없음 >> 1619
movie_id 없음 >> 1630
movie_id 없음 >> 1130
movie_id 없음 >> 957
movie_id 없음 >> 1637
movie_id 없음 >> 1507
movie_id 없음 >> 1498
movie_id 없음 >> 1235
movie_id 없음 >> 1398
movie_id 없음 >> 1574
movie_id 없음 >> 1647
movie_id 없음 >> 1649
movie_id 없음 >> 1614
movie_id 없음 >> 1668
movie_id 없음 >> 1340
movie_id 없음 >> 1606
movie_id 없음 >> 1653
movie_id 없음 >> 1579
movie_id 없음 >> 1673
movie_id 없음 >> 1613
movie_id 없음 >> 1656
movie_id 없음 >> 1373
movie_id 없음 >> 1080
movie_id 없음 >> 814
y_pred shape >> (25000,)


1.0238175298731347

## Gender 기준 추천 모델

In [99]:
# 영화별 성별별 평점 평균 계산
users[:2]

Unnamed: 0,user_id,age,gender,job,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043


In [100]:
movies[:2]

Unnamed: 0,movie_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)


In [102]:
ratings[:2]

Unnamed: 0,user_id,movie_id,rating
0,1,55,5
1,1,203,4


In [111]:
# merge
one = pd.DataFrame({
    'id' : [1, 2, 3],
    'name' : ['홍길동', '이미자', '박보검']
})

two = pd.DataFrame({
    'id' : [1, 2, 5],
    'hire_date' : [2020, 2010, 2002]
})

one

Unnamed: 0,id,name
0,1,홍길동
1,2,이미자
2,3,박보검


In [105]:
two

Unnamed: 0,id,hire_date
0,1,2020
1,2,2010
2,3,2002


In [112]:
pd.merge(one, two)

Unnamed: 0,id,name,hire_date
0,1,홍길동,2020
1,2,이미자,2010


In [116]:
# x_train과 users merge
merged_ratings = pd.merge(x_train, users)
merged_ratings

Unnamed: 0,user_id,movie_id,rating,age,gender,job,zip_code
0,928,268,5,21,M,student,55408
1,928,114,5,21,M,student,55408
2,928,328,3,21,M,student,55408
3,928,172,5,21,M,student,55408
4,928,191,5,21,M,student,55408
...,...,...,...,...,...,...,...
74995,895,748,3,31,F,librarian,32301
74996,895,13,5,31,F,librarian,32301
74997,895,301,4,31,F,librarian,32301
74998,895,151,5,31,F,librarian,32301


In [120]:
# 영화별 성별별 평점 평균 계산
g_mean = merged_ratings.groupby(['movie_id', 'gender']).rating.mean()
g_mean

movie_id  gender
1         F         3.833333
          M         3.910506
2         F         3.529412
          M         3.268293
3         F         2.846154
                      ...   
1678      M         1.000000
1679      M         3.000000
1680      M         2.000000
1681      M         3.000000
1682      M         3.000000
Name: rating, Length: 3028, dtype: float64

In [121]:
x_train.shape

(75000, 3)

In [122]:
users.shape

(943, 5)

In [133]:
users.set_index('user_id', inplace=True)
users

Unnamed: 0_level_0,age,gender,job,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213
...,...,...,...,...
939,26,F,student,33319
940,32,M,administrator,02215
941,20,M,student,97229
942,48,F,librarian,78209


In [127]:
# 피벗 테이블 : 행과 열이 교차되는 부분에 값이 존재하는 테이블

# train 데이터를 full matrix로 변환
# full matrix : 모든 행과 모든 열이 존재하는 2차원 배열
rating_matrix = x_train.pivot(index='user_id', columns='movie_id', values='rating')
rating_matrix

movie_id,1,2,3,4,5,6,7,8,9,10,...,1672,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,,5.0,4.0,1.0,,,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,,,...,,,,,,,,,,
940,,,,,,,4.0,5.0,3.0,,...,,,,,,,,,,
941,5.0,,,,,,4.0,,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,


In [135]:
users

Unnamed: 0_level_0,age,gender,job,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213
...,...,...,...,...
939,26,F,student,33319
940,32,M,administrator,02215
941,20,M,student,97229
942,48,F,librarian,78209


## 정확도 계산

In [126]:
## 모델별 RMSE 계산하는 함수 정의
## 모델별 예측치의 정확도 계산
def score(model):
    id_pairs = zip(x_test['user_id'], x_test['movie_id'])
    
    y_pred = np.array([model(user, movie) for (user, movie) in id_pairs])

    y_true = np.array(x_test['rating'])
    
    return RMSE(y_true, y_pred)

In [130]:
# gender 기준 추천 모델
# gender별 평균을 예측치로 돌려주는 함수
def cf_gender(user_id, movie_id):
    if movie_id in rating_matrix:
        gender = users.loc[user_id]['gender']
        
        if gender in g_mean[movie_id]:
            gender_rating = g_mean[movie_id][gender]
        else:
            gender_rating=3.0
            
    else:
        gender_rating=3.0
    return gender_rating

In [131]:
score(cf_gender)

1.0340351804042918