# 1. 기본 CF 알고리즘

In [16]:
# 관련 라이브러리 import
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [17]:
# 데이터 읽어오기(user, item, data)

u_cols = ['user_id','age','sex','occupation', 'zip_code'] # 사용자아이디, 나이, 성별, 직업, 우편번호
users = pd.read_csv('data/u.user', 
                    sep='|', 
                    names=u_cols, 
                    encoding = 'latin-1')
users = users.set_index('user_id')


i_cols = ['movie_id','title', 'release date', 'video release date',
          'IMDB URL', 'unknown', 'Action', 'Adventure', 'Animation',
          'Children\'s','Comedy','Crime', 'Documentary','Drama', 
          'Fantasy','Film-Noir','Horror','Musical','Mystery','Romance',
          'Sci-Fi','Thriller','War','Western']
movies = pd.read_csv('data/u.item', 
                     sep='|', 
                     names=i_cols,
                     encoding = 'latin-1')
movies = movies.set_index('movie_id')


r_cols = ['user_id','movie_id','rating','timestamp']
ratings = pd.read_csv('data/u.data', 
                     sep='\t', 
                     names=r_cols,
                     encoding = 'latin-1')

In [18]:
# 정확도(RMSE)를 계산하는 함수
def RMSE(y_true, y_pred) :
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))

In [19]:
# 데이터 셋 만들기
x = ratings.copy()
y = ratings['user_id']

x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                    test_size=0.25,
                                                    stratify=y) # 계층화 추출
# rating full matrix 만들기
rating_matrix = x_train.pivot(index='user_id', columns='movie_id', values='rating')
rating_matrix






movie_id,1,2,3,4,5,6,7,8,9,10,...,1672,1673,1674,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,,,3.0,5.0,4.0,,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,,,...,,,,,,,,,,
940,,,,2.0,,,4.0,5.0,,,...,,,,,,,,,,
941,,,,,,,4.0,,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,


In [20]:
# 모델별 RMSE를 계산하는 함수
def score(model):
    id_pairs = zip(x_test['user_id'], x_test['movie_id']) # 튜플 형태로 만들어줌
    y_pred = np.array([model(user, movie) for (user, movie) in id_pairs])
    y_true = np.array(x_test['rating'])

    return RMSE(y_true, y_pred)

In [21]:
# 코사인 유사도 계산
from sklearn.metrics.pairwise import cosine_similarity
matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)

user_similarity


user_similarity = pd.DataFrame(user_similarity, 
                               index=rating_matrix.index, 
                               columns=rating_matrix.index)

user_similarity



user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.171083,0.054619,0.076060,0.300220,0.352463,0.333078,0.242558,0.104790,0.246290,...,0.273779,0.128720,0.221339,0.160129,0.160642,0.090626,0.266020,0.130620,0.136572,0.310929
2,0.171083,1.000000,0.105289,0.241558,0.085756,0.195299,0.081814,0.091880,0.139125,0.150971,...,0.162501,0.254854,0.298298,0.343632,0.309788,0.216930,0.199892,0.073075,0.093525,0.077609
3,0.054619,0.105289,1.000000,0.243770,0.026596,0.067916,0.060450,0.076347,0.053770,0.015402,...,0.039719,0.053726,0.144296,0.066771,0.102558,0.014341,0.127822,0.103262,0.121880,0.032886
4,0.076060,0.241558,0.243770,1.000000,0.043104,0.062315,0.066815,0.154791,0.075124,0.043036,...,0.070292,0.050041,0.132820,0.253212,0.193860,0.040073,0.120758,0.192361,0.067293,0.021004
5,0.300220,0.085756,0.026596,0.043104,1.000000,0.136533,0.266783,0.257463,0.038523,0.169377,...,0.211844,0.080832,0.123709,0.066973,0.105461,0.029796,0.188388,0.108506,0.107730,0.223490
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.090626,0.216930,0.014341,0.040073,0.029796,0.080669,0.077842,0.123462,0.000000,0.060242,...,0.044328,0.387426,0.114298,0.206357,0.363568,1.000000,0.023514,0.137404,0.035171,0.059891
940,0.266020,0.199892,0.127822,0.120758,0.188388,0.335069,0.269593,0.139736,0.134007,0.268311,...,0.232279,0.118922,0.147124,0.125120,0.137493,0.023514,1.000000,0.095943,0.156981,0.213370
941,0.130620,0.073075,0.103262,0.192361,0.108506,0.122812,0.046500,0.136091,0.053179,0.063072,...,0.027280,0.210329,0.191881,0.303657,0.188320,0.137404,0.095943,1.000000,0.065354,0.075503
942,0.136572,0.093525,0.121880,0.067293,0.107730,0.250205,0.214248,0.155072,0.081674,0.140731,...,0.123596,0.022669,0.065999,0.054333,0.053454,0.035171,0.156981,0.065354,1.000000,0.118189


In [25]:
# 영화(movie_id)의 가중평균 rating을 계산하는 함수 : CF_simple

# CF_simple
def CF_simple(user_id, movie_id) :
    if movie_id in rating_matrix.columns:
        sim_scores = user_similarity[user_id].copy()
        movie_ratings = rating_matrix[movie_id].copy()
        none_rating_idx = movie_ratings[movie_ratings.isnull()].index
        movie_ratings = movie_ratings.dropna()
        sim_scores = sim_scores.drop(none_rating_idx)
        mean_rating = np.dot(sim_scores, movie_ratings)/sim_scores.sum()
    else :
        mean_rating=3.0
    return mean_rating

score(CF_simple)




# 정확도 계산
#score(CF_simple)

1.0189101347361194

In [None]:
user_similarity

In [26]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# 데이터 읽어오기(user, item, data)

u_cols = ['user_id','age','sex','occupation', 'zip_code'] # 사용자아이디, 나이, 성별, 직업, 우편번호
users = pd.read_csv('data/u.user', 
                    sep='|', 
                    names=u_cols, 
                    encoding = 'latin-1')
users = users.set_index('user_id')


i_cols = ['movie_id','title', 'release date', 'video release date',
          'IMDB URL', 'unknown', 'Action', 'Adventure', 'Animation',
          'Children\'s','Comedy','Crime', 'Documentary','Drama', 
          'Fantasy','Film-Noir','Horror','Musical','Mystery','Romance',
          'Sci-Fi','Thriller','War','Western']
movies = pd.read_csv('data/u.item', 
                     sep='|', 
                     names=i_cols,
                     encoding = 'latin-1')
movies = movies.set_index('movie_id')

'u.data'
r_cols = ['user_id','movie_id','rating','timestamp']
ratings = pd.read_csv('data/u.data', 
                     sep='\t', 
                     names=r_cols,
                     encoding = 'latin-1')

In [27]:
# 정확도(RMSE)를 계산하는 함수
def RMSE(y_true, y_pred) :
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))

In [37]:
# 데이터 셋 만들기
x = ratings.copy()
y = ratings['user_id']


x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                    test_size=0.25,
                                                    random_state = 0) #계층화 추출
rating_matrix = x_train.pivot(index='user_id', 
                              columns='movie_id',
                              values='rating')

In [38]:
# 코사인 유사도 계산
from sklearn.metrics.pairwise import cosine_similarity
matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)


user_similarity = pd.DataFrame(user_similarity,
                               index = rating_matrix.index,
                               columns = rating_matrix.index)
print(user_similarity)

user_id       1         2         3         4         5         6         7    \
user_id                                                                         
1        1.000000  0.140887  0.036981  0.069143  0.306388  0.302852  0.339506   
2        0.140887  1.000000  0.097246  0.181611  0.048586  0.160920  0.092840   
3        0.036981  0.097246  1.000000  0.211377  0.030859  0.038523  0.055345   
4        0.069143  0.181611  0.211377  1.000000  0.039629  0.056862  0.077217   
5        0.306388  0.048586  0.030859  0.039629  1.000000  0.187204  0.304344   
...           ...       ...       ...       ...       ...       ...       ...   
939      0.080099  0.114472  0.021812  0.000000  0.040104  0.106194  0.074990   
940      0.263132  0.208187  0.120308  0.194381  0.233320  0.224265  0.299872   
941      0.089671  0.079739  0.091164  0.115340  0.083141  0.085547  0.044938   
942      0.141146  0.149775  0.126480  0.145046  0.098550  0.186713  0.179860   
943      0.331283  0.118609 

In [39]:
#Neighbor size를 정해서 예측치를 계산하는 함수
def CF_knn(user_id, movie_id, neighbor_size=0):
    if movie_id in rating_matrix.columns:
        sim_scores = user_similarity[user_id].copy()
        movie_ratings = rating_matrix[movie_id].copy()
        none_rating_idx = movie_ratings[movie_ratings.isnull()].index
        movie_ratings = movie_ratings.drop(none_rating_idx)
        sim_scores = sim_scores.drop(none_rating_idx)
        if neighbor_size == 0 :
            mean_rating = np.dot(sim_scores, movie_ratings)/sim_scores.sum()
        else :
            if len(sim_scores) > 0 :
                neighbor_size = min(neighbor_size, len(sim_scores))
                sim_scores = np.array(sim_scores)
                movie_ratings = np.array(movie_ratings)
                user_idx = np.argsort(sim_scores)
                sim_scores = sim_scores[user_idx][-neighbor_size:]
                movie_ratings = movie_ratings[user_idx][-neighbor_size:]
                mean_rating = np.dot(sim_scores, movie_ratings)/sim_scores.sum()
            else :
                mean_rating = 3.0   
    
    else : 
        mean_rating=3.0
    return mean_rating




score(CF_knn)


1.0251251692635397

In [40]:
# 기존 score 함수에 neighbor_size 인자값 추가

# 모델별 RMSE를 계산하는 함수
def score(model, neighbor_size=0):
    id_pairs = zip(x_test['user_id'], x_test['movie_id']) # 튜플 형태로 만들어줌
    y_pred = np.array([model(user, movie, neighbor_size) for (user, movie) in id_pairs])
    y_true = np.array(x_test['rating'])

    return RMSE(y_true, y_pred)



# 정확도 계산
score(CF_knn, neighbor_size=30)


1.0200919120771832

# 3. 최적의 이웃크기 설정

In [35]:
# neighbor size가 10,20,30,40,50,60인 경우에 대해서 RMSE를 계산하고 이를 출력

for neighbor_size in [37,38,39,40,41,42,43,44] :
    print('neighbor_size = %d : RMSE = %.4f' %(neighbor_size, score(CF_knn, neighbor_size)))





neighbor_size = 37 : RMSE = 1.0181
neighbor_size = 38 : RMSE = 1.0180
neighbor_size = 39 : RMSE = 1.0178
neighbor_size = 40 : RMSE = 1.0179
neighbor_size = 41 : RMSE = 1.0178
neighbor_size = 42 : RMSE = 1.0178
neighbor_size = 43 : RMSE = 1.0180
neighbor_size = 44 : RMSE = 1.0181
