In [1]:
import pandas as pd 
import numpy as np

users = pd.read_csv('./data/users.dat')
movies = pd.read_csv('./data/movies.dat')
ratings = pd.read_csv('./data/ratings.dat')

In [2]:
display(users.head(1))
display(movies.head(1))
display(ratings.head(1))

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711


Unnamed: 0,movie_id,title,release data,video release date,IMDB URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949


In [3]:
def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred)) ** 2))

def score(model):
    id_pairs = zip(X_test['user_id'], X_test['movie_id'])
    y_pred = np.array([model(user, movie) for (user, movie) in id_pairs])
    y_true = np.array(X_test['rating'])
    return RMSE(y_true, y_pred)

In [4]:
# train, test set 분리
from sklearn.model_selection import train_test_split
X = ratings.copy()
y = ratings['user_id']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y)

In [5]:
rating_matrix = X_train.pivot(index='user_id', columns='movie_id', values='rating')

In [6]:
rating_matrix.head(3)

movie_id,1,2,3,4,5,6,7,8,9,10,...,1671,1672,1673,1674,1676,1677,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,3.0,4.0,3.0,3.0,5.0,4.0,,5.0,,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,


# 1. CF - Basic

In [7]:
from sklearn.metrics.pairwise import cosine_similarity
matrix_dummy = rating_matrix.copy().fillna(0)

def CF_simple(user_id, movie_id):
    if movie_id in rating_matrix:
        sim_scores = user_similarity[user_id].copy()
        movie_ratings = rating_matrix[movie_id].copy()
        none_rating_idx = movie_ratings[movie_ratings.isnull()].index # movie_id에 대한 rating이 없는 user_id 인덱스
        movie_ratings = movie_ratings.dropna()
        sim_scores = sim_scores.drop(none_rating_idx) # movie_id를 본 user_id의 rating만 고려
        mean_rating = np.dot(sim_scores, movie_ratings) / sim_scores.sum() # 유사도점수에 따라 가중 평균
    else:
        mean_rating = 3.0
    
    return mean_rating

## 1-1. Cosine Similarity

In [8]:
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(user_similarity, index=rating_matrix.index, columns=rating_matrix.index)

In [9]:
user_similarity

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.155472,0.029953,0.048229,0.319510,0.317509,0.302881,0.178880,0.050243,0.263410,...,0.277396,0.102880,0.234792,0.111586,0.140139,0.109098,0.184561,0.063781,0.138621,0.309066
2,0.155472,1.000000,0.087895,0.115989,0.079474,0.218022,0.095117,0.091750,0.157889,0.138032,...,0.152933,0.226645,0.295963,0.285987,0.276337,0.164168,0.163701,0.131862,0.119131,0.141060
3,0.029953,0.087895,1.000000,0.301616,0.000000,0.050628,0.053202,0.071056,0.058100,0.046407,...,0.042770,0.039006,0.142344,0.058652,0.097027,0.019962,0.070692,0.087505,0.058147,0.000000
4,0.048229,0.115989,0.301616,1.000000,0.042636,0.039240,0.057958,0.144765,0.133165,0.064464,...,0.038727,0.049667,0.104723,0.122210,0.064158,0.000000,0.113161,0.140075,0.131272,0.058549
5,0.319510,0.079474,0.000000,0.042636,1.000000,0.168325,0.275268,0.210084,0.037524,0.146048,...,0.265183,0.066759,0.086486,0.055100,0.101145,0.082512,0.126100,0.111417,0.134791,0.230152
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.109098,0.164168,0.019962,0.000000,0.082512,0.083371,0.048053,0.070526,0.000000,0.045978,...,0.052639,0.336240,0.187435,0.161944,0.375890,1.000000,0.042099,0.102734,0.022462,0.070946
940,0.184561,0.163701,0.070692,0.113161,0.126100,0.204923,0.241766,0.133205,0.054264,0.245951,...,0.210468,0.073448,0.108008,0.122090,0.108725,0.042099,1.000000,0.099805,0.199808,0.151126
941,0.063781,0.131862,0.087505,0.140075,0.111417,0.070188,0.059523,0.183289,0.133256,0.047201,...,0.047852,0.174561,0.232987,0.229674,0.253297,0.102734,0.099805,1.000000,0.000000,0.000000
942,0.138621,0.119131,0.058147,0.131272,0.134791,0.249542,0.239040,0.131036,0.099698,0.149536,...,0.139699,0.000000,0.040784,0.023999,0.033520,0.022462,0.199808,0.000000,1.000000,0.167563


In [8]:
score(CF_simple)

1.0144316669472162

## 1-2. Pearson Correlation Coefficient

In [16]:
user_similarity = matrix_dummy.T.corr(method="pearson")

In [18]:
user_similarity

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.074367,-0.006028,0.034859,0.192396,0.207876,0.192667,0.227489,0.008460,0.182204,...,0.199129,0.065911,0.158488,0.103771,0.108851,0.076527,0.207330,0.112171,0.091264,0.232930
2,0.074367,1.000000,0.086285,0.198324,-0.004568,0.166105,0.035687,0.033144,0.155299,0.041892,...,0.093863,0.243800,0.260024,0.344691,0.210037,0.219712,0.183259,0.171769,0.048686,0.025006
3,-0.006028,0.086285,1.000000,0.188477,-0.008539,0.017726,0.008912,0.056490,0.009164,-0.012169,...,0.005004,0.039802,0.083191,0.043398,0.074022,0.015899,0.126065,0.038320,0.030749,-0.002068
4,0.034859,0.198324,0.188477,1.000000,-0.027163,0.038207,0.046847,0.065988,0.120893,0.035606,...,0.011308,0.034846,0.094933,0.165950,0.045315,0.025715,0.164395,0.131990,0.129134,0.003657
5,0.192396,-0.004568,-0.008539,-0.027163,1.000000,0.064443,0.193414,0.175568,0.012209,0.086562,...,0.169738,0.061172,0.005888,0.004757,0.069500,0.051846,0.153061,0.056293,0.014874,0.181491
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.076527,0.219712,0.015899,0.025715,0.051846,0.042062,0.016873,0.109328,-0.014695,-0.011679,...,0.050200,0.378691,0.146515,0.159594,0.324678,1.000000,0.065071,0.142163,-0.010433,0.048948
940,0.207330,0.183259,0.126065,0.164395,0.153061,0.208977,0.149417,0.108642,0.121965,0.162459,...,0.144852,0.097814,0.110687,0.129825,0.087797,0.065071,1.000000,0.109227,0.132393,0.093384
941,0.112171,0.171769,0.038320,0.131990,0.056293,0.100899,-0.016316,0.091620,0.182873,0.064755,...,-0.003920,0.141464,0.284623,0.046754,0.243257,0.142163,0.109227,1.000000,0.018671,0.057651
942,0.091264,0.048686,0.030749,0.129134,0.014874,0.199871,0.170241,0.088709,0.056386,0.174053,...,0.130603,0.023638,0.015052,0.041370,0.027808,-0.010433,0.132393,0.018671,1.000000,0.106404


In [19]:
score(CF_simple)

1.2357939891827803

# 2. CF - Neighborhood based

In [23]:
def score(model, neighbor_size=0):
    id_pairs = zip(X_test['user_id'], X_test['movie_id'])
    y_pred = np.array([model(user, movie, neighbor_size) for (user, movie) in id_pairs])
    y_true = np.array(X_test['rating'])
    return RMSE(y_true, y_pred)

def cf_knn(user_id, movie_id, neighbor_size=0):
    if movie_id in rating_matrix:
        sim_scores = user_similarity[user_id].copy()
        movie_ratings = rating_matrix[movie_id].copy()
        none_rating_idx = movie_ratings[movie_ratings.isnull()].index
        movie_ratings = movie_ratings.drop(none_rating_idx)
        sim_scores = sim_scores.drop(none_rating_idx)
        
        if neighbor_size == 0:
            mean_rating = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
        else:
            if len(sim_scores) > 1:
                neighbor_size = min(neighbor_size, len(sim_scores)) # 이웃크기와 평점남긴 user수 중 작은 값
                sim_scores = np.array(sim_scores)
                movie_ratings = np.array(movie_ratings)
                user_idx = np.argsort(sim_scores)
                sim_scores = sim_scores[user_idx][-neighbor_size:]
                movie_ratings = movie_ratings[user_idx][-neighbor_size:]
                mean_rating = np.dot(sim_scores, movie_ratings) / sim_scores.sum() # 유사도 높은 user부터 k명까지 고려
            else:
                mean_rating = 3.0
    else:
        mean_rating = 3.0
    
    return mean_rating

In [52]:
score(cf_knn, neighbor_size = 30)

1.0155257884242075

In [53]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [24]:
# 실제 추천을 할 때에는 train/test set을 나눌 필요없으므로 전체 데이터로 사용자 간 유사도를 구함
rating_matrix = ratings.pivot_table(values='rating', index='user_id', columns='movie_id')

In [25]:
from sklearn.metrics.pairwise import cosine_similarity
matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(user_similarity, index=rating_matrix.index, columns=rating_matrix.index)

In [32]:
def recom_movie(user_id, n_items, neighbor_size=30):
    user_movie = rating_matrix.loc[user_id].copy()
    for movie in rating_matrix:
        if pd.notnull(user_movie.loc[movie]):
            user_movie.loc[movie] = 0 # 사용자가 이미 본 영화는 추천안함
        else:
            user_movie.loc[movie] = cf_knn(user_id, movie, neighbor_size) # 예상 평점
        
    movie_sort = user_movie.sort_values(ascending=False)[:n_items] # 내림차순
    recom_movies = movies.loc[movie_sort.index]
    recommendations = recom_movies['title']
    return recommendations

In [38]:
movies.set_index('movie_id', inplace=True)

In [42]:
recom_movie(user_id=2, n_items=5, neighbor_size=30)

movie_id
1293                         Star Kid (1997)
1467    Saint of Fort Washington, The (1993)
1189                      Prefontaine (1997)
1500               Santa with Muscles (1996)
318                  Schindler's List (1993)
Name: title, dtype: object

## 2-1. 최적의 이웃 크기

In [43]:
# train, test set 분리
from sklearn.model_selection import train_test_split
X = ratings.copy()
y = ratings['user_id']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y)

In [44]:
rating_matrix = X_train.pivot_table(values='rating', index='user_id', columns='movie_id')

In [46]:
from sklearn.metrics.pairwise import cosine_similarity
matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(user_similarity, index=rating_matrix.index, columns=rating_matrix.index)

In [48]:
for neighbor_size in [10, 20, 30, 40, 50, 60]:
    print('Neighbor size = %d : RMSE = %.4f' % (neighbor_size, score(cf_knn, neighbor_size)))

Neighbor size = 10 : RMSE = 1.0208
Neighbor size = 20 : RMSE = 1.0053
Neighbor size = 30 : RMSE = 1.0032
Neighbor size = 40 : RMSE = 1.0035
Neighbor size = 50 : RMSE = 1.0036
Neighbor size = 60 : RMSE = 1.0037


## 2-2. 사용자의 평가경향을 고려한 CF

In [63]:
rating_mean = rating_matrix.mean(axis=1) # 사용자별 평균 평점
rating_bias = (rating_matrix.T - rating_mean).T # rating 편차

In [79]:
def cf_knn_bias(user_id, movie_id, neighbor_size=0):
    if movie_id in rating_bias:
        sim_scores = user_similarity[user_id].copy()
        movie_ratings = rating_bias[movie_id].copy()
        none_rating_idx = movie_ratings[movie_ratings.isnull()].index
        movie_ratings = movie_ratings.drop(none_rating_idx)
        sim_scores = sim_scores.drop(none_rating_idx)
        
        if neighbor_size == 0:
            prediction = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
            prediction = prediction + rating_mean[user_id]
        else:
            if len(sim_scores) > 1:
                neighbor_size = min(neighbor_size, len(sim_scores))
                sim_scores = np.array(sim_scores)
                movie_ratings = np.array(movie_ratings)
                user_idx = np.argsort(sim_scores)
                sim_scores = sim_scores[user_idx][-neighbor_size:]
                movie_ratings = movie_ratings[user_idx][-neighbor_size:]
                prediction = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
                prediction = prediction + rating_mean[user_id]
            else:
                prediction = rating_mean[user_id]
    else:
        prediction = rating_mean[user_id] # 사용자의 평균 평점
    
    return prediction

In [65]:
score(cf_knn_bias, 30)

0.9452426205639461

- 연습문제(3-2) : 사용자ID 지정하면 5개 영화 추천

In [66]:
rating_mean = rating_matrix.mean(axis=1) # 사용자별 평균 평점
rating_bias = (rating_matrix.T - rating_mean).T # rating 편차

In [71]:
rating_mean[:2]

user_id
1    3.617647
2    3.608696
dtype: float64

In [70]:
rating_bias.head(2)

movie_id,1,2,3,4,5,6,7,8,9,10,...,1670,1671,1672,1673,1674,1675,1677,1678,1680,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.382353,-0.617647,0.382353,-0.617647,-0.617647,1.382353,,-2.617647,1.382353,-0.617647,...,,,,,,,,,,
2,,,,,,,,,,-1.608696,...,,,,,,,,,,


In [80]:
def recom_movie2(user_id, n_items, neighbor_size=30):
    user_movie = rating_matrix.loc[user_id].copy()
    for movie in rating_matrix:
        if pd.notnull(user_movie.loc[movie]):
            user_movie.loc[movie] = 0 # 사용자가 이미 본 영화는 추천안함
        else:
            user_movie.loc[movie] = cf_knn_bias(user_id, movie, neighbor_size) # 예상 평점
        
    movie_sort = user_movie.sort_values(ascending=False)[:n_items] # 내림차순
    recom_movies = movies.loc[movie_sort.index]
    recommendations = recom_movies['title']
    return recommendations

In [82]:
recom_movie2(user_id=1, n_items=5, neighbor_size=30)

movie_id
1467    Saint of Fort Washington, The (1993)
1500               Santa with Muscles (1996)
1367                            Faust (1994)
64          Shawshank Redemption, The (1994)
1449                  Pather Panchali (1955)
Name: title, dtype: object

## 2-3. CF 정확도 개선 방법

In [10]:
rating_matrix.head(2)

movie_id,1,2,3,4,5,6,7,8,9,10,...,1671,1672,1673,1674,1676,1677,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,3.0,4.0,3.0,3.0,5.0,4.0,,5.0,,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,


- 사용자별 공통 평가 수 계산

In [21]:
rating_mean = rating_matrix.mean(axis=1) # 사용자별 평균 평점
rating_bias = (rating_matrix.T - rating_mean).T # rating 편차

In [11]:
rating_binary1 = np.array(rating_matrix > 0).astype(float) # 평점 남긴 값을 1로 변경
rating_binary2 = rating_binary1.T

In [18]:
rating_matrix.shape

(943, 1635)

In [15]:
counts = np.dot(rating_binary1, rating_binary2)
counts = pd.DataFrame(counts, index=rating_matrix.index, columns=rating_matrix.index).fillna(0)

In [19]:
counts.head(2)

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,204.0,12.0,4.0,4.0,49.0,51.0,74.0,14.0,2.0,40.0,...,44.0,11.0,30.0,9.0,20.0,9.0,24.0,4.0,16.0,48.0
2,12.0,46.0,5.0,3.0,4.0,20.0,11.0,3.0,4.0,10.0,...,11.0,8.0,20.0,10.0,16.0,5.0,9.0,4.0,5.0,9.0


In [20]:
from sklearn.metrics.pairwise import cosine_similarity
matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(user_similarity, index=rating_matrix.index, columns=rating_matrix.index)

In [26]:
counts[1]

user_id
1      204.0
2       12.0
3        4.0
4        4.0
5       49.0
       ...  
939      9.0
940     24.0
941      4.0
942     16.0
943     48.0
Name: 1, Length: 943, dtype: float64

In [None]:
def cf_knn_bias_sig(user_id, movie_id, neighbor_size=0):
    if movie_id in rating_bias:
        sim_scores = user_similarity[user_id].copy()
        movie_ratings = rating_bias[movie_id].copy()
        
        no_rating = movie_ratings.isnull() # 평점을 부여하지 않은 user
        common_counts = counts[user_id] # 공통으로 평가한 item수
        low_significance = common_counts < SIG_LEVEL
        none_rating_idx = movie_ratings[no_rating | low_significance].index
        
        none_rating_idx = movie_ratings[movie_ratings.isnull()].index
        movie_ratings = movie_ratings.drop(none_rating_idx)
        sim_scores = sim_scores.drop(none_rating_idx)
        
        if neighbor_size == 0:
            prediction = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
            prediction = prediction + rating_mean[user_id]
        else:
            if len(sim_scores) > 1:
                neighbor_size = min(neighbor_size, len(sim_scores))
                sim_scores = np.array(sim_scores)
                movie_ratings = np.array(movie_ratings)
                user_idx = np.argsort(sim_scores)
                sim_scores = sim_scores[user_idx][-neighbor_size:]
                movie_ratings = movie_ratings[user_idx][-neighbor_size:]
                prediction = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
                prediction = prediction + rating_mean[user_id]
            else:
                prediction = rating_mean[user_id]
    else:
        prediction = rating_mean[user_id] # 사용자의 평균 평점
    
    return prediction