In [1]:
import pandas as pd 
import numpy as np

users = pd.read_csv('./data/users.dat')
movies = pd.read_csv('./data/movies.dat')
ratings = pd.read_csv('./data/ratings.dat')

In [3]:
def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred)) ** 2))

def score(model):
    id_pairs = zip(X_test['user_id'], X_test['movie_id'])
    y_pred = np.array([model(user, movie) for (user, movie) in id_pairs])
    y_true = np.array(X_test['rating'])
    return RMSE(y_true, y_pred)

In [4]:
# train, test set 분리
from sklearn.model_selection import train_test_split
X = ratings.copy()
y = ratings['user_id']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y)

In [6]:
rating_matrix = X_train.pivot(index='user_id', columns='movie_id', values='rating')

In [7]:
rating_matrix.head(3)

movie_id,1,2,3,4,5,6,7,8,9,10,...,1668,1670,1671,1672,1676,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,,3.0,3.0,5.0,4.0,1.0,5.0,,...,,,,,,,,,,
2,4.0,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,


## 1. CF - Basic

In [8]:
from sklearn.metrics.pairwise import cosine_similarity
matrix_dummy = rating_matrix.copy().fillna(0)

def CF_simple(user_id, movie_id):
    if movie_id in rating_matrix:
        sim_scores = user_similarity[user_id].copy()
        movie_ratings = rating_matrix[movie_id].copy()
        none_rating_idx = movie_ratings[movie_ratings.isnull()].index # movie_id에 대한 rating이 없는 user_id 인덱스
        movie_ratings = movie_ratings.dropna()
        sim_scores = sim_scores.drop(none_rating_idx) # movie_id를 본 user_id의 rating만 고려
        mean_rating = np.dot(sim_scores, movie_ratings) / sim_scores.sum() # 유사도점수에 따라 가중 평균
    else:
        mean_rating = 3.0
    
    return mean_rating

### 1-1. Cosine Similarity

In [9]:
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(user_similarity, index=rating_matrix.index, columns=rating_matrix.index)

In [10]:
user_similarity

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.122783,0.034260,0.053001,0.296665,0.316940,0.375612,0.243086,0.036222,0.280419,...,0.288422,0.066090,0.161228,0.140982,0.152450,0.091661,0.246097,0.061604,0.162045,0.318269
2,0.122783,1.000000,0.063952,0.114960,0.070018,0.200390,0.101835,0.072888,0.119129,0.131799,...,0.160496,0.292399,0.284862,0.298897,0.264301,0.176323,0.123997,0.160599,0.138900,0.099308
3,0.034260,0.063952,1.000000,0.154378,0.000000,0.056130,0.036976,0.051944,0.057170,0.057298,...,0.000000,0.000000,0.098401,0.032872,0.076989,0.014893,0.114864,0.023881,0.119152,0.014837
4,0.053001,0.114960,0.154378,1.000000,0.012728,0.057293,0.080665,0.106117,0.000000,0.015973,...,0.039189,0.048096,0.148360,0.105263,0.137355,0.038748,0.186681,0.124270,0.103337,0.015441
5,0.296665,0.070018,0.000000,0.012728,1.000000,0.134780,0.307168,0.218751,0.036451,0.157113,...,0.266349,0.056476,0.068370,0.048367,0.103518,0.067260,0.181420,0.128475,0.129864,0.290368
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.091661,0.176323,0.014893,0.038748,0.067260,0.120211,0.096789,0.126030,0.052016,0.060325,...,0.059957,0.333019,0.187324,0.160126,0.381978,1.000000,0.022906,0.176001,0.040028,0.113395
940,0.246097,0.123997,0.114864,0.186681,0.181420,0.292867,0.235803,0.202398,0.054518,0.251796,...,0.256283,0.021324,0.117974,0.146234,0.093765,0.022906,1.000000,0.088156,0.202061,0.166589
941,0.061604,0.160599,0.023881,0.124270,0.128475,0.110873,0.053949,0.047156,0.083412,0.101512,...,0.035711,0.118671,0.161973,0.153473,0.232907,0.176001,0.088156,1.000000,0.071320,0.000000
942,0.162045,0.138900,0.119152,0.103337,0.129864,0.241129,0.218732,0.120655,0.054635,0.130782,...,0.208746,0.075908,0.064802,0.078536,0.074138,0.040028,0.202061,0.071320,1.000000,0.110036


In [8]:
score(CF_simple)

1.0144316669472162

### 1-2. Pearson Correlation Coefficient

In [16]:
user_similarity = matrix_dummy.T.corr(method="pearson")

In [18]:
user_similarity

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.074367,-0.006028,0.034859,0.192396,0.207876,0.192667,0.227489,0.008460,0.182204,...,0.199129,0.065911,0.158488,0.103771,0.108851,0.076527,0.207330,0.112171,0.091264,0.232930
2,0.074367,1.000000,0.086285,0.198324,-0.004568,0.166105,0.035687,0.033144,0.155299,0.041892,...,0.093863,0.243800,0.260024,0.344691,0.210037,0.219712,0.183259,0.171769,0.048686,0.025006
3,-0.006028,0.086285,1.000000,0.188477,-0.008539,0.017726,0.008912,0.056490,0.009164,-0.012169,...,0.005004,0.039802,0.083191,0.043398,0.074022,0.015899,0.126065,0.038320,0.030749,-0.002068
4,0.034859,0.198324,0.188477,1.000000,-0.027163,0.038207,0.046847,0.065988,0.120893,0.035606,...,0.011308,0.034846,0.094933,0.165950,0.045315,0.025715,0.164395,0.131990,0.129134,0.003657
5,0.192396,-0.004568,-0.008539,-0.027163,1.000000,0.064443,0.193414,0.175568,0.012209,0.086562,...,0.169738,0.061172,0.005888,0.004757,0.069500,0.051846,0.153061,0.056293,0.014874,0.181491
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.076527,0.219712,0.015899,0.025715,0.051846,0.042062,0.016873,0.109328,-0.014695,-0.011679,...,0.050200,0.378691,0.146515,0.159594,0.324678,1.000000,0.065071,0.142163,-0.010433,0.048948
940,0.207330,0.183259,0.126065,0.164395,0.153061,0.208977,0.149417,0.108642,0.121965,0.162459,...,0.144852,0.097814,0.110687,0.129825,0.087797,0.065071,1.000000,0.109227,0.132393,0.093384
941,0.112171,0.171769,0.038320,0.131990,0.056293,0.100899,-0.016316,0.091620,0.182873,0.064755,...,-0.003920,0.141464,0.284623,0.046754,0.243257,0.142163,0.109227,1.000000,0.018671,0.057651
942,0.091264,0.048686,0.030749,0.129134,0.014874,0.199871,0.170241,0.088709,0.056386,0.174053,...,0.130603,0.023638,0.015052,0.041370,0.027808,-0.010433,0.132393,0.018671,1.000000,0.106404


In [19]:
score(CF_simple)

1.2357939891827803

## 2. CF - Neighborhood based

In [51]:
def score(model, neighbor_size=0):
    id_pairs = zip(X_test['user_id'], X_test['movie_id'])
    y_pred = np.array([model(user, movie, neighbor_size) for (user, movie) in id_pairs])
    y_true = np.array(X_test['rating'])
    return RMSE(y_true, y_pred)

def cf_knn(user_id, movie_id, neighbor_size=0):
    if movie_id in rating_matrix:
        sim_scores = user_similarity[user_id].copy()
        movie_ratings = rating_matrix[movie_id].copy()
        none_rating_idx = movie_ratings[movie_ratings.isnull()].index
        movie_ratings = movie_ratings.drop(none_rating_idx)
        sim_scores = sim_scores.drop(none_rating_idx)
        
        if neighbor_size == 0:
            mean_rating = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
        else:
            if len(sim_scores) > 1:
                neighbor_size = min(neighbor_size, len(sim_scores))
                sim_scores = np.array(sim_scores)
                movie_ratings = np.array(movie_ratings)
                user_idx = np.argsort(sim_scores)
                sim_scores = sim_scores[user_idx][-neighbor_size:]
                movie_ratings = movie_ratings[user_idx][-neighbor_size:]
                mean_rating = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
            else:
                mean_rating = 3.0
    else:
        mean_rating = 3.0
    
    return mean_rating

In [52]:
score(cf_knn, neighbor_size = 30)

1.0155257884242075

In [53]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [58]:
# 실제 추천을 할 때에는 train/test set을 나눌 필요없으므로 전체 데이터로 사용자 간 유사도를 구함
rating_matrix = ratings.pivot_table(values='rating', index='user_id', columns='movie_id')

In [59]:
from sklearn.metrics.pairwise import cosine_similarity
matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(user_similarity, index=rating_matrix.index, columns=rating_matrix.index)

In [73]:
def recom_movie(user_id, n_items, neighbor_size=30):
    user_movie = rating_matrix.loc[user_id].copy()
    for movie in rating_matrix:
        if pd.notnull(user_movie.loc[movie]):
            user_movie.loc[movie] = 0 # 사용자가 이미 본 영화는 추천안함
        else:
            user_movie.loc[movie] = cf_knn(user_id, movie, neighbor_size)
        
    movie_sort = user_movie.sort_values(ascending=False)[:n_items]
    recom_movies = movies.loc[movie_sort.index]
    recommendations = recom_movies['title']
    return recommendations

In [74]:
recom_movie(user_id=2, n_items=5, neighbor_size=30)

movie_id
1293                     Ayn Rand: A Sense of Life (1997)
1467                                     Cure, The (1995)
1189                              That Old Feeling (1997)
1500    Prisoner of the Mountains (Kavkazsky Plennik) ...
318                       Everyone Says I Love You (1996)
Name: title, dtype: object