# Movie dataset recommend

## 1. Dataset

In [17]:
from scipy import spatial

In [18]:
rating_df = pd.read_csv('ratings_small.csv')
rating_df.drop('timestamp', axis=1 ,inplace=True)
rating_df.tail()

Unnamed: 0,userId,movieId,rating
99999,671,6268,2.5
100000,671,6269,4.0
100001,671,6365,4.0
100002,671,6385,2.5
100003,671,6565,3.5


## 2. Check dataset

In [19]:
unique_user = rating_df['userId'].unique()
len(unique_user)

671

In [20]:
unique_movie = rating_df['movieId'].unique()
len(unique_movie)

9066

In [21]:
# ratring
rating_df.groupby('rating').size().reset_index(name='rating_counts')

Unnamed: 0,rating,rating_counts
0,0.5,1101
1,1.0,3326
2,1.5,1687
3,2.0,7271
4,2.5,4449
5,3.0,20064
6,3.5,10538
7,4.0,28750
8,4.5,7723
9,5.0,15095


In [22]:
# user_rating 분포
user_counts_df = rating_df.groupby('userId').size().reset_index(name='user_rating_count')
user_counts_df.sort_values('user_rating_count', ascending=False, inplace=True)
user_counts_df.head()

Unnamed: 0,userId,user_rating_count
546,547,2391
563,564,1868
623,624,1735
14,15,1700
72,73,1610


In [23]:
# movie 분포
movie_counts_df = rating_df.groupby('movieId').size().reset_index(name='movie_rating_count')
movie_counts_df.sort_values('movie_rating_count', ascending=False, inplace=True)
movie_counts_df.head()

Unnamed: 0,movieId,movie_rating_count
321,356,341
266,296,324
284,318,311
525,593,304
232,260,291


## 3. Preprocessing

In [24]:
# user 최소 평가 수, movie 최소 평가 수
user_limit, movie_limit = 365, 100

In [25]:
filtered_userId = user_counts_df[user_counts_df['user_rating_count'] > user_limit]
filtered_userId = list(filtered_userId['userId'])
len(filtered_userId), filtered_userId[:5]

(59, [547, 564, 624, 15, 73])

In [26]:
filtered_movieId = movie_counts_df[movie_counts_df['movie_rating_count'] > movie_limit]
filtered_movieId = list(filtered_movieId['movieId'])
len(filtered_movieId), filtered_movieId[:5]

(149, [356, 296, 318, 593, 260])

In [27]:
# 10004 -> 5570
filtered_df = rating_df[rating_df['userId'].isin(filtered_userId)]

filtered_df = filtered_df[filtered_df['movieId'].isin(filtered_movieId)]
len(filtered_df)

5570

In [28]:
filtered_df.head()

Unnamed: 0,userId,movieId,rating
962,15,1,2.0
963,15,2,2.0
965,15,6,4.0
966,15,10,3.0
974,15,25,3.0


## 4. Pivot table

In [29]:
user_df = filtered_df.pivot_table(values='rating', index='userId', columns='movieId', aggfunc=np.average, fill_value=0, dropna=False)
user_df.tail()

movieId,1,2,6,10,25,32,34,36,39,47,...,6377,6539,6874,7153,7361,7438,8961,33794,58559,79132
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
615,4.0,0.0,3.5,0.0,0.0,3.0,0.0,0.0,0.0,4.5,...,4.5,3.5,4.5,4.0,4.0,4.0,4.0,4.0,4.5,4.0
624,5.0,3.0,0.0,4.0,0.0,2.0,0.0,0.0,0.0,4.0,...,3.5,3.5,4.0,4.5,0.0,3.5,4.5,3.5,3.5,4.0
654,5.0,3.0,0.0,4.0,0.0,5.0,4.5,4.5,0.0,4.5,...,5.0,4.5,4.5,5.0,4.0,4.0,5.0,4.5,0.0,0.0
664,3.5,0.0,4.0,0.0,0.0,5.0,0.0,0.0,0.0,4.5,...,0.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.5,5.0
665,0.0,3.0,0.0,0.0,0.0,4.0,2.0,0.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 5. Function

In [30]:
def cosine_similarity(vector_1, vector_2):
    idx = vector_1.nonzero()[0]
    if len(idx) == 0:
        return -1
    vector_1, vector_2 = np.array(vector_1)[idx], np.array(vector_2)[idx]
    
    idx = vector_2.nonzero()[0]
    if len(idx) == 0:
        return -1
    vector_1, vector_2 = np.array(vector_1)[idx], np.array(vector_2)[idx]
    
    return 1 - spatial.distance.cosine(vector_1, vector_2)

In [31]:
# test code - cosine_similarity
vector_1 = np.array([1,0,3,0,5])
vector_2 = np.array([5,3,0,1,5])
cosine_similarity(vector_1, vector_2)

0.8320502943378437

In [32]:
# test code - cosine_similarity
cosine_similarity(user_df.loc[15], user_df.loc[19])

0.9501250301799182

In [33]:
def similarity_matrix(user_df, similarity_func):
    
    users = user_df.index
    
    df = user_df.T
    
    matrix = []
    for idx_1, value_1 in df.items():
        row = []
        for idx_2, value_2 in df.items():
            row.append(similarity_func(value_1, value_2))
        matrix.append(row)
        
    return pd.DataFrame(matrix, index= users, columns=users)

In [34]:
# test code - similarity_matrix
sm_df = similarity_matrix(user_df, cosine_similarity)
sm_df.tail()

userId,15,19,23,30,48,56,73,102,105,119,...,580,587,596,605,607,615,624,654,664,665
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
615,0.913701,0.986879,0.977971,0.976351,0.990701,0.986844,0.977249,0.977529,0.981156,0.975449,...,0.98418,0.982973,0.984113,0.945157,0.979301,1.0,0.977023,0.991736,0.991745,0.97756
624,0.933455,0.973056,0.974726,0.970812,0.977897,0.964744,0.968863,0.967962,0.974071,0.964382,...,0.967887,0.974954,0.970422,0.949962,0.968329,0.977023,1.0,0.978762,0.977336,0.959331
654,0.917356,0.979269,0.981476,0.978836,0.987746,0.977249,0.97678,0.976687,0.983877,0.977521,...,0.983758,0.97794,0.981081,0.955068,0.98024,0.991736,0.978762,1.0,0.993751,0.973894
664,0.930106,0.979273,0.985208,0.974926,0.993049,0.976032,0.982777,0.976597,0.982344,0.982795,...,0.98368,0.97312,0.988273,0.956637,0.988647,0.991745,0.977336,0.993751,1.0,0.974557
665,0.903008,0.95424,0.967124,0.951942,0.97652,0.94866,0.962269,0.937813,0.964187,0.967379,...,0.962619,0.955417,0.954341,0.92442,0.960054,0.97756,0.959331,0.973894,0.974557,1.0


In [75]:
def mean_score(user_df, sm_df, target, closer_count):
    
    sms_df = sm_df.drop(target)
    sms_df.sort_values(target, ascending=False)
    sms_df = sms_df[target][:closer_count]
    
    smsw_df = user_df.loc[sms_df.index]
    
    ms_df = pd.DataFrame(columns=user_df.columns)
    ms_df.loc['user'] = user_df.loc[target]
    ms_df.loc['mean'] = smsw_df.mean()
    
    return ms_df

In [76]:
# test code - mean_score
ms_df = mean_score(user_df, sm_df, 48, 10)
ms_df

movieId,1,2,6,10,25,32,34,36,39,47,...,6377,6539,6874,7153,7361,7438,8961,33794,58559,79132
user,4.0,3.5,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,4.0,4.0,0.0,4.0,4.5,0.0,3.5,0.0,4.0,4.0
mean,2.6,1.25,2.7,2.0,2.25,2.6,2.4,2.25,1.45,4.05,...,1.85,1.95,1.85,2.5,1.9,1.6,1.3,1.9,1.45,1.4


## 6. Recommend func

In [56]:
def recommend(ms_df):
    recommend_df = ms_df.T
    recommend_df = recommend_df[recommend_df['user'] == 0]
    recommend_df = recommend_df.sort_values('mean', ascending=False)
    
    return recommend_df, list(recommend_df.index)

In [70]:
# test code - recommend
recommend_df, recommend_list = recommend(ms_df)
print(recommend_list[:10])
recommend_df.head()

[1208, 1213, 50, 1198, 1196, 1193, 1221, 260, 1036, 47]


Unnamed: 0_level_0,user,mean
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1208,0.0,4.7
1213,0.0,4.4
50,0.0,4.3
1198,0.0,4.25
1196,0.0,4.25


## 7. MAE

In [71]:
def mae(value, pred):
    idx = value.nonzero()[0]
    value, pred = np.array(value)[idx], np.array(pred)[idx]
    
    idx = pred.nonzero()[0]
    value, pred = np.array(value)[idx], np.array(pred)[idx]
    
    return sum(np.abs(value - pred)) / len(idx)

In [72]:
# test code - MAE
mae(ms_df.loc['user'], ms_df.loc['mean'])

1.34453125

In [63]:
def evaluate (user_df, sm_df, closer_count, algorithm):
    users = user_df.index
    evaluate_list = []
    
    for target in users:
        pred_df = mean_score(user_df, sm_df, target, closer_count)
        evaluate_list.append(algorithm(pred_df.loc['user'], pred_df.loc['mean']))
        
    return np.average(evaluate_list)

In [65]:
# test code - evaluate
evaluate(user_df, sm_df, 10, mae)

1.2550454175330121

In [68]:
start, end = 2, 30

for closer_count in range(start, end+1):
    print(closer_count, evaluate(user_df, sm_df, closer_count, mae))

2 1.3814250858071688
3 1.275097639817603
4 1.2094074821105159
5 1.3050461778620523
6 1.3246275766991435
7 1.2311636621195423
8 1.2761566957251878
9 1.2561241235115717
10 1.2550454175330121
11 1.2542228170235707
12 1.275346215307806
13 1.2993443322828602
14 1.3391508570106374
15 1.3562680703092493
16 1.3471410085424833
17 1.3686166965787816
18 1.3682563528865233
19 1.3813134350455234
20 1.384239580756688
21 1.354592796537474
22 1.3661788048874124
23 1.3895042667894872
24 1.4195456927761512
25 1.4196079861436561
26 1.4275149304242982
27 1.434993448370625
28 1.429677818535073
29 1.4265488855275679
30 1.422003306822126
