In [99]:
import pandas as pd
from implicit.als import AlternatingLeastSquares
import os
import numpy as np

In [100]:
rating_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python', encoding = "ISO-8859-1")
orginal_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [101]:
# 3점 이상만 남깁니다.
ratings = ratings[ratings['rating']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [102]:
# rating 컬럼의 이름을 count로 바꿉니다.
ratings.rename(columns={'rating':'count'}, inplace=True)

In [103]:
ratings['count']

0          5
1          3
2          3
3          4
4          5
          ..
1000203    3
1000205    5
1000206    5
1000207    4
1000208    4
Name: count, Length: 836478, dtype: int64

In [104]:
# 영화 제목을 보기 위해 메타 데이터를 읽어옵니다.
movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python', encoding='ISO-8859-1')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [131]:
data = ratings.copy()
data

Unnamed: 0,user_id,movie_id,count,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000203,6040,1090,3,956715518
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [132]:
del data['timestamp']
data

Unnamed: 0,user_id,movie_id,count
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5
...,...,...,...
1000203,6040,1090,3
1000205,6040,1094,5
1000206,6040,562,5
1000207,6040,1096,4


In [146]:
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [134]:
# my information 
my_favoriteID = [ 1 , 2 , 16, 29 ,26]
my_count = [ 5, 4, 3, 3, 3]

my_movielist = pd.DataFrame({'user_id': 0, 'movie_id': my_favoriteID, 'count':my_count})

#my_movielist

if not data.isin({'user_id':[0]})['user_id'].any():  
    data = data.append(my_movielist)

data.tail(10)

Unnamed: 0,user_id,movie_id,count
1000203,6040,1090,3
1000205,6040,1094,5
1000206,6040,562,5
1000207,6040,1096,4
1000208,6040,1097,4
0,0,1,5
1,0,2,4
2,0,16,3
3,0,29,3
4,0,26,3


In [141]:
user_unique = data['user_id'].unique()
movie_unique = data['movie_id'].unique()

In [147]:
# make csr
from scipy.sparse import csr_matrix

num_user=data['user_id'].max() + 1
num_movie=movies['movie_id'].max()

print(num_user)
print(num_movie)

6041
3952


In [137]:
data

Unnamed: 0,user_id,movie_id,count
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5
...,...,...,...
0,0,1,5
1,0,2,4
2,0,16,3
3,0,29,3


In [150]:
csr_data=csr_matrix((data['count'], (data['user_id'], data['movie_id'])), shape=(num_user + 1, num_movie + 1))
csr_data

<6042x3953 sparse matrix of type '<class 'numpy.longlong'>'
	with 836483 stored elements in Compressed Sparse Row format>

## 학습하기

In [151]:
from implicit.als import AlternatingLeastSquares
import numpy as np

als_model = AlternatingLeastSquares(factors=100, regularization=0.01, use_gpu=False, iterations=15, dtype=np.float32)

In [152]:
# input type is item * user
als_model.fit(csr_data.T)

  0%|          | 0/15 [00:00<?, ?it/s]

## 유사한 영화 찾기

In [153]:
movie_id = 3
user_vector, Grumpier_vector = als_model.user_factors[0], als_model.item_factors[3]

In [154]:
user_vector

array([-0.02093385, -0.44808263, -0.02276553, -0.03789078,  0.02510111,
       -0.3695668 ,  0.64492744,  0.53560144,  0.3609642 ,  0.19775678,
       -0.90862906, -0.15557925,  0.36067852, -0.22516184, -0.15079811,
       -0.05691286,  0.06202944,  0.14074975, -0.23646751, -0.29021516,
        0.11153993,  0.87384516,  0.23666759, -0.13561726,  0.40904215,
       -0.25497738, -0.2311631 ,  0.24180387,  0.09935629, -0.22328779,
        0.32644132,  0.01102107,  0.22113805, -0.44558787,  0.08873332,
        0.18583643,  0.15442066, -0.04233646, -0.589035  ,  0.49844077,
       -0.7147084 ,  0.26572818,  0.02116974,  0.6413126 ,  0.15966727,
        0.08290743,  0.63347703, -0.4143685 , -0.22094567, -0.03900662,
        0.20037839,  0.6417458 ,  0.00396901, -0.7558378 ,  0.20925051,
        0.10505114, -0.08205782, -0.10324182, -0.13661313, -0.11242778,
       -0.11852805,  0.07939927,  0.20885952, -0.12189148, -0.4027625 ,
       -0.08026271, -0.01175683,  0.3035905 ,  0.3052293 , -0.45

In [155]:
Grumpier_vector

array([ 7.57466257e-03,  3.81963439e-02,  1.35099944e-02, -1.07813263e-02,
        3.94512853e-03,  7.78263621e-03, -1.65786389e-02,  1.24526042e-02,
       -1.39642823e-02,  8.00105557e-03, -1.65003352e-02,  6.64030621e-03,
        1.36311492e-03,  2.48992397e-03, -1.52502907e-03,  3.61137302e-03,
        1.38818529e-02, -1.23179061e-02,  1.62316803e-02,  3.81069840e-03,
        1.24938255e-02,  7.85670709e-03, -1.03630973e-02, -1.52892563e-02,
        5.28515130e-03,  2.32069455e-02,  3.13536674e-02,  1.54892467e-02,
       -5.14147617e-03,  9.14509408e-03,  4.49298183e-03,  8.43215920e-03,
       -6.25188695e-04,  1.71166100e-02,  1.30442092e-02, -1.54697690e-02,
        1.36441626e-02,  5.29046217e-03, -3.34890932e-03,  2.43675453e-03,
        6.57826057e-03, -1.43883610e-03,  5.83786238e-03, -3.90665233e-03,
        3.26548750e-03, -1.08614722e-02, -3.39003420e-03,  6.47617970e-03,
       -2.85761640e-03,  1.08852424e-02, -3.92099703e-03,  2.64473236e-03,
        4.09066351e-03,  

In [156]:
np.dot(user_vector, Grumpier_vector)

-0.036084387

In [157]:
similar_movie = als_model.similar_items(movie_id, N=15)
similar_movie

[(3, 1.0),
 (3450, 0.718196),
 (370, 0.5628439),
 (276, 0.5517234),
 (804, 0.52551264),
 (3869, 0.50813913),
 (186, 0.5010795),
 (237, 0.5008113),
 (1894, 0.496832),
 (1461, 0.4926365),
 (1409, 0.49140295),
 (1390, 0.49138695),
 (520, 0.48851928),
 (1457, 0.48800793),
 (2558, 0.48506513)]

In [199]:
title = movies[movies['movie_id']==similar_movie[0][0]]['title'].values[0]
title

'Grumpier Old Men (1995)'

In [231]:
def get_movie(movie_id):
	return movies[movies['movie_id']== movie_id]['title'].values[0]
	

In [234]:
def get_similar_movie(movie_id, N=15):
	similar_movie = als_model.similar_items(movie_id, N + 1)
	for i in similar_movie:
		title = get_movie(i[0])
		if movie_id == i[0]:
			continue
		print('similarity: {}, 	{}'.format(round(i[1], 4), title))

In [233]:
get_similar_movie(3)

similarity: 0.7182000279426575, 	Grumpy Old Men (1993)
similarity: 0.5627999901771545, 	Naked Gun 33 1/3: The Final Insult (1994)
similarity: 0.5516999959945679, 	Milk Money (1994)
similarity: 0.5254999995231628, 	She's the One (1996)
similarity: 0.5080999732017517, 	Naked Gun 2 1/2: The Smell of Fear, The (1991)
similarity: 0.5011000037193298, 	Nine Months (1995)
similarity: 0.5008000135421753, 	Forget Paris (1995)
similarity: 0.4968000054359436, 	Six Days Seven Nights (1998)
similarity: 0.4925999939441681, 	Vegas Vacation (1997)
similarity: 0.49140000343322754, 	Michael (1996)
similarity: 0.49140000343322754, 	My Fellow Americans (1996)
similarity: 0.4884999990463257, 	Robin Hood: Men in Tights (1993)
similarity: 0.4880000054836273, 	Fools Rush In (1997)
similarity: 0.48510000109672546, 	Forces of Nature (1999)
similarity: 0.477400004863739, 	Multiplicity (1996)


## 영화 추천하기

In [222]:
user = 0

movie_recommended = als_model.recommend(user, csr_data, N=20, filter_already_liked_items=True)

In [243]:
for i in movie_recommended:
	print('score: {}, {}' .format(i[1], get_movie(i[0])))

score: 0.4335787892341614, Toy Story 2 (1999)
score: 0.293207585811615, Bug's Life, A (1998)
score: 0.22639738023281097, Mask, The (1994)
score: 0.20469099283218384, Aladdin (1992)
score: 0.16231714189052582, Hook (1991)
score: 0.1596899926662445, Twelve Monkeys (1995)
score: 0.15715646743774414, Wrong Trousers, The (1993)
score: 0.15678977966308594, Close Shave, A (1995)
score: 0.15655234456062317, Babe (1995)
score: 0.15463928878307343, Santa Clause, The (1994)
score: 0.14413881301879883, Indian in the Cupboard, The (1995)
score: 0.14345693588256836, Willy Wonka and the Chocolate Factory (1971)
score: 0.13660846650600433, Groundhog Day (1993)
score: 0.1335115134716034, Chicken Run (2000)
score: 0.13350725173950195, Lion King, The (1994)
score: 0.13349300622940063, Dragonheart (1996)
score: 0.13023680448532104, Pi (1998)
score: 0.12000543624162674, Hercules (1997)
score: 0.11911635100841522, Dances with Wolves (1990)
score: 0.11874398589134216, Shallow Grave (1994)
