In [1]:
import pandas as pd
import numpy as np
import os
from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares

In [2]:
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

# 데이터 불러오기

In [3]:
rating_file_path=os.getenv('HOME') + '/aiffel/exp9/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python', encoding = "ISO-8859-1")
orginal_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


# 평점 3점 미만 데이터 삭제

In [4]:
ratings = ratings[ratings['rating']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


# rating을 count로 수정

In [5]:
ratings.rename(columns={'rating':'count'}, inplace=True)

In [6]:
ratings['count']

0          5
1          3
2          3
3          4
4          5
          ..
1000203    3
1000205    5
1000206    5
1000207    4
1000208    4
Name: count, Length: 836478, dtype: int64

# 사용하지 않을 데이터인 timestamp 삭제

In [7]:
using_cols = ['user_id', 'movie_id', 'count']
ratings = ratings[using_cols]
ratings.head(10)

Unnamed: 0,user_id,movie_id,count
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5
5,1,1197,3
6,1,1287,5
7,1,2804,5
8,1,594,4
9,1,919,4


# 영화 타이틀 불러오기

In [8]:
movie_file_path=os.getenv('HOME') + '/aiffel/exp9/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python', encoding='ISO-8859-1')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


# 영화 타이틀 확인

In [9]:
set(movies['title'])

{'Birds, The (1963)',
 'Poltergeist III (1988)',
 'Jaws 3-D (1983)',
 'Great Mouse Detective, The (1986)',
 'Ladyhawke (1985)',
 'Here on Earth (2000)',
 'Clean Slate (Coup de Torchon) (1981)',
 'Dracula (1958)',
 "Pot O' Gold (1941)",
 'Slumber Party Massacre II, The (1987)',
 'Superstar (1999)',
 'Heaven & Earth (1993)',
 'Schizopolis (1996)',
 'Day the Earth Stood Still, The (1951)',
 'Fabulous Baker Boys, The (1989)',
 'Awfully Big Adventure, An (1995)',
 'Sleeping Beauty (1959)',
 'Cat on a Hot Tin Roof (1958)',
 'Perfect Blue (1997)',
 'Star Is Born, A (1937)',
 'Identification of a Woman (Identificazione di una donna) (1982)',
 'Re-Animator (1985)',
 'Koyaanisqatsi (1983)',
 'Aladdin and the King of Thieves (1996)',
 'Passion of Mind (1999)',
 'Eraserhead (1977)',
 'Stuart Saves His Family (1995)',
 'Big Fella (1937)',
 'Legends of the Fall (1994)',
 'Perfect Murder, A (1998)',
 'Limbo (1999)',
 'Wonderland (1997)',
 'Governess, The (1998)',
 'Waiting to Exhale (1995)',
 'Richar

# 가능하면 목록에 있는 영화를 선호 영화에 추가하고자 실제로 목록에 영화가 있는지 검색

In [10]:
print(movies.loc[movies['title'] == 'Forrest Gump (1994)'])

     movie_id                title               genre
352       356  Forrest Gump (1994)  Comedy|Romance|War


# 총 6039명의 사용자 확인

In [14]:
ratings['user_id'].nunique()

6039

# 총 3628개의 영화

In [15]:
ratings['movie_id'].nunique()

3628

# 시청 상위 영화 목록

In [16]:
movie_count = ratings.groupby('movie_id')['user_id'].count()
movie_count.sort_values(ascending=False).head(30)

movie_id
2858    3211
260     2910
1196    2885
1210    2716
2028    2561
589     2509
593     2498
1198    2473
1270    2460
2571    2434
480     2413
2762    2385
608     2371
110     2314
1580    2297
527     2257
1197    2252
2396    2213
1617    2210
318     2194
858     2167
1265    2121
1097    2102
2997    2066
2716    2051
296     2030
356     2022
1240    2019
1       2000
457     1941
Name: user_id, dtype: int64

# 영화 타이틀을 movie_id에 맞춰서 병합

In [18]:
ratings = pd.merge(ratings, movies[['title', 'movie_id']], on='movie_id', how='left')
ratings

Unnamed: 0,user_id,movie_id,count,title
0,1,1193,5,One Flew Over the Cuckoo's Nest (1975)
1,1,661,3,James and the Giant Peach (1996)
2,1,914,3,My Fair Lady (1964)
3,1,3408,4,Erin Brockovich (2000)
4,1,2355,5,"Bug's Life, A (1998)"
...,...,...,...,...
836473,6040,1090,3,Platoon (1986)
836474,6040,1094,5,"Crying Game, The (1992)"
836475,6040,562,5,Welcome to the Dollhouse (1995)
836476,6040,1096,4,Sophie's Choice (1982)


# 영화 추천의 정확성을 높이기 위해 15개의 영화 선호 영화로 선정

In [19]:
my_favorite = ['Forrest Gump (1994)', 'Truman Show, The (1998)', 'Edward Scissorhands (1990)', 'Gattaca (1997)', 'Sixth Sense, The (1999)', 'Aladdin (1992)' , 'Sound of Music, The (1965)' ,'Star Is Born, A (1937)' ,"Shawshank Redemption, The (1994)" ,'Toy Story (1995)', "Singin' in the Rain (1952)", 'Titanic (1997)', 'Iron Giant, The (1999)', 'Terminator 2: Judgment Day (1991)', 'Matrix, The (1999)']

my_playlist = pd.DataFrame({'user_id': ['donghyun']*15, 'title': my_favorite, 'count':[5]*15})

if not ratings.isin({'user_id':['donghyun']})['user_id'].any():
    ratings = ratings.append(my_playlist, ignore_index=True)

ratings.tail(10) 

Unnamed: 0,user_id,movie_id,count,title
836483,donghyun,,5,Aladdin (1992)
836484,donghyun,,5,"Sound of Music, The (1965)"
836485,donghyun,,5,"Star Is Born, A (1937)"
836486,donghyun,,5,"Shawshank Redemption, The (1994)"
836487,donghyun,,5,Toy Story (1995)
836488,donghyun,,5,Singin' in the Rain (1952)
836489,donghyun,,5,Titanic (1997)
836490,donghyun,,5,"Iron Giant, The (1999)"
836491,donghyun,,5,Terminator 2: Judgment Day (1991)
836492,donghyun,,5,"Matrix, The (1999)"


In [20]:
user_unique = ratings['user_id'].unique()
movie_unique = ratings['title'].unique()

user_to_idx = {v:k for k,v in enumerate(user_unique)}
movie_to_idx = {v:k for k,v in enumerate(movie_unique)}

In [21]:
print(user_to_idx['donghyun'])
print(movie_to_idx['Toy Story (1995)'])

6039
40


# 인덱싱 진행

In [22]:
temp_user_data = ratings['user_id'].map(user_to_idx.get).dropna()
if len(temp_user_data) == len(ratings):
    print('user_id column indexing OK!!')
    ratings['user_id'] = temp_user_data 
else:
    print('user_id column indexing Fail!!')

temp_movie_data = ratings['title'].map(movie_to_idx.get).dropna()
if len(temp_movie_data) == len(ratings):
    print('movie column indexing OK!!')
    ratings['movie_id'] = temp_movie_data
else:
    print('movie column indexing Fail!!')

ratings

user_id column indexing OK!!
movie column indexing OK!!


Unnamed: 0,user_id,movie_id,count,title
0,0,0,5,One Flew Over the Cuckoo's Nest (1975)
1,0,1,3,James and the Giant Peach (1996)
2,0,2,3,My Fair Lady (1964)
3,0,3,4,Erin Brockovich (2000)
4,0,4,5,"Bug's Life, A (1998)"
...,...,...,...,...
836488,6039,663,5,Singin' in the Rain (1952)
836489,6039,27,5,Titanic (1997)
836490,6039,851,5,"Iron Giant, The (1999)"
836491,6039,92,5,Terminator 2: Judgment Day (1991)


In [23]:
ratings.tail(20)

Unnamed: 0,user_id,movie_id,count,title
836473,6038,1030,3,Platoon (1986)
836474,6038,986,5,"Crying Game, The (1992)"
836475,6038,311,5,Welcome to the Dollhouse (1995)
836476,6038,142,4,Sophie's Choice (1982)
836477,6038,26,4,E.T. the Extra-Terrestrial (1982)
836478,6039,160,5,Forrest Gump (1994)
836479,6039,385,5,"Truman Show, The (1998)"
836480,6039,250,5,Edward Scissorhands (1990)
836481,6039,431,5,Gattaca (1997)
836482,6039,38,5,"Sixth Sense, The (1999)"


# CSR 데이터 생성

In [24]:
num_user = ratings['user_id'].nunique()
num_movie = ratings['movie_id'].nunique()

csr_data = csr_matrix((ratings['count'], (ratings.user_id, ratings.movie_id)), shape=(num_user, num_movie))
csr_data

<6040x3628 sparse matrix of type '<class 'numpy.longlong'>'
	with 836493 stored elements in Compressed Sparse Row format>

In [25]:
csr_data_transpose = csr_data.T
csr_data_transpose

<3628x6040 sparse matrix of type '<class 'numpy.longlong'>'
	with 836493 stored elements in Compressed Sparse Column format>

# 모델링

In [26]:
als_model = AlternatingLeastSquares(factors=200, regularization=0.01, use_gpu=False, iterations=20, dtype=np.float32)
als_model.fit(csr_data_transpose)

  0%|          | 0/20 [00:00<?, ?it/s]

# 사용자설정 아이디와 선호 목록 안에 있던 영화의 내적

In [27]:
donghyun = user_to_idx['donghyun']
toy_story = movie_to_idx['Toy Story (1995)']
donghyun_vector = als_model.user_factors[donghyun]
toy_story_vector = als_model.item_factors[toy_story]

# 기존에 선호 목록에 들어 있던 Toy Story의 선호도가 0.906으로 나옴  
# 학습한 모델이 선호 영화를 정확히 파악하고 있음을 확인

In [28]:
np.dot(donghyun_vector,toy_story_vector)

0.90608996

# 선호 목록에 들어가지 않은 영화중에서도 전혀 관련이 없을 법한 영화를 선정하여 내적 계산

In [29]:
singin_in_the_rain = movie_to_idx['Psycho (1960)']
singin_in_the_rain_vector = als_model.item_factors[singin_in_the_rain]

# 마이너스 값으로 전혀 연관성이 없음을 잘 보여줌

In [30]:
np.dot(donghyun_vector,singin_in_the_rain_vector)

-0.08396674

In [31]:
idx_to_movie = {v:k for k,v in movie_to_idx.items()}

def get_similar_movie(title: str):
    movie_id = movie_to_idx[title]
    similar_movie = als_model.similar_items(movie_id)
    similar_movie = [idx_to_movie[i[0]] for i in similar_movie]
    return similar_movie

# 입력한 영화와 비슷한 영화 리스트 도출

In [32]:
get_similar_movie('Star Is Born, A (1937)')

['Star Is Born, A (1937)',
 'Life of Émile Zola, The (1937)',
 'Rain (1932)',
 'Son of the Sheik, The (1926)',
 'Jamaica Inn (1939)',
 'Algiers (1938)',
 'Voyage of the Damned (1976)',
 'Murder! (1930)',
 'Red Dwarf, The (Le Nain rouge) (1998)',
 "Barney's Great Adventure (1998)"]

# 선호 영화를 기반으로한 영화 추천 진행

In [33]:
user = user_to_idx['donghyun']
movie_recommended = als_model.recommend(user, csr_data, N=20, filter_already_liked_items=True)
movie_recommended

[(110, 0.5422491),
 (330, 0.5153725),
 (121, 0.49608997),
 (10, 0.38439628),
 (50, 0.37702325),
 (9, 0.36129385),
 (2, 0.35419494),
 (479, 0.3538732),
 (384, 0.35336328),
 (141, 0.332785),
 (39, 0.32994974),
 (248, 0.3172221),
 (87, 0.31550863),
 (200, 0.31431127),
 (175, 0.2975533),
 (354, 0.2967982),
 (508, 0.2910764),
 (548, 0.28438428),
 (301, 0.27872598),
 (317, 0.2703116)]

# 추천 상위권의 유사도 값이 0.5 내외의 값을 보이는 만큼 상당히 추천이 잘 이루어지고 있음
# 실제로도 추천 리스트의 영화들이 선호 영화 리스트에 있는 영화와 비슷한 장르인 것을 확인

In [34]:
[idx_to_movie[i[0]] for i in movie_recommended]

['Groundhog Day (1993)',
 'Lion King, The (1994)',
 'Silence of the Lambs, The (1991)',
 'Beauty and the Beast (1991)',
 'Toy Story 2 (1999)',
 'Wizard of Oz, The (1939)',
 'My Fair Lady (1964)',
 'Contact (1997)',
 'Jerry Maguire (1996)',
 'Fugitive, The (1993)',
 'Apollo 13 (1995)',
 'Good Will Hunting (1997)',
 'Braveheart (1995)',
 'Terminator, The (1984)',
 'Men in Black (1997)',
 'West Side Story (1961)',
 'Ghost (1990)',
 'Fantasia (1940)',
 'Rushmore (1998)',
 'Twelve Monkeys (1995)']

# 추천이 상당히 잘 이루어진 것으로 보이지만 결국 전체 대중의 전반적인 경향성에 기반한 추천이라는 점에서 개개인의 성향을 반영한 추천이 아니라는 점에서 효과적인 추천이라고 할 수 있을지는 미지수
# 특히 장르적 기준이 아니라 정서적인 기준 같이 데이터화 하기 어려운 기준을 가진 사람들에게는 추천 시스템의 효용을 기대하기 어려움
# 개개인의 경향성도 데이터화하여 비슷한 경향성의 유저의 데이터를 중심으로 추천 시스템이 작동하게 만든다면 훨씬 더 유의미한 추천 시스템이 만들어질 것이라고 생각됨