# 데이터 불러오기

In [3]:
import pandas as pd
import os
rating_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python', encoding = "ISO-8859-1")
orginal_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [4]:
# 3점 이상만 남깁니다.
ratings = ratings[ratings['rating']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [5]:
# rating 컬럼의 이름을 count로 바꿉니다.
ratings.rename(columns={'rating':'count'}, inplace=True)

In [6]:
ratings['count']

0          5
1          3
2          3
3          4
4          5
          ..
1000203    3
1000205    5
1000206    5
1000207    4
1000208    4
Name: count, Length: 836478, dtype: int64

In [7]:
# 영화 제목을 보기 위해 메타 데이터를 읽어옵니다.
movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python', encoding='ISO-8859-1')
movies

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


## 데이터 병합

In [8]:
data = pd.merge(movies,ratings, how='outer',on='movie_id')

data = pd.merge(ratings,movies)

data = data[["user_id","title","count",]]
data.sort_values("user_id")

Unnamed: 0,user_id,title,count
0,1,One Flew Over the Cuckoo's Nest (1975),5
31113,1,Antz (1998),4
31674,1,"Girl, Interrupted (1999)",4
32044,1,Hercules (1997),4
32415,1,Aladdin (1992),4
...,...,...,...
657728,6040,Vanya on 42nd Street (1994),4
393446,6040,M*A*S*H (1970),4
253075,6040,Big Night (1996),3
127665,6040,Shakespeare in Love (1998),3


-----------------------

# 내가 좋아하는 영화 데이터 입력

## 영화 종류와 평가인원 수는?

In [9]:
print("영화 종류 : ", len(movies["title"].unique()))
print("평가 인원 : ", len(ratings["user_id"].unique()))

영화 종류 :  3883
평가 인원 :  6039


### 좋아하는 영화 검색을 위해 title, genre 소문자로 변경

In [10]:
data["title"] = data["title"].str.lower()
data

Unnamed: 0,user_id,title,count
0,1,one flew over the cuckoo's nest (1975),5
1,2,one flew over the cuckoo's nest (1975),5
2,12,one flew over the cuckoo's nest (1975),4
3,15,one flew over the cuckoo's nest (1975),4
4,17,one flew over the cuckoo's nest (1975),5
...,...,...,...
836473,5851,one little indian (1973),5
836474,5854,slaughterhouse (1987),4
836475,5854,"promise, the (versprechen, das) (1994)",3
836476,5938,"five wives, three secretaries and me (1998)",4


### 영화 검색

In [11]:
# 1번 방법 : 제목 검색 / movies[movies['title'].str.contains('영화 제목')]
movies[movies['title'].str.contains('toy story')]

# 2번 방법 : 영화 랜덤 출력
import random
sampleList = random.sample(list(data["title"]), 10)
sampleList

['pulp fiction (1994)',
 'dirty dancing (1987)',
 'armageddon (1998)',
 'miracle on 34th street (1994)',
 'babe (1995)',
 'secret garden, the (1993)',
 'insider, the (1999)',
 'nurse betty (2000)',
 'peter pan (1953)',
 "boys don't cry (1999)"]

## 모델 검증을 위한 사용자 초기 정보 세팅 (내 기호에 맞춰 데이터 입력)

In [12]:
my_favorite = ["toy story (1995)","toy story 2 (1999)","truman show, the (1998)","aliens (1986)"]

my_favorite_list = pd.DataFrame({'user_id': ["wooyong"]*4, 'title': my_favorite, 'count':[5]*4})

if not data.isin({'user_id':["wooyong"]})['user_id'].any():  # user_id에 'wooyong'이라는 데이터가 없다면
    data = data.append(my_favorite_list)                           # 위에 임의로 만든 my_favorite 데이터를 추가해 줍니다. 

data.tail(10)       # 잘 추가되었는지 확인해 봅시다.


Unnamed: 0,user_id,title,count
836472,5717,master ninja i (1984),4
836473,5851,one little indian (1973),5
836474,5854,slaughterhouse (1987),4
836475,5854,"promise, the (versprechen, das) (1994)",3
836476,5938,"five wives, three secretaries and me (1998)",4
836477,5948,identification of a woman (identificazione di ...,5
0,wooyong,toy story (1995),5
1,wooyong,toy story 2 (1999),5
2,wooyong,"truman show, the (1998)",5
3,wooyong,aliens (1986),5


In [13]:
data

Unnamed: 0,user_id,title,count
0,1,one flew over the cuckoo's nest (1975),5
1,2,one flew over the cuckoo's nest (1975),5
2,12,one flew over the cuckoo's nest (1975),4
3,15,one flew over the cuckoo's nest (1975),4
4,17,one flew over the cuckoo's nest (1975),5
...,...,...,...
836477,5948,identification of a woman (identificazione di ...,5
0,wooyong,toy story (1995),5
1,wooyong,toy story 2 (1999),5
2,wooyong,"truman show, the (1998)",5



# User_id, 영화제목 토크나이징

In [14]:
user_unique = data['user_id'].unique()

user_to_idx = {v:k for k,v in enumerate(user_unique)}


title_unique = data['title'].unique()
title_to_idx = {v:k for k,v in enumerate(title_unique)}

print(title_to_idx["toy story (1995)"])

40


* 토이스토리2는 3883이라는 숫자를 부여받았다.

In [15]:
temp_user_data = data['user_id'].map(user_to_idx.get).dropna()
if len(temp_user_data) == len(data):   # 모든 row가 정상적으로 인덱싱되었다면
    print('user_id column indexing OK!!')
    data['user_id'] = temp_user_data   # data['user_id']을 인덱싱된 Series로 교체해 줍니다. 
else:
    print('user_id column indexing Fail!!')


temp_title_data = data['title'].map(title_to_idx.get).dropna()
if len(temp_title_data) == len(data):
    print('title column indexing OK!!')
    data['title'] = temp_title_data
else:
    print('title column indexing Fail!!')

data

user_id column indexing OK!!
title column indexing OK!!


Unnamed: 0,user_id,title,count
0,0,0,5
1,1,0,5
2,2,0,4
3,3,0,4
4,4,0,5
...,...,...,...
836477,1648,3627,5
0,6039,40,5
1,6039,50,5
2,6039,385,5


In [16]:
data["count"] *= (1/5)
data

Unnamed: 0,user_id,title,count
0,0,0,1.0
1,1,0,1.0
2,2,0,0.8
3,3,0,0.8
4,4,0,1.0
...,...,...,...
836477,1648,3627,1.0
0,6039,40,1.0
1,6039,50,1.0
2,6039,385,1.0


# CSR matrix로 변환

In [17]:
from scipy.sparse import csr_matrix





num_user = data['user_id'].nunique()   # 6040
num_title = data['title'].nunique()    # 3887

csr_data = csr_matrix((data["count"], (data.user_id, data.title)), shape= (num_user, num_title))
csr_data



<6040x3628 sparse matrix of type '<class 'numpy.float64'>'
	with 836482 stored elements in Compressed Sparse Row format>

In [18]:
from implicit.als import AlternatingLeastSquares
import os
import numpy as np

# implicit 라이브러리에서 권장하고 있는 부분입니다. 학습 내용과는 무관합니다.
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

In [19]:
# Implicit AlternatingLeastSquares 모델의 선언
als_model = AlternatingLeastSquares(factors=1000, regularization=0.01, use_gpu=False, iterations=30, dtype=np.float32)

In [20]:
# als 모델은 input으로 (item X user 꼴의 matrix를 받기 때문에 Transpose해줍니다.)
csr_data_transpose = csr_data.T
csr_data_transpose

<3628x6040 sparse matrix of type '<class 'numpy.float64'>'
	with 836482 stored elements in Compressed Sparse Column format>

In [21]:
# 모델 훈련
als_model.fit(csr_data_transpose)

  0%|          | 0/30 [00:00<?, ?it/s]

In [22]:
wooyong, toy_story2 = user_to_idx["wooyong"], title_to_idx['toy story 2 (1999)']
wooyong_vector, toy_sotry2_vector = als_model.user_factors[wooyong], als_model.item_factors[toy_story2]

print('슝=3')

슝=3


In [23]:
wooyong_vector

array([-4.18912135e-02,  5.55646606e-03,  9.22156498e-02,  6.04223683e-02,
        2.00013459e-01,  8.61355737e-02,  8.69912095e-03,  5.92524335e-02,
        4.29167412e-02, -2.76182033e-02,  8.46340731e-02, -6.66107088e-02,
       -1.51178837e-01, -8.51858482e-02, -3.47199589e-02,  1.35271728e-01,
        7.61293247e-02,  1.52314799e-02, -2.70121489e-02, -1.43137529e-01,
        7.53935948e-02, -6.31735101e-02, -7.61426315e-02,  8.96581449e-03,
        1.76085159e-02,  5.23868836e-02, -1.29359573e-01,  1.10394778e-02,
       -7.40859238e-03,  8.09728205e-02, -1.78860664e-01, -6.11130195e-03,
       -1.27969265e-01, -8.42344463e-02, -3.85021903e-02,  1.46228939e-01,
        4.08879779e-02,  5.29193096e-02,  9.55437422e-02,  1.50990725e-01,
        1.15488164e-01,  4.06403355e-02, -7.71744270e-03, -8.47998261e-02,
       -5.63782863e-02,  8.73709768e-02, -3.17583792e-02,  7.45781437e-02,
        9.62241068e-02, -9.95896570e-03, -2.06969958e-02,  6.23912625e-02,
        6.35988042e-02,  

In [24]:
toy_sotry2_vector

array([-1.23765776e-02,  3.81259322e-02,  3.85458060e-02,  2.49334108e-02,
        3.01065072e-02,  4.29433919e-02,  6.13730634e-03,  1.17951864e-02,
       -1.29598882e-02,  8.83598253e-03,  8.01778305e-03, -1.57687571e-02,
       -1.14244083e-02, -1.15042673e-02, -3.63702327e-02,  6.41507879e-02,
        1.83773749e-02,  3.69887762e-02,  4.54845838e-02, -2.98144612e-02,
        4.96622026e-02, -3.14281285e-02,  3.02458312e-02, -1.99681669e-02,
        3.42683233e-02,  2.92183310e-02, -2.22831927e-02, -8.01656302e-03,
       -8.65120499e-04,  5.38711734e-02, -1.86679419e-02,  6.00914052e-03,
       -6.99203927e-03, -1.64639670e-02, -9.96585190e-03,  5.61745241e-02,
        4.35943604e-02, -2.25567929e-02,  3.30814384e-02,  6.68191984e-02,
       -2.32137311e-02,  3.55173349e-02,  6.23424351e-02, -1.89306680e-02,
       -3.68604213e-02,  3.22782472e-02, -1.42511930e-02,  7.42624374e-03,
        4.50452743e-03,  1.59334410e-02,  8.00636690e-03,  1.94630120e-02,
        1.64373759e-02, -

In [25]:
print("토이스토리2 : ",np.dot(wooyong_vector, toy_sotry2_vector))

토이스토리2 :  0.9740257


### 오버피팅인가?

In [36]:
wooyong, toy_story = user_to_idx["wooyong"], title_to_idx['toy story (1995)']
wooyong_vector, toy_sotry_vector = als_model.user_factors[wooyong], als_model.item_factors[toy_story]
print("토이스토리1 : ",np.dot(wooyong_vector, toy_sotry_vector))

wooyong, truman = user_to_idx["wooyong"], title_to_idx['truman show, the (1998)']
wooyong_vector, truman_vector = als_model.user_factors[wooyong], als_model.item_factors[truman]
print("트루먼쇼    : ",np.dot(wooyong_vector, truman_vector))

wooyong, aliens = user_to_idx["wooyong"], title_to_idx['aliens (1986)']
wooyong_vector, aliens_vector = als_model.user_factors[wooyong], als_model.item_factors[aliens]
print("에일리언    : ",np.dot(wooyong_vector, aliens_vector))



토이스토리1 :  0.96307224
트루먼쇼    :  0.87785393
에일리언    :  0.9435483


* 좋아하는 영화 리스트로 넣은 영화의 np.dot값을 모두 구한 결과 모두 0.87이상의 값이 나왔다.
 - 이것이 오버피팅인지 아닌지 알 수 없고, 페이지 최하단에 있는 "나에게 추천하는 영화 리스트" 역시 오버피팅에 의한 결과인지는 모르겠다.
 - 하지만 추천시스템의 매력이 이런게 아닌가 싶다

In [27]:
promise = title_to_idx['promise, the (versprechen, das) (1994)']
promise_vector = als_model.item_factors[promise]
np.dot(wooyong_vector, promise_vector)

-0.001130208

In [28]:
favorite_movie = 'toy story 2 (1999)'
movie_id = title_to_idx[favorite_movie]
similar_movie = als_model.similar_items(movie_id, N=15)
similar_movie

[(50, 1.0000001),
 (40, 0.23167254),
 (2137, 0.20654005),
 (1056, 0.19633244),
 (2749, 0.19423422),
 (2144, 0.19248906),
 (2970, 0.1909466),
 (3543, 0.1907865),
 (3453, 0.19062735),
 (1246, 0.19047646),
 (3619, 0.19003619),
 (3201, 0.18925205),
 (1983, 0.18920991),
 (3395, 0.18913202),
 (3437, 0.18908928)]

In [29]:
#artist_to_idx 를 뒤집어, index로부터 artist 이름을 얻는 dict를 생성합니다. 
idx_to_title = {v:k for k,v in title_to_idx.items()}
[idx_to_title[i[0]] for i in similar_movie]

['toy story 2 (1999)',
 'toy story (1995)',
 'tigger movie, the (2000)',
 'vie est belle, la (life is rosey) (1987)',
 'careful (1992)',
 'amityville curse, the (1990)',
 'digimon: the movie (2000)',
 'small wonders (1996)',
 'woo (1998)',
 'homeward bound ii: lost in san francisco (1996)',
 'smoking/no smoking (1993)',
 'allnighter, the (1987)',
 'air bud: golden receiver (1998)',
 'castaway cowboy, the (1974)',
 'assassination (1987)']

## 비슷한 유형의 영화 추천 함수

In [30]:
def get_similar_movie(movie_name: str):
    movie_id = title_to_idx[movie_name]
    similar_movie = als_model.similar_items(movie_id)
    similar_movie = [idx_to_title[i[0]] for i in similar_movie]
    return similar_movie

print("슝=3")

슝=3


In [31]:
get_similar_movie("antz (1998)")

['antz (1998)',
 'perfect blue (1997)',
 'haunted world of edward d. wood jr., the (1995)',
 'meet the deedles (1998)',
 'lassie (1994)',
 "doug's 1st movie (1999)",
 'anna (1996)',
 'celestial clockwork (1994)',
 'buck and the preacher (1972)',
 'mamma roma (1962)']

## 나에게 추천하는 영화

In [32]:
user = user_to_idx['wooyong']
# recommend에서는 user*item CSR Matrix를 받습니다.
movie_recommended = als_model.recommend(user, csr_data, N=20, filter_already_liked_items=True)
movie_recommended

[(428, 0.0321986),
 (1725, 0.029286575),
 (481, 0.028376864),
 (516, 0.027806532),
 (2200, 0.027507983),
 (1646, 0.026931664),
 (1348, 0.026586518),
 (2054, 0.026404718),
 (2227, 0.02631785),
 (491, 0.026263997),
 (374, 0.026193928),
 (982, 0.025991287),
 (1182, 0.025893766),
 (605, 0.025450805),
 (2290, 0.025352744),
 (1861, 0.025163032),
 (1496, 0.02489403),
 (2126, 0.024768522),
 (1329, 0.024635559),
 (1701, 0.024416953)]

In [33]:
[idx_to_title[i[0]] for i in movie_recommended]

['alive (1993)',
 'beach, the (2000)',
 'kolya (1996)',
 'sabrina (1995)',
 'bottle rocket (1996)',
 'yellow submarine (1968)',
 'creature comforts (1990)',
 'bad boys (1995)',
 'end of the affair, the (1999)',
 'bronx tale, a (1993)',
 'exorcist, the (1973)',
 'in the name of the father (1993)',
 'red dawn (1984)',
 'cool runnings (1993)',
 'ninth gate, the (2000)',
 'pawnbroker, the (1965)',
 'peacemaker, the (1997)',
 'rounders (1998)',
 'modern times (1936)',
 'my own private idaho (1991)']

# 고찰
- csr 매트리스에 대한 개념이해가 어려웠을 뿐, 흥미로운 주제였다.
- 왜 0.97이 나왔는데 추천영화는 마음에 안드는 영화 투성인지 모르겟다
- 추천시스템의 맹점인지 epoch수를 줄여서 0.25가 나올때는 제법 마음에드는 영화가 많았다.
- [정답은 이거다!] 라는게 없으니 애매한 분야인 것 같다.