# 1- Import Libraries

In [1]:
import numpy as np
import scipy
import implicit

print(np.__version__)
print(scipy.__version__)
print(implicit.__version__)

1.26.3
1.11.4
0.7.2


# 2- Loading Dataset

In [2]:
import os
import pandas as pd

rating_file_path= '/Users/kenny_jung/aiffel/data/recommendata_iu/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'ratings', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python', encoding = "ISO-8859-1")
orginal_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,ratings,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [6]:
# 영화 제목을 보기 위해 메타 데이터를 읽어옵니다.
movie_file_path= '/Users/kenny_jung/aiffel/data/recommendata_iu/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python', encoding='ISO-8859-1')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


# 3- Preprocessing

In [3]:
# 3점 이상만 남깁니다.
ratings = ratings[ratings['ratings']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [4]:
# ratings 컬럼의 이름을 counts로 바꿉니다.
ratings.rename(columns={'ratings':'counts'}, inplace=True)

In [5]:
ratings['counts']

0          5
1          3
2          3
3          4
4          5
          ..
1000203    3
1000205    5
1000206    5
1000207    4
1000208    4
Name: counts, Length: 836478, dtype: int64

In [7]:
ratings

Unnamed: 0,user_id,movie_id,counts,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000203,6040,1090,3,956715518
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


# 4- EDA

In [8]:
# 유저 수
ratings['user_id'].nunique()

6039

In [9]:
# 영화 수
ratings['movie_id'].nunique()

3628

In [18]:
# 인기 많은 영화
movie_count = ratings.groupby('movie_id')['user_id'].count()
top30movie = pd.DataFrame(movie_count.sort_values(ascending=False).head(30))
top30movie['movie_title'] = top30movie.index.map(lambda x: movies[movies['movie_id'] == x]['title'].values[0])
print(type(top30movie))
top30movie

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0_level_0,user_id,movie_title
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
2858,3211,American Beauty (1999)
260,2910,Star Wars: Episode IV - A New Hope (1977)
1196,2885,Star Wars: Episode V - The Empire Strikes Back...
1210,2716,Star Wars: Episode VI - Return of the Jedi (1983)
2028,2561,Saving Private Ryan (1998)
589,2509,Terminator 2: Judgment Day (1991)
593,2498,"Silence of the Lambs, The (1991)"
1198,2473,Raiders of the Lost Ark (1981)
1270,2460,Back to the Future (1985)
2571,2434,"Matrix, The (1999)"
