# Anime dataset 불러오기
* anime.csv : 애니메이션의 title, genre 등 item에 대한 info가 담긴 data
* ratings.csv : 사용자(user)가 애니메이션(item)에 대한 rating이 담긴 data

In [1]:
from google.colab import files
uploaded = files.upload()

Saving anime.csv to anime.csv
Saving rating.csv to rating.csv


In [119]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.metrics.pairwise import cosine_similarity
import operator
%matplotlib inline

# Data Preprocessing (데이터 전처리)
* dataset 구조 파악
* 두 데이터셋을 하나로 병합
* pivot table 생성

In [120]:
# 데이터프레임 형태로 할당
anime = pd.read_csv('anime.csv')
rating = pd.read_csv('rating.csv')

In [121]:
# anime column 확인
anime.head(3)

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262


In [122]:
# TV 방영 애니메이션만 추출
anime_tv = anime[anime['type']=='TV']    
anime_tv.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
5,32935,Haikyuu!!: Karasuno Koukou VS Shiratorizawa Ga...,"Comedy, Drama, School, Shounen, Sports",TV,10,9.15,93351


In [123]:
# TV 방영 애니메이션 평가 수 파악
anime_tv.members.describe()

count    3.787000e+03
mean     4.268366e+04
std      8.912101e+04
min      1.200000e+01
25%      4.830000e+02
50%      5.947000e+03
75%      4.424600e+04
max      1.013917e+06
Name: members, dtype: float64

In [124]:
# 평가 수 기준 상위 50% TV 애니메이션만 추출
anime_tv = anime_tv[anime_tv.members >= 5947]    
anime_tv.shape

(1894, 7)

In [125]:
# rating column 확인
rating.head(3)

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1


In [126]:
# -1로 표기된 평점을 NaN로 변환
rating.replace({-1: np.nan}, regex = True, inplace = True)    
rating.head(3)

Unnamed: 0,user_id,anime_id,rating
0,1,20,
1,1,24,
2,1,79,


In [127]:
# anime data와 rating data 병합 (anime_id 기준)
merged = rating.merge(anime_tv, left_on = 'anime_id', right_on = 'anime_id', suffixes= ['_user', ''])
merged.rename(columns = {'rating_user':'user_rating'}, inplace = True)

merged = merged[['user_id', 'name', 'user_rating']]    # 필요 column만

In [128]:
# 20000명에 대한 dataset (computing issue)
merged_sub = merged[merged.user_id <= 20000]

In [130]:
merged_sub.head()

Unnamed: 0,user_id,name,user_rating
0,1,Naruto,
1,3,Naruto,8.0
2,5,Naruto,6.0
3,6,Naruto,
4,10,Naruto,


In [131]:
# 피봇 테이블 생성
piv = merged_sub.pivot_table(index = ['user_id'], columns = ['name'], values = 'user_rating')
print(piv.shape)
piv.head()

(18716, 1795)


name,.hack//Roots,.hack//Sign,.hack//Tasogare no Udewa Densetsu,009-1,07-Ghost,11eyes,12-sai.: Chicchana Mune no Tokimeki,30-sai no Hoken Taiiku,91 Days,A-Channel,...,Zoku Natsume Yuujinchou,Zoku Sayonara Zetsubou Sensei,Zombie-Loan,"Zone of the Enders: Dolores, I",ef: A Tale of Melodies.,ef: A Tale of Memories.,iDOLM@STER Xenoglossia,s.CRY.ed,xxxHOLiC,xxxHOLiC Kei
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,7.0,,,,,,,
5,,,,,,,,,,,...,,7.0,,,,,,,2.0,
7,,,,,,,,,,,...,,,,,,,,,,


In [104]:
# 정규화
piv_norm = piv.apply(lambda x: (x - np.mean(x)) / (np.max(x) - np.min(x)), axis = 1)
# 평점을 매기지 않은 user column 제거
piv_norm.fillna(0, inplace = True)
piv_norm = piv_norm.T
piv_norm = piv_norm.loc[:, (piv_norm != 0).any(axis=0)]

In [132]:
# 희소 행렬 형태로 변환
piv_sparse = sp.sparse.csr_matrix(piv_norm.values)

In [133]:
#사용자 기반 유사도 행렬
user_similarity = cosine_similarity(piv_sparse.T)

# 아이템 기반 유사도 행렬
item_similarity = cosine_similarity(piv_sparse)

In [134]:
user_sim_df = pd.DataFrame(user_similarity, index = piv_norm.columns, columns = piv_norm.columns)
item_sim_df = pd.DataFrame(item_similarity, index = piv_norm.index, columns = piv_norm.index)

In [135]:
user_sim_df.head()

user_id,3,5,7,8,10,11,12,14,16,17,...,19989,19990,19992,19993,19994,19995,19996,19997,19998,20000
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,1.0,0.064821,0.167627,0.020365,0.125977,0.04523309,0.108629,0.005399,0.077388,0.248077,...,0.101602,0.155584,0.0,0.074104,0.034628,0.083195,-0.052886,-0.016283,0.003794,-0.020254
5,0.064821,1.0,0.073114,0.016531,0.03157,0.005464404,0.094435,0.107404,0.007147,0.131081,...,0.057388,-0.024971,0.011666,0.083377,0.117525,0.048341,-0.026435,-0.029484,0.040754,-0.005498
7,0.167627,0.073114,1.0,-0.013548,0.020401,-0.01515745,0.061591,0.122852,0.048526,0.165479,...,-0.035113,0.052594,-0.064844,0.051428,0.068708,0.028304,0.0,0.0,0.046984,0.01866
8,0.020365,0.016531,-0.013548,1.0,-0.052705,0.01892828,0.002786,-0.055035,0.0,0.001329,...,0.019713,0.0,0.0,0.034493,0.007308,-0.011986,0.0,0.0,0.046962,-0.069722
10,0.125977,0.03157,0.020401,-0.052705,1.0,-1.665335e-16,0.008811,0.044415,0.0,0.071618,...,0.19008,0.21398,0.0,0.0,0.113134,0.140668,0.0,0.0,0.0,0.0


In [109]:
# User ID를 입력했을 때, 입력된 User와 취향이 유사한 User 10명 반환하는 함수 
def top_users(user):
    
 if user not in piv_norm.columns:
      return('ID {} 유저의 정보가 없습니다.'.format(user))
   
 print('ID {}와 가장 유사한 유저는 다음과 같습니다.:\n'.format(user))
 sim_values = user_sim_df.sort_values(by=user, ascending=False).loc[:,user].tolist()[1:11]
 sim_users = user_sim_df.sort_values(by=user, ascending=False).index[1:11]
 zipped = zip(sim_users, sim_values,)
 for user, sim in zipped:
     print('ID {0}, 유사도: {1:.2f}'.format(user, sim)) 

In [110]:
top_users(20000)

ID 20000와 가장 유사한 유저는 다음과 같습니다.:

ID 11085, 유사도: 0.86
ID 2560, 유사도: 0.65
ID 17099, 유사도: 0.52
ID 16914, 유사도: 0.45
ID 1368, 유사도: 0.44
ID 6936, 유사도: 0.41
ID 2069, 유사도: 0.39
ID 5639, 유사도: 0.39
ID 19042, 유사도: 0.37
ID 12883, 유사도: 0.37


In [136]:
item_sim_df.head()

name,.hack//Roots,.hack//Sign,.hack//Tasogare no Udewa Densetsu,009-1,07-Ghost,11eyes,12-sai.: Chicchana Mune no Tokimeki,30-sai no Hoken Taiiku,91 Days,A-Channel,...,Zoku Natsume Yuujinchou,Zoku Sayonara Zetsubou Sensei,Zombie-Loan,"Zone of the Enders: Dolores, I",ef: A Tale of Melodies.,ef: A Tale of Memories.,iDOLM@STER Xenoglossia,s.CRY.ed,xxxHOLiC,xxxHOLiC Kei
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
.hack//Roots,1.0,0.208888,0.294037,0.013417,0.035021,0.042777,0.017516,0.022737,-0.010055,0.014429,...,-0.052543,-0.015453,0.010984,-0.005756,-0.02425,-0.016565,0.013604,0.021661,-0.026765,-0.025869
.hack//Sign,0.208888,1.0,0.168604,0.025281,0.042555,0.040025,0.002459,0.034466,-0.014535,0.033941,...,-0.031616,-0.01956,0.035092,0.014673,-0.023841,-0.006569,-0.012319,0.007937,-0.020464,-0.026013
.hack//Tasogare no Udewa Densetsu,0.294037,0.168604,1.0,0.037288,0.020993,0.038395,-0.000361,0.002783,0.00096,0.016956,...,-0.035796,-0.023923,0.029486,0.019901,-0.01684,-0.023182,0.001717,0.010111,-0.045788,-0.03394
009-1,0.013417,0.025281,0.037288,1.0,0.006068,0.031984,0.029974,0.050457,-0.034925,0.052102,...,-0.003461,0.011822,-0.005609,0.003363,-0.006517,-0.005316,-0.002141,-0.010789,0.013313,0.007863
07-Ghost,0.035021,0.042555,0.020993,0.006068,1.0,0.086233,0.009937,0.044858,-0.010993,0.014321,...,-0.060371,-0.012864,0.041818,0.00202,-0.026509,-0.025946,-0.001227,0.022191,-0.017103,-0.014243


In [137]:
# 애니메이션 제목을 입력했을 때, 입력된 애니메이션과 비슷한 작품 10개 반환하는 함수
def top_items(anime):
  count = 1
  print('애니메이션 \'{}\'와 가장 유사한 애니메이션은 다음과 같습니다.:\n'.format(anime))
  for item in item_sim_df.sort_values(by = anime, ascending = False).index[1:11]:
    print('{}번 : {}'.format(count, item))
    count += 1

In [142]:
top_items('Naruto')

애니메이션 'Naruto'와 가장 유사한 애니메이션은 다음과 같습니다.:

1번 : Bleach
2번 : Dragon Ball GT
3번 : Yu☆Gi☆Oh! Duel Monsters
4번 : Dragon Ball Z
5번 : Fairy Tail
6번 : Pokemon Advanced Generation
7번 : Ao no Exorcist
8번 : D.Gray-man
9번 : Highschool of the Dead
10번 : Dragon Ball
