# Library

In [90]:
# 데이터 처리 및 분석을 위한 라이브러리
import numpy as np  # 수학 연산 및 배열 연산을 위한 라이브러리
import pandas as pd  # 데이터 프레임을 다루기 위한 라이브러리

# 데이터 시각화를 위한 라이브러리
import matplotlib.pyplot as plt  # 그래프 및 차트 그리기
import seaborn as sns  # 시각화 기능을 향상시키는 라이브러리

# 머신러닝 관련 라이브러리
from sklearn.cluster import KMeans  # K-means 클러스터링 알고리즘 (비지도 학습)
from sklearn.preprocessing import LabelEncoder, StandardScaler  # 데이터 전처리를 위한 도구
from sklearn.feature_extraction.text import TfidfVectorizer  # TF-IDF 벡터 변환 (텍스트 데이터 벡터화)
from sklearn.metrics.pairwise import sigmoid_kernel  # 시그모이드 커널을 이용한 유사도 측정
from sklearn.metrics.pairwise import cosine_similarity  # 코사인 유사도를 계산하는 함수

# 추천 시스템 관련 라이브러리 (Surprise 라이브러리 사용)
from surprise import SVD  # SVD(특이값 분해) 기반 추천 시스템 알고리즘
from surprise import Dataset, Reader  # 데이터셋 로딩 및 처리
from surprise.model_selection import train_test_split  # 추천 시스템용 데이터 분할
from surprise import accuracy  # 추천 시스템 평가 (RMSE 등 측정)

# 경고 메시지 무시 (불필요한 경고를 숨기기 위해 사용)
import warnings
warnings.filterwarnings('ignore')

# 자연어 처리 관련 라이브러리
import nltk  # 자연어 처리(NLP)를 위한 라이브러리
import re  # 정규 표현식 (문자열 처리)
import string  # 문자열 관련 기능 제공
from nltk.tokenize import word_tokenize  # 문장을 단어 단위로 토큰화
from nltk.corpus import stopwords  # 불용어(의미 없는 단어) 제거
from nltk.stem import PorterStemmer  # 어간 추출 (동사의 변형을 정규화)

# 실행 시간 측정 (성능 비교 등 활용)
import time

# 최근접 이웃 알고리즘을 위한 라이브러리
from scipy.sparse import csr_matrix  # 희소 행렬(대부분이 0인 행렬) 변환
from sklearn.neighbors import NearestNeighbors  # 최근접 이웃 알고리즘 (KNN 등)


# Load Data

In [91]:
rating = pd.read_csv('./data/rating.csv')
anime = pd.read_csv('./data/anime.csv')

# Data Summary

In [92]:
rating.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


In [93]:
anime.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [94]:
anime[anime.name=='Death Note']
# print(anime[anime.name=='One Punch Man'])
# print(anime[anime.name=='One Piece'])

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
40,1535,Death Note,"Mystery, Police, Psychological, Supernatural, ...",TV,37,8.71,1013917


In [95]:
anime.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [96]:
print(f'anime shape: {anime.shape}\nrating shape: {rating.shape}')

anime shape: (12294, 7)
rating shape: (7813737, 3)


# Check Missing Values

In [97]:
rating.isna().sum()

user_id     0
anime_id    0
rating      0
dtype: int64

In [98]:
anime.isna().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

# Remove Missing Rows

In [99]:
anime.dropna(axis=0, inplace=True)
anime.isna().sum()

anime_id    0
name        0
genre       0
type        0
episodes    0
rating      0
members     0
dtype: int64

In [100]:
anime.describe()

Unnamed: 0,anime_id,rating,members
count,12017.0,12017.0,12017.0
mean,13638.001165,6.478264,18348.88
std,11231.076675,1.023857,55372.5
min,1.0,1.67,12.0
25%,3391.0,5.89,225.0
50%,9959.0,6.57,1552.0
75%,23729.0,7.18,9588.0
max,34519.0,10.0,1013917.0


In [101]:
anime.episodes.value_counts()

episodes
1      5571
2      1075
12      810
13      571
26      514
       ... 
358       1
366       1
201       1
172       1
125       1
Name: count, Length: 187, dtype: int64

# Check Duplicates

In [102]:
duplicated_anime = anime[anime.duplicated()].shape[0] #.shape[0] → 데이터프레임의 행(row) 개수를 의미
#anime[anime.duplicated()] -> duplicated()가 True인 행만 선택해서 새로운 데이터프레임을 만듦.
#anime.duplicated() -> 중복된 행인지(True/False) 확인
print(f'count of duplicate anime: {duplicated_anime}')

count of duplicate anime: 0


In [103]:
duplicated_rating = rating[rating.duplicated()].shape[0]
print(f'count of dupliacte anime: {duplicated_rating}') # 찐빠 발생

count of dupliacte anime: 1


# Remove Duplicates

In [104]:
rating.drop_duplicates(keep='first', inplace=True) # 첫 번쨰 등장한 값 유지

duplicated_rating = rating[rating.duplicated()].shape[0]
print(f'count of duplicated anime after removing: {duplicated_rating}')

count of duplicated anime after removing: 0


# Create Dateset

In [None]:
df = pd.merge(anime, rating, on='anime_id')
df.to_csv("./data/anime_rating_merged.csv", index=False)

In [None]:
# df = pd.read_csv('./data/anime_rating_merged.csv')
# df27364 = df[df['user_id'] == 27364]
# df27364.to_csv("./data/anime_rating_27364.csv", index=False)

In [89]:
# df27364.tail()

Unnamed: 0,anime_id,name,genre,type,episodes,rating_x,members,user_id,rating_y
7813580,10368,Teleclub no Himitsu,Hentai,OVA,2,4.67,148,27364,-1
7813588,5541,The Satisfaction,Hentai,OVA,1,4.37,166,27364,-1
7813601,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219,27364,-1
7813604,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175,27364,-1
7813608,26081,Yasuji no Pornorama: Yacchimae!!,Hentai,Movie,1,5.46,142,27364,-1


In [106]:
df.tail(20)

Unnamed: 0,anime_id,name,genre,type,episodes,rating_x,members,user_id,rating_y
7813590,5541,The Satisfaction,Hentai,OVA,1,4.37,166,39532,-1
7813591,5541,The Satisfaction,Hentai,OVA,1,4.37,166,48766,-1
7813592,5541,The Satisfaction,Hentai,OVA,1,4.37,166,58483,1
7813593,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211,20171,7
7813594,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211,39532,-1
7813595,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211,48766,-1
7813596,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211,58483,1
7813597,5543,Under World,Hentai,OVA,1,4.28,183,39532,-1
7813598,5543,Under World,Hentai,OVA,1,4.28,183,48766,-1
7813599,5543,Under World,Hentai,OVA,1,4.28,183,49503,4


In [None]:
# df = df.rename(columns={'rating_x': 'user_rating'})
# df = df.drop('rating_y', axis=1)

# df.to_csv('anime_rating.csv', index=False) # 인덱스 미포함.

In [114]:
df = df.rename(columns={'rating_x': 'anime_rating'})
df = df.rename(columns={'rating_y': 'user_rating'})
df = df[df['user_rating'] != -1] # user_rating이 -1인 행 제거
df.to_csv("./data/anime_rating_-1.csv", index=False)
print("user_rating=-1 개수:", (df['user_rating'] == -1).sum())

user_rating=-1 개수: 0


In [115]:
df = pd.read_csv('./data/anime_rating_-1.csv')
print(f'dataset shape: {df.shape}')

dataset shape: (6337145, 9)


In [116]:
df.head(20)

Unnamed: 0,anime_id,name,genre,type,episodes,anime_rating,members,user_id,user_rating
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,99,5
1,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,152,10
2,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,244,10
3,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,271,10
4,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,322,10
5,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,398,10
6,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,462,8
7,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,490,10
8,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,548,10
9,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,570,10


In [118]:
df.describe()

Unnamed: 0,anime_id,anime_rating,members,user_id,user_rating
count,6337145.0,6337145.0,6337145.0,6337145.0,6337145.0
mean,8902.547,7.675013,184576.7,36747.95,7.808543
std,8881.674,0.6699057,190953.2,21013.37,1.57244
min,1.0,2.0,33.0,1.0,1.0
25%,1239.0,7.29,46803.0,18985.0,7.0
50%,6213.0,7.7,117091.0,36815.0,8.0
75%,14075.0,8.15,256325.0,54873.0,9.0
max,34475.0,9.37,1013917.0,73516.0,10.0


In [119]:
# df = pd.read_csv("./data/anime_rating.csv")

# # user_id가 27364인 데이터만 필터링
# df_filtered = df[df['user_id'] == 27364]

# df_filtered.to_csv("./data/anime_rating_27364.csv", index=False)

# Preprocessing Function

In [121]:
df = df.copy() #데이터프레임을 복사하여 원본을 보호, 원본 데이터프레임(df)을 직접 변경하는 것이 아니라 안전하게 수정 가능.
df['user_rating'].replace(to_replace=-1, value=np.nan, inplace=True) #user_rating 컬럼에서 -1 값을 NaN(결측값)으로 변환
df = df.dropna(axis=0) # NaN 이 포함된 행(row) 삭제
print("Null values after final pre-processing:")
df.isna().sum() # 각 컬럼별로 결측값 개수를 출력

Null values after final pre-processing:


anime_id        0
name            0
genre           0
type            0
episodes        0
anime_rating    0
members         0
user_id         0
user_rating     0
dtype: int64

In [122]:
def lower_text(text): # lower_text 함수선언, text라는 입력값(문자열)을 받음.
    """
        to lowercase # 함수 설명 작성 -> 소문자로 변환 가능
    """
    text = text.lower() # 모든 문자 소문자로 변환
    return text #소문자로 변환된 문자열 반환 

# 왜 소문자로 변환할까?
# Naruto와 naruto를 같은 단어로 인식하기 위해!
# 머신러닝/딥러닝 모델이 불필요한 차이를 학습하지 않도록!

In [123]:
def clean_text(text):
    """
        data preprocessing 
    """
    
    # to lowercase
    text = text.lower()

    # remove sybmols and other words
    text = re.sub(r'<[^>]*>', '', text) # <html> 같은 태그 제거
    text = re.sub(r'http\S+', '', text) # URL 제거
    text = re.sub(r'&quot;', '', text) # 특수 기호 제거
    text = re.sub(r'.hack//', '', text) # ".hack//"같은 패턴 제거
    text = re.sub(r'&#039;', '', text) # '&#039;' -> '' (어포스트로피 깨짐 현상 제거)
    text = re.sub(r'A&#039;s', '', text) # A&#039;s -> ''
    text = re.sub(r'I&#039;', 'I\'', text) # 'I&#039;' → 'I\'' (아포스트로피 복구)
    text = re.sub(r'&amp;', 'and', text) # '&amp;' → 'and' (HTML 인코딩 복구)
  
    # remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # remove number
    #text = re.sub(r'\d+', '', text)

    # tokenization
    #words = word_tokenize(text)

    # remove stopwords
    #stop_words = set(stopwords.words('english'))
    #words = [word for word in words if word not in stop_words]

    # stemming
    #stemmer = PorterStemmer()
    #words = [stemmer.stem(word) for word in words]

    # join words
    #text = ' '.join(words)
    
    return text

# Data Preprocessing

In [124]:
# start_time = time.time()
# df['name']=df['name'].apply(clean_text)
# anime['name']=anime['name'].apply(clean_text)
# end_time = time.time()
# elapsed_time = end_time - start_time
# print("process time:", elapsed_time, "sec.")

start_time = time.time() #time.time()을 사용하여 코드 실행이 시작되는 시간을 저장
df['name']=df['name'].apply(clean_text) #df의 "name" 컬럼의 모든 값에 clean_text() 적용
anime['name'] = anime['name'].apply(clean_text) #anime 데이터프레임에도 동일한 작업 수행
end_time = time.time() #실행이 끝나는 시점의 시간 기록
elapsed_time = end_time - start_time #실행 시간을 초 단위로 계산 
print("process time: ", elapsed_time, " sec.")

process time:  38.06562924385071  sec.


# Popularity-Based Recommender 인기 기반 추천

### User rating 순위 뽑아보기

In [153]:
def popularity_recommender_u(df, selected_features):
    """
        recommender system with popularity-based
    """
    # grouping & calculating mean value 
    grouped_df = df.groupby(selected_features).agg({'user_rating': 'mean'}).reset_index()
    # sorting to rating
    sorted_df = grouped_df.sort_values('user_rating', ascending=False)
    # give the recommedations
    recommendations = sorted_df.head(15)
    return recommendations

In [154]:
df.columns

Index(['anime_id', 'name', 'genre', 'type', 'episodes', 'anime_rating',
       'members', 'user_id', 'user_rating'],
      dtype='object')

In [155]:
df.head(10)

Unnamed: 0,anime_id,name,genre,type,episodes,anime_rating,members,user_id,user_rating
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,99,5
1,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,152,10
2,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,244,10
3,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,271,10
4,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,322,10
5,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,398,10
6,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,462,8
7,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,490,10
8,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,548,10
9,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,570,10


In [156]:
# according to anime names
selected_features = ['name']
popularity_recommender_u(df, selected_features)

Unnamed: 0,name,user_rating
4330,Ketsuinu,10.0
6492,Only You: Viva! Cabaret Club,10.0
1275,Choegang Top Plate,10.0
3223,Hello Kitty no Tomatta Big Ben,10.0
1277,Chogattai Majutsu Robot Ginguiser Specials,10.0
7359,STAR BEAT!: Hoshi no Kodou,10.0
3205,Hello Kitty no Circus ga Yatte Kita,10.0
3044,Hamster Club,10.0
223,Ajisai no Uta,10.0
7940,Shiroi Zou,10.0


In [158]:
# according to members
selected_features = ['members']
popularity_recommender_u(df, selected_features)

Unnamed: 0,members,user_rating
3391,8028,9.5
6004,114262,9.449495
6256,200630,9.426313
5749,80679,9.389788
6467,793665,9.322741
6140,151266,9.272552
6463,673572,9.261326
6396,336376,9.236398
6428,425855,9.234586
5757,81109,9.202258


----

### Anime rating 뽑아보기

In [159]:
def popularity_recommender_a(df, selected_features):
    """
        recommender system with popularity-based
    """
    # grouping & calculating mean value 
    grouped_df = df.groupby(selected_features).agg({'anime_rating': 'mean'}).reset_index()
    # sorting to rating
    sorted_df = grouped_df.sort_values('anime_rating', ascending=False)
    # give the recommedations
    recommendations = sorted_df.head(15)
    return recommendations

In [160]:
selected_features = ['name']
popularity_recommender_a(df, selected_features)

Unnamed: 0,name,anime_rating
4395,Kimi no Na wa.,9.37
2364,Fullmetal Alchemist: Brotherhood,9.26
2755,Gintama°,9.25
8255,Steins;Gate,9.17
2746,Gintama&#039;,9.16
2985,Haikyuu!!: Karasuno Koukou VS Shiratorizawa Ga...,9.15
3529,Hunter x Hunter (2011),9.13
2700,Ginga Eiyuu Densetsu,9.11
2747,Gintama&#039;: Enchousen,9.11
2743,Gintama Movie: Kanketsu-hen - Yorozuya yo Eien...,9.1


In [161]:
# according to members
selected_features = ['members']
popularity_recommender_a(df, selected_features)

Unnamed: 0,members,anime_rating
6256,200630,9.37
6467,793665,9.26
6004,114262,9.25
6463,673572,9.17
6140,151266,9.16
5868,93351,9.15
6428,425855,9.13
5749,80679,9.11
5757,81109,9.11
5669,72534,9.1


---

### First genre 생성

In [162]:
# create first genre
df['first_genre'] = df['genre'].apply(lambda x: x.split(',')[0].strip() if ',' in x else x)

In [163]:
df.columns

Index(['anime_id', 'name', 'genre', 'type', 'episodes', 'anime_rating',
       'members', 'user_id', 'user_rating', 'first_genre'],
      dtype='object')

In [167]:
selected_features = ['first_genre']
popularity_recommender_u(df, selected_features)

Unnamed: 0,first_genre,user_rating
14,Josei,8.574034
28,Sci-Fi,8.502633
21,Mystery,8.355527
24,Psychological,8.327117
6,Drama,7.952341
9,Game,7.87067
0,Action,7.867908
4,Dementia,7.863052
1,Adventure,7.798738
2,Cars,7.756006


In [166]:
selected_features = ['first_genre']
popularity_recommender_a(df, selected_features)

Unnamed: 0,first_genre,anime_rating
14,Josei,8.469407
28,Sci-Fi,8.389518
24,Psychological,8.219053
21,Mystery,8.214965
4,Dementia,7.848917
6,Drama,7.810982
9,Game,7.764884
0,Action,7.733717
1,Adventure,7.703489
2,Cars,7.689872


In [168]:
#according to type
selected_features = ['type']
popularity_recommender_u(df, selected_features)

Unnamed: 0,type,user_rating
0,Movie,7.92258
5,TV,7.89916
4,Special,7.463638
3,OVA,7.334584
2,ONA,7.229329
1,Music,7.214282


In [169]:
#according to type
selected_features = ['type']
popularity_recommender_a(df, selected_features)

Unnamed: 0,type,anime_rating
0,Movie,7.832864
5,TV,7.753773
4,Special,7.349545
3,OVA,7.215986
2,ONA,7.069045
1,Music,7.049417


---

# Clustering and Collaborative Recommender