In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Content based analysis

## 데이터 불러오기

In [4]:
data = pd.read_csv('books_with_genres.csv')
data.head(3)

Unnamed: 0,id,book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url,genre
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,"Young Adult, Fiction, Fantasy, Science Fiction..."
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,...,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...,"Young Adult, Magic, Childrens, Middle Grade, A..."
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...,"Fantasy, Young Adult, Romance, Fiction, Vampir..."


In [6]:
# 결측치 처리 (결측치가 있는 행 제거)
data.dropna(inplace=True)

## content 결합

In [7]:
# 'genre' 열의 쉼표와 공백을 제거하여 일관성 있게 만듭니다
data['genre'] = data['genre'].str.replace(',', ' ').str.replace(' ', ' ')

# 'content' 열 생성
data['content'] = data['title'] + ' ' + data['authors'] + ' ' + data['genre']


### TF-IDF Vectorization

In [11]:
# TF-IDF 벡터라이저 초기화
tfidf = TfidfVectorizer(stop_words='english')

# 'content' 열을 벡터화
tfidf_matrix = tfidf.fit_transform(data['content'])

# 벡터화된 콘텐츠의 형태 확인
print(tfidf_matrix.shape)  # (책의 수, 단어의 수)

(7643, 12062)


Calculate Cosine similarity

In [14]:
# 코사인 유사도 계산
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
cosine_sim.shape

(7643, 7643)

### recommendation function implement
추천함수 구현

In [24]:
# 인덱스와 제목 매핑 생성
indices = pd.Series(data.index, index=data['title']).drop_duplicates()

def get_recommendations(title, cosine_sim=cosine_sim):
    # 해당 책의 인덱스 가져오기
    idx = indices.get(title)
    
    if idx is None:
        return "해당 제목의 책이 데이터에 없습니다."
    
    # 모든 책과의 유사도 점수 가져오기
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # 유사도 점수를 기준으로 정렬 (높은 순)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # 자신을 제외하고 상위 10개 추천
    sim_scores = sim_scores[1:11]
    
    # 추천할 책의 인덱스 추출
    book_indices = [i[0] for i in sim_scores]
    
    # 추천 책의 제목 반환
    return data['title'].iloc[book_indices]

# 예시 사용
print(get_recommendations('The Hunger Games (The Hunger Games, #1)'))


506     The Hunger Games Trilogy Boxset (The Hunger Ga...
16                   Catching Fire (The Hunger Games, #2)
19                      Mockingjay (The Hunger Games, #3)
1819                                    Hunger (Gone, #2)
5945                    The Quillan Games (Pendragon, #7)
2324                    The Player of Games (Culture, #2)
505                           The One (The Selection, #3)
3114     A Hunger Like No Other (Immortals After Dark #2)
6768                                                   S.
1530     Gregor the Overlander (Underland Chronicles, #1)
Name: title, dtype: object
