# 책 추천 시스템 - 콘텐츠 기반 필터링(Content-based Filtering)
- `Good Books Dataset`
    - https://www.kaggle.com/zygmunt/goodbooks-10k
- ratings, books, tag, book_tags, to_read의 10k(10,000) 데이터
- Tfidf Vectorizer를 사용해 authors, tag_name, 그리고 authors+tag_name을 합친 것을 기반으로 유사한 책 찾기/추천

In [57]:
# 데이터 확인
import pandas as pd
import numpy as np

import os
#print(os.listdir('./goodbooks-10k/'))

# books 데이터 읽기
# Book metadata
books = pd.read_csv('./Good Books/books.csv', encoding='ISO-8859-1') # 이번 데이터들은 encoding을 ISO-8859-1로 읽어야함
books.head()

# bestbookid is the most popular edition for a given work. Generally it's the same as goodreadsbookid, differs occasionally.
# books_count is the number of editions for a given work.
# rating 1 ~ 5의 의미는 별점 1점부터 5점의 수

Unnamed: 0,id,book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPrÃ©",1997.0,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...


In [58]:
# ratings 데이터 읽기
# book-user-rating triples
ratings = pd.read_csv('./Good Books/ratings.csv', encoding='ISO-8859-1')
ratings.head()
# rating 데이터에는 Book_id와 User_id 그리고 해당 유저가 준 rating 점수가 있음

Unnamed: 0,book_id,user_id,rating
0,1,314,5
1,1,439,3
2,1,588,5
3,1,1169,4
4,1,1185,4


In [59]:
# book_tags 데이터 읽기
# Tag applications, sorted by goodreadsbookid ASC and count DESC
book_tags = pd.read_csv('./Good Books/book_tags.csv', encoding='ISO-8859-1')
book_tags.head()
# Book의 id와 tag의 id가 있음

Unnamed: 0,goodreads_book_id,tag_id,count
0,1,30574,167697
1,1,11305,37174
2,1,11557,34173
3,1,8717,12986
4,1,33114,12716


In [92]:
# tags 데이터 읽기
# Tag id-name mapping
tags = pd.read_csv('./Good Books/tags.csv')
tags.tail()
# Tag의 id와 해당 tag와 연결되는 name이 있음

Unnamed: 0,tag_id,tag_name
34247,34247,Ｃhildrens
34248,34248,Ｆａｖｏｒｉｔｅｓ
34249,34249,Ｍａｎｇａ
34250,34250,ＳＥＲＩＥＳ
34251,34251,ｆａｖｏｕｒｉｔｅｓ


In [63]:
# to_read 데이터 읽기
# Books marked by users "to read"; Sorted by userid and bookid
to_read = pd.read_csv('./Good Books/to_read.csv')
to_read.head()
# 유저가 어떤 책을 읽었는지에 대한 id가 적혀있음

Unnamed: 0,user_id,book_id
0,1,112
1,1,235
2,1,533
3,1,1198
4,1,1874


In [62]:
# book_tags와 tags를 merge
tags_join_DF = pd.merge(book_tags, tags, left_on='tag_id', right_on='tag_id', how='inner')
tags_join_DF.head()
# Tag_id와 tag_name을 book_id가 있는 데이터 프레임과 merge함

Unnamed: 0,goodreads_book_id,tag_id,count,tag_name
0,1,30574,167697,to-read
1,2,30574,24549,to-read
2,3,30574,496107,to-read
3,5,30574,11909,to-read
4,6,30574,298,to-read


## Authors로 유사한 책 찾기
- Tfidf

In [64]:
# books의 authors 컬럼
books['authors'][:5]

0                 Suzanne Collins
1    J.K. Rowling, Mary GrandPrÃ©
2                 Stephenie Meyer
3                      Harper Lee
4             F. Scott Fitzgerald
Name: authors, dtype: object

In [65]:
# authors로 Tfidf 수행
from sklearn.feature_extraction.text import TfidfVectorizer

tf = TfidfVectorizer(analyzer='word', ngram_range=(1,2), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(books['authors'])
tfidf_matrix

<10000x14742 sparse matrix of type '<class 'numpy.float64'>'
	with 43235 stored elements in Compressed Sparse Row format>

In [66]:
# 코사인 유사도 측정
# sklearn의 linear_kernel을 사용해서 작가명으로 만든 Tfidf매트릭스를 유사도 행렬로 생성
from sklearn.metrics.pairwise import linear_kernel

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [67]:
# Hobbit과 유사한 책 찾기
titles = books['title']
indices = pd.Series(books.index, index=books['title'])
indices['The Hobbit'] # The Hobbit의 index는 6

6

In [68]:
# 유사도 값 호출
# 유사도 행렬에서 hobbit의 인덱스의 행 불러오기
cosine_sim[indices['The Hobbit']]

array([0., 0., 0., ..., 0., 0., 0.])

In [70]:
# 유사도 결과를 인덱스를 가진 list 형으로
print(cosine_sim[indices['The Hobbit']].shape) # 총 1만개의 책 데이터가 있음
list(enumerate(cosine_sim[indices['The Hobbit']]))[:3]
# 유사도 행렬에서 The Hobbit의 인덱스만 가져오고, 
# 해당 컬럼(다른 책 인덱스)과 코사인 유사도 점수를 enumerate를 사용하여 
# 튜플형식으로 만들고, 해당 데이터를 list에 넣는다

(10000,)


[(0, 0.0), (1, 0.0), (2, 0.0)]

In [71]:
# 가장 유사한 책의 인덱스 찾기
sim_scores = list(enumerate(cosine_sim[indices['The Hobbit']]))
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True) # 유사도 순으로 정렬
sim_scores[:3]
# 호빗과 가장 유사한 책의 인덱스(여기서는 열)와 코사인 점수를 정렬하여 출력함
# 완전 똑같은 1점도 보인다. 18번, 154번
# 참고로 맨 앞에 (6, 1.0)은 본인 자신

[(6, 1.0), (18, 1.0), (154, 1.0)]

In [None]:
print(f'Index 6번의 책 이름 :', books['title'][6])
print(f'Index 18번의 책 이름 :', books['title'][18])
print(f'Index 154번의 책 이름 :', books['title'][154])

In [72]:
# 작가로 본 유사 책 검색
sim_scores = sim_scores[1:11] # 자기 자신은 제외한 10개 유사 책 조회
book_indices = [i[0] for i in sim_scores] # index 번호 추출
titles.iloc[book_indices]
# 그 외의 다른 책들도 대부분 Hobbit이긴 하나, 아마 작가가 동일인일 가능성이 높다.
# 사실 생각해 보면 작가이름으로만 Tfidf를 했기 때문에, 작가 이름이 같다면 모두 동일한 점수(1)로 나올것이다

18      The Fellowship of the Ring (The Lord of the Ri...
154            The Two Towers (The Lord of the Rings, #2)
160     The Return of the King (The Lord of the Rings,...
188     The Lord of the Rings (The Lord of the Rings, ...
963     J.R.R. Tolkien 4-Book Boxed Set: The Hobbit an...
4975        Unfinished Tales of NÃºmenor and Middle-Earth
2308                               The Children of HÃºrin
610              The Silmarillion (Middle-Earth Universe)
8271                   The Complete Guide to Middle-Earth
1128     The History of the Hobbit, Part One: Mr. Baggins
Name: title, dtype: object

## Tag로 유사한 책 찾기
- Tfidf

In [73]:
# book dataframe에 tag 포함 시키기
# Books 데이터 프레임에, 앞에서 만든 tagid와 tag name을 merge함
books_with_tags = pd.merge(books, tags_join_DF, left_on='book_id', right_on='goodreads_book_id', how='inner')
books_with_tags.head()

Unnamed: 0,id,book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url,goodreads_book_id,tag_id,count,tag_name
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,2767052,30574,11314,to-read
1,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,2767052,11305,10836,fantasy
2,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,2767052,11557,50755,favorites
3,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,2767052,8717,35418,currently-reading
4,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,2767052,33114,25968,young-adult


In [75]:
# 이번에는 tag로 Tfidf
# 앞에선 작가 이름으로 Tfidf를 했고, 이번엔 Tag로 해본다
tf1 = TfidfVectorizer(analyzer='word', ngram_range=(1,2), min_df=0, stop_words='english')
tfidf_matrix1 = tf1.fit_transform(books_with_tags['tag_name'].head(10000))
cosine_sim1 = linear_kernel(tfidf_matrix1, tfidf_matrix1)

In [76]:
# 추천책을 반환하는 함수
# 이번에는 책의 제목을 넣으면 추천책을 반환하는 함수를 작성
titles1 = books['title']
indices1 = pd.Series(books.index, index=books['title'])

def tags_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim1[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11] # sim_scores = sim_scores[1:11]은 총 10개를 가리키며, 1부터 한것은 0번은 입력한 책 제목 자신이 나오기 떄문임
    book_indices = [i[0] for i in sim_scores]
    return titles.iloc[book_indices]

In [78]:
# Tag로 찾아본 The Hobbits와 유사책
tags_recommendations('The Hobbit').head(10)
# 헝거게임, 듄 등 호빗과 비슷한 판타지 장르가 나오는듯 하다

16             Catching Fire (The Hunger Games, #2)
31                                  Of Mice and Men
107    Confessions of a Shopaholic (Shopaholic, #1)
125                       Dune (Dune Chronicles #1)
149                                    The Red Tent
206          One for the Money (Stephanie Plum, #1)
214                                Ready Player One
231             The Gunslinger (The Dark Tower, #1)
253          Shiver (The Wolves of Mercy Falls, #1)
313                         Inkheart (Inkworld, #1)
Name: title, dtype: object

## Authors와 Tag를 합쳐서 유사한 책 찾기
- Tfidf

In [80]:
# 임시로 book id 마다 tag를 붙이고
# book_id에 있는 모든 tag_name들을 한번에 모아놓음
temp_df = books_with_tags.groupby('book_id')['tag_name'].apply(' '.join).reset_index()
temp_df.head()

Unnamed: 0,book_id,tag_name
0,1,to-read fantasy favorites currently-reading yo...
1,2,to-read fantasy favorites currently-reading yo...
2,3,to-read fantasy favorites currently-reading yo...
3,5,to-read fantasy favorites currently-reading yo...
4,6,to-read fantasy young-adult fiction harry-pott...


In [83]:
# 그걸 books에 합치고
books = pd.merge(books, temp_df, left_on='book_id', right_on='book_id', how='inner')
books.head()
# 이번에는 tag name이 하나의 컬럼에 여러개가 들어있음

Unnamed: 0,id,book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url,tag_name
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,to-read fantasy favorites currently-reading yo...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPrÃ©",1997.0,Harry Potter and the Philosopher's Stone,...,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...,to-read fantasy favorites currently-reading yo...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...,to-read fantasy favorites currently-reading yo...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...,to-read favorites currently-reading young-adul...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...,to-read favorites currently-reading young-adul...


In [85]:
# 저자 이름과 tag name을 합치고
books['corpus'] = (pd.Series(books[['authors','tag_name']].fillna('').values.tolist()).str.join(' '))
books['corpus'][:3]
# corpus라는 컬럼에 저자와 태그가 한번에 모두 있음

0    Suzanne Collins to-read fantasy favorites curr...
1    J.K. Rowling, Mary GrandPrÃ© to-read fantasy f...
2    Stephenie Meyer to-read fantasy favorites curr...
Name: corpus, dtype: object

In [87]:
# Tfidf를 수행
# 작가 이름와 Tag name을 합친것을 Tfidf를 실행함
tf_corpus = TfidfVectorizer(analyzer='word', ngram_range=(1,2), min_df=0, stop_words='english')
tfidf_matrix_corpus = tf_corpus.fit_transform(books['corpus'])
cosine_sim_corpus = linear_kernel(tfidf_matrix_corpus, tfidf_matrix_corpus)

titles = books['title']
indices = pd.Series(books.index, index=books['title'])

In [88]:
# 추천 함수를 만들고
def corpus_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim_corpus[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    book_indices = [i[0] for i in sim_scores]
    return titles.iloc[book_indices]

In [89]:
# Hobbit과 비슷한 책은?
corpus_recommendations('The Hobbit')

188     The Lord of the Rings (The Lord of the Rings, ...
154            The Two Towers (The Lord of the Rings, #2)
160     The Return of the King (The Lord of the Rings,...
18      The Fellowship of the Ring (The Lord of the Ri...
610              The Silmarillion (Middle-Earth Universe)
4975        Unfinished Tales of NÃºmenor and Middle-Earth
2308                               The Children of HÃºrin
963     J.R.R. Tolkien 4-Book Boxed Set: The Hobbit an...
465                             The Hobbit: Graphic Novel
8271                   The Complete Guide to Middle-Earth
Name: title, dtype: object

In [90]:
# Twilight과 비슷한 것은?
corpus_recommendations('Twilight (Twilight, #1)')

51                                 Eclipse (Twilight, #3)
48                                New Moon (Twilight, #2)
991                    The Twilight Saga (Twilight, #1-4)
833                         Midnight Sun (Twilight, #1.5)
731     The Short Second Life of Bree Tanner: An Eclip...
1618    The Twilight Saga Complete Collection  (Twilig...
4087    The Twilight Saga: The Official Illustrated Gu...
2020             The Twilight Collection (Twilight, #1-3)
72                                The Host (The Host, #1)
219     Twilight: The Complete Illustrated Movie Compa...
Name: title, dtype: object

In [91]:
# Romeo와 Juliet과 유사한 것은?
corpus_recommendations('Romeo and Juliet')

352                      Othello
769                Julius Caesar
124                       Hamlet
153                      Macbeth
247    A Midsummer Night's Dream
838       The Merchant of Venice
854                Twelfth Night
529       Much Ado About Nothing
713                    King Lear
772      The Taming of the Shrew
Name: title, dtype: object

- Tfidf를 사용했는데, 만약 작가나 태그만 사용한다면 같은 작가, 같은 태그의 책들만 추천해 줬을것이다.
- 하지만 하나의 컬럼에 모아서 Tfidf를 하였을땐 다른 결과가 나왔다.
- 또 다른 더 나은 방법이 있는지 찾아서 시도해야겠다.

---