# 1. Goodbooks-10k 
- Link : https://www.kaggle.com/zygmunt/goodbooks-10k

In [1]:
import pandas as pd
import numpy as np
import plotnine 
from plotnine import *
import os, sys, gc
from tqdm.notebook import tqdm
import warnings 
warnings.filterwarnings('ignore')

In [2]:
path = './input/books/'
print(os.listdir(path))

['book_tags.csv', 'ratings.csv', 'books.csv', 'sample_book.xml', 'to_read.csv', 'train.csv', 'test.csv', 'tags.csv']


In [3]:
books = pd.read_csv(path + "books.csv")
book_tags = pd.read_csv(path + "book_tags.csv")
train = pd.read_csv(path + "train.csv")
test = pd.read_csv(path + "test.csv")
tags = pd.read_csv(path + "tags.csv")
to_read = pd.read_csv(path + "to_read.csv")

In [4]:
train['book_id'] = train['book_id'].astype(str)
test['book_id'] = test['book_id'].astype(str)
books['book_id'] = books['book_id'].astype(str)

In [5]:
popular_rec_model = books.sort_values(by='books_count', ascending=False)['book_id'].values[0:500]

In [6]:
sol = test.groupby(['user_id'])['book_id'].agg({'unique'}).reset_index()
gt = {}
for user in tqdm(sol['user_id'].unique()): 
    gt[user] = list(sol[sol['user_id'] == user]['unique'].values[0])

HBox(children=(FloatProgress(value=0.0, max=53424.0), HTML(value='')))




In [7]:
rec_df = pd.DataFrame()
rec_df['user_id'] = train['user_id'].unique()

## TF-IDF를 이용한 Contents Based Model 

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(books['title'])
print(tfidf_matrix.shape)

(10000, 9019)


In [9]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
cosine_matrix.shape

(10000, 10000)

In [10]:
# book title와 id를 매핑할 dictionary를 생성해줍니다. 
book2id = {}
for i, c in enumerate(books['title']): book2id[i] = c

# id와 book title를 매핑할 dictionary를 생성해줍니다. 
id2book = {}
for i, c in book2id.items(): id2book[c] = i
    
# book_id와 title를 매핑할 dictionary를 생성해줍니다.
bookid2book = {}
for i, j in zip(books['title'].values, books['book_id'].values):
    bookid2book[i] = j

In [11]:
books['title'].head()

0              The Hunger Games (The Hunger Games, #1)
1    Harry Potter and the Sorcerer's Stone (Harry P...
2                              Twilight (Twilight, #1)
3                                To Kill a Mockingbird
4                                     The Great Gatsby
Name: title, dtype: object

In [12]:
idx = id2book['Twilight (Twilight, #1)']  
sim_scores = [(book2id[i], c) for i, c in enumerate(cosine_matrix[idx]) if i != idx] 
sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse=True)
sim_scores[0:10] 

[('The Twilight Saga (Twilight, #1-4)', 0.920347418277986),
 ('The Twilight Collection (Twilight, #1-3)', 0.8786339079447184),
 ('The Twilight Saga Complete Collection  (Twilight, #1-4 + 3.5)',
  0.7697532056304309),
 ('Twilight and History', 0.7465001575650626),
 ('The Twilight Saga: The Official Illustrated Guide (Twilight, #4.5)',
  0.7045174300631831),
 ('Twilight Eyes', 0.6770737331426326),
 ('Twilight (The Mediator, #6)', 0.6377631333498953),
 ('New Moon (Twilight, #2)', 0.6185575138625542),
 ('Eclipse (Twilight, #3)', 0.612819563854136),
 ('The Servants of Twilight', 0.5837817298466093)]

0. 학습셋에서 제목이 있는 경우에 대해서만 진행
1. 각 유저별로 읽은 책의 목록을 수집 
2. 읽은 책과 유사한 책 추출 
3. 모든 책에 대해서 유사도를 더한 값을 계산 
4. 3에서 유사도가 가장 높은 순서대로 추출 

In [13]:
train = pd.merge(train, books[['book_id', 'title']], how='left', on='book_id')
train.head()

Unnamed: 0,user_id,book_id,title
0,1,4893,
1,2,8855,
2,3,9049,
3,4,3273,Moloka'i
4,5,4829,


In [14]:
# 0. 학습셋에서 제목이 있는 경우에 대해서만 진행
tf_train = train[train['title'].notnull()].reset_index(drop=True)
tf_train['idx2title'] = tf_train['title'].apply(lambda x: id2book[x])
tf_train.head()

Unnamed: 0,user_id,book_id,title,idx2title
0,4,3273,Moloka'i,1215
1,7,4138,Naked,343
2,7,4588,Extremely Loud and Incredibly Close,248
3,9,8676,Unlimited Power : The New Science Of Personal ...,4701
4,10,5907,The Hobbit,6


In [15]:
idx2title2book = {}
for i, j in zip(tf_train['idx2title'].values, tf_train['book_id'].values):
    idx2title2book[i] = j

In [16]:
# 1. 각 유저별로 읽은 책의 목록을 수집 
user = 7
read_list = tf_train.groupby(['user_id'])['idx2title'].agg({'unique'}).reset_index()
seen = read_list[read_list['user_id'] == user]['unique'].values[0]
seen

array([343, 248])

In [17]:
# 2. 읽은 책과 유사한 책 추출 
## 343번째 책과 다른 책들간의 유사도 
cosine_matrix[343]

array([0., 0., 0., ..., 0., 0., 0.])

In [18]:
# 2. 읽은 책과 유사한 책 추출 
total_cosine_sim = np.zeros(len(book2id))
for book_ in seen: 
    # 3. 모든 책에 대해서 유사도를 더한 값을 계산 
    # 343번째 책과 248의 유사도가 모두 결합된 유사도
    total_cosine_sim += cosine_matrix[book_]

In [19]:
# 4. 3에서 유사도가 가장 높은 순서대로 추출
sim_scores = [(i, c) for i, c in enumerate(total_cosine_sim) if i not in seen] # 자기 자신을 제외한 영화들의 유사도 및 인덱스를 추출 
sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse=True) # 유사도가 높은 순서대로 정렬 
sim_scores[0:5]

[(4809, 0.793036327171204),
 (6199, 0.6915730356677104),
 (3194, 0.6607508855409738),
 (1570, 0.6390974315343301),
 (7393, 0.5820260477746269)]

In [20]:
book2id[4809]

'The Naked and the Dead'

In [21]:
bookid2book[book2id[4809]]

'12467'

In [22]:
tf_train['user_id'].unique()

array([    4,     7,     9, ..., 53416, 53419, 53424])

In [23]:
tf_train.head()

Unnamed: 0,user_id,book_id,title,idx2title
0,4,3273,Moloka'i,1215
1,7,4138,Naked,343
2,7,4588,Extremely Loud and Incredibly Close,248
3,9,8676,Unlimited Power : The New Science Of Personal ...,4701
4,10,5907,The Hobbit,6


In [24]:
## 전체 영화에 대해서 진행 
total_rec_list = {}

read_list1 = train.groupby(['user_id'])['book_id'].agg({'unique'}).reset_index()
read_list2 = tf_train.groupby(['user_id'])['idx2title'].agg({'unique'}).reset_index()

for user in tqdm(train['user_id'].unique()):
    rec_list = []
        
    # 만약 TF-IDF 소속의 추천대상이라면 Contents 기반의 추천 
    if user in tf_train['user_id'].unique():
        # 1. 각 유저별로 읽은 책의 목록을 수집 
        seen = read_list2[read_list2['user_id'] == user]['unique'].values[0]
        # 2. 읽은 책과 유사한 책 추출 
        total_cosine_sim = np.zeros(len(book2id))
        for book_ in seen: 
            # 3. 모든 책에 대해서 유사도를 더한 값을 계산 
            # 343번째 책과 248의 유사도가 모두 결합된 유사도
            total_cosine_sim += cosine_matrix[book_]
            
        # 4. 3에서 유사도가 가장 높은 순서대로 추출
        sim_scores = [(bookid2book[book2id[i]], c) for i, c in enumerate(total_cosine_sim) if i not in seen] # 자기 자신을 제외한 영화들의 유사도 및 인덱스를 추출 
        recs = sorted(sim_scores, key = lambda x: x[1], reverse=True)[0:300] # 유사도가 높은 순서대로 정렬 
        for rec in recs: 
            if rec not in seen:
                rec_list.append(rec)   
        
    # 그렇지 않으면 인기도 기반의 추천 
    else: 
        seen = read_list1[read_list1['user_id'] == user]['unique'].values[0]
        for rec in popular_rec_model[0:400]:
            if rec not in seen:
                rec_list.append(rec)
                
    total_rec_list[user] = rec_list[0:200]

HBox(children=(FloatProgress(value=0.0, max=53382.0), HTML(value='')))




In [25]:
import six
import math

# https://github.com/kakao-arena/brunch-article-recommendation/blob/master/evaluate.py

class evaluate():
    def __init__(self, recs, gt, topn=100):
        self.recs = recs
        self.gt = gt 
        self.topn = topn 
        
    def _ndcg(self):
        Q, S = 0.0, 0.0
        for u, seen in six.iteritems(self.gt):
            seen = list(set(seen))
            rec = self.recs.get(u, [])
            if not rec or len(seen) == 0:
                continue

            dcg = 0.0
            idcg = sum([1.0 / math.log(i + 2, 2) for i in range(min(len(seen), len(rec)))])
            for i, r in enumerate(rec):
                if r not in seen:
                    continue
                rank = i + 1
                dcg += 1.0 / math.log(rank + 1, 2)
            ndcg = dcg / idcg
            S += ndcg
            Q += 1
        return S / Q


    def _map(self):
        n, ap = 0.0, 0.0
        for u, seen in six.iteritems(self.gt):
            seen = list(set(seen))
            rec = self.recs.get(u, [])
            if not rec or len(seen) == 0:
                continue

            _ap, correct = 0.0, 0.0
            for i, r in enumerate(rec):
                if r in seen:
                    correct += 1
                    _ap += (correct / (i + 1.0))
            _ap /= min(len(seen), len(rec))
            ap += _ap
            n += 1.0
        return ap / n


    def _entropy_diversity(self):
        sz = float(len(self.recs)) * self.topn
        freq = {}
        for u, rec in six.iteritems(self.recs):
            for r in rec:
                freq[r] = freq.get(r, 0) + 1
        ent = -sum([v / sz * math.log(v / sz) for v in six.itervalues(freq)])
        return ent
    
    def _evaluate(self):
        print('MAP@%s: %s' % (self.topn, self._map()))
        print('NDCG@%s: %s' % (self.topn, self._ndcg()))
        print('EntDiv@%s: %s' % (self.topn, self._entropy_diversity()))

In [26]:
evaluate_func = evaluate(recs=total_rec_list, gt = gt, topn=200)
evaluate_func._evaluate()

MAP@200: 8.152674144049246e-05
NDCG@200: 0.0008393198172755216
EntDiv@200: 6.916260625417244


 ## Word2vec을 이용한 추천시스템 
 - Tag간의 유사도 
 - 제목간의 유사도 
 - 책의 읽은 순서를 통한 유사도 

In [27]:
agg = train.groupby(['user_id'])['book_id'].agg({'unique'})
agg.head()

Unnamed: 0_level_0,unique
user_id,Unnamed: 1_level_1
1,[4893]
2,[8855]
3,[9049]
4,[3273]
5,"[4829, 6703]"


In [28]:
# int형식은 Word2vec에서 학습이 안되어서 String으로 변경해줍니다. 
sentence = []
for user_sentence in agg['unique'].values:
    sentence.append(list(map(str, user_sentence)))

In [29]:
# Word2vec의 학습을 진행해줍니다. 
from gensim.models import Word2Vec
embedding_model = Word2Vec(sentence, size=20, window = 5, 
                           min_count=1, workers=4, iter=200, sg=1)

In [30]:
embedding_model.wv.most_similar(positive=['4893'], topn=10)

[('9714', 0.8639865517616272),
 ('8675', 0.853424608707428),
 ('9984', 0.8509962558746338),
 ('8618', 0.8410573601722717),
 ('6291', 0.8388713002204895),
 ('2108', 0.8207754492759705),
 ('1778', 0.8174813389778137),
 ('9934', 0.8113306164741516),
 ('8524', 0.8078303337097168),
 ('9650', 0.8058333396911621)]

In [31]:
## 전체 영화에 대해서 진행 
total_rec_list = {}

read_list = train.groupby(['user_id'])['book_id'].agg({'unique'}).reset_index()
for user in tqdm(train['user_id'].unique()):
    rec_list = []     
    seen = read_list1[read_list1['user_id'] == user]['unique'].values[0]
    word2vec_dict = {}
    for book in seen: 
        for i in embedding_model.wv.most_similar(positive=[book], topn=300):
            if i[0] not in seen: 
                if i[0] not in word2vec_dict.keys(): 
                    word2vec_dict[i[0]] = i[1]
                else:
                    word2vec_dict[i[0]] += i[1]
                
    rec_list = list(dict(sorted(word2vec_dict.items(), key = lambda x: x[1], reverse=True)).keys())
    total_rec_list[user] = rec_list[0:200]

HBox(children=(FloatProgress(value=0.0, max=53382.0), HTML(value='')))




In [32]:
evaluate_func = evaluate(recs=total_rec_list, gt = gt, topn=200)
evaluate_func._evaluate()

MAP@200: 0.06359985421018921
NDCG@200: 0.19385573523449956
EntDiv@200: 8.905510194684979


### 태그를 통한 유사도 계산 

In [33]:
book_tags.columns = ['book_id', 'tag_id', 'count']
book_tags['book_id'] = book_tags['book_id'].astype(str)
book_tags['tag_id'] = book_tags['tag_id'].astype(str)

tags['tag_id'] = tags['tag_id'].astype(str)

book_tags = pd.merge(book_tags, tags, how='left', on='tag_id')
book_tags.head()

Unnamed: 0,book_id,tag_id,count,tag_name
0,1,30574,167697,to-read
1,1,11305,37174,fantasy
2,1,11557,34173,favorites
3,1,8717,12986,currently-reading
4,1,33114,12716,young-adult


In [34]:
agg = book_tags.groupby(['book_id'])['tag_name'].agg({'unique'}).reset_index()
agg.head()

Unnamed: 0,book_id,unique
0,1,"[to-read, fantasy, favorites, currently-readin..."
1,10,"[to-read, favorites, fantasy, currently-readin..."
2,10006,"[to-read, fiction, currently-reading, rory-gil..."
3,1000751,"[to-read, classics, childrens, fiction, curren..."
4,10008056,"[to-read, default, currently-reading, krimi, c..."


In [35]:
# 태그간의 유사도 계산 
# int형식은 Word2vec에서 학습이 안되어서 String으로 변경해줍니다. 
sentence = []
for user_sentence in agg['unique'].values:
    sentence.append(list(map(str, user_sentence)))

In [36]:
from gensim.models import doc2vec
doc_vectorizer = doc2vec.Doc2Vec(
    dm=0,            # PV-DBOW / default 1
    dbow_words=1,    # w2v simultaneous with DBOW d2v / default 0
    window=10,        # distance between the predicted word and context words
    size=100,        # vector size
    alpha=0.025,     # learning-rate
    seed=1234,
    min_count=5,    # ignore with freq lower
    min_alpha=0.025, # min learning-rate
    workers=4,   # multi cpu
    hs = 1,          # hierar chical softmax / default 0
    negative = 10   # negative sampling / default 5
)

In [37]:
from collections import namedtuple

TaggedDocument = namedtuple('TaggedDocument', 'words tags')
tagged_train_docs = [TaggedDocument(c, [d]) for c, d in agg[['unique', 'book_id']].values]

In [38]:
doc_vectorizer.build_vocab(tagged_train_docs)
print(str(doc_vectorizer))

Doc2Vec(dbow+w,d100,n10,hs,w10,mc5,s0.001,t4)


In [39]:
# 벡터 문서 학습
from time import time

start = time()

for epoch in tqdm(range(5)):
    doc_vectorizer.train(tagged_train_docs, total_examples=doc_vectorizer.corpus_count, epochs=doc_vectorizer.iter)
    doc_vectorizer.alpha -= 0.002 # decrease the learning rate
    doc_vectorizer.min_alpha = doc_vectorizer.alpha # fix the learning rate, no decay

#doc_vectorizer.train(tagged_train_docs, total_examples=doc_vectorizer.corpus_count, epochs=doc_vectorizer.iter)
end = time()
print("During Time: {}".format(end-start))

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


During Time: 407.58349442481995


In [40]:
doc_vectorizer.docvecs.most_similar('1', topn=20)

[('136251', 0.8540500402450562),
 ('15881', 0.8274269104003906),
 ('5', 0.7960681915283203),
 ('6', 0.7919667959213257),
 ('3', 0.7812910676002502),
 ('862041', 0.770203709602356),
 ('2', 0.7157841920852661),
 ('10', 0.7057254314422607),
 ('99298', 0.683951735496521),
 ('1317181', 0.6641982793807983),
 ('3950967', 0.6619352102279663),
 ('28187', 0.6554629802703857),
 ('100464', 0.6545640230178833),
 ('111450', 0.6466459631919861),
 ('6294', 0.6448465585708618),
 ('4502507', 0.6405039429664612),
 ('28194', 0.6279330253601074),
 ('13837', 0.6260802745819092),
 ('18116', 0.6238935589790344),
 ('119322', 0.6228699088096619)]

In [41]:
train.head()

Unnamed: 0,user_id,book_id,title
0,1,4893,
1,2,8855,
2,3,9049,
3,4,3273,Moloka'i
4,5,4829,


In [42]:
# tag 정보가 있는 책이 있고 아닌 책이 있어서 해당 책만 추출 
agg['type'] = '1'
train = pd.merge(train, agg, how='left', on='book_id')

In [43]:
## 전체 영화에 대해서 진행 
total_rec_list = {}

read_list1 = train.groupby(['user_id'])['book_id'].agg({'unique'}).reset_index()
read_list2 = train[train['type'] == '1'].groupby(['user_id'])['book_id'].agg({'unique'}).reset_index()
for user in tqdm(train['user_id'].unique()):
    rec_list = []
    if user in read_list2['user_id'].unique():
        seen = read_list2[read_list2['user_id'] == user]['unique'].values[0]
        doc2vec_dict = {}
        for book in seen: 
            for i in doc_vectorizer.docvecs.most_similar(positive=[book], topn=300): 
                if i[0] not in doc2vec_dict.keys(): 
                    doc2vec_dict[i[0]] = i[1]
                else:
                    doc2vec_dict[i[0]] += i[1]

        rec_list = list(dict(sorted(doc2vec_dict.items(), key = lambda x: x[1], reverse=True)).keys())
    else:
        
        seen = read_list1[read_list1['user_id'] == user]['unique'].values[0]
        for rec in popular_rec_model[0:300]:
            if rec not in seen:
                rec_list.append(rec)
    total_rec_list[user] = rec_list[0:200]

HBox(children=(FloatProgress(value=0.0, max=53382.0), HTML(value='')))




In [44]:
evaluate_func = evaluate(recs=total_rec_list, gt = gt, topn=200)
evaluate_func._evaluate()

MAP@200: 0.00016221527094582744
NDCG@200: 0.0016506821367251478
EntDiv@200: 6.98870286802475
