In [None]:
# 책 추천 예제
# https://towardsdatascience.com/how-did-we-build-book-recommender-systems-in-an-hour-part-2-k-nearest-neighbors-and-matrix-c04b3c2ef55c
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
import sklearn
from sklearn.decomposition import TruncatedSVD

book = pd.read_csv('BX-Books.csv', sep=';', error_bad_lines=False, encoding='latin-1')
book.columns = ['ISBN', 'bookTitle', 'bookAuthor', 'yearOfPublication', 'publisher', 'imageUrlS', 'imageUrlM', 'imageUrlL']
user = pd.read_csv('BX-Users.csv', sep=';', error_bad_lines=False, encoding='latin-1')
user.columns = ['userID', 'Location', 'Age']
rating = pd.read_csv('BX-Book-Ratings.csv', sep=';', error_bad_lines=False, encoding='latin-1')
rating.columns = ['userID', 'ISBN', 'bookRating']

In [None]:
book.head()

In [None]:
user.head()

In [None]:
rating.head()

In [None]:
# rating과 book을 ISBN을 기준으로 통합
combine_book_rating = pd.merge(rating, book, on='ISBN')
columns = ['yearOfPublication', 'publisher', 'imageUrlS', 'imageUrlM', 'imageUrlL']
combine_book_rating = combine_book_rating.drop(columns, axis=1)
combine_book_rating.head()

In [None]:
combine_book_rating[combine_book_rating["bookTitle"] == "Flesh Tones: A Novel"]

In [None]:
# bookTitle이 NA인 row 제거
combine_book_rating = combine_book_rating.dropna(axis=0, subset = ['bookTitle'])

In [None]:
# bookTitle을 기준으로 그루핑을 하고 점수를 부여한 사용자의 명수를 계산한 컬럼을 추가
book_ratingCount = (combine_book_rating.
                    groupby(by = ['bookTitle'])['bookRating'].
                    count().
                    reset_index().
                    rename(columns = {'bookRating': 'totalRatingCount'})
                    [['bookTitle', 'totalRatingCount']]
                   )

In [None]:
book_ratingCount[book_ratingCount["bookTitle"] == "Flesh Tones: A Novel"]

In [None]:
book_ratingCount.head()

In [None]:
# combine_book_rating 과 book_ratingCount를 merge
rating_with_totalRatingCount = combine_book_rating.merge(book_ratingCount, left_on = 'bookTitle', right_on = 'bookTitle', how = 'left')
rating_with_totalRatingCount.head()

In [None]:
# 책 점수 통계, 책들이 얼마나 많이 평가를 받았나
pd.set_option('display.float_format', lambda x: '%.3f' % x)
print(book_ratingCount['totalRatingCount'].describe())

In [None]:
# 위의 통계로 봐서는 median에 해당하는 책조차 1번 밖에 평가를 받지 못했다.
# 그럼 상위 책들의 통계를 확인해보자.
print(book_ratingCount['totalRatingCount'].quantile(np.arange(.9, 1, .01)))

In [None]:
# 1%에 해당하는 책들만이 50개 이상의 평가를 받았다.
# 현재 너무 많은 책들이 데이터에 존재하므로 상위 1%의 책들만을 대상으로 하자 (총 2444 개의 책)
popularity_threshold = 50
rating_popular_book = rating_with_totalRatingCount.query('totalRatingCount >= @popularity_threshold')
rating_popular_book.head()

In [None]:
len(set(rating_popular_book['bookTitle']))

In [None]:
# US와 Canada에서 출간된 책만을 선택
combined = rating_popular_book.merge(user, left_on = 'userID', right_on = 'userID', how = 'left')


In [None]:
# test
combined[combined['Location'].str.contains("use|canada")]

In [None]:
us_canada_user_rating = combined[combined['Location'].str.contains("usa|canada")]
us_canada_user_rating=us_canada_user_rating.drop('Age', axis=1)
us_canada_user_rating.head()

# 입력 데이터 만들기

In [None]:
if not us_canada_user_rating[us_canada_user_rating.duplicated(['userID', 'bookTitle'])].empty:
    initial_rows = us_canada_user_rating.shape[0]

    print('Initial dataframe shape {0}'.format(us_canada_user_rating.shape))
    us_canada_user_rating = us_canada_user_rating.drop_duplicates(['userID', 'bookTitle'])
    current_rows = us_canada_user_rating.shape[0]
    print('New dataframe shape {0}'.format(us_canada_user_rating.shape))
    print('Removed {0} rows'.format(initial_rows - current_rows))

In [None]:
us_canada_user_rating

In [None]:
# 책-사용자 점수 matrix
us_canada_user_rating_pivot = us_canada_user_rating.pivot(index = 'bookTitle', columns = 'userID', values = 'bookRating').fillna(0)
us_canada_user_rating_matrix = csr_matrix(us_canada_user_rating_pivot.values)

In [None]:
us_canada_user_rating_pivot.head()

In [None]:
us_canada_user_rating_pivot.shape

In [None]:
# 데이터 타입: Compressed Sparse Row matrix
us_canada_user_rating_matrix

# Item에 kNN 적용

In [None]:
from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(us_canada_user_rating_matrix)

In [None]:
query_index = np.random.choice(us_canada_user_rating_pivot.shape[0])
print(query_index)
us_canada_user_rating_pivot.iloc[query_index, :]

In [None]:
#print(us_canada_user_rating_pivot.iloc[query_index, :].values)
#print(us_canada_user_rating_pivot.iloc[query_index, :].values.reshape(1, -1))

In [None]:
#ttemp = us_canada_user_rating_pivot.iloc[query_index, :].values.reshape(1, -1)[0]
#ttemp[ttemp > 0]

In [None]:
query_index = np.random.choice(us_canada_user_rating_pivot.shape[0])
distances, indices = model_knn.kneighbors(us_canada_user_rating_pivot.iloc[query_index, :].values.reshape(1, -1), n_neighbors = 6)
print(query_index)
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(us_canada_user_rating_pivot.index[query_index]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, us_canada_user_rating_pivot.index[indices.flatten()[i]], distances.flatten()[i]))

In [None]:
print(indices)
print(distances)

# Item-based Collaborative Filtering Using Matrix Factorization

In [None]:
# 이번에는 사용자가 row에 오도록
us_canada_user_rating_pivot2 = us_canada_user_rating.pivot(index = 'userID', columns = 'bookTitle', values = 'bookRating').fillna(0)

In [None]:
us_canada_user_rating_pivot2.head()

In [None]:
us_canada_user_rating_pivot2.shape

In [None]:
us_canada_user_rating_pivot2.columns

In [None]:
X = pd.DataFrame(us_canada_user_rating_pivot2.values.T)
X.shape
X.head()

In [None]:
#us_canada_user_rating_pivot2.T

In [None]:
# X의 컬럼과 인덱스 명 설정
X.columns = us_canada_user_rating_pivot2.index
X.index = us_canada_user_rating_pivot2.columns
X.head()

In [None]:
# SVD 사용
import sklearn
from sklearn.decomposition import TruncatedSVD

SVD = TruncatedSVD(n_components=12, random_state=17)
matrix = SVD.fit_transform(X)
matrix.shape

In [None]:
# correlation coefficient matrix
import warnings
warnings.filterwarnings("ignore",category =RuntimeWarning)
corr = np.corrcoef(matrix)
corr.shape

In [None]:
us_canada_book_title = us_canada_user_rating_pivot2.columns
us_canada_book_list = list(us_canada_book_title)
coffey_hands = us_canada_book_list.index("The Green Mile: Coffey's Hands (Green Mile Series)")
print(coffey_hands)

In [None]:
corr_coffey_hands  = corr[coffey_hands]

In [None]:
print(corr_coffey_hands)

In [None]:
list(us_canada_book_title[(corr_coffey_hands<1.0) & (corr_coffey_hands>0.9)])

# Collaborative Filtering Using NMF

In [None]:
X

In [None]:
# user별로 평가한 아이템 개수
review_count = pd.DataFrame(X[X > 0].count())


In [None]:
# user별로 평가한 아이템 개수가 20개를 넘는 경우의 user id list
id_selected = review_count[review_count[0] > 20].index

In [None]:
# 위의 선택된 id에 해당하는 컬럼만을 모아둔 것 (X에서)
X2 = X[id_selected]

In [None]:
X2.head()

In [None]:
# NMF 모델 객체 생성
from sklearn.decomposition import NMF
model = NMF(n_components=200)

In [None]:
# NMF 모델 학습 (Book-User Matix에 대해)
# W는 Book-Cluster matrix
W = model.fit_transform(X2)

In [None]:
# H는 Cluster-User matrix
H = model.components_

In [None]:
H.shape

In [None]:
# H를 dataframe으로 바꾸고 X 컬럼명을 컬럼명으로 세팅
H = pd.DataFrame(np.round(model.components_,2), columns=X2.columns)

In [None]:
# W를 dataframe으로 바꾸고 X row명을 row명으로 세팅
W = pd.DataFrame(np.round(model.transform(X2),2))
W.index = X.index

In [None]:
H.head()

In [None]:
W.head()

In [None]:
# X matrix를 W*H로 reconstruction하여 reconstructed에 저장 colums, index 세팅
reconstructed = pd.DataFrame(np.round(np.dot(W,H),2), columns=X2.columns)
reconstructed.index = X2.index

In [None]:
reconstructed

In [None]:
# user id 638 에 대한 reconstructed 점수 중 3이상인 것들 
temp = pd.DataFrame(reconstructed[reconstructed[638] > 3][638])

In [None]:
len(pd.DataFrame(reconstructed[reconstructed[638] > 3][638]))

In [None]:
temp

In [None]:
#pd.DataFrame.sort_values(by=temp, axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')
temp.sort_values(by=638,ascending=False)

In [None]:
# user id 638이 원래 평가한 점수 중 0 이상인 것들 
pd.DataFrame(X[X[638] > 0][638])

In [None]:
# user별로 평가한 아이템 개수
review_count = pd.DataFrame(X[X > 0].count())

In [None]:
review_count[100:200]

## 이번에는 같은 내용을 User-Book Matrix를 이용하여 계산해본다.

In [None]:
X2_tr = X2.T

In [None]:
X2_tr.head()

In [None]:
model_tr = NMF(n_components=200)
W = model.fit_transform(X2_tr)
H = pd.DataFrame(np.round(model.components_,2), columns=X2_tr.columns)

In [None]:
reconstructed = pd.DataFrame(np.round(np.dot(W,H),2), columns=X2_tr.columns)
reconstructed.index = X2_tr.index

In [None]:
reconstructed

In [None]:
reconstructed2 = reconstructed.T

In [None]:
reconstructed2

In [None]:
len(pd.DataFrame(reconstructed2[reconstructed2[638] > 2][638]))

In [None]:
temp = pd.DataFrame(reconstructed2[reconstructed2[638] > 2][638])
temp.sort_values(by=638,ascending=False)