In [26]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [55]:
all = pd.read_csv('data_new/all_data.csv', encoding='utf-8')
all.head()

Unnamed: 0,User-ID,Location,Age,country,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,183,"porto, porto, portugal",27.0,portugal,8401422825,9,Fahrenheit 451,Ray Bradbury,1993.0,"Plaza &amp Janes Editores, S.A."
1,183,"porto, porto, portugal",27.0,portugal,8420639133,0,Temor y Temblor,Soren Kierkegaard,2001.0,Alianza
2,183,"porto, porto, portugal",27.0,portugal,8433920081,0,El Almuerzo Desnudo,William S. Burroughs,1995.0,Anagrama
3,183,"porto, porto, portugal",27.0,portugal,8440630794,7,Cuentos del Planeta Tierra,Arthur C. Clarke,1993.0,Ediciones B
4,183,"porto, porto, portugal",27.0,portugal,8476409419,8,Estudios sobre el amor,Jose Ortega Y Gaset,2001.0,Downtown Book Center


In [56]:
all.columns

Index(['User-ID', 'Location', 'Age', 'country', 'ISBN', 'Book-Rating',
       'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher'],
      dtype='object')

In [57]:
user_ratings = all.drop(columns=['Location', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher'])

# 평점이 0인 rating 정보 제거
user_no_ratings_index = user_ratings[user_ratings['Book-Rating'] == 0].index
user_ratings = user_ratings.drop(user_no_ratings_index)

# 8세 미만, 100세 초과인 user들의 rating 정보 제거
user_age_outliers_index = user_ratings[(user_ratings['Age'] < 8) | (user_ratings['Age'] > 100)].index
user_ratings = user_ratings.drop(user_age_outliers_index)

user_ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35297 entries, 0 to 100725
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   User-ID      35297 non-null  int64  
 1   Age          35297 non-null  float64
 2   country      35215 non-null  object 
 3   ISBN         35297 non-null  object 
 4   Book-Rating  35297 non-null  int64  
dtypes: float64(1), int64(2), object(2)
memory usage: 1.6+ MB


In [59]:
age_group_values = [int(age//10) for age in user_ratings['Age'].values]
user_ratings['Age-Group'] = age_group_values
user_ratings.head()

Unnamed: 0,User-ID,Age,country,ISBN,Book-Rating,Age-Group
0,183,27.0,portugal,8401422825,9,2
3,183,27.0,portugal,8440630794,7,2
4,183,27.0,portugal,8476409419,8,2
5,242,37.0,germany,3257203659,9,3
6,242,37.0,germany,3257207522,10,3


In [60]:
x = user_ratings.copy()
y = user_ratings['Book-Rating']
train, test, y_train, y_test = train_test_split(x, y, test_size=0.3, stratify=y, random_state=42)

train.head()

Unnamed: 0,User-ID,Age,country,ISBN,Book-Rating,Age-Group
78427,216401,30.0,france,2842190114,8,3
87884,240567,33.0,usa,044022165X,6,3
58310,163319,34.0,germany,3257056109,9,3
67614,186593,33.0,france,2253141976,5,3
898,3538,31.0,spain,8426105084,5,3


In [61]:
def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))

def baseline(user_id, ISBN):
    try:
        rating = train_mean[ISBN]
    except:
        rating = 5.5
    return rating

def score(model):
    id_pairs = zip(test['User-ID'], test['ISBN'])
    y_pred = np.array([model(user_id, book) for (user_id, book) in id_pairs])
    y_true = np.array(test['Book-Rating'])
    return RMSE(y_true, y_pred)

train_mean = train.groupby(['ISBN'])['Book-Rating'].mean()
print(f"Bestseller Model Score: {score(baseline)}")

rating_matrix = train.pivot(values='Book-Rating', index='User-ID', columns='ISBN')
rating_matrix.head()

Bestseller Model Score: 2.506472541236802


ISBN,000104799X,000160418X,000225056X,000412913X,000458726X,000470973X,000599134X,000647425X,000648185X,000648199X,...,B0000C2W5U,B0000C7BNG,B0000DAPP1,B0000T6KHI,B0000T6KIM,B0000VZEH8,B0000VZEJQ,B0001FZGRQ,B0001GMSV2,B000234N3A
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
183,,,,,,,,,,,...,,,,,,,,,,
242,,,,,,,,,,,...,,,,,,,,,,
254,,,,,,,,,,,...,,,,,,,,,,
388,,,,,,,,,,,...,,,,,,,,,,
392,,,,,,,,,,,...,,,,,,,,,,


In [62]:
# ISBN, user의 출신 country별 평점 평균 계산
# 각 도서(ISBN)에 대해 특정 country의 사용자들은 평균 몇 점을 주고 있는지
country_mean = train[['ISBN', 'country', 'Book-Rating']].groupby(['ISBN', 'country'])['Book-Rating'].mean()
country_mean

ISBN        country       
000104799X  canada             7.0
000160418X  england            7.0
000225056X  usa               10.0
000412913X  united kingdom     8.0
000458726X  new zealand        9.0
                              ... 
B0000VZEJQ  germany           10.0
B0001FZGRQ  usa                9.0
B0001GMSV2  canada             8.0
            usa                8.0
B000234N3A  usa                9.0
Name: Book-Rating, Length: 18318, dtype: float64

In [80]:
users = pd.read_csv('data_new/users_with_country.csv')
users = users.set_index('User-ID')

age_group = [int(age//10) for age in users['Age'].values]
users['Age-Group'] = age_group
users.head()

Unnamed: 0_level_0,Location,Age,Country,Age-Group
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
183,"porto, porto, portugal",27.0,portugal,2
242,"neuffen, baden-wuerttemberg, germany",37.0,germany,3
254,"minneapolis, minnesota, usa",24.0,usa,2
388,"nashville, tennessee, usa",35.0,usa,3
392,"barcelona, catalunya, spain",44.0,spain,4


In [81]:
def cf_country(user_id, ISBN):
    if ISBN in rating_matrix: # train dataset에 ISBN이 있는지 확인
        country = (users.loc[user_id])['Country'] # user의 country 정보 가져오기
        if country in country_mean[ISBN]:
            country_rating = country_mean[ISBN][country]
        else:
            country_rating = 5.5
    else:
        country_rating = 5.5
    return country_rating

print(f"cf_country Model Score: {score(cf_country)}")

cf_country Model Score: 2.5906243177721056


In [84]:
# ISBN, user의 age-group별 평점 평균 계산
# 각 도서(ISBN)에 대해 특정 age-group의 사용자들은 평균 몇 점을 주고 있는지
age_group_mean = train[['ISBN', 'Age-Group', 'Book-Rating']].groupby(['ISBN', 'Age-Group'])['Book-Rating'].mean()
age_group_mean.head()

ISBN        Age-Group
000104799X  6             7.0
000160418X  6             7.0
000225056X  2            10.0
000412913X  4             8.0
000458726X  2             8.0
Name: Book-Rating, dtype: float64

In [85]:
def cf_age(user_id, ISBN):
    if ISBN in rating_matrix: # train dataset에 ISBN이 있는지 확인
        age_group = (users.loc[user_id])['Age-Group'] # user의 age_group 정보 가져오기
        if age_group in age_group_mean[ISBN]:
            age_group_rating = age_group_mean[ISBN][age_group]
        else:
            age_group_rating = 5.5
    else:
        age_group_rating = 5.5
    return age_group_rating

print(f"cf_age Model Score: {score(cf_age)}")

cf_age Model Score: 2.6659366532464763
