In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [44]:
all = pd.read_csv('data_new/all_data_with_author_cluster.csv')
all.drop(columns=['Unnamed: 0'], inplace=True)
all.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Location,Age,country,Book-Title,Book-Author,Year-Of-Publication,Publisher,Author_2cluster,Author_3cluster
0,276762,3453092007,8,"duisburg, nordrhein-westfalen, germany",25.0,germany,Die zweite Haut.,Dean Koontz,1995.0,Heyne,1,2
1,159390,3404122879,10,"dannenfels, rheinland-pfalz, germany",36.0,germany,"Vater Himmel, Mutter Erde.",Sue Harrison,1995.0,L?¼bbe,2,3
2,159390,1853260207,6,"dannenfels, rheinland-pfalz, germany",36.0,germany,Jane Eyre (Wordsworth Classics),Charlotte Bronte,1997.0,NTC/Contemporary Publishing Company,2,2
3,159390,3442426677,7,"dannenfels, rheinland-pfalz, germany",36.0,germany,Die Grast?¤nzer.,Susan Power,1997.0,Goldmann,1,2
4,159390,3426771365,8,"dannenfels, rheinland-pfalz, germany",36.0,germany,Nieten in Nadelstreifen. Deutschlands Manager ...,G?¼nter Ogger,1995.0,"Droemersche Verlagsanstalt Th. Knaur Nachf., G...",1,1


In [45]:
outlier_idx = all[(all['Age'] < 8) | (all['Age'] > 100) | (all['Year-Of-Publication'] >= 2020) | (all['Year-Of-Publication'] < 1500)].index
all = all.drop(outlier_idx)
all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35292 entries, 0 to 35521
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   User-ID              35292 non-null  int64  
 1   ISBN                 35292 non-null  object 
 2   Book-Rating          35292 non-null  int64  
 3   Location             35292 non-null  object 
 4   Age                  35292 non-null  float64
 5   country              35210 non-null  object 
 6   Book-Title           33397 non-null  object 
 7   Book-Author          33396 non-null  object 
 8   Year-Of-Publication  33397 non-null  float64
 9   Publisher            33397 non-null  object 
 10  Author_2cluster      35292 non-null  int64  
 11  Author_3cluster      35292 non-null  int64  
dtypes: float64(2), int64(4), object(6)
memory usage: 3.5+ MB


In [46]:
age_group_values = [int(age//10) for age in all['Age'].values]
all['Age-Group'] = age_group_values
all.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Location,Age,country,Book-Title,Book-Author,Year-Of-Publication,Publisher,Author_2cluster,Author_3cluster,Age-Group
0,276762,3453092007,8,"duisburg, nordrhein-westfalen, germany",25.0,germany,Die zweite Haut.,Dean Koontz,1995.0,Heyne,1,2,2
1,159390,3404122879,10,"dannenfels, rheinland-pfalz, germany",36.0,germany,"Vater Himmel, Mutter Erde.",Sue Harrison,1995.0,L?¼bbe,2,3,3
2,159390,1853260207,6,"dannenfels, rheinland-pfalz, germany",36.0,germany,Jane Eyre (Wordsworth Classics),Charlotte Bronte,1997.0,NTC/Contemporary Publishing Company,2,2,3
3,159390,3442426677,7,"dannenfels, rheinland-pfalz, germany",36.0,germany,Die Grast?¤nzer.,Susan Power,1997.0,Goldmann,1,2,3
4,159390,3426771365,8,"dannenfels, rheinland-pfalz, germany",36.0,germany,Nieten in Nadelstreifen. Deutschlands Manager ...,G?¼nter Ogger,1995.0,"Droemersche Verlagsanstalt Th. Knaur Nachf., G...",1,1,3


In [47]:
all.to_csv('data_new/all_data_with_age_group_and_author_cluster.csv', index=False, encoding='utf-8')

In [48]:
movies_with_rgb = pd.read_csv('data_new/ISBN_image_rgb.csv')
movies_with_rgb.head()

Unnamed: 0,ISBN,freq_R,freq_G,freq_B,mean_R,mean_G,mean_B,median_R,median_G,median_B,dominant_R,dominant_G,dominant_B
0,000160418X,200.937581,224.802151,249.884615,184.595336,187.272806,185.140984,193.597908,193.604444,205.783268,193,204,215
1,000712032X,140.301,101.541528,52.005924,160.825655,135.51773,93.794715,152.523578,142.941991,73.073817,128,96,57
2,013091830X,109.580252,46.130504,49.98147,107.649468,79.081354,66.716077,109.580252,59.543649,53.237324,111,61,61
3,014000971X,247.707408,245.850584,238.332888,202.526818,201.951598,191.964463,221.387688,220.476633,207.90402,237,235,226
4,014003885X,250.97222,251.416521,248.071251,147.078088,143.570468,139.169199,149.323084,143.438285,137.226505,35,30,26


In [57]:
movies = all.drop(columns=['User-ID', 'Book-Rating', 'Location', 'Age', 'country', 'Age-Group'])
movies = movies.drop_duplicates()
movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22089 entries, 0 to 35521
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ISBN                 22089 non-null  object 
 1   Book-Title           21905 non-null  object 
 2   Book-Author          21904 non-null  object 
 3   Year-Of-Publication  21905 non-null  float64
 4   Publisher            21905 non-null  object 
 5   Author_2cluster      22089 non-null  int64  
 6   Author_3cluster      22089 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 1.3+ MB


In [58]:
movie_info = pd.merge(movies, movies_with_rgb, how='left', on='ISBN')

In [59]:
pub_year = []

for year in movie_info['Year-Of-Publication'].values:
    pub_year.append(year - (year % 5))
    
movie_info['Pub_Year_Group'] = pub_year
movie_info.head()

  pub_year.append(year - (year % 5))


Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Author_2cluster,Author_3cluster,freq_R,freq_G,freq_B,mean_R,mean_G,mean_B,median_R,median_G,median_B,dominant_R,dominant_G,dominant_B,Pub_Year_Group
0,3453092007,Die zweite Haut.,Dean Koontz,1995.0,Heyne,1,2,25.423763,13.25615,15.092863,94.528354,53.574605,30.860437,70.423813,21.056648,25.86546,201.0,123.0,62.0,1995.0
1,3404122879,"Vater Himmel, Mutter Erde.",Sue Harrison,1995.0,L?¼bbe,2,3,193.106621,171.382147,127.22703,149.020801,101.050603,92.814399,154.829088,75.18669,79.45613,176.0,167.0,141.0,1995.0
2,1853260207,Jane Eyre (Wordsworth Classics),Charlotte Bronte,1997.0,NTC/Contemporary Publishing Company,2,2,26.15612,25.752898,29.361271,126.011309,113.480181,99.807248,129.667024,112.531261,91.413362,201.0,184.0,162.0,1995.0
3,3442426677,Die Grast?¤nzer.,Susan Power,1997.0,Goldmann,1,2,98.330955,170.660799,69.664447,86.247736,109.982683,42.403505,91.360521,95.737149,34.171008,46.0,55.0,18.0,1995.0
4,3426771365,Nieten in Nadelstreifen. Deutschlands Manager ...,G?¼nter Ogger,1995.0,"Droemersche Verlagsanstalt Th. Knaur Nachf., G...",1,1,71.676877,66.321471,62.608258,118.671777,119.891576,112.008173,106.610079,111.613069,101.967329,59.0,57.0,54.0,1995.0


In [54]:
movie_info.to_csv('data_new/rated_movie_info.csv', index=False, encoding='utf-8')

In [61]:
pub_year_df = pd.get_dummies(pub_year)

In [86]:
movie_with_pubyear = pd.concat([movie_info, pub_year_df], axis=1)
movie_with_pubyear.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Author_2cluster,Author_3cluster,freq_R,freq_G,freq_B,...,1960.0,1965.0,1970.0,1975.0,1980.0,1985.0,1990.0,1995.0,2000.0,2005.0
0,3453092007,Die zweite Haut.,Dean Koontz,1995.0,Heyne,1,2,25.423763,13.25615,15.092863,...,0,0,0,0,0,0,0,1,0,0
1,3404122879,"Vater Himmel, Mutter Erde.",Sue Harrison,1995.0,L?¼bbe,2,3,193.106621,171.382147,127.22703,...,0,0,0,0,0,0,0,1,0,0
2,1853260207,Jane Eyre (Wordsworth Classics),Charlotte Bronte,1997.0,NTC/Contemporary Publishing Company,2,2,26.15612,25.752898,29.361271,...,0,0,0,0,0,0,0,1,0,0
3,3442426677,Die Grast?¤nzer.,Susan Power,1997.0,Goldmann,1,2,98.330955,170.660799,69.664447,...,0,0,0,0,0,0,0,1,0,0
4,3426771365,Nieten in Nadelstreifen. Deutschlands Manager ...,G?¼nter Ogger,1995.0,"Droemersche Verlagsanstalt Th. Knaur Nachf., G...",1,1,71.676877,66.321471,62.608258,...,0,0,0,0,0,0,0,1,0,0


In [87]:
movie_with_pubyear.drop(columns=['Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher', 'Pub_Year_Group'], inplace=True)
movie_with_pubyear.head()

Unnamed: 0,ISBN,Author_2cluster,Author_3cluster,freq_R,freq_G,freq_B,mean_R,mean_G,mean_B,median_R,...,1960.0,1965.0,1970.0,1975.0,1980.0,1985.0,1990.0,1995.0,2000.0,2005.0
0,3453092007,1,2,25.423763,13.25615,15.092863,94.528354,53.574605,30.860437,70.423813,...,0,0,0,0,0,0,0,1,0,0
1,3404122879,2,3,193.106621,171.382147,127.22703,149.020801,101.050603,92.814399,154.829088,...,0,0,0,0,0,0,0,1,0,0
2,1853260207,2,2,26.15612,25.752898,29.361271,126.011309,113.480181,99.807248,129.667024,...,0,0,0,0,0,0,0,1,0,0
3,3442426677,1,2,98.330955,170.660799,69.664447,86.247736,109.982683,42.403505,91.360521,...,0,0,0,0,0,0,0,1,0,0
4,3426771365,1,1,71.676877,66.321471,62.608258,118.671777,119.891576,112.008173,106.610079,...,0,0,0,0,0,0,0,1,0,0


In [88]:
movie_with_pubyear = movie_with_pubyear.fillna(0)
movie_with_pubyear.columns

Index([           'ISBN', 'Author_2cluster', 'Author_3cluster',
                'freq_R',          'freq_G',          'freq_B',
                'mean_R',          'mean_G',          'mean_B',
              'median_R',        'median_G',        'median_B',
            'dominant_R',      'dominant_G',      'dominant_B',
                  1900.0,            1910.0,            1920.0,
                  1925.0,            1930.0,            1935.0,
                  1940.0,            1945.0,            1950.0,
                  1955.0,            1960.0,            1965.0,
                  1970.0,            1975.0,            1980.0,
                  1985.0,            1990.0,            1995.0,
                  2000.0,            2005.0],
      dtype='object')

In [89]:
rgb_data = movie_with_pubyear[['freq_R', 'freq_G', 'freq_B', 'mean_R', 'mean_G', 'mean_B', 'median_R', 'median_G', 'median_B', 'dominant_R', 'dominant_G', 'dominant_B']]
rgb_data.head()

Unnamed: 0,freq_R,freq_G,freq_B,mean_R,mean_G,mean_B,median_R,median_G,median_B,dominant_R,dominant_G,dominant_B
0,25.423763,13.25615,15.092863,94.528354,53.574605,30.860437,70.423813,21.056648,25.86546,201.0,123.0,62.0
1,193.106621,171.382147,127.22703,149.020801,101.050603,92.814399,154.829088,75.18669,79.45613,176.0,167.0,141.0
2,26.15612,25.752898,29.361271,126.011309,113.480181,99.807248,129.667024,112.531261,91.413362,201.0,184.0,162.0
3,98.330955,170.660799,69.664447,86.247736,109.982683,42.403505,91.360521,95.737149,34.171008,46.0,55.0,18.0
4,71.676877,66.321471,62.608258,118.671777,119.891576,112.008173,106.610079,111.613069,101.967329,59.0,57.0,54.0


In [90]:
scaler = StandardScaler()
scaler.fit(rgb_data)
rgb_scaled_array = scaler.transform(rgb_data)
rgb_scaled = pd.DataFrame(rgb_scaled_array, columns=rgb_data.columns)
rgb_scaled.head()

Unnamed: 0,freq_R,freq_G,freq_B,mean_R,mean_G,mean_B,median_R,median_G,median_B,dominant_R,dominant_G,dominant_B
0,-0.375284,-0.484939,-0.44131,0.457858,-0.005228,-0.289844,0.091887,-0.470081,-0.323958,1.659639,0.897054,0.196443
1,1.438753,1.321571,0.910354,1.193923,0.713088,0.728991,1.150124,0.326784,0.555377,1.367693,1.461298,1.299521
2,-0.367362,-0.34217,-0.269318,0.883119,0.901149,0.843988,0.834653,0.876544,0.751575,1.659639,1.679301,1.592744
3,0.413445,1.31333,0.216495,0.346006,0.848232,-0.100018,0.354382,0.629313,-0.187678,-0.15043,0.02504,-0.41793
4,0.125094,0.121306,0.13144,0.783979,0.998154,1.044633,0.545575,0.863027,0.924748,0.001382,0.050688,0.084739


In [98]:
movie_temp = movie_with_pubyear.drop(columns=['freq_R', 'freq_G', 'freq_B', 'mean_R', 'mean_G', 'mean_B', 'median_R', 'median_G', 'median_B', 'dominant_R', 'dominant_G', 'dominant_B'])
movie_final = pd.concat([movie_temp, rgb_scaled], axis=1)
# movie_final.to_csv('data_new/movie_info_scaled.csv', index=False, encoding='utf-8')
movie_final.head()

Unnamed: 0,ISBN,Author_2cluster,Author_3cluster,1900.0,1910.0,1920.0,1925.0,1930.0,1935.0,1940.0,...,freq_B,mean_R,mean_G,mean_B,median_R,median_G,median_B,dominant_R,dominant_G,dominant_B
0,3453092007,1,2,0,0,0,0,0,0,0,...,-0.44131,0.457858,-0.005228,-0.289844,0.091887,-0.470081,-0.323958,1.659639,0.897054,0.196443
1,3404122879,2,3,0,0,0,0,0,0,0,...,0.910354,1.193923,0.713088,0.728991,1.150124,0.326784,0.555377,1.367693,1.461298,1.299521
2,1853260207,2,2,0,0,0,0,0,0,0,...,-0.269318,0.883119,0.901149,0.843988,0.834653,0.876544,0.751575,1.659639,1.679301,1.592744
3,3442426677,1,2,0,0,0,0,0,0,0,...,0.216495,0.346006,0.848232,-0.100018,0.354382,0.629313,-0.187678,-0.15043,0.02504,-0.41793
4,3426771365,1,1,0,0,0,0,0,0,0,...,0.13144,0.783979,0.998154,1.044633,0.545575,0.863027,0.924748,0.001382,0.050688,0.084739


In [99]:
# movie 간의 consine similarity 계산

movie_final = movie_final.set_index('ISBN')
movie_similarity = cosine_similarity(movie_final, movie_final) # (22089, 22089)
movie_similarity = pd.DataFrame(movie_similarity, index=movie_final.index, columns=movie_final.index)
movie_similarity.head()

ISBN,3453092007,3404122879,1853260207,3442426677,3426771365,3404117905,3442452643,3404283139,3404139895,3404920821,...,3777611484,3785708963,078949647X,1580621317,8420430773,8429729941,8432231096,8474105706,043913224X,1558584099
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3453092007,1.0,0.64696,0.754845,0.471185,0.362551,0.342001,0.283455,0.612814,0.65672,0.157504,...,0.232861,0.398515,0.274864,0.332305,0.570764,0.492353,0.414715,0.154091,0.578693,0.472573
3404122879,0.64696,1.0,0.824134,0.743267,0.708774,0.333795,0.867599,0.854322,0.367237,0.801135,...,-0.236655,0.408738,0.832566,-0.175495,0.703002,0.101731,0.053982,0.734576,0.959123,0.754996
1853260207,0.754845,0.824134,1.0,0.496594,0.73424,0.289332,0.657408,0.788086,0.417076,0.492762,...,-0.193487,0.599909,0.70056,-0.123456,0.92345,0.122314,0.067639,0.662771,0.843165,0.878046
3442426677,0.471185,0.743267,0.496594,1.0,0.676206,0.662615,0.603474,0.75024,0.555265,0.667576,...,0.063463,0.486521,0.45612,0.169216,0.271459,0.379804,0.29724,0.38279,0.763988,0.480256
3426771365,0.362551,0.708774,0.73424,0.676206,1.0,0.381203,0.715912,0.829421,0.386788,0.688763,...,-0.285613,0.856811,0.646987,-0.167532,0.67754,0.053583,-0.038607,0.701196,0.794447,0.837117


## DEMOGRAPHIC MODEL

In [32]:
# ratings 정보 불러오기

ratings = pd.read_csv('data_new/all_data_with_age_group_and_author_cluster.csv')
ratings.drop(columns=['Location', 'Age', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher', 'Author_2cluster', 'Author_3cluster'], inplace=True)
new_country = []
for country in ratings['country'].values:
    try:
        new_country.append(country.strip())
    except:
        new_country.append(None)
    
ratings.drop(columns=['country'], inplace=True)
ratings['country'] = new_country
ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Age-Group,country
0,276762,3453092007,8,2,germany
1,159390,3404122879,10,3,germany
2,159390,1853260207,6,3,germany
3,159390,3442426677,7,3,germany
4,159390,3426771365,8,3,germany


In [4]:
# movie_similarity 정보 불러오기

movie_info = pd.read_csv('data_new/movie_info_scaled.csv')
movie_info = movie_info.set_index('ISBN')
movie_similarity = cosine_similarity(movie_info, movie_info) # (22089, 22089)
movie_similarity = pd.DataFrame(movie_similarity, index=movie_info.index, columns=movie_info.index)
movie_similarity.head()

ISBN,3453092007,3404122879,1853260207,3442426677,3426771365,3404117905,3442452643,3404283139,3404139895,3404920821,...,3777611484,3785708963,078949647X,1580621317,8420430773,8429729941,8432231096,8474105706,043913224X,1558584099
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3453092007,1.0,0.64696,0.754845,0.471185,0.362551,0.342001,0.283455,0.612814,0.65672,0.157504,...,0.232861,0.398515,0.274864,0.332305,0.570764,0.492353,0.414715,0.154091,0.578693,0.472573
3404122879,0.64696,1.0,0.824134,0.743267,0.708774,0.333795,0.867599,0.854322,0.367237,0.801135,...,-0.236655,0.408738,0.832566,-0.175495,0.703002,0.101731,0.053982,0.734576,0.959123,0.754996
1853260207,0.754845,0.824134,1.0,0.496594,0.73424,0.289332,0.657408,0.788086,0.417076,0.492762,...,-0.193487,0.599909,0.70056,-0.123456,0.92345,0.122314,0.067639,0.662771,0.843165,0.878046
3442426677,0.471185,0.743267,0.496594,1.0,0.676206,0.662615,0.603474,0.75024,0.555265,0.667576,...,0.063463,0.486521,0.45612,0.169216,0.271459,0.379804,0.29724,0.38279,0.763988,0.480256
3426771365,0.362551,0.708774,0.73424,0.676206,1.0,0.381203,0.715912,0.829421,0.386788,0.688763,...,-0.285613,0.856811,0.646987,-0.167532,0.67754,0.053583,-0.038607,0.701196,0.794447,0.837117


In [33]:
# train_test_split

x = ratings.copy()
y = ratings['Book-Rating']
# user_id = ratings['User-ID']
train, test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)
len(train), len(test), len(y_train), len(y_test)

(24704, 10588, 24704, 10588)

In [34]:
# 전체 평점 평균

total_mean = train['Book-Rating'].mean()
print(total_mean)

7.732431994818653


In [35]:
def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))

def baseline(user_id, ISBN):
    return total_mean

def score(model):
    id_pairs = zip(test['User-ID'], test['ISBN'])
    y_pred = np.array([model(user_id, book) for (user_id, book) in id_pairs])
    y_true = np.array(test['Book-Rating'])
    return RMSE(y_true, y_pred)

print(f"Baseline Model Score: {score(baseline)}")

rating_matrix = train.pivot(values='Book-Rating', index='User-ID', columns='ISBN')
rating_matrix.head()

Baseline Model Score: 1.8323849921826738


ISBN,000104799X,000160418X,000222674X,000225056X,000225414X,000225946X,000412913X,000458726X,000599134X,000617616X,...,B00009NDAN,B0000AA9JB,B0000C2W5U,B0000C7BNG,B0000DAPP1,B0000T6KHI,B0000T6KIM,B0000VZEH8,B0001GMSV2,B000234N3A
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
183,,,,,,,,,,,...,,,,,,,,,,
242,,,,,,,,,,,...,,,,,,,,,,
254,,,,,,,,,,,...,,,,,,,,,,
388,,,,,,,,,,,...,,,,,,,,,,
392,,,,,,,,,,,...,,,,,,,,,,


In [36]:
users = pd.read_csv('data_new/users_with_country.csv')
users = users.set_index('User-ID')

age_group = [int(age//10) for age in users['Age'].values]
users['Age-Group'] = age_group
users.head()

Unnamed: 0_level_0,Location,Age,Country,Age-Group
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
183,"porto, porto, portugal",27.0,portugal,2
242,"neuffen, baden-wuerttemberg, germany",37.0,germany,3
254,"minneapolis, minnesota, usa",24.0,usa,2
388,"nashville, tennessee, usa",35.0,usa,3
392,"barcelona, catalunya, spain",44.0,spain,4


In [11]:
rating_matrix['1577193563'][rating_matrix['1577193563'].notna()]

User-ID
81682    9.0
Name: 1577193563, dtype: float64

In [169]:
country_mean = train[['ISBN', 'country', 'Book-Rating']].groupby(['ISBN', 'country'])['Book-Rating'].mean()
'1577193563' in country_mean, '1577193563' in rating_matrix

(False, True)

In [39]:
# ISBN, user의 출신 country별 평점 평균 계산
# 각 도서(ISBN)에 대해 특정 country의 사용자들은 평균 몇 점을 주고 있는지
country_mean = train[['ISBN', 'country', 'Book-Rating']].groupby(['ISBN', 'country'])['Book-Rating'].mean()

def cf_country(user_id, ISBN):
    if ISBN in rating_matrix: # train dataset에 ISBN이 있는지 확인
        country = (users.loc[user_id])['Country'] # user의 country 정보 가져오기
        if (ISBN in country_mean) and (country in country_mean[ISBN]):
            rating = country_mean[ISBN][country]
        else:
            rating = total_mean
    else:
        rating = total_mean
    return rating

print(f"cf_country Model Score: {score(cf_country)}")

cf_country Model Score: 1.9473792005988748


In [41]:
# ISBN, user의 출신 country별 평점 평균 계산
# 각 도서(ISBN)에 대해 특정 country의 사용자들은 평균 몇 점을 주고 있는지
country_total_mean = train[['country', 'Book-Rating']].groupby(['country'])['Book-Rating'].mean()

def cf_country(user_id, ISBN):
    country = (users.loc[user_id])['Country'] # user의 country 정보 가져오기
    
    if ISBN in rating_matrix: # train dataset에 ISBN이 있는지 확인
        if (ISBN in country_mean) and (country in country_mean[ISBN]):
            rating = country_mean[ISBN][country]
        else:
            if country in country_total_mean:
                rating = country_total_mean[country]
            else:
                rating = total_mean
    else:
        if country in country_total_mean:
                rating = country_total_mean[country]
        else:
            rating = total_mean
    return rating

print(f"cf_country_plus_country_total_mean Model Score: {score(cf_country)}")

cf_country_plus_country_total_mean Model Score: 1.9343594610415693


In [42]:
country_mean

ISBN        country       
000104799X  canada             7.5
000160418X  england            7.0
000222674X  united kingdom     9.0
000225056X  usa               10.0
000225414X  united kingdom     7.0
                              ... 
B0000T6KHI  usa               10.0
B0000VZEH8  usa                9.0
B0001GMSV2  canada             8.0
            usa                8.0
B000234N3A  usa                9.0
Name: Book-Rating, Length: 18378, dtype: float64

In [43]:
# ISBN, user의 age-group별 평점 평균 계산
# 각 도서(ISBN)에 대해 특정 age-group의 사용자들은 평균 몇 점을 주고 있는지
age_group_mean = train[['ISBN', 'Age-Group', 'Book-Rating']].groupby(['ISBN', 'Age-Group'])['Book-Rating'].mean()
age_group_mean.head()

ISBN        Age-Group
000104799X  5             8.0
            6             7.0
000160418X  6             7.0
000222674X  5             9.0
000225056X  2            10.0
Name: Book-Rating, dtype: float64

In [44]:
def cf_age(user_id, ISBN):
    if ISBN in rating_matrix: # train dataset에 ISBN이 있는지 확인
        age_group = (users.loc[user_id])['Age-Group'] # user의 age_group 정보 가져오기
        if age_group in age_group_mean[ISBN]:
            age_group_rating = age_group_mean[ISBN][age_group]
        else:
            age_group_rating = total_mean
    else:
        age_group_rating = total_mean
    return age_group_rating

print(f"cf_age Model Score: {score(cf_age)}")

cf_age Model Score: 1.9196606982239313
