## Load Data

In [7]:
import os
import pandas as pd
import numpy as np
from datetime import datetime

In [15]:
root_path = os.path.join(os.getcwd(), 'data/')

raw_users = pd.read_csv(root_path + 'users.csv')
raw_books = pd.read_csv(root_path + 'books.csv')
raw_ratings = pd.read_csv(root_path + 'train_ratings.csv')

train_ratings = pd.read_csv('./data/train_ratings.csv')
test_ratings = pd.read_csv('./data/test_ratings.csv')

## users 전처리 함수
- location
    - 소문자 변환, 공백 및 특수문자 삭제
    - , 기준으로 앞에서부터 city, state, country 분리
    - state 결측치에 city 기준으로 최빈 state 대체
    - country 결측치에 city 기준으로 최빈 country 대체
    - 미국이면 state 사용, 그외는 country 사용
    - 결측치는 'na' 입력
- age
    - 결측치는 0 입력
    - 결측치, 20세 미만부터 70세 미만, 70세 이상 100세 미만 총 8개 구간으로 범주화
- (선택) 데이터가 적은 class는 others로 통합
    - location
    - 시간이 오래 걸림

In [16]:
def users_preprocess(raw_users) :
    users = raw_users.copy()
    
    # location
    users['location'] = users['location'].str.lower().replace('[^0-9a-zA-Z:,]', '', regex=True)
    users['city'] = users['location'].apply(lambda x: x.split(',')[-3].strip())
    users['state'] = users['location'].apply(lambda x: x.split(',')[-2].strip())
    users['country'] = users['location'].apply(lambda x: x.split(',')[-1].strip())
    users = users.replace('na', np.nan)
    users = users.replace('', np.nan)
    users.drop(columns=['location'], inplace=True)
    
    city_state_map = dict(users.groupby('city')['state']
                          .value_counts().sort_values().index.tolist())
    city_country_map = dict(users.groupby('city')['country']
                            .value_counts().sort_values().index.tolist())
    users['state'] = users['city'].map(city_state_map)
    users['country'] = users['city'].map(city_country_map)
    
    # users['location'] = users['country'].copy()
    # users['location'] = np.where(users['location']=='usa',
    #                          users['state'],
    #                          users['location'])
    # users['location'].fillna('na', inplace=True)
    
    users['country'].fillna('na', inplace=True)
    users['state'].fillna('na', inplace=True)
    users['city'].fillna('na', inplace=True)
    
    # age
    users['age'].fillna(0, inplace=True)
    bins = [0, 1, 20, 30, 40, 50, 60, 70, 100]
    users['age_bin'] = pd.cut(x=users['age'], bins=bins, right=False, labels=range(8))
#########
    # 선택
    # location_cnt = users['location'].value_counts()
    # low_cnt_location = location_cnt[location_cnt < 10].index.tolist()
    # for location in low_cnt_location :
    #     users['location'] = np.where(users['location']==location,
    #                                  'others', users['location'])
##########
    users.drop(columns=['age'], inplace=True)
    
    return users

In [17]:
users = users_preprocess(raw_users)

In [18]:
users

Unnamed: 0,user_id,city,state,country,age_bin
0,8,timmins,ontario,canada,0
1,11400,ottawa,ontario,canada,4
2,11676,na,na,na,0
3,67544,toronto,ontario,canada,3
4,85526,victoria,britishcolumbia,canada,3
...,...,...,...,...,...
68087,278376,danville,california,usa,5
68088,278621,victoria,britishcolumbia,canada,7
68089,278636,irvington,newyork,usa,0
68090,278659,vancouver,britishcolumbia,canada,3


## books 전처리 함수
- isbn 
    - img_url 에서 파싱해서 대체
    - books, ratings 같이 처리
- book_author
    - 소문자 변환, 공백 및 특수문자 삭제
- year_of_publication
    - 1950년 이전 ~ 2010년 이전까지 7개 구간으로 범주화
- publisher
    - 소문자 변환, 공백 및 특수문자 삭제
- category -> major_cat
    - 소문자 변환, 공백 및 특수문자 삭제
    - 동일 저자에 대해 최빈 카테고리로 통일
        - ex) 저자가 stephenking인 모든 책의 카테고리는 fiction
    - 남은 결측치에 대해서 출판사의 최빈 카테고리 적용 (통일은 X)
    - 그래도 남은 결측치에는 'na'
    - 최종적으로 같은 카테고리로 분류 가능한 경우를 묶어서 major_cat으로 정리
- summary
    - 있으면 1, 없으면 0으로 변환
- isbn_area
    - 참고 : https://en.wikipedia.org/wiki/List_of_ISBN_registration_groups
    - isbn의 접두사 부분을 파싱해 책이 출판된 국가/지역/언어권 구분
- book_title, img_url, img_path, language, category
    - 삭제
- (선택) 데이터가 적은 class는 others로 통합
    - author, publisher, major_cat, isbn_area
    - 시간이 오래 걸림

In [20]:
def isbn_area(isbn) :
    if isbn[0] in ('0', '1') :
        return '1'
    if isbn[0] in ('2', '3', '4', '5', '7') :
        return isbn[0]
    # 6으로 시작하는 경우 없음
    if isbn[0] == '8' :
        return isbn[:2]
    if isbn[0] == '9' :
        if int(isbn[:2]) < 95 :
            return isbn[:2]
        if int(isbn[:2]) < 99 :
            return isbn[:3]
        else :
            return isbn[:4]
    else :
        return 'others'

def books_ratings_preprocess(raw_books, raw_ratings) :
    books = raw_books.copy()
    ratings = raw_ratings.merge(raw_books[['isbn', 'img_url']], how='left', on='isbn')
    
    # isbn
    ratings['isbn'] = ratings['img_url'].apply(lambda x: x.split('P/')[1][:10])
    books['isbn'] = books['img_url'].apply(lambda x: x.split('P/')[1][:10])
    
    # book_author
    books['book_author'] = books['book_author'].str\
                        .lower().replace('[^0-9a-zA-Z]', '', regex=True)
    
    # year_of_publication
    bins = [0, 1950, 1960, 1970, 1980, 1990, 2000, 2010]
    books['year_of_publication'] = pd.cut(x=books['year_of_publication'],
                                          bins=bins, right=False, labels=range(7))
    
    # publisher
    books['publisher'] = books['publisher'].str\
                        .lower().replace('[^0-9a-zA-Z]', '', regex=True)
    
    # category
    books['category'] = books['category'].str\
                        .lower().replace('[^0-9a-zA-Z]', '', regex=True)
    author_cat_map = dict(books.groupby('book_author')['category']
                      .value_counts().sort_values().index.tolist())
    books['category'] = books['book_author'].map(author_cat_map)
    publisher_cat_map = dict(books.groupby('publisher')['category']
                      .value_counts().sort_values().index.tolist())
    books['category'] = books['category'].fillna(
                        books['publisher'].map(publisher_cat_map))
    books['category'].fillna('na', inplace=True)
    major_cat = ['fiction', 'juvenilefiction', 'juvenilenonfiction', 'biography',
            'histor', 'religio', 'science', 'social', 'politic', 'humor',
            'spirit', 'business', 'cook', 'health', 'famil', 'computer',
            'travel', 'self', 'poet', 'language', 'art', 'language art',
            'literary', 'criticism', 'nature', 'philosoph', 'reference', 'drama',
            'sport', 'transportation', 'comic', 'craft', 'education', 'crime',
            'music', 'animal', 'garden', 'detective', 'house', 'tech', 'photograph',
            'adventure', 'game', 'architect', 'law', 'antique', 'friend',
            'sciencefiction', 'fantasy', 'mathematic', 'design', 'actor',
            'horror', 'adultery']
    books['major_cat'] = books['category'].copy()
    for category in major_cat :
        books['major_cat'] = np.where(books['category'].str.contains(category),
                                     category, books['major_cat'])
        
    # summary
    books['summary'] = np.where(books['summary'].notnull(), 1, 0)
    
    # isbn_area
    books['isbn_area'] = books['isbn'].apply(isbn_area)
    
#     # 선택
#     aut_cnt = books['book_author'].value_counts()
#     low_cnt_aut = aut_cnt[aut_cnt < 10].index.tolist()
#     for aut in low_cnt_aut :
#         books['book_author'] = np.where(books['book_author']==aut,
#                                      'others', books['book_author'])


#     # 선택
#     pub_cnt = books['publisher'].value_counts()
#     low_cnt_pub = pub_cnt[pub_cnt < 10].index.tolist()
#     for pub in low_cnt_pub :
#         books['publisher'] = np.where(books['publisher']==pub,
#                                      'others', books['publisher'])


#     # 선택
#     cat_cnt = books['major_cat'].value_counts()
#     low_cnt_cat = cat_cnt[cat_cnt < 10].index.tolist()
#     for cat in low_cnt_cat :
#         books['major_cat'] = np.where(books['major_cat']==cat,
#                                      'others', books['major_cat'])


#     # 선택
#     area_cnt = books['isbn_area'].value_counts()
#     low_cnt_area = area_cnt[area_cnt < 10].index.tolist()
#     for area in low_cnt_cat :
#         books['isbn_area'] = np.where(books['isbn_area']==area,
#                                      'others', books['isbn_area'])
    
    
    
    ratings.drop(columns=['img_url'], inplace=True)
    books.drop(columns=['book_title', 'img_url', 'language', 'category', 'img_path'],
               inplace=True)
    
    return books, ratings
    

In [21]:
books, ratings = books_ratings_preprocess(raw_books, raw_ratings)

In [93]:
# users.to_csv('minho_users.csv')
# books.to_csv('minho_books.csv')
# ratings.to_csv('minho_ratings.csv')

In [22]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68092 entries, 0 to 68091
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   user_id  68092 non-null  int64   
 1   city     68092 non-null  object  
 2   state    68092 non-null  object  
 3   country  68092 non-null  object  
 4   age_bin  68092 non-null  category
dtypes: category(1), int64(1), object(3)
memory usage: 2.1+ MB


In [23]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149570 entries, 0 to 149569
Data columns (total 7 columns):
 #   Column               Non-Null Count   Dtype   
---  ------               --------------   -----   
 0   isbn                 149570 non-null  object  
 1   book_author          149570 non-null  object  
 2   year_of_publication  149570 non-null  category
 3   publisher            149570 non-null  object  
 4   summary              149570 non-null  int64   
 5   major_cat            149570 non-null  object  
 6   isbn_area            149570 non-null  object  
dtypes: category(1), int64(1), object(5)
memory usage: 7.0+ MB


In [71]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306795 entries, 0 to 306794
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   user_id  306795 non-null  int64 
 1   isbn     306795 non-null  object
 2   rating   306795 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 7.0+ MB


In [25]:
train_ratings = pd.read_csv('./data/train_ratings.csv')
test_ratings = pd.read_csv('./data/test_ratings.csv')
train_df = train_ratings.merge(users, on='user_id', how='left').merge(books[['isbn', 'book_author', 'year_of_publication', 'publisher', 'major_cat', 'isbn_area']], on='isbn', how='left')
test_df = test_ratings.merge(users, on='user_id', how='left').merge(books[['isbn', 'book_author', 'year_of_publication', 'publisher', 'major_cat', 'isbn_area']], on='isbn', how='left')

In [26]:
train_df.book_author.value_counts(dropna=False)

stephenking        3759
johngrisham        2937
noraroberts        2396
jamespatterson     1908
jkrowling          1416
                   ... 
hisakomatsubara       1
andygoldsworthy       1
mauricekrafft         1
alanrayburn           1
mapsco                1
Name: book_author, Length: 52185, dtype: int64

In [27]:
train_df['age_bin'] = train_df['age_bin'].astype('str')
train_df['year_of_publication'] = train_df['year_of_publication'].astype('str')

test_df['age_bin'] = test_df['age_bin'].astype('str')
test_df['year_of_publication'] = test_df['year_of_publication'].astype('str')

train_df['book_author'] = train_df['book_author'].fillna('stephenking')
test_df['book_author'] = test_df['book_author'].fillna('stephenking')

# # lanugage 전처리
# train_df['language'] = train_df['language'].fillna('en')
# test_df['language'] = test_df['language'].fillna('en')

In [32]:
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor

X_train, X_val, y_train, y_val = train_test_split(train_df.drop(['user_id', 'isbn', 'rating'], axis=1), train_df['rating'], test_size = 0.2, random_state = 42)
# cbr = CatBoostRegressor(cat_features = list(X_train.drop(['age', 'year_of_publication'], axis=1).columns) )

In [33]:
cbr = CatBoostRegressor(cat_features = list(X_train.columns) )

In [34]:
cbr.fit(X_train, y_train, verbose=False)

  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,


<catboost.core.CatBoostRegressor at 0x28fc36800>

In [35]:
from sklearn.metrics import mean_squared_error

y_pred = cbr.predict(X_val)
mean_squared_error(y_pred, y_val, squared=False)

2.193428330532013

In [37]:
import optuna
from optuna import Trial
from optuna.samplers import TPESampler

In [38]:
# random sampler
sampler = TPESampler(seed=42)

# define function
def objective(trial):
    # suggest_{type}: 범위 내의 {type}값 선택
    
    cbr_param = {
        'iterations':trial.suggest_int("iterations", 800, 1600),
        'od_wait':trial.suggest_int('od_wait', 10, 50),
        'learning_rate' : trial.suggest_float('learning_rate',0.005, 0.5),
        'reg_lambda': trial.suggest_float('reg_lambda',0.1 ,10),
        'subsample': trial.suggest_float('subsample',0,1),
        'random_strength': trial.suggest_float('random_strength',10,50),
        'depth': trial.suggest_int('depth',2, 10),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf',1,5),
        'leaf_estimation_iterations': trial.suggest_int('leaf_estimation_iterations',1,15),
        # 'bagging_temperature' :trial.suggest_float('bagging_temperature', 0.01, 100.00), 
        # 'colsample_bylevel':trial.suggest_float('colsample_bylevel', 0.4, 1.0),
    }

    # Generate model
    cbr = CatBoostRegressor(**cbr_param, cat_features = list(X_train.columns))
    cbr = cbr.fit(X_train, y_train, eval_set=[(X_val, y_val)], 
                           verbose=0, early_stopping_rounds=25)
                           
    # 평가지표 원하는 평가 지표가 있을 시 바꾸어 준다.
    RMSE = mean_squared_error(y_val, cbr.predict(X_val), squared=False)
    return RMSE

optuna_cbr = optuna.create_study(direction='minimize', sampler=sampler)
optuna_cbr.optimize(objective, n_trials=50)

[32m[I 2023-04-19 17:56:37,858][0m A new study created in memory with name: no-name-825cd3fe-5cd0-474f-82d4-702789b0a421[0m
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
[32m[I 2023-04-19 17:57:00,394][0m Trial 0 finished with value: 2.211641301604309 and parameters: {'iterations': 1100, 'od_wait': 48, 'learning_rate': 0.36733700119664553, 'reg_lambda': 6.026718993550663, 'subsample': 0.15601864044243652, 'random_strength': 16.239780813448107, 'depth': 2, 'min_data_in_leaf': 5, 'leaf_estimation_iterations': 10}. Best is trial 0 with value: 2.211641301604309.[0m
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
[32m[I 2023-04-19 17:57:24,802][0m Trial 1 finished with value: 2.20210330875792 and parameters: {'iterations': 1367, 'od_wait': 10, 'learning_rate': 0.4851053768201872, 'reg_lambda': 8.341182143924176, 'subsample': 0.212339110678

In [39]:
cbr_trial = optuna_cbr.best_trial
cbr_trial_params = cbr_trial.params
print('Best Trial: score {},\nparams {}'.format(cbr_trial.value, cbr_trial_params))

Best Trial: score 2.1914280120892213,
params {'iterations': 1383, 'od_wait': 40, 'learning_rate': 0.048874458688324886, 'reg_lambda': 3.491598561612883, 'subsample': 0.8356954568350058, 'random_strength': 39.72019974335227, 'depth': 8, 'min_data_in_leaf': 1, 'leaf_estimation_iterations': 14}


In [40]:
cbr_trial_params

{'iterations': 1383,
 'od_wait': 40,
 'learning_rate': 0.048874458688324886,
 'reg_lambda': 3.491598561612883,
 'subsample': 0.8356954568350058,
 'random_strength': 39.72019974335227,
 'depth': 8,
 'min_data_in_leaf': 1,
 'leaf_estimation_iterations': 14}

In [41]:
# Modeling fit
cbr = CatBoostRegressor(**cbr_trial_params, cat_features = list(X_train.drop(['age_bin', 'year_of_publication'], axis=1).columns), verbose=False)
# cbr = CatBoostRegressor(**cbr_trial_params, verbose=False)
cbr_study = cbr.fit(X_train, y_train)

  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,


In [42]:
y_pred = cbr_study.predict(test_df.drop(['user_id','isbn', 'rating'], axis=1)) 
submit = pd.DataFrame({'user_id': np.array(test_df.index), 'isbn': test_df['isbn'], 'rating': y_pred })
now = datetime.now()
submit.to_csv('./submit/submission_' + str(now) + '_catboost.csv', index=False)

  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,


In [43]:
dict(zip(list(X_train.columns), cbr.feature_importances_))

{'city': 26.463785709040568,
 'state': 9.690552561850126,
 'country': 6.312690317444285,
 'age_bin': 9.860203796896554,
 'book_author': 15.784783256618164,
 'year_of_publication': 3.799621197071933,
 'publisher': 12.67359795453161,
 'major_cat': 9.553973542252189,
 'isbn_area': 5.860791664294598}