In [1]:
# !pip install catboost
# !pip install lightgbm

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [3]:
books = pd.read_csv('../data/books.csv')
users = pd.read_csv('../data/users.csv')
train_ratings = pd.read_csv('../data/train_ratings.csv')
test_ratings = pd.read_csv('../data/test_ratings.csv')

In [4]:
books.head()

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher,img_url,language,category,summary,img_path
0,2005018,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,en,['Actresses'],"In a small town in Canada, Clara Callan reluct...",images/0002005018.01.THUMBZZZ.jpg
1,60973129,Decision in Normandy,Carlo D'Este,1991.0,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,en,['1940-1949'],"Here, for the first time in paperback, is an o...",images/0060973129.01.THUMBZZZ.jpg
2,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999.0,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,en,['Medical'],"Describes the great flu epidemic of 1918, an o...",images/0374157065.01.THUMBZZZ.jpg
3,399135782,The Kitchen God's Wife,Amy Tan,1991.0,Putnam Pub Group,http://images.amazon.com/images/P/0399135782.0...,en,['Fiction'],A Chinese immigrant who is convinced she is dy...,images/0399135782.01.THUMBZZZ.jpg
4,425176428,What If?: The World's Foremost Military Histor...,Robert Cowley,2000.0,Berkley Publishing Group,http://images.amazon.com/images/P/0425176428.0...,en,['History'],"Essays by respected military historians, inclu...",images/0425176428.01.THUMBZZZ.jpg


### users - preprocessingtrain_ratings

In [5]:
users.head()

Unnamed: 0,user_id,location,age
0,8,"timmins, ontario, canada",
1,11400,"ottawa, ontario, canada",49.0
2,11676,"n/a, n/a, n/a",
3,67544,"toronto, ontario, canada",30.0
4,85526,"victoria, british columbia, canada",36.0


In [6]:
def age_map(x: int) -> int:
    x = int(x)
    if x < 20:
        return 1
    elif x >= 20 and x < 30:
        return 2
    elif x >= 30 and x < 40:
        return 3
    elif x >= 40 and x < 50:
        return 4
    elif x >= 50 and x < 60:
        return 5
    else:
        return 6

In [7]:
users['age'] = users['age'].fillna(users['age'].mean())
users['age'] = users['age'].apply(age_map)
users['age'] = users['age'].astype('str')

In [8]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68092 entries, 0 to 68091
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   user_id   68092 non-null  int64 
 1   location  68092 non-null  object
 2   age       68092 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.6+ MB


In [9]:
users['location'] = users['location'].str.replace(r'[^0-9a-zA-Z:,]', '', regex=True)
users['location_city'] = users['location'].apply(lambda x: x.split(',')[0])
users['location_state'] = users['location'].apply(lambda x: x.split(',')[1])
users['location_country'] = users['location'].apply(lambda x: x.split(',')[2])
users = users.drop(['location'], axis=1)

In [10]:
loc_city2idx = {v:k for k,v in enumerate(users['location_city'].unique())}
loc_state2idx = {v:k for k,v in enumerate(users['location_state'].unique())}
loc_country2idx = {v:k for k,v in enumerate(users['location_country'].unique())}
users['location_city'] = users['location_city'].map(loc_city2idx)
users['location_state'] = users['location_state'].map(loc_state2idx)
users['location_country'] = users['location_country'].map(loc_country2idx)

In [11]:
# na -> NULL
users = users.replace('na', np.nan)
# , , , -> NULL
users = users.replace('', np.nan)

In [12]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68092 entries, 0 to 68091
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   user_id           68092 non-null  int64 
 1   age               68092 non-null  object
 2   location_city     68092 non-null  int64 
 3   location_state    68092 non-null  int64 
 4   location_country  68092 non-null  int64 
dtypes: int64(4), object(1)
memory usage: 2.6+ MB


In [13]:
users.head()

Unnamed: 0,user_id,age,location_city,location_state,location_country
0,8,3,0,0,0
1,11400,4,1,0,0
2,11676,3,2,1,1
3,67544,3,3,0,0
4,85526,3,4,2,0


### books - preprocessing

In [14]:
books.head()

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher,img_url,language,category,summary,img_path
0,2005018,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,en,['Actresses'],"In a small town in Canada, Clara Callan reluct...",images/0002005018.01.THUMBZZZ.jpg
1,60973129,Decision in Normandy,Carlo D'Este,1991.0,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,en,['1940-1949'],"Here, for the first time in paperback, is an o...",images/0060973129.01.THUMBZZZ.jpg
2,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999.0,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,en,['Medical'],"Describes the great flu epidemic of 1918, an o...",images/0374157065.01.THUMBZZZ.jpg
3,399135782,The Kitchen God's Wife,Amy Tan,1991.0,Putnam Pub Group,http://images.amazon.com/images/P/0399135782.0...,en,['Fiction'],A Chinese immigrant who is convinced she is dy...,images/0399135782.01.THUMBZZZ.jpg
4,425176428,What If?: The World's Foremost Military Histor...,Robert Cowley,2000.0,Berkley Publishing Group,http://images.amazon.com/images/P/0425176428.0...,en,['History'],"Essays by respected military historians, inclu...",images/0425176428.01.THUMBZZZ.jpg


In [15]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149570 entries, 0 to 149569
Data columns (total 10 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   isbn                 149570 non-null  object 
 1   book_title           149570 non-null  object 
 2   book_author          149569 non-null  object 
 3   year_of_publication  149570 non-null  float64
 4   publisher            149570 non-null  object 
 5   img_url              149570 non-null  object 
 6   language             82343 non-null   object 
 7   category             80719 non-null   object 
 8   summary              82343 non-null   object 
 9   img_path             149570 non-null  object 
dtypes: float64(1), object(9)
memory usage: 11.4+ MB


In [16]:
# category 대괄호 제거
books.loc[books[books['category'].notnull()].index, 'category'] = books[books['category'].notnull()]['category'].apply(lambda x: re.sub('[\W_]+',' ',x).strip())

In [17]:
# 카테고리 수 줄이기
categories = ['garden','crafts','physics','adventure','music','fiction','nonfiction','science','science fiction','social','homicide',
 'sociology','disease','religion','christian','philosophy','psycholog','mathemat','agricult','environmental',
 'business','poetry','drama','literary','travel','motion picture','children','cook','literature','electronic',
 'humor','animal','bird','photograph','computer','house','ecology','family','architect','camp','criminal','language','india']

for category in categories:
    books.loc[books[books['category'].str.contains(category,na=False)].index,'category_high'] = category

In [18]:
category_high_df = pd.DataFrame(books['category_high'].value_counts()).reset_index()
category_high_df.columns = ['category','count']

In [19]:
# 5개 이하인 항목은 others로 묶어주도록 하겠습니다.
others_list = category_high_df[category_high_df['count']<5]['category'].values
books.loc[books[books['category_high'].isin(others_list)].index, 'category_high']='others'

In [20]:
books.head()

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher,img_url,language,category,summary,img_path,category_high
0,2005018,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,en,Actresses,"In a small town in Canada, Clara Callan reluct...",images/0002005018.01.THUMBZZZ.jpg,
1,60973129,Decision in Normandy,Carlo D'Este,1991.0,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,en,1940 1949,"Here, for the first time in paperback, is an o...",images/0060973129.01.THUMBZZZ.jpg,
2,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999.0,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,en,Medical,"Describes the great flu epidemic of 1918, an o...",images/0374157065.01.THUMBZZZ.jpg,
3,399135782,The Kitchen God's Wife,Amy Tan,1991.0,Putnam Pub Group,http://images.amazon.com/images/P/0399135782.0...,en,Fiction,A Chinese immigrant who is convinced she is dy...,images/0399135782.01.THUMBZZZ.jpg,
4,425176428,What If?: The World's Foremost Military Histor...,Robert Cowley,2000.0,Berkley Publishing Group,http://images.amazon.com/images/P/0425176428.0...,en,History,"Essays by respected military historians, inclu...",images/0425176428.01.THUMBZZZ.jpg,


In [21]:
loc_title2idx = {v:k for k,v in enumerate(books['book_title'].unique())}
loc_author2idx = {v:k for k,v in enumerate(books['book_author'].unique())}
loc_publisher2idx = {v:k for k,v in enumerate(books['publisher'].unique())}
loc_language2idx = {v:k for k,v in enumerate(books['language'].unique())}
loc_category2idx = {v:k for k,v in enumerate(books['category'].unique())}
loc_category_high2idx = {v:k for k,v in enumerate(books['category_high'].unique())}

books['book_title'] = books['book_title'].map(loc_title2idx)
books['book_author'] = books['book_author'].map(loc_author2idx)
books['publisher'] = books['publisher'].map(loc_publisher2idx)
books['language'] = books['language'].map(loc_language2idx)
books['category'] = books['category'].map(loc_category2idx)
books['category_high'] = books['category_high'].map(loc_category_high2idx)

### Merge

In [22]:
users.head()

Unnamed: 0,user_id,age,location_city,location_state,location_country
0,8,3,0,0,0
1,11400,4,1,0,0
2,11676,3,2,1,1
3,67544,3,3,0,0
4,85526,3,4,2,0


In [23]:
books.head()

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher,img_url,language,category,summary,img_path,category_high
0,2005018,0,0,2001.0,0,http://images.amazon.com/images/P/0002005018.0...,0,0,"In a small town in Canada, Clara Callan reluct...",images/0002005018.01.THUMBZZZ.jpg,0
1,60973129,1,1,1991.0,1,http://images.amazon.com/images/P/0060973129.0...,0,1,"Here, for the first time in paperback, is an o...",images/0060973129.01.THUMBZZZ.jpg,0
2,374157065,2,2,1999.0,2,http://images.amazon.com/images/P/0374157065.0...,0,2,"Describes the great flu epidemic of 1918, an o...",images/0374157065.01.THUMBZZZ.jpg,0
3,399135782,3,3,1991.0,3,http://images.amazon.com/images/P/0399135782.0...,0,3,A Chinese immigrant who is convinced she is dy...,images/0399135782.01.THUMBZZZ.jpg,0
4,425176428,4,4,2000.0,4,http://images.amazon.com/images/P/0425176428.0...,0,4,"Essays by respected military historians, inclu...",images/0425176428.01.THUMBZZZ.jpg,0


In [24]:
books_and_ratings = pd.merge(left = books , right = train_ratings, how = 'right', on = 'isbn')
train = pd.merge(left = users , right = books_and_ratings, how = 'right', on = 'user_id')

In [25]:
books_and_ratings_test = pd.merge(left = books , right = test_ratings, how = 'right', on = 'isbn')
test = pd.merge(left = users , right = books_and_ratings_test, how = 'right', on = 'user_id')

In [26]:
train_ratings.shape

(306795, 3)

In [27]:
print(train.shape)
print(test.shape)

(306795, 17)
(76699, 17)


In [28]:
train = train.set_index('user_id')
test = test.set_index('user_id')

In [45]:
test.head(10)

Unnamed: 0_level_0,age,location_city,location_state,location_country,isbn,book_title,book_author,year_of_publication,publisher,img_url,language,category,summary,img_path,category_high,rating
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
11676,3,2,1,1,2005018,0,0,2001.0,0,http://images.amazon.com/images/P/0002005018.0...,0,0,"In a small town in Canada, Clara Callan reluct...",images/0002005018.01.THUMBZZZ.jpg,0,0
116866,3,1,3,2,2005018,0,0,2001.0,0,http://images.amazon.com/images/P/0002005018.0...,0,0,"In a small town in Canada, Clara Callan reluct...",images/0002005018.01.THUMBZZZ.jpg,0,0
152827,4,1,0,0,60973129,1,1,1991.0,1,http://images.amazon.com/images/P/0060973129.0...,0,1,"Here, for the first time in paperback, is an o...",images/0060973129.01.THUMBZZZ.jpg,0,0
157969,3,15,11,3,374157065,2,2,1999.0,2,http://images.amazon.com/images/P/0374157065.0...,0,2,"Describes the great flu epidemic of 1918, an o...",images/0374157065.01.THUMBZZZ.jpg,0,0
67958,3,25,19,3,399135782,3,3,1991.0,3,http://images.amazon.com/images/P/0399135782.0...,0,3,A Chinese immigrant who is convinced she is dy...,images/0399135782.01.THUMBZZZ.jpg,0,0
213191,3,36,9,3,399135782,3,3,1991.0,3,http://images.amazon.com/images/P/0399135782.0...,0,3,A Chinese immigrant who is convinced she is dy...,images/0399135782.01.THUMBZZZ.jpg,0,0
245827,3,43,3,2,399135782,3,3,1991.0,3,http://images.amazon.com/images/P/0399135782.0...,0,3,A Chinese immigrant who is convinced she is dy...,images/0399135782.01.THUMBZZZ.jpg,0,0
11676,3,2,1,1,671870432,5,5,1993.0,5,http://images.amazon.com/images/P/0671870432.0...,1,5,,images/0671870432.01.THUMBZZZ.jpg,0,0
263256,3,20,15,3,1558746218,10,10,1998.0,10,http://images.amazon.com/images/P/1558746218.0...,1,5,,images/1558746218.01.THUMBZZZ.jpg,0,0
226745,3,60,3,2,1881320189,13,13,1994.0,13,http://images.amazon.com/images/P/1881320189.0...,0,3,A beautifully narrated novel of time and place...,images/1881320189.01.THUMBZZZ.jpg,0,0


In [29]:
category_cols = ['location_city', 'location_state', 'location_country', 'book_title', 'book_author', 'publisher', 'language', 'category', 'category_high']
# location_cols = ['location_city', 'location_state', 'location_country']
for col in category_cols:
    train[col] = train[col].astype('str')
    test[col] = test[col].astype('str')

In [30]:
train

Unnamed: 0_level_0,age,location_city,location_state,location_country,isbn,book_title,book_author,year_of_publication,publisher,img_url,language,category,summary,img_path,category_high,rating
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
8,3,0,0,0,0002005018,0,0,2001.0,0,http://images.amazon.com/images/P/0002005018.0...,0,0,"In a small town in Canada, Clara Callan reluct...",images/0002005018.01.THUMBZZZ.jpg,0,4
67544,3,3,0,0,0002005018,0,0,2001.0,0,http://images.amazon.com/images/P/0002005018.0...,0,0,"In a small town in Canada, Clara Callan reluct...",images/0002005018.01.THUMBZZZ.jpg,0,7
123629,3,5,0,0,0002005018,0,0,2001.0,0,http://images.amazon.com/images/P/0002005018.0...,0,0,"In a small town in Canada, Clara Callan reluct...",images/0002005018.01.THUMBZZZ.jpg,0,8
200273,3,6,0,0,0002005018,0,0,2001.0,0,http://images.amazon.com/images/P/0002005018.0...,0,0,"In a small town in Canada, Clara Callan reluct...",images/0002005018.01.THUMBZZZ.jpg,0,8
210926,3,7,0,0,0002005018,0,0,2001.0,0,http://images.amazon.com/images/P/0002005018.0...,0,0,"In a small town in Canada, Clara Callan reluct...",images/0002005018.01.THUMBZZZ.jpg,0,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
278843,2,2220,9,3,0743525493,135431,1323,2002.0,2340,http://images.amazon.com/images/P/0743525493.0...,1,5,,images/0743525493.01.THUMBZZZ.jpg,0,7
278851,3,33,17,3,067161746X,109970,72,1987.0,228,http://images.amazon.com/images/P/067161746X.0...,0,7,A tongue-in-cheek survival guide for single pe...,images/067161746X.01.THUMBZZZ.jpg,0,6
278851,3,33,17,3,0884159221,135433,62056,1985.0,11569,http://images.amazon.com/images/P/0884159221.0...,1,5,,images/0884159221.01.THUMBZZZ.jpg,0,7
278851,3,33,17,3,0912333022,135434,62057,1997.0,7026,http://images.amazon.com/images/P/0912333022.0...,0,3,These hilarious stories by the creator of publ...,images/0912333022.01.THUMBZZZ.jpg,0,7


In [31]:
train_preprocessed = train.drop(['img_url', 'summary', 'img_path', 'isbn'], axis=1)

In [32]:
test_preprocessed = test.drop(['img_url', 'summary', 'img_path', 'isbn'], axis=1)

### Split / Modeling

In [33]:
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor
# from lightgbm import LGBMRegressor

In [48]:
X_train, X_val, y_train, y_val = train_test_split(train_preprocessed.drop(['rating'], axis=1), train_preprocessed['rating'], test_size = 0.2, random_state = 42)

In [49]:
cbr = CatBoostRegressor()
# lgbm = LGBMRegressor()

In [50]:
cbr.fit(X_train, y_train)
# lgbm.fit(X_train, y_train)

# cbr.fit(train_preprocessed.drop(['rating'], axis=1), train_preprocessed['rating'])

Learning rate set to 0.097676
0:	learn: 2.4294786	total: 14.6ms	remaining: 14.6s
1:	learn: 2.4260555	total: 25.9ms	remaining: 12.9s
2:	learn: 2.4232633	total: 36.8ms	remaining: 12.2s
3:	learn: 2.4210133	total: 49.2ms	remaining: 12.2s
4:	learn: 2.4189451	total: 60.5ms	remaining: 12s
5:	learn: 2.4169500	total: 72.1ms	remaining: 11.9s
6:	learn: 2.4154660	total: 83.1ms	remaining: 11.8s
7:	learn: 2.4126360	total: 93.7ms	remaining: 11.6s
8:	learn: 2.4112312	total: 103ms	remaining: 11.4s
9:	learn: 2.4095387	total: 114ms	remaining: 11.2s
10:	learn: 2.4081201	total: 124ms	remaining: 11.1s
11:	learn: 2.4067973	total: 134ms	remaining: 11.1s
12:	learn: 2.4057250	total: 145ms	remaining: 11s
13:	learn: 2.4046577	total: 155ms	remaining: 11s
14:	learn: 2.4036092	total: 167ms	remaining: 10.9s
15:	learn: 2.4014626	total: 178ms	remaining: 11s
16:	learn: 2.3999955	total: 188ms	remaining: 10.9s
17:	learn: 2.3990415	total: 198ms	remaining: 10.8s
18:	learn: 2.3981838	total: 208ms	remaining: 10.8s
19:	learn: 

<catboost.core.CatBoostRegressor at 0x7f9119e6eaf0>

In [51]:
y_pred = cbr.predict(X_val)

# y_pred = cbr.predict(train_preprocessed.drop(['rating'], axis=1)) 

In [54]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_pred, y_val, squared=False)

2.3098306153810118

In [60]:
cbr.fit(train_preprocessed.drop(['rating'], axis=1), train_preprocessed['rating'])

Learning rate set to 0.101182
0:	learn: 2.4292655	total: 19.3ms	remaining: 19.3s
1:	learn: 2.4257287	total: 32.5ms	remaining: 16.2s
2:	learn: 2.4229911	total: 45.2ms	remaining: 15s
3:	learn: 2.4204890	total: 58.5ms	remaining: 14.6s
4:	learn: 2.4184708	total: 71.8ms	remaining: 14.3s
5:	learn: 2.4166395	total: 85.7ms	remaining: 14.2s
6:	learn: 2.4141691	total: 98.7ms	remaining: 14s
7:	learn: 2.4122472	total: 113ms	remaining: 14s
8:	learn: 2.4103599	total: 127ms	remaining: 14s
9:	learn: 2.4085823	total: 139ms	remaining: 13.8s
10:	learn: 2.4071726	total: 151ms	remaining: 13.6s
11:	learn: 2.4058121	total: 162ms	remaining: 13.4s
12:	learn: 2.4039141	total: 175ms	remaining: 13.3s
13:	learn: 2.4021798	total: 186ms	remaining: 13.1s
14:	learn: 2.4011096	total: 197ms	remaining: 12.9s
15:	learn: 2.3998392	total: 209ms	remaining: 12.9s
16:	learn: 2.3983907	total: 221ms	remaining: 12.8s
17:	learn: 2.3975450	total: 232ms	remaining: 12.7s
18:	learn: 2.3963343	total: 243ms	remaining: 12.5s
19:	learn: 2

<catboost.core.CatBoostRegressor at 0x7f9119e6eaf0>

In [64]:
y_pred = cbr.predict(test_preprocessed.drop(['rating'], axis=1)) 

In [65]:
submit = pd.DataFrame({'user_id': np.array(test.index), 'isbn': test['isbn'], 'rating': y_pred })
submit.head()

Unnamed: 0_level_0,user_id,isbn,rating
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
11676,11676,2005018,7.446558
116866,116866,2005018,6.109824
152827,152827,60973129,7.607782
157969,157969,374157065,7.856301
67958,67958,399135782,7.598187


In [68]:
submit.to_csv('submission_04121052_catboost.csv', index=False)