In [1]:
import pandas as pd
import catboost

In [2]:
df = pd.read_csv(r'dataset/books_train.csv')
df.rename(columns = {'  num_pages': 'num_pages'}, inplace=True) # rename wrong column name

In [3]:
df.head()

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher
0,26237,Revolutionary Girl Utena Vol. 3: To Sprout,Chiho Saito/Be-Pas,4.05,1591162076,9781591162070,eng,200,1153,16,2/4/2004,VIZ Media LLC
1,33448,Positioning: The Battle for Your Mind,Al Ries/Jack Trout,4.04,71359168,9780071359160,en-US,246,126,9,1/18/2001,McGraw-Hill Education
2,13739,Twelve Fair Kingdoms,Suzette Haden Elgin,3.99,425058506,9780425058503,eng,195,141,10,3/1/1983,Berkley
3,2711,The Riverside Chaucer,Geoffrey Chaucer/Larry Dean Benson/F.N. Robinson,4.18,395290317,9780395290316,enm,1327,7760,152,12/12/1987,Houghton Mifflin
4,40540,PHP and MySQL Web Development (Developer's Lib...,Luke Welling/Laura Thomson,3.96,672326728,752063326725,en-US,1008,590,31,10/1/2004,Sams


In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(df.drop(columns=['average_rating', 'bookID', 'isbn', 'isbn13', 'publication_date']), df['average_rating'], random_state=42, train_size=0.8)

In [5]:
train_dataset = catboost.Pool(X_train, Y_train, cat_features=['language_code','title', 'authors', 'publisher'])
test_dataset = catboost.Pool(X_test, Y_test, cat_features=['language_code','title', 'authors', 'publisher'])

Task is ranking books - so we should use regression model. I decided to use CatBoostRegressor, because I can use categorical features AS-IS, without any transformations.

In [6]:
cb = catboost.CatBoostRegressor(verbose=1, random_state=42)
grid = {'depth' : [6,8,10],
      'learning_rate' : [0.01, 0.05, 0.1],
      'iterations'    : [30, 50, 100, 200, 250]
      }
cb.grid_search(grid, train_dataset)

0:	learn: 3.9087552	test: 3.9130152	best: 3.9130152 (0)	total: 54ms	remaining: 1.57s
1:	learn: 3.8700126	test: 3.8742992	best: 3.8742992 (1)	total: 55.6ms	remaining: 778ms
2:	learn: 3.8318804	test: 3.8361024	best: 3.8361024 (2)	total: 58.8ms	remaining: 529ms
3:	learn: 3.7941512	test: 3.7982803	best: 3.7982803 (3)	total: 60.6ms	remaining: 394ms
4:	learn: 3.7567977	test: 3.7608080	best: 3.7608080 (4)	total: 65ms	remaining: 325ms
5:	learn: 3.7197889	test: 3.7237775	best: 3.7237775 (5)	total: 66.7ms	remaining: 267ms
6:	learn: 3.6832639	test: 3.6871482	best: 3.6871482 (6)	total: 68.1ms	remaining: 224ms
7:	learn: 3.6470753	test: 3.6508723	best: 3.6508723 (7)	total: 70.7ms	remaining: 194ms
8:	learn: 3.6111132	test: 3.6148191	best: 3.6148191 (8)	total: 72.1ms	remaining: 168ms
9:	learn: 3.5753484	test: 3.5790787	best: 3.5790787 (9)	total: 73.2ms	remaining: 146ms
10:	learn: 3.5401514	test: 3.5438415	best: 3.5438415 (10)	total: 80.2ms	remaining: 139ms
11:	learn: 3.5051683	test: 3.5088578	best: 3.

{'params': {'depth': 6, 'iterations': 200, 'learning_rate': 0.1},
 'cv_results': defaultdict(list,
             {'iterations': [0,
               1,
               2,
               3,
               4,
               5,
               6,
               7,
               8,
               9,
               10,
               11,
               12,
               13,
               14,
               15,
               16,
               17,
               18,
               19,
               20,
               21,
               22,
               23,
               24,
               25,
               26,
               27,
               28,
               29,
               30,
               31,
               32,
               33,
               34,
               35,
               36,
               37,
               38,
               39,
               40,
               41,
               42,
               43,
               44,
               45,
               46,
    

In [7]:
from sklearn.metrics import mean_squared_error
mean_squared_error(cb.predict(X_test), Y_test.values)

0.09019046748413204

MSE not so good, but in fact - other students have the same result, so this dataset can be weak linked.

In [8]:
test_df = pd.read_csv('dataset/books_test.csv')
test_df.rename(columns = {'  num_pages': 'num_pages'}, inplace=True) # rename wrong column name
test_df.head()

Unnamed: 0,bookID,title,authors,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher
0,2538,El hombre duplicado,José Saramago/Pilar del Río,8466312803,9788466312806,spa,380,1295,106,9/1/2004,Punto de Lectura
1,31912,Buffy the Vampire Slayer and Philosophy: Fear ...,James B. South/William Irwin,812695313,9780812695311,eng,335,2519,85,3/13/2003,Open Court
2,7716,Plays Pleasant,George Bernard Shaw/Dan H. Laurence/W.J. McCor...,140437940,9780140437942,eng,336,265,10,3/27/2003,Penguin Classics
3,7811,A Friend of the Earth,T. Coraghessan Boyle,747553467,9780747553465,eng,275,30,3,10/8/2001,Bloomsbury Paperbacks
4,19379,Mark Twain's Own Autobiography: The Chapters f...,Mark Twain/Michael J. Kiskis,299125408,9780299125400,eng,301,9,1,10/1/1990,University of Wisconsin Press


In [9]:
new_df = pd.DataFrame(data={'bookID' : test_df.bookID.values, 'average_rating': cb.predict(test_df.drop(columns=['bookID', 'isbn', 'isbn13', 'publication_date']))})
new_df

Unnamed: 0,bookID,average_rating
0,2538,3.827274
1,31912,3.955903
2,7716,3.896989
3,7811,3.733553
4,19379,3.877230
...,...,...
2776,26449,4.023425
2777,32298,4.237143
2778,28456,3.944373
2779,14719,3.878349


In [10]:
new_df.to_csv('dataset/books_submission.csv', index=False)