In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
np.random.seed(42)

# Table of Contents :
* [1. Problem Introduction](#section1)
* [2. Sampling](#section2)
* [3. Feature engineering](#section3)
    -  [Target Encodeing](#section31)
    -  [Scaling](#section32)
* [4. Modelling](#section4)
    -  [Linear regression](#section41)
    -  [Random Forest](#section42)
* [5. Fine Tune Model](#section5)
    -  [Random Seacrh](#section51)





<a id="section1"></a>
# Problem Introduction : 
Book price prediction is a task that involves estimating the price of a book based on various factors such as its attributes, market conditions, and historical data. It can be approached as a regression problem, where the goal is to build a predictive model that can accurately predict the price of a book given its features.


In [2]:
book_df = pd.read_csv('cleaned_book.csv')
print(book_df.shape)
book_df.columns

(3830, 16)


Index(['id', 'name', 'price', 'original_price', 'discount', 'discount_rate',
       'rating_average', 'review_count', 'short_description',
       'all_time_quantity_sold', 'authors', 'categories', 'publisher_vn',
       'book_cover', 'number_of_page', 'manufacturer'],
      dtype='object')

In [3]:
X = book_df[['categories', 'publisher_vn','book_cover', 'number_of_page', 'manufacturer']]
y = book_df[['original_price']]
test_size = 0.2
random_state = 42
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)


In [4]:
print(X.shape, X.columns)
print(y.shape, y.columns)
print(X_train.shape, X_train.columns)
print(y_train.shape, y_train.columns)

(3830, 5) Index(['categories', 'publisher_vn', 'book_cover', 'number_of_page',
       'manufacturer'],
      dtype='object')
(3830, 1) Index(['original_price'], dtype='object')
(3064, 5) Index(['categories', 'publisher_vn', 'book_cover', 'number_of_page',
       'manufacturer'],
      dtype='object')
(3064, 1) Index(['original_price'], dtype='object')


<a id="section31"></a>
## Target Encodeing: 


In [5]:
X_train

Unnamed: 0,categories,publisher_vn,book_cover,number_of_page,manufacturer
3142,Tin Học Văn Phòng,Thái Hà,Bìa mềm,360,Nhà Xuất Bản Công Thương
3263,Sách Y Học,Thái Hà,Bìa mềm,264,Nhà Xuất Bản Công Thương
829,Sách Kiến Thức - Kỹ Năng Cho Trẻ,Tân Việt,Bìa mềm,31,Nhà Xuất Bản Mỹ Thuật
700,Sách nghệ thuật sống đẹp,Công ty cổ phần Ahora,Bìa mềm,556,Nhà Xuất Bản Thế Giới
3779,Sách tiếng Việt,Công ty Sách Bảo Trang,Bìa mềm,192,Nhà Xuất Bản Tri Thức
...,...,...,...,...,...
1130,Sách Học Tiếng Anh,Zenbooks,Bìa mềm,327,Nhà Xuất Bản Đà Nẵng
1294,Sách tham khảo cấp III,Cty Sách Sách Hay,Bìa mềm,204,Nhà Xuất Bản Đại Học Quốc Gia Hà Nội
860,Sách Làm Cha Mẹ,Đinh Tị,Bìa mềm,396,Nhà Xuất Bản Thanh Niên
3507,Sách Tâm Lý Tuổi Teen,First News - Trí Việt,Bìa mềm,152,Nhà Xuất Bản Tổng Hợp


In [6]:
from category_encoders import MEstimateEncoder

# Create the encoder instance. Choose m to control noise.
encoder = MEstimateEncoder(cols=['categories',"publisher_vn",'book_cover','manufacturer'])

# Fit the encoder on the training set.
encoder.fit(X_train, y_train)

# Encode the  data

X_train_encode = encoder.transform(X_train)
X_test_encode = encoder.transform(X_test)
print(X_train_encode.shape, X_train_encode)



(3064, 5)          categories   publisher_vn     book_cover  number_of_page  \
3142  177550.865403  146753.719575  141483.233209             360   
3263  159453.944781  146753.719575  141483.233209             264   
829   100696.923096  105234.412971  141483.233209              31   
700   131412.820537  163476.923221  141483.233209             556   
3779  162966.011930  188453.846443  141483.233209             192   
...             ...            ...            ...             ...   
1130  192190.446659  123287.044550  141483.233209             327   
1294  185118.414936  112953.846443  141483.233209             204   
860   150237.428252  119494.230805  141483.233209             396   
3507  115283.076940  158884.835170  141483.233209             152   
3174  150169.230805  159453.846443  141483.233209             120   

       manufacturer  
3142  152998.557701  
3263  152998.557701  
829   281648.160560  
700   203974.463871  
3779  194849.946346  
...             ...  
1130  1

In [7]:
print(X_test_encode.shape, X_test_encode)


(766, 5)          categories   publisher_vn     book_cover  number_of_page  \
2922  106790.298101  169134.855805  141483.233209             328   
3716  208616.849831  156907.692885  305274.464338             620   
1964  298946.684360  257927.179526  141483.233209             724   
3621  221177.622404  146753.719575  141483.233209             276   
1694  268602.564116  211194.924669  141483.233209             476   
...             ...            ...            ...             ...   
1070  192190.446659  211194.924669  141483.233209             556   
2417   52300.037896   59592.414534  141483.233209             192   
433   112817.948814  109271.564295  141483.233209              80   
807   150237.428252   99144.729366  141483.233209             688   
371   133258.299626  159889.903851  141483.233209             145   

       manufacturer  
2922  195651.065092  
3716  203974.463871  
1964  140957.577506  
3621  152998.557701  
1694  192534.760221  
...             ...  
1070  19

<a id="section32"></a>

# SCALING


In [8]:
from sklearn.preprocessing import StandardScaler
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the scaler on the training data
scaler.fit(X_train_encode)

X_train_normalized = scaler.transform(X_train_encode)

# Normalize the testing data using the same scaler
X_test_normalized = scaler.transform(X_test_encode)



<a id="section41"></a>
# Linear Regression : 

In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

linRegressor = LinearRegression()

scores = cross_val_score(linRegressor, X_train_normalized, y_train, cv=5, scoring='neg_mean_squared_error')

# Convert the negative MSE scores to positive RMSE scores
rmse_scores = np.sqrt(-scores)

# Print the RMSE scores for each fold
print("RMSE scores:", rmse_scores)
print("Mean RMSE:", rmse_scores.mean())

RMSE scores: [86010.72222065 88914.56453534 71216.49616304 90418.80553456
 88223.08345698]
Mean RMSE: 84956.73438211411


<a id="section42"></a>
# Random Forest : 

In [10]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(n_estimators=100, random_state=42)
scores = cross_val_score(forest_reg, X_train_normalized, y_train, cv=5, scoring='neg_mean_squared_error')

# Convert the negative MSE scores to positive RMSE scores
rmse_scores = np.sqrt(-scores)

# Print the RMSE scores for each fold
print("RMSE scores:", rmse_scores)
print("Mean RMSE:", rmse_scores.mean())

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


RMSE scores: [83640.68260803 87327.12332847 74192.03021814 87631.80091609
 95775.10556408]
Mean RMSE: 85713.3485269628


<a id="section51"></a>

# RANDOMIZE SEARCH

In [11]:
from sklearn.model_selection import RandomizedSearchCV

params = {
        'n_estimators': [np.random.randint(low=1, high=100)],
        'max_features': [np.random.randint(low=1, high=5)],
    }

forest_reg = RandomForestRegressor(random_state=42)
rnd_search = RandomizedSearchCV(forest_reg, param_distributions=params,
                                n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42)
rnd_search.fit(X_train_normalized, y_train)


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)


In [12]:
rnd_search.best_params_

{'n_estimators': 52, 'max_features': 1}

In [13]:
forest_reg_tuning = RandomForestRegressor(n_estimators=72, max_features=1 ,random_state=42)
scores = cross_val_score(forest_reg_tuning, X_train_normalized, y_train, cv=5, scoring='neg_mean_squared_error')

# Convert the negative MSE scores to positive RMSE scores
rmse_scores = np.sqrt(-scores)

# Print the RMSE scores for each fold
print("RMSE scores:", rmse_scores)
print("Mean RMSE:", rmse_scores.mean())

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


RMSE scores: [84858.31141495 85106.90452188 69829.65116489 87117.74967399
 90226.21622413]
Mean RMSE: 83427.76659996875


In [15]:
forest_reg_tuning.fit( X_train_normalized, y_train)
y_pred = forest_reg_tuning.predict(X_test_encode)
mse = mean_squared_error(y_test, y_pred)
# Convert the negative MSE scores to positive RMSE scores
rmse_scores = np.sqrt(mse)

# Print the RMSE scores for each fold
print("RMSE scores:", rmse_scores)
print("Mean RMSE:", rmse_scores.mean())

  forest_reg_tuning.fit( X_train_normalized, y_train)


RMSE scores: 641195.3465878738
Mean RMSE: 641195.3465878738


