In [53]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
np.random.seed(42)

# Table of Contents :
* [1. Problem Goal](#section1)
* [2. Sampling](#section2)
* [3. Feature engineering](#section3)
    -  [Target Encodeing](#section31)
* [4. Modelling](#section4)
    -  [Linear regression](#section41)



<a id="section1"></a>
# Problem Goal : 
Book price prediction is a task that involves estimating the price of a book based on various factors such as its attributes, market conditions, and historical data. It can be approached as a regression problem, where the goal is to build a predictive model that can accurately predict the price of a book given its features.


<a id="section2"></a>
# SAMPLING:

In [54]:
book_df = pd.read_csv('cleaned_book.csv')
print(book_df.shape)
book_df.columns

(3830, 16)


Index(['id', 'name', 'price', 'original_price', 'discount', 'discount_rate',
       'rating_average', 'review_count', 'short_description',
       'all_time_quantity_sold', 'authors', 'categories', 'publisher_vn',
       'book_cover', 'number_of_page', 'manufacturer'],
      dtype='object')

In [55]:
X = book_df[['categories', 'publisher_vn','book_cover', 'number_of_page', 'manufacturer']]
y = book_df[['original_price']]
test_size = 0.2
random_state = 42
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)


In [56]:
print(X.shape, X.columns)
print(y.shape, y.columns)
print(X_train.shape, X_train.columns)
print(y_train.shape, y_train.columns)

(3830, 5) Index(['categories', 'publisher_vn', 'book_cover', 'number_of_page',
       'manufacturer'],
      dtype='object')
(3830, 1) Index(['original_price'], dtype='object')
(3064, 5) Index(['categories', 'publisher_vn', 'book_cover', 'number_of_page',
       'manufacturer'],
      dtype='object')
(3064, 1) Index(['original_price'], dtype='object')


<a id="section31"></a>
## Target Encodeing: 


In [57]:
from category_encoders import MEstimateEncoder

# Create the encoder instance. Choose m to control noise.
encoder = MEstimateEncoder(cols=['categories',"publisher_vn",'book_cover','manufacturer'])

# Fit the encoder on the training set.
encoder.fit(X_train, y_train)

# Encode the  data

X_train_encode = encoder.transform(X_train)
X_test_encode = encoder.transform(X_test)
print(X_train_encode.shape, X_train_encode)



(3064, 5)          categories   publisher_vn     book_cover  number_of_page  \
3142  177550.865403  146753.719575  141483.233209             360   
3263  159453.944781  146753.719575  141483.233209             264   
829   100696.923096  105234.412971  141483.233209              31   
700   131412.820537  163476.923221  141483.233209             556   
3779  162966.011930  188453.846443  141483.233209             192   
...             ...            ...            ...             ...   
1130  192190.446659  123287.044550  141483.233209             327   
1294  185118.414936  112953.846443  141483.233209             204   
860   150237.428252  119494.230805  141483.233209             396   
3507  115283.076940  158884.835170  141483.233209             152   
3174  150169.230805  159453.846443  141483.233209             120   

       manufacturer  
3142  152998.557701  
3263  152998.557701  
829   281648.160560  
700   203974.463871  
3779  194849.946346  
...             ...  
1130  1

In [58]:
print(X_test_encode.shape, X_test_encode)


(766, 5)          categories   publisher_vn     book_cover  number_of_page  \
2922  106790.298101  169134.855805  141483.233209             328   
3716  208616.849831  156907.692885  305274.464338             620   
1964  298946.684360  257927.179526  141483.233209             724   
3621  221177.622404  146753.719575  141483.233209             276   
1694  268602.564116  211194.924669  141483.233209             476   
...             ...            ...            ...             ...   
1070  192190.446659  211194.924669  141483.233209             556   
2417   52300.037896   59592.414534  141483.233209             192   
433   112817.948814  109271.564295  141483.233209              80   
807   150237.428252   99144.729366  141483.233209             688   
371   133258.299626  159889.903851  141483.233209             145   

       manufacturer  
2922  195651.065092  
3716  203974.463871  
1964  140957.577506  
3621  152998.557701  
1694  192534.760221  
...             ...  
1070  19

<a id="section41"></a>
# Linear Regression : 

In [59]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

regressor = LinearRegression()

# Fit the model on the training data
regressor.fit(X_train_encode, y_train)

# Predict on the test set
y_predict = regressor.predict(X_test_encode)

# Calculate mean squared error
mse = mean_squared_error(y_test, y_predict)

print('Mean Squared Error:', mse)

Mean Squared Error: 7135861691.474946
