In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_log_error

In [2]:
# Load the datasets
train_data = pd.read_excel('Data_Train.xlsx')
test_data = pd.read_excel('Data_Test.xlsx')

In [3]:
# Preprocessing function
def preprocess_data(df):
    df = df.fillna('')
    df['Reviews'] = df['Reviews'].str.extract(r'(\d+)').astype(int)
    df['Ratings'] = df['Ratings'].str.extract(r'(\d+)').astype(int)
    return df

train_data = preprocess_data(train_data)
test_data = preprocess_data(test_data)

In [4]:
# Define feature columns and target column
feature_columns = ['Title', 'Author', 'Edition', 'Reviews', 'Ratings', 'Synopsis', 'Genre', 'BookCategory']
target_column = 'Price'

In [5]:
# Split the training data into features and target
X = train_data[feature_columns]
y = train_data[target_column]

In [6]:
# Further split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Define preprocessing steps
text_features = ['Title', 'Author', 'Synopsis', 'Genre', 'BookCategory']
numeric_features = ['Reviews', 'Ratings']

text_transformer = Pipeline(steps=[
    ('tfidf', TfidfVectorizer(max_features=100))
])

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('text_title', text_transformer, 'Title'),
        ('text_author', text_transformer, 'Author'),
        ('text_synopsis', text_transformer, 'Synopsis'),
        ('text_genre', text_transformer, 'Genre'),
        ('text_bookcategory', text_transformer, 'BookCategory'),
        ('num', numeric_transformer, numeric_features)
    ])

In [8]:
# Define the model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

In [9]:
# Train the model
model.fit(X_train, y_train)

In [10]:
# Predict on the validation set
y_val_pred = model.predict(X_val)

In [11]:
# Calculate RMSLE on the validation set
rmsle_score = np.sqrt(mean_squared_log_error(y_val, y_val_pred))
print('Validation RMSLE:', rmsle_score)

Validation RMSLE: 0.6484846574254421


In [12]:
# Predict on the test data
y_test_pred = model.predict(test_data[feature_columns])

In [13]:
# Save predictions
predictions = pd.DataFrame({'Price': y_test_pred})
predictions.to_csv('book_price_predictions.csv', index=False)

print("Predictions saved to 'book_price_predictions.csv'.")

Predictions saved to 'book_price_predictions.csv'.
