In [92]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [94]:
df = pd.read_csv("bestsellers with categories.csv")

In [98]:
data = {
    'Name': ['Book1', 'Book2', 'Book3', 'Book4', 'Book5'],
    'Author': ['Author1', 'Author2', 'Author3', 'Author4', 'Author5'],
    'User Rating': [4.5, 4.0, 4.8, 3.5, 4.2],
    'Reviews': [2000, 1500, 3000, 800, 1200],
    'Price': [15.99, 20.99, 18.50, 22.00, 12.99],
    'Year': [2021, 2020, 2022, 2019, 2021],
    'Genre': ['Fantasy', 'Sci-Fi', 'Romance', 'Thriller', 'Fantasy']
}

In [124]:
# Create DataFrame
df = pd.DataFrame(data)

# Step 3: Handle Missing Values (if any)
df['Author'] = df['Author'].fillna('Unknown')  # Assign directly
df['Genre'] = df['Genre'].fillna('Unknown')  # Assign directly
df['Reviews'] = df['Reviews'].fillna(df['Reviews'].mean())  # Assign directly
df['Price'] = df['Price'].fillna(df['Price'].mean())  # Assign directly
df['Year'] = df['Year'].fillna(df['Year'].mean())  # Assign directly

# Step 4: Encode Categorical Features (Author, Genre)
label_encoder = LabelEncoder()
df['Author'] = label_encoder.fit_transform(df['Author'])
df['Genre'] = label_encoder.fit_transform(df['Genre'])


In [102]:
# Step 5: Define Features (X) and Target (y)
X = df.drop(columns=['Name', 'User Rating'])  # Drop non-relevant columns like 'Name' and 'User Rating'
y = df['User Rating']  # Target column is 'User Rating'

# Step 6: Split the Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 7: Feature Scaling (Scale numerical columns)
scaler = StandardScaler()
X_train[['Reviews', 'Price', 'Year']] = scaler.fit_transform(X_train[['Reviews', 'Price', 'Year']])
X_test[['Reviews', 'Price', 'Year']] = scaler.transform(X_test[['Reviews', 'Price', 'Year']])


In [108]:
# Step 8: Train a RandomForest Regressor Model
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Step 9: Make Predictions
y_pred = model.predict(X_test)

# Step 10: Evaluate the Model Performance
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5

# Check if there are at least two samples in the test set before calculating R²
if len(y_test) > 1:
    r2 = r2_score(y_test, y_pred)
    print(f"R^2 Score: {r2:.2f}")
else:
    print("R^2 Score is not well-defined with less than two samples.")

# Print the evaluation metrics
print(f"Mean Squared Error: {mse:.2f}")
print(f"Root Mean Squared Error: {rmse:.2f}")


R^2 Score is not well-defined with less than two samples.
Mean Squared Error: 0.05
Root Mean Squared Error: 0.22


In [122]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

# Assuming 'df' is the DataFrame containing the necessary columns
# Drop the 'Name' column since it's not useful for prediction
X = df.drop(columns=['User Rating', 'Name'])  # Remove 'Name' and 'User Rating'
y = df['User Rating']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the ColumnTransformer for preprocessing
column_transformer = ColumnTransformer(
    transformers=[
        ('author_genre', OneHotEncoder(handle_unknown='ignore'), ['Author', 'Genre']),
        ('scaler', StandardScaler(), ['Reviews', 'Price', 'Year'])
    ],
    remainder='passthrough'  # Keep other columns as they are (none left here after drop)
)

# Create the pipeline with preprocessing and model training
pipeline = Pipeline(steps=[
    ('preprocessor', column_transformer),
    ('model', RandomForestRegressor(random_state=42))
])

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

# Step 9: Make Predictions
y_pred = pipeline.predict(X_test)

# Step 10: Evaluate the Model Performance
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred)

# Print the evaluation metrics
print(f"Mean Squared Error: {mse:.2f}")
print(f"Root Mean Squared Error: {rmse:.2f}")
print(f"R^2 Score: {r2:.2f}")

# Step 11: Predict User Rating for a New Book

# Create a DataFrame for the new data (without the 'Name' column)
new_data = pd.DataFrame([{
    'Author': 'J.K. Rowling',  # Original value before encoding
    'Reviews': 3000,
    'Price': 18.99,
    'Year': 2021,
    'Genre': 'Fantasy'  # Original value before encoding
}])

# Use the pipeline to predict the User Rating for the new data
predicted_rating = pipeline.predict(new_data)
print(f"Predicted User Rating for the new book: {predicted_rating[0]:.2f}")


Mean Squared Error: 0.08
Root Mean Squared Error: 0.29
R^2 Score: nan
Predicted User Rating for the new book: 4.43


