# Amazon Bestselling Books Analysis

In this analysis, we aim to find keywords in book titles that tend to increase ratings.

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import seaborn as sns
import re
from wordcloud import WordCloud

ModuleNotFoundError: No module named 'wordcloud'

In [None]:
# Load the dataset
file_path = 'BestSeller Books of Amazon.csv'
df = pd.read_csv(file_path)
df.head()

## Data Preprocessing
Clean the data by removing special characters and converting ratings and prices to numerical values.

In [None]:
# Remove special characters from 'Price' and convert to numerical
df['Price'] = df['Price'].replace('[â‚¹,]', '', regex=True).astype(float)
# Convert 'Rating' to numerical
df['Rating'] = df['Rating'].astype(float)
# Display the cleaned data
df.head()

## Text Analysis
Tokenize the book titles and perform text analysis to extract keywords.

In [None]:
# Function to clean and tokenize book titles
def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)
    text = text.lower()
    text = text.strip()
    tokens = text.split()
    return tokens

# Apply the function to the 'Book Name' column
df['tokens'] = df['Book Name'].apply(clean_text)
# Display the tokenized data
df.head()

## Correlation Analysis
Analyze the correlation between the presence of certain keywords in book titles and the ratings.

In [None]:
# Create a bag of words model
vectorizer = CountVectorizer(tokenizer=clean_text)
X = vectorizer.fit_transform(df['Book Name'])
# Convert to DataFrame
X_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
# Add the ratings to the DataFrame
X_df['Rating'] = df['Rating']
# Calculate the correlation matrix
correlation_matrix = X_df.corr()
# Display the correlation matrix
correlation_matrix['Rating'].sort_values(ascending=False)

## Visualization
Create visualizations to present the findings.

In [None]:
# Generate a word cloud of the most correlated words
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(correlation_matrix['Rating'].dropna().to_dict())
# Display the word cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()