# Amazon Best Sellers Data Analysis (2009–2019)
Author: Filza Rahman  
Description: An analysis of top-selling books using Python, pandas, and matplotlib.


In [None]:
# Step 1: Import Required Libraries
import pandas as pd
import matplotlib.pyplot as plt


# Step 2: Load the Dataset
Load the CSV file containing the Amazon bestsellers data.

In [None]:
df = pd.read_csv('bestsellers.csv')

print(" Data successfully loaded!\n")
print(df.head())
print("\nShape of dataset:", df.shape)
print("\nColumns:", df.columns.tolist())


# Step 3: Clean the Data
- Remove duplicates  
- Rename columns for clarity  
- Convert Price column to float

In [None]:
df.drop_duplicates(inplace=True)

df.rename(columns={
    "Name": "Title",
    "User Rating": "Rating",
    "Year": "Publication Year"
}, inplace=True)

df["Price"] = df["Price"].astype(float)

print("\n✅ Data cleaned and ready for analysis!")
print(df.info())


## Step 4: Exploratory Data Analysis (EDA)
### 4.1 Distribution of Books by Genre

In [None]:
genre_counts = df['Genre'].value_counts()
print("\nBooks by Genre:\n", genre_counts)

plt.figure(figsize=(6,4))
plt.bar(genre_counts.index, genre_counts.values, color=['skyblue','lightgreen'])
plt.title("Distribution of Books by Genre")
plt.xlabel("Genre")
plt.ylabel("Count")
plt.tight_layout()
plt.show()

### 4.2 Top 10 Authors by Number of Bestsellers

In [None]:
author_counts = df['Author'].value_counts().head(10)
print("\nTop 10 Authors:\n", author_counts)

plt.figure(figsize=(10,5))
plt.barh(author_counts.index[::-1], author_counts.values[::-1], color='lightcoral')
plt.title("Top 10 Best-Selling Authors (2009–2019)")
plt.xlabel("Number of Books in Top 50")
plt.ylabel("Author")
plt.tight_layout()
plt.show()

### 4.3 Average Rating by Genre

In [None]:
avg_rating_by_genre = df.groupby("Genre")["Rating"].mean().sort_values(ascending=False)
print("\nAverage Rating by Genre:\n", avg_rating_by_genre)

plt.figure(figsize=(6,4))
plt.bar(avg_rating_by_genre.index, avg_rating_by_genre.values, color=['orange','purple'])
plt.title("Average Rating by Genre")
plt.xlabel("Genre")
plt.ylabel("Average Rating")
plt.ylim(4.5, 4.7)
plt.tight_layout()
plt.show()

### 4.4 Yearly Trends in Bestsellers

In [None]:
books_per_year = df['Publication Year'].value_counts().sort_index()
print("\nBooks per Year:\n", books_per_year)

plt.figure(figsize=(10,5))
plt.plot(books_per_year.index, books_per_year.values, marker='o', linestyle='-', color='blue')
plt.title("Number of Bestsellers per Year (2009–2019)")
plt.xlabel("Year")
plt.ylabel("Number of Books")
plt.grid(True)
plt.tight_layout()
plt.show()

### 4.5 Relationship: Rating vs. Price

In [None]:
plt.figure(figsize=(7,5))
for genre in df['Genre'].unique():
    subset = df[df['Genre']==genre]
    plt.scatter(subset['Price'], subset['Rating'], label=genre, alpha=0.7)
plt.title("Book Price vs. Rating by Genre")
plt.xlabel("Price ($)")
plt.ylabel("Rating")
plt.legend()
plt.tight_layout()
plt.show()

### 4.6 Correlation Between Numeric Variables

In [None]:
corr = df[['Rating', 'Reviews', 'Price', 'Publication Year']].corr()
print("\nCorrelation Matrix:\n", corr)

plt.figure(figsize=(6,4))
plt.imshow(corr, cmap='Blues', interpolation='none')
plt.colorbar()
plt.xticks(range(len(corr)), corr.columns, rotation=45)
plt.yticks(range(len(corr)), corr.columns)
plt.title("Correlation Heatmap")
plt.tight_layout()
plt.show()

## Step 5: Export Key Insights

In [None]:
author_counts.to_csv("top_10_authors.csv")
avg_rating_by_genre.to_csv("avg_rating_by_genre.csv")
books_per_year.to_csv("books_per_year.csv")

print("\nCSV files exported successfully:")
print("- top_10_authors.csv")
print("- avg_rating_by_genre.csv")
print("- books_per_year.csv")

## Step 6: Summary Insights

In [None]:
print("\nKEY INSIGHTS:")
print("---------------------------------------------------")
print("• Fiction books maintain slightly higher ratings on average than Non-Fiction.")
print("• A few authors like Jeff Kinney and Suzanne Collins dominated the charts.")
print("• 2014–2017 saw a rise in the number of top sellers, suggesting strong publishing cycles.")
print("• Price has minimal correlation with Rating, but expensive books tend to get fewer reviews.")
print("• Consistently high-rated authors correlate with popular long-running series.")
print("---------------------------------------------------")
print("\nAnalysis complete!")