In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv("./amz_uk_price_prediction_dataset.csv")

# Display basic info
display(df.info())
display(df.head())

### Part 1: Understanding Product Categories ###
# Frequency table for product categories
category_counts = df['category'].value_counts()
print("Top 5 Most Listed Product Categories:")
print(category_counts.head())

# Bar chart of product categories
plt.figure(figsize=(12,6))
category_counts.plot(kind='bar', color='skyblue')
plt.title("Product Listings by Category")
plt.xlabel("Category")
plt.ylabel("Count")
plt.xticks(rotation=90)
plt.show()

# Pie chart for top 5 categories
top_categories = category_counts.head(5)
plt.figure(figsize=(8,8))
top_categories.plot(kind='pie', autopct='%1.1f%%', colors=["#ff9999","#66b3ff","#99ff99","#ffcc99","#c2c2f0"])
plt.title("Top 5 Product Categories")
plt.ylabel("")
plt.show()

### Part 2: Delving into Product Pricing ###
# Measures of Centrality for Price
mean_price = df['price'].mean()
median_price = df['price'].median()
mode_price = df['price'].mode()[0]
print(f"Mean Price: {mean_price:.2f}, Median Price: {median_price:.2f}, Mode Price: {mode_price:.2f}")

# Measures of Dispersion for Price
price_variance = df['price'].var()
price_std = df['price'].std()
price_range = df['price'].max() - df['price'].min()
price_iqr = df['price'].quantile(0.75) - df['price'].quantile(0.25)
print(f"Price Variance: {price_variance:.2f}, Std Dev: {price_std:.2f}, Range: {price_range:.2f}, IQR: {price_iqr:.2f}")

# Histogram of Prices
plt.figure(figsize=(10,5))
sns.histplot(df['price'], bins=50, kde=True, color='purple')
plt.title("Price Distribution")
plt.xlabel("Price")
plt.ylabel("Frequency")
plt.xlim([0, df['price'].quantile(0.95)]) # Limit x-axis to avoid extreme outliers
plt.show()

# Boxplot for Prices
plt.figure(figsize=(8,6))
sns.boxplot(x=df['price'], color='orange')
plt.title("Boxplot of Product Prices")
plt.xlabel("Price")
plt.xlim([0, df['price'].quantile(0.95)])
plt.show()

### Part 3: Unpacking Product starss ###
# Measures of Centrality for starss
mean_stars = df['stars'].mean()
median_stars = df['stars'].median()
mode_stars = df['stars'].mode()[0]
print(f"Mean stars: {mean_stars:.2f}, Median stars: {median_stars:.2f}, Mode stars: {mode_stars:.2f}")

# Measures of Dispersion for starss
stars_variance = df['stars'].var()
stars_std = df['stars'].std()
stars_iqr = df['stars'].quantile(0.75) - df['stars'].quantile(0.25)
print(f"stars Variance: {stars_variance:.2f}, Std Dev: {stars_std:.2f}, IQR: {stars_iqr:.2f}")

# Skewness and Kurtosis
stars_skewness = df['stars'].skew()
stars_kurtosis = df['stars'].kurt()
print(f"stars Skewness: {stars_skewness:.2f}, Kurtosis: {stars_kurtosis:.2f}")

# Histogram of starss
plt.figure(figsize=(10,5))
sns.histplot(df['stars'], bins=20, kde=True, color='blue')
plt.title("Distribution of Product starss")
plt.xlabel("stars")
plt.ylabel("Frequency")
plt.show()
