In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import scipy.stats as stats

# Load the dataset
df = pd.read_csv(r'https://www.kaggle.com/datasets/asaniczka/uk-optimal-product-price-prediction/')

# Part 1: Analyzing Best-Seller Trends Across Product Categories
# Crosstab between product category and isBestSeller status
crosstab = pd.crosstab(df['category'], df['isBestSeller'])

# Calculate proportion of best-sellers for each category
crosstab['proportion_best_sellers'] = crosstab[1] / (crosstab[0] + crosstab[1])
crosstab_sorted = crosstab.sort_values(by='proportion_best_sellers', ascending=False)
print(crosstab_sorted.head())

# Chi-square test and Cramér's V
chi2, p, dof, expected = stats.chi2_contingency(crosstab.iloc[:, :-1])
n = crosstab.sum().sum()
cramers_v = np.sqrt(chi2 / (n * (min(crosstab.shape) - 1)))

print(f"Chi-square test p-value: {p}")
print(f"Cramér's V: {cramers_v}")

# Stacked bar chart
crosstab_sorted.drop(columns='proportion_best_sellers').plot(kind='bar', stacked=True, figsize=(12, 8))
plt.title('Distribution of Best-Seller Status Across Categories')
plt.xlabel('Category')
plt.ylabel('Count')
plt.show()

# Part 2: Exploring Product Prices and Ratings Across Categories and Brands
# Remove outliers in product prices
Q1 = df['price'].quantile(0.25)
Q3 = df['price'].quantile(0.75)
IQR = Q3 - Q1
df_filtered = df[~((df['price'] < (Q1 - 1.5 * IQR)) | (df['price'] > (Q3 + 1.5 * IQR)))]

# Violin plot
top_20_categories = df_filtered['category'].value_counts().head(20).index
df_top_20 = df_filtered[df_filtered['category'].isin(top_20_categories)]
plt.figure(figsize=(15, 10))
sns.violinplot(x='category', y='price', data=df_top_20)
plt.title('Distribution of Prices Across Top 20 Categories')
plt.xticks(rotation=90)
plt.show()

# Bar chart for average prices
top_10_categories = df_filtered['category'].value_counts().head(10).index
df_top_10 = df_filtered[df_filtered['category'].isin(top_10_categories)]
avg_price = df_top_10.groupby('category')['price'].mean().sort_values(ascending=False)
plt.figure(figsize=(12, 8))
sns.barplot(x=avg_price.index, y=avg_price.values)
plt.title('Average Price of Products for Top 10 Categories')
plt.xlabel('Category')
plt.ylabel('Average Price')
plt.xticks(rotation=90)
plt.show()

# Box plot for product ratings
plt.figure(figsize=(15, 10))
sns.boxplot(x='category', y='rating', data=df_top_10)
plt.title('Distribution of Ratings Across Top 10 Categories')
plt.xticks(rotation=90)
plt.show()

# Part 3: Investigating the Interplay Between Product Prices and Ratings
# Correlation coefficient
correlation = df_filtered['price'].corr(df_filtered['rating'])
print(f"Correlation between price and rating: {correlation}")

# Scatter plot
plt.figure(figsize=(10, 6))
sns.scatterplot(x='rating', y='price', data=df_filtered)
plt.title('Relationship Between Product Rating and Price')
plt.xlabel('Rating')
plt.ylabel('Price')
plt.show()

# Correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(df_filtered.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

# QQ plot for product prices
plt.figure(figsize=(10, 6))
stats.probplot(df_filtered['price'], dist="norm", plot=plt)
plt.title('QQ Plot for Product Prices')
plt.show()
