In [None]:
import pandas as pd
import scipy.stats as stats
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr
from scipy.stats.contingency import association

In [None]:
amazon_data = "C:/Users/Pc/Desktop/Ironhack/Labs/lab-eda-bivariate/amz_uk_price_prediction_dataset.csv"
amz_df = pd.read_csv(amazon_data)
amz_df.head()

Part 1: Analyzing Best-Seller Trends Across Product Categories
1. Crosstab Analysis:

In [None]:
# Calculate the crosstab between category and best-seller status
crosstab = pd.crosstab(amz_df['category'], amz_df['isBestSeller'])

# Display the crosstab
print(crosstab)

# Calculate the proportion of best-sellers for each category
crosstab['Proportion Best-Seller'] = crosstab[True] / crosstab.sum(axis=1)

# Sort the categories by the proportion of best-sellers in descending order
crosstab_sorted = crosstab.sort_values('Proportion Best-Seller', ascending=False)

# Display the sorted result
print(crosstab_sorted)

The category with the highest proportion of best-sellers is 'Grocery'

2. Statistical Tests (Chi-Square and Cramér's V):

In [None]:
# Chi-square test
chi2, p, _, _ = stats.chi2_contingency(crosstab)

# Print the results of the Chi-square test
print(f"Chi-square statistic: {chi2}")
print(f"P-value: {p}")

# Compute Cramér's V using association()
cramers_v = association(crosstab.iloc[:, :-1].values, method="cramer")  # Convert to NumPy array
print(f"Cramér's V: {cramers_v}")


Chi-square: The relationship between the category and the best-seller distrubution is statistically significant, so it is not independent.  

Cramér's V: is a measure of the strength of association between two categorical variables, ranging from 0 to 1:In this case, Cramér's V: 0.1222829439760564, which indicates a weak association between the two variables, despite the large chi-square statistic and significant p-value.

3. Visualizations:

In [None]:
# Create a crosstab to count the number of best-sellers vs non-best-sellers by category
category_best_seller = pd.crosstab(amz_df['category'], amz_df['isBestSeller'])
print (category_best_seller)

In [None]:
# Calculate the proportion of best-sellers for each category
category_best_seller['Proportion Best-Seller'] = category_best_seller[True] / category_best_seller.sum(axis=1)

# Sort categories by the proportion of best-sellers in descending order
category_best_seller_sorted = category_best_seller.sort_values('Proportion Best-Seller', ascending=False)

# Select the top N categories (e.g., top 20)
top_n = 20
top_categories = category_best_seller_sorted.head(top_n)

# Plotting a horizontal bar chart for the top N categories with the highest proportion of best-sellers
plt.figure(figsize=(10, 8))
top_categories['Proportion Best-Seller'].plot(kind='barh', color='salmon')

# Customize the plot
plt.title(f'Top {top_n} Categories by Proportion of Best-Sellers', fontsize=16)
plt.xlabel('Proportion of Best-Sellers', fontsize=12)
plt.ylabel('Product Category', fontsize=12)

# Reverse the Y-axis to have the highest at the top
plt.gca().invert_yaxis()

# Adjust layout for a better fit
plt.tight_layout()

# Show the plot
plt.show()

Part 2: Exploring Product Prices and Ratings Across Categories and Brands
    
    0. Preliminary Step: Remove outliers in product prices.

In [None]:
# Calculate Q1 (25th percentile) and Q3 (75th percentile) for the 'price' column
Q1 = amz_df['price'].quantile(0.25)
Q3 = amz_df['price'].quantile(0.75)

# Calculate the IQR (Interquartile Range)
IQR = Q3 - Q1

# Calculate the lower and upper bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Remove rows where the 'price' is below the lower bound or above the upper bound
amz_df_no_outliers = amz_df[(amz_df['price'] >= lower_bound) & (amz_df['price'] <= upper_bound)]

# Show the new dataframe with outliers removed
amz_df_no_outliers.head()

1. Violin plots

In [None]:
# Filter the top 20 categories based on count (number of products) in the outlier-free dataset
top_20_categories = amz_df_no_outliers['category'].value_counts().nlargest(20).index
filtered_data_no_outliers = amz_df_no_outliers[amz_df_no_outliers['category'].isin(top_20_categories)]

# Create the violin plot for price distribution across the top 20 categories (outlier-free dataset)
plt.figure(figsize=(12, 8))
sns.violinplot(data=filtered_data_no_outliers, x='category', y='price', palette='muted')

# Customize the plot
plt.title('Price Distribution Across Top 20 Categories (Outlier-Free)', fontsize=16)
plt.xlabel('Product Category', fontsize=12)
plt.ylabel('Price', fontsize=12)
plt.xticks(rotation=90)  # Rotate x-axis labels for better readability
plt.tight_layout()

# Show the plot
plt.show()


In [None]:
# Group by 'category' and calculate the median price for each category
median_prices_by_category = amz_df_no_outliers.groupby('category')['price'].median()

# Sort the categories by the median price in descending order
median_prices_sorted = median_prices_by_category.sort_values(ascending=False)

# Convert to a DataFrame for better readability
median_prices_table = median_prices_sorted.reset_index()
median_prices_table.columns = ['Category', 'Median Price']

# Display the table
print(median_prices_table)

Desktop PCs is the category with the highest median price

2. Bar charts

In [None]:
# Get the top 10 categories based on count (number of products) in the outlier-free dataset
top_10_categories = amz_df_no_outliers['category'].value_counts().nlargest(10).index
top_10_data_no_outliers = amz_df_no_outliers[amz_df_no_outliers['category'].isin(top_10_categories)]

In [None]:
# Calculate the average price for each of the top 10 categories
avg_price_by_category = top_10_data_no_outliers.groupby('category')['price'].mean().sort_values(ascending=False)

In [None]:
# Create the bar chart comparing the average price for the top 10 categories
plt.figure(figsize=(12, 8))
sns.barplot(x=avg_price_by_category.index, y=avg_price_by_category.values, palette='viridis')

# Customize the plot
plt.title('Average Price by Product Category (Top 10)', fontsize=16)
plt.xlabel('Product Category', fontsize=12)
plt.ylabel('Average Price', fontsize=12)
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
# Identify the category with the highest average price
highest_avg_price_category = avg_price_by_category.idxmax()
highest_avg_price = avg_price_by_category.max()

print(f"The product category with the highest average price is '{highest_avg_price_category}' with an average price of {highest_avg_price:.2f}")

3. Box plots

In [None]:
# Get the top 10 categories based on count (number of products) in the outlier-free dataset
top_10_categories = amz_df_no_outliers['category'].value_counts().nlargest(10).index
top_10_data_no_outliers = amz_df_no_outliers[amz_df_no_outliers['category'].isin(top_10_categories)]

In [None]:
# Create the box plot for ratings across the top 10 categories
plt.figure(figsize=(12, 8))
sns.boxplot(x='category', y='stars', data=top_10_data_no_outliers, palette='muted')

# Customize the plot
plt.title('Distribution of Ratings Across Top 10 Categories', fontsize=16)
plt.xlabel('Product Category', fontsize=12)
plt.ylabel('Product Rating (Stars)', fontsize=12)
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
# Calculate the median rating for each of the top 10 categories
median_rating_by_category = top_10_data_no_outliers.groupby('category')['stars'].median().sort_values(ascending=False)

# Identify the category with the highest median rating
highest_median_rating_category = median_rating_by_category.idxmax()
highest_median_rating = median_rating_by_category.max()

print(f"The product category with the highest median rating is '{highest_median_rating_category}' with a median rating of {highest_median_rating:.2f}")

In [None]:
# Create the box plot for ratings across the top 10 categories
plt.figure(figsize=(12, 8))
sns.boxplot(x='category', y='stars', data=top_10_data_no_outliers, hue='category', palette='muted', legend=False)

# Customize the plot
plt.title('Distribution of Ratings Across Top 10 Categories', fontsize=16)
plt.xlabel('Product Category', fontsize=12)
plt.ylabel('Product Rating (Stars)', fontsize=12)
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.tight_layout()

# Show the plot
plt.show()

Part 3: Investigating the Interplay Between Product Prices and Ratings

1. Correlation coefficients

In [None]:
# Calculate the Pearson correlation coefficient between 'price' and 'stars'
correlation = amz_df_no_outliers['price'].corr(amz_df_no_outliers['stars'])

# Display the correlation value
print(f"The correlation coefficient between product price and rating is: {correlation:.2f}")

Weak Negative Correlation: Since the value is close to 0, it suggests that price and rating are not strongly related. The negative sign means that, in a very weak way, as the price increases, ratings tend to slightly decrease (or vice versa). However, this is very weak and likely not meaningful in practical terms.

Visualisations

In [None]:
# Scatter plot with Price on Y-axis and Rating on X-axis
plt.figure(figsize=(10, 6))
sns.scatterplot(data=amz_df_no_outliers, x='stars', y='price', alpha=0.5)

# Customize the plot
plt.title('Scatter Plot: Price vs Rating (Stars)', fontsize=16)
plt.xlabel('Rating (Stars)', fontsize=12)
plt.ylabel('Price', fontsize=12)
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
# Log-transform the price
amz_df_no_outliers['log_price'] = np.log1p(amz_df_no_outliers['price'])

# Scatter plot with log-transformed Price on Y-axis and Rating on X-axis
plt.figure(figsize=(10, 6))
sns.scatterplot(data=amz_df_no_outliers, x='stars', y='log_price', alpha=0.5)

# Customize the plot
plt.title('Scatter Plot: Log-transformed Price vs Rating (Stars)', fontsize=16)
plt.xlabel('Rating (Stars)', fontsize=12)
plt.ylabel('Log-transformed Price', fontsize=12)
plt.tight_layout()

# Show the plot
plt.show()

The heatmap

In [None]:
# Select numerical columns from the DataFrame
numerical_cols = amz_df_no_outliers.select_dtypes(include=['float64', 'int64']).columns

# Calculate the correlation matrix for the numerical variables
correlation_matrix = amz_df_no_outliers[numerical_cols].corr()

# Create the heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', cbar=True, square=True, linewidths=0.5)

# Customize the plot
plt.title('Correlation Heatmap of Numerical Variables', fontsize=16)
plt.tight_layout()

# Show the plot
plt.show()

QQ Plot

In [None]:
# Create a QQ plot to check if prices follow a normal distribution
plt.figure(figsize=(8, 6))
stats.probplot(amz_df_no_outliers['price'], dist="norm", plot=plt)

# Customize the plot
plt.title('QQ Plot: Checking if Product Prices Follow a Normal Distribution', fontsize=16)
plt.tight_layout()

# Show the plot
plt.show()