In [None]:
# ================================
# Lab: EDA Univariate - Solution
# ================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Configuración visual
plt.style.use("default")
plt.rcParams["figure.figsize"] = (10,6)

# --- Cargar datos ---
df = pd.read_csv("amz_uk_price_prediction_dataset.csv")

# Asegurarse de que price y stars son numéricos
df['price'] = pd.to_numeric(df['price'], errors='coerce')
df['stars'] = pd.to_numeric(df['stars'], errors='coerce')

# Eliminar filas con valores faltantes en columnas relevantes
df = df.dropna(subset=['price', 'stars', 'category'])

# ================================
# Part 1: Understanding Product Categories
# ================================

# Frequency Table: cuenta de cada categoría
category_counts = df['category'].value_counts()
print("Frequency Table - Categories:\n", category_counts.head(10))

# Visualizaciones
# Top 10 categorías - Bar Chart
top_10_categories = category_counts.head(10)
top_10_categories.plot(kind='bar')
plt.title("Top 10 Product Categories")
plt.xlabel("Category")
plt.ylabel("Number of Products")
plt.xticks(rotation=45, ha='right')
plt.show()

# Top 5 categorías - Pie Chart
top_5_categories = category_counts.head(5)
top_5_categories.plot(kind='pie', autopct='%1.1f%%')
plt.title("Proportion of Top 5 Product Categories")
plt.ylabel("")
plt.show()

# ================================
# Part 2: Delving into Product Pricing
# ================================

# Measures of Centrality
price_mean = df['price'].mean()
price_median = df['price'].median()
price_mode = df['price'].mode()[0]

print("Price - Mean:", price_mean)
print("Price - Median:", price_median)
print("Price - Mode:", price_mode)

# Measures of Dispersion
price_variance = df['price'].var()
price_std = df['price'].std()
price_range = df['price'].max() - df['price'].min()
price_iqr = df['price'].quantile(0.75) - df['price'].quantile(0.25)

print("Price - Variance:", price_variance)
print("Price - Std:", price_std)
print("Price - Range:", price_range)
print("Price - IQR:", price_iqr)

# Visualizaciones de precios
# Histogram
plt.hist(df['price'], bins=50)
plt.title("Distribution of Product Prices")
plt.xlabel("Price (£)")
plt.ylabel("Frequency")
plt.show()

# Boxplot
plt.boxplot(df['price'], vert=False)
plt.title("Boxplot of Product Prices")
plt.xlabel("Price (£)")
plt.show()

# ================================
# Part 3: Unpacking Product Ratings
# ================================

# Measures of Centrality
rating_mean = df['stars'].mean()
rating_median = df['stars'].median()
rating_mode = df['stars'].mode()[0]

print("Ratings - Mean:", rating_mean)
print("Ratings - Median:", rating_median)
print("Ratings - Mode:", rating_mode)

# Measures of Dispersion
rating_variance = df['stars'].var()
rating_std = df['stars'].std()
rating_range = df['stars'].max() - df['stars'].min()
rating_iqr = df['stars'].quantile(0.75) - df['stars'].quantile(0.25)

print("Ratings - Variance:", rating_variance)
print("Ratings - Std:", rating_std)
print("Ratings - Range:", rating_range)
print("Ratings - IQR:", rating_iqr)

# Shape of the Distribution
rating_skewness = df['stars'].skew()
rating_kurtosis = df['stars'].kurtosis()

print("Ratings - Skewness:", rating_skewness)
print("Ratings - Kurtosis:", rating_kurtosis)

# Visualizaciones de ratings
# Histogram
plt.hist(df['stars'], bins=20)
plt.title("Distribution of Product Ratings")
plt.xlabel("Stars")
plt.ylabel("Frequency")
plt.show()

# Boxplot
plt.boxplot(df['stars'], vert=False)
plt.title("Boxplot of Product Ratings")
plt.xlabel("Stars")
plt.show()
