# Lab - EDA Bivariate Analysis

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency

In [None]:
df = pd.read_csv('./data/amz_uk_price_prediction_dataset.csv')
df

### 1.1 Crosstab Analysis
Create a crosstab between the product category and the isBestSeller status.

In [None]:
contingency_table = pd.crosstab(df['category'], df['isBestSeller'], margins=True, margins_name='Total')
# contingency_table

Are there categories where being a best-seller is more prevalent?

In [None]:
# calculate best seller proportion, add column, sort it descending
contingency_table['Proportion'] = contingency_table[True] / contingency_table[False]
sorted_contingency_table = contingency_table.sort_values(by='Proportion', ascending=False)

sorted_contingency_table

### 1.2 Statistical Tests

1.2.1 Conduct a Chi-square test to determine if the best-seller distribution is independent of the product category.

In [None]:
# Chi-square test with previous crosstab table
chi2, p_value, dof, expected = chi2_contingency(contingency_table)

print(f'\nChi-square statistic: {chi2}')
print(f'\nP-value: {p_value}')
print(f'\nDegrees of Freedom: {dof}')
print('\nExpected Frequencies:')
print(expected)

1.2.2 Compute Cramér's V to understand the strength of association between best-seller status and category.

In [None]:
# Cramér's V
n = contingency_table.sum().sum()  # total number of observations
min_dim = min(contingency_table.shape) - 1  # minimum of (number of rows - 1, number of columns - 1)
cramers_v = np.sqrt(chi2 / (n * min_dim))

print("Cramér's V:", cramers_v)

### 1.3 Visualizations

Visualize the relationship between product categories and the best-seller status using a stacked bar chart.

In [None]:
ax = contingency_table.plot(kind='bar', stacked=True, figsize=(40, 6), color=['coral', 'teal'])

ax.set_title('Relationship between Product Category and Best Seller Status')
ax.set_xlabel('Category')
ax.set_ylabel('Count')
ax.legend(title='Best seller status', labels=['No', 'Yes'])

plt.show()

In [None]:
sorted_categories = contingency_table.sum(axis=1).sort_values(ascending=False)

# top categories
top_20_categories = sorted_categories.tail(20).index

top_20_contingency_table = contingency_table.loc[top_20_categories]

ax = top_20_contingency_table.plot(kind='bar', stacked=True, figsize=(8, 6), color=['coral', 'teal'])

ax.set_title('Top 20 Best Selling vs Non Best Selling Categories')
ax.set_xlabel('Category')
ax.set_ylabel('Count')
ax.legend(title='Best sSeller Status', labels=['No', 'Yes'])

plt.tight_layout()
plt.show()