# 👟 Adidas Shoe Data Analysis

This notebook provides a comprehensive analysis of the Adidas shoe dataset.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import matplotlib.pyplot as plt

# Set plotting style
plt.style.use('seaborn')
sns.set_palette('husl')

## 1. Data Loading and Initial Exploration

In [None]:
# Load the datasets
shoes_dim = pd.read_csv('shoes_dim.csv')
shoes_fact = pd.read_csv('shoes_fact.csv')
country_dim = pd.read_csv('country_dim.csv')

# Display basic information about the datasets
print("\nShoes Dimension Table Info:")
shoes_dim.info()

print("\nShoes Fact Table Info:")
shoes_fact.info()

print("\nCountry Dimension Table Info:")
country_dim.info()

## 2. Data Quality Analysis

In [None]:
def analyze_data_quality(df, title):
    print(f"\n{title} Data Quality Analysis")
    print("-" * 50)
    
    # Missing values
    missing = df.isnull().sum()
    missing_pct = (missing / len(df)) * 100
    
    quality_df = pd.DataFrame({
        'Missing Values': missing,
        'Missing Percentage': missing_pct.round(2)
    })
    
    print("\nMissing Values Analysis:")
    display(quality_df[quality_df['Missing Values'] > 0])
    
    # Duplicates
    duplicates = df.duplicated().sum()
    print(f"\nDuplicate Records: {duplicates}")

# Analyze each dataset
analyze_data_quality(shoes_dim, "Shoes Dimension")
analyze_data_quality(shoes_fact, "Shoes Fact")
analyze_data_quality(country_dim, "Country Dimension")

## 3. Product Analysis

In [None]:
# Gender Distribution
plt.figure(figsize=(10, 6))
shoes_dim['gender'].value_counts().plot(kind='bar')
plt.title('Gender Distribution')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.show()

# Usage Categories
plt.figure(figsize=(12, 6))
shoes_dim['best_for_wear'].value_counts().plot(kind='bar')
plt.title('Usage Categories Distribution')
plt.xlabel('Category')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

## 4. Color Analysis

In [None]:
# Analyze color patterns
def analyze_colors(df):
    colors_df = pd.DataFrame({
        'Dominant': df['dominant_color'].value_counts(),
        'Sub1': df['sub_color1'].value_counts(),
        'Sub2': df['sub_color2'].value_counts()
    })
    
    # Plot top 10 colors for each category
    fig, axes = plt.subplots(1, 3, figsize=(20, 6))
    
    colors_df['Dominant'].head(10).plot(kind='bar', ax=axes[0], title='Top Dominant Colors')
    colors_df['Sub1'].head(10).plot(kind='bar', ax=axes[1], title='Top Sub-Color 1')
    colors_df['Sub2'].head(10).plot(kind='bar', ax=axes[2], title='Top Sub-Color 2')
    
    for ax in axes:
        ax.tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()

analyze_colors(shoes_dim)

## 5. Geographic Analysis

In [None]:
# Analyze geographic distribution
print("Currency Distribution:")
display(country_dim['currency'].value_counts())

print("\nShoe Metric Distribution:")
display(country_dim['shoe_metric'].value_counts())

# Create a summary table
geo_summary = pd.DataFrame({
    'Currency Count': country_dim['currency'].value_counts(),
    'Metric System': country_dim['shoe_metric'].value_counts()
})

display(geo_summary)

## 6. Cross Analysis

In [None]:
# Gender vs Usage Category
gender_usage = pd.crosstab(shoes_dim['gender'], shoes_dim['best_for_wear'])

plt.figure(figsize=(12, 6))
gender_usage.plot(kind='bar', stacked=True)
plt.title('Gender Distribution Across Usage Categories')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.legend(title='Usage Category', bbox_to_anchor=(1.05, 1))
plt.tight_layout()
plt.show()

# Color preferences by gender
color_gender = pd.crosstab(shoes_dim['dominant_color'], shoes_dim['gender'])
color_gender_pct = color_gender.div(color_gender.sum(axis=0), axis=1) * 100

plt.figure(figsize=(12, 6))
color_gender_pct.head(10).plot(kind='bar')
plt.title('Color Preferences by Gender (Top 10 Colors)')
plt.xlabel('Color')
plt.ylabel('Percentage')
plt.legend(title='Gender')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 7. Key Insights

In [None]:
# Generate key insights
print("Key Insights:")
print("-" * 50)

# Total products
print(f"Total unique shoes: {len(shoes_dim)}")

# Gender distribution
gender_dist = shoes_dim['gender'].value_counts()
print("\nGender Distribution:")
for gender, count in gender_dist.items():
    print(f"{gender}: {count} ({(count/len(shoes_dim)*100):.1f}%)")

# Most common categories
print("\nTop 5 Usage Categories:")
display(shoes_dim['best_for_wear'].value_counts().head())

# Most common colors
print("\nTop 5 Dominant Colors:")
display(shoes_dim['dominant_color'].value_counts().head())