## Have a quick look at H&M catalogues

See https://www.kaggle.com/competitions/h-and-m-personalized-fashion-recommendations/data for more information

## Python Import

In [None]:
import pandas as pd

## Main CSV

### Load and easy checks

In [None]:
df = pd.read_csv("../data/H&M/articles.csv")
df.head()

In [None]:
df.shape, df.columns

In [None]:
df.duplicated().sum()

In [None]:
df.isna().sum()

In [None]:
df.nunique()

### Looks quite clean, only missing a few descriptions!

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [None]:
df.columns

In [None]:
# Improved Plotly histograms with better formatting
def create_histogram_plotly(df, column, title_suffix=""):
    """Create a horizontal histogram with improved formatting"""
    fig = px.histogram(
        df, 
        y=column,
        title=f'Distribution of {title_suffix or column.replace("_", " ").title()}',
        labels={'count': 'Frequency', column: column.replace('_', ' ').title()},
        height=max(400, len(df[column].unique()) * 20),  # Dynamic height based on categories
        orientation='h'  # Horizontal orientation for better readability of long names
    )
    
    # Improve layout
    fig.update_layout(
        xaxis_title="Frequency",
        yaxis_title=column.replace('_', ' ').title(),
        showlegend=False,
        margin=dict(l=200, r=50, t=50, b=50),  # Adjust margins for long labels
        font=dict(size=12)
    )
    
    # Sort bars by frequency
    fig.update_yaxes(categoryorder="total ascending")
    
    return fig

# Your original columns
fig1 = create_histogram_plotly(df, 'colour_group_name', 'Colour Groups')
fig2 = create_histogram_plotly(df, 'perceived_colour_value_name', 'Perceived Colour Values')

# Product-related distributions
fig3 = create_histogram_plotly(df, 'product_type_name', 'Product Types')
fig4 = create_histogram_plotly(df, 'product_group_name', 'Product Groups')
fig5 = create_histogram_plotly(df, 'garment_group_name', 'Garment Groups')

# Department and organization
fig6 = create_histogram_plotly(df, 'department_name', 'Departments')
fig7 = create_histogram_plotly(df, 'section_name', 'Sections')
fig8 = create_histogram_plotly(df, 'index_name', 'Index Names')

# Appearance-related
fig9 = create_histogram_plotly(df, 'graphical_appearance_name', 'Graphical Appearances')
fig10 = create_histogram_plotly(df, 'perceived_colour_master_name', 'Master Colour Categories')

# Display all additional plots
for fig in [fig3, fig4, fig5, fig6, fig7, fig8, fig9, fig10]:
    fig.show()


In [None]:
df['colour_group_name'].unique()

In [None]:
df["product_type_name"].value_counts().index.to_list()