In [15]:
import polars as pl
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Load the data
df = pl.read_csv('data/2026-01-30-Jesta tech test - fashion_sample.csv')

# Parse dates
df = df.with_columns([
    pl.col('date').str.to_date(),
    pl.col('launch_date').str.to_date()
])

# Calculate derived fields
df = df.with_columns([
    ((pl.col('original_price') - pl.col('current_price')) / pl.col('original_price')).alias('markdown_pct'),
    ((pl.col('current_price') - pl.col('cost')) / pl.col('current_price')).alias('margin')
])

# Display basic info
print("Dataset shape:", df.shape)
print("\nData types:")
print(df.schema)
print("\nBasic statistics:")
print(df.describe())

Dataset shape: (100, 11)

Data types:
Schema({'date': Date, 'style': String, 'site': String, 'sales': Int64, 'inventory': Int64, 'original_price': Float64, 'current_price': Float64, 'cost': Float64, 'launch_date': Date, 'markdown_pct': Float64, 'margin': Float64})

Basic statistics:
shape: (9, 12)
┌────────────┬────────────┬────────────┬───────────┬───┬───────┬────────────┬───────────┬──────────┐
│ statistic  ┆ date       ┆ style      ┆ site      ┆ … ┆ cost  ┆ launch_dat ┆ markdown_ ┆ margin   │
│ ---        ┆ ---        ┆ ---        ┆ ---       ┆   ┆ ---   ┆ e          ┆ pct       ┆ ---      │
│ str        ┆ str        ┆ str        ┆ str       ┆   ┆ f64   ┆ ---        ┆ ---       ┆ f64      │
│            ┆            ┆            ┆           ┆   ┆       ┆ str        ┆ f64       ┆          │
╞════════════╪════════════╪════════════╪═══════════╪═══╪═══════╪════════════╪═══════════╪══════════╡
│ count      ┆ 100        ┆ 100        ┆ 100       ┆ … ┆ 100.0 ┆ 100        ┆ 100.0     ┆ 100.0

In [21]:
# Visualization 1: Sales and Inventory Over Time
import plotly.io as pio

dates = df['date'].to_list()
sales = df['sales'].to_list()
inventory = df['inventory'].to_list()
pio.renderers.default = "browser"
fig = make_subplots(rows=2, cols=1, shared_xaxes=True,
                    subplot_titles=('Sales Pattern - Stockouts Force Zero Sales',
                                    'Inventory Levels - Two Stockout Periods Visible'))

# Sales
fig.add_trace(go.Scatter(x=dates, y=sales, mode='lines+markers', name='Daily Sales',
                         marker=dict(size=4)), row=1, col=1)

# Stockout periods
fig.add_vrect(x0=dates[32], x1=dates[40], fillcolor='red', opacity=0.2,
              annotation_text='Stockout 1', row=1, col=1)
fig.add_vrect(x0=dates[60], x1=dates[69], fillcolor='red', opacity=0.2,
              annotation_text='Stockout 2', row=1, col=1)
fig.add_vline(x=dates[69], line_dash='dash', line_color='green', row=1, col=1)

# Inventory
fig.add_trace(go.Scatter(x=dates, y=inventory, fill='tozeroy', name='Inventory',
                         fillcolor='rgba(255,165,0,0.5)', line=dict(color='darkorange')),
              row=2, col=1)
fig.add_hline(y=0, line_dash='dash', line_color='red', opacity=0.5, row=2, col=1)

fig.update_layout(height=600, title_text='Sales and Inventory Over Time')
fig.update_yaxes(title_text='Sales', row=1, col=1)
fig.update_yaxes(title_text='Inventory', row=2, col=1)
fig.show()

# Identify stockout days
stockout_days = df.filter(pl.col('inventory') == 0)
print(f"\nStockout days: {stockout_days.height}")

# Save visualization
fig.write_html("vis1.html")
fig.show()


Stockout days: 39


In [22]:
# Visualization 2: Price Changes and Margin Impact
fig = make_subplots(rows=3, cols=1, shared_xaxes=True,
                    subplot_titles=('Price Evolution - Markdown Period After Day 70',
                                    'Markdown Percentage Over Time',
                                    'Margin Evolution'))

original_price = df['original_price'].to_list()
current_price = df['current_price'].to_list()
cost = df['cost'].to_list()
markdown_pct = df['markdown_pct'].to_list()
margin = df['margin'].to_list()

# Prices
fig.add_trace(go.Scatter(x=dates, y=original_price, name='Original Price',
                         line=dict(dash='dash')), row=1, col=1)
fig.add_trace(go.Scatter(x=dates, y=current_price, name='Current Price',
                         line=dict(width=2)), row=1, col=1)
fig.add_trace(go.Scatter(x=dates, y=cost, name='Cost',
                         line=dict(dash='dot', color='gray')), row=1, col=1)

# Markdown
fig.add_trace(go.Bar(x=dates, y=[m * 100 for m in markdown_pct], name='Markdown %',
                     marker_color='red', opacity=0.6), row=2, col=1)

# Margin
fig.add_trace(go.Scatter(x=dates, y=[m * 100 for m in margin], name='Margin %',
                         line=dict(color='green', width=2)), row=3, col=1)
fig.add_hline(y=10, line_dash='dash', line_color='red', row=3, col=1,
              annotation_text='Min Margin (10%)')

fig.update_layout(height=800, title_text='Price Changes and Margin Impact')
fig.update_yaxes(title_text='Price ($)', row=1, col=1)
fig.update_yaxes(title_text='Markdown %', row=2, col=1)
fig.update_yaxes(title_text='Margin %', row=3, col=1)
fig.show()

# Data quality check
print("\nData Quality Check:")
print(f"Null counts:\n{df.null_count()}")
print(f"\nNegative sales: {df.filter(pl.col('sales') < 0).height}")
print(f"Negative inventory: {df.filter(pl.col('inventory') < 0).height}")
print(f"Price > Original: {df.filter(pl.col('current_price') > pl.col('original_price')).height}")


Data Quality Check:
Null counts:
shape: (1, 11)
┌──────┬───────┬──────┬───────┬───┬──────┬─────────────┬──────────────┬────────┐
│ date ┆ style ┆ site ┆ sales ┆ … ┆ cost ┆ launch_date ┆ markdown_pct ┆ margin │
│ ---  ┆ ---   ┆ ---  ┆ ---   ┆   ┆ ---  ┆ ---         ┆ ---          ┆ ---    │
│ u32  ┆ u32   ┆ u32  ┆ u32   ┆   ┆ u32  ┆ u32         ┆ u32          ┆ u32    │
╞══════╪═══════╪══════╪═══════╪═══╪══════╪═════════════╪══════════════╪════════╡
│ 0    ┆ 0     ┆ 0    ┆ 1     ┆ … ┆ 0    ┆ 0           ┆ 0            ┆ 0      │
└──────┴───────┴──────┴───────┴───┴──────┴─────────────┴──────────────┴────────┘

Negative sales: 0
Negative inventory: 0
Price > Original: 0


In [23]:
# Part B: Feature Engineering

# 1. Stockout Detection
# Flag periods where inventory=0 but there were recent sales (rolling 7-day window)
df = df.with_columns([
    pl.col('sales').rolling_sum(window_size=7).shift(1).alias('recent_sales_7d')
])

df = df.with_columns([
    ((pl.col('inventory') == 0) & (pl.col('recent_sales_7d') > 0)).alias('is_stockout')
])

# 2. Product Lifecycle
# Calculate weeks since launch and classify
df = df.with_columns([
    ((pl.col('date') - pl.col('launch_date')).dt.total_days() / 7).cast(pl.Int32).alias('weeks_since_launch')
])

df = df.with_columns([
    pl.when(pl.col('weeks_since_launch') <= 4)
    .then(pl.lit('new'))
    .when(pl.col('weeks_since_launch') <= 8)
    .then(pl.lit('growth'))
    .when(pl.col('markdown_pct') > 0.1)
    .then(pl.lit('clearance'))
    .otherwise(pl.lit('mature'))
    .alias('lifecycle_stage')
])

# 3. Inventory Health
# Days of supply (inventory / avg daily sales over last 7 days)
df = df.with_columns([
    pl.col('sales').rolling_mean(window_size=7).alias('avg_daily_sales_7d')
])

df = df.with_columns([
    pl.when(pl.col('avg_daily_sales_7d') > 0)
    .then(pl.col('inventory') / pl.col('avg_daily_sales_7d'))
    .otherwise(pl.lit(None))
    .alias('days_of_supply'),

    # Markdown depth classification
    pl.when(pl.col('markdown_pct') == 0)
    .then(pl.lit('full_price'))
    .when(pl.col('markdown_pct') <= 0.2)
    .then(pl.lit('light_markdown'))
    .when(pl.col('markdown_pct') <= 0.4)
    .then(pl.lit('moderate_markdown'))
    .otherwise(pl.lit('deep_markdown'))
    .alias('markdown_depth'),

    # Margin status
    pl.when(pl.col('margin') >= 0.3)
    .then(pl.lit('healthy'))
    .when(pl.col('margin') >= 0.1)
    .then(pl.lit('acceptable'))
    .when(pl.col('margin') >= 0)
    .then(pl.lit('low'))
    .otherwise(pl.lit('negative'))
    .alias('margin_status')
])

# Display feature summary
print("=== Feature Engineering Summary ===\n")

print("1. Stockout Detection:")
print(f"   Total stockout days: {df.filter(pl.col('is_stockout')).height}")
print(f"   Stockout rate: {df.filter(pl.col('is_stockout')).height / df.height * 100:.1f}%\n")

print("2. Product Lifecycle Distribution:")
print(df.group_by('lifecycle_stage').agg(pl.len().alias('days')).sort('days', descending=True))

print("\n3. Inventory Health:")
print(f"   Avg days of supply: {df['days_of_supply'].mean():.1f}")
print("\n   Markdown depth distribution:")
print(df.group_by('markdown_depth').agg(pl.len().alias('days')).sort('days', descending=True))
print("\n   Margin status distribution:")
print(df.group_by('margin_status').agg(pl.len().alias('days')).sort('days', descending=True))

# Show new columns
print("\n=== New Feature Columns ===")
print(df.select(['date', 'is_stockout', 'weeks_since_launch', 'lifecycle_stage',
                 'days_of_supply', 'markdown_depth', 'margin_status']).head(10))

=== Feature Engineering Summary ===

1. Stockout Detection:
   Total stockout days: 32
   Stockout rate: 32.0%

2. Product Lifecycle Distribution:
shape: (4, 2)
┌─────────────────┬──────┐
│ lifecycle_stage ┆ days │
│ ---             ┆ ---  │
│ str             ┆ u32  │
╞═════════════════╪══════╡
│ clearance       ┆ 31   │
│ growth          ┆ 28   │
│ mature          ┆ 23   │
│ new             ┆ 18   │
└─────────────────┴──────┘

3. Inventory Health:
   Avg days of supply: 5.2

   Markdown depth distribution:
shape: (3, 2)
┌───────────────────┬──────┐
│ markdown_depth    ┆ days │
│ ---               ┆ ---  │
│ str               ┆ u32  │
╞═══════════════════╪══════╡
│ full_price        ┆ 69   │
│ deep_markdown     ┆ 16   │
│ moderate_markdown ┆ 15   │
└───────────────────┴──────┘

   Margin status distribution:
shape: (3, 2)
┌───────────────┬──────┐
│ margin_status ┆ days │
│ ---           ┆ ---  │
│ str           ┆ u32  │
╞═══════════════╪══════╡
│ healthy       ┆ 84   │
│ low           

In [24]:
# Visualization: Feature Engineering Results
fig = make_subplots(rows=2, cols=2,
                    subplot_titles=('Stockout Periods', 'Product Lifecycle Stage',
                                    'Days of Supply', 'Margin Status Over Time'),
                    specs=[[{}, {}], [{}, {}]])

# 1. Stockout visualization
stockout_flags = [1 if x else 0 for x in df['is_stockout'].to_list()]
fig.add_trace(go.Scatter(x=dates, y=inventory, name='Inventory',
                         line=dict(color='blue')), row=1, col=1)
fig.add_trace(go.Scatter(x=dates, y=[s * max(inventory) for s in stockout_flags],
                         name='Stockout', fill='tozeroy',
                         fillcolor='rgba(255,0,0,0.3)'), row=1, col=1)

# 2. Lifecycle stage
lifecycle_map = {'new': 1, 'growth': 2, 'mature': 3, 'clearance': 4}
lifecycle_numeric = [lifecycle_map.get(x, 0) for x in df['lifecycle_stage'].to_list()]
fig.add_trace(go.Scatter(x=dates, y=lifecycle_numeric, name='Lifecycle',
                         mode='lines', line=dict(color='purple', width=2)), row=1, col=2)

# 3. Days of supply
dos = df['days_of_supply'].to_list()
fig.add_trace(go.Scatter(x=dates, y=dos, name='Days of Supply',
                         line=dict(color='orange')), row=2, col=1)
fig.add_hline(y=14, line_dash='dash', line_color='green', row=2, col=1)
fig.add_hline(y=7, line_dash='dash', line_color='red', row=2, col=1)

# 4. Margin status
margin_map = {'healthy': 3, 'acceptable': 2, 'low': 1, 'negative': 0}
margin_numeric = [margin_map.get(x, 0) for x in df['margin_status'].to_list()]
fig.add_trace(go.Scatter(x=dates, y=margin_numeric, name='Margin Status',
                         line=dict(color='green', width=2)), row=2, col=2)

fig.update_layout(height=700, title_text='Feature Engineering Results', showlegend=False)
fig.show()