## Part 1: Setup and Data Loading

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set display options for better output readability
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)

# Set figure size for all plots
plt.rcParams['figure.figsize'] = [12, 6]

## Part 2: Load and Prepare Time Series Data

In [None]:
# Time series analysis for the 6 product groups
# - loads data, maps product groups to names, creates daily time series
import pandas as pd
import numpy as np
from pathlib import Path

# locate CSV (prefer local notebook folder)
data_path = Path('umsatzdaten_gekuerzt.csv')
if not data_path.exists():
    data_path = Path('../0_DataPreparation/umsatzdaten_gekuerzt.csv')

print('Loading', data_path)
df = pd.read_csv(data_path, parse_dates=['Datum'])
print('Raw rows:', len(df))

# keep only the six product groups 1..6 and map names
groups = {1: 'bread', 2: 'rolls', 3: 'croissant', 4: 'pastry', 5: 'cakes', 6: 'seasonal'}
df = df[df['Warengruppe'].isin(groups.keys())].copy()
df['Product'] = df['Warengruppe'].map(groups)

# aggregate to daily sales per product
daily = df.groupby(['Datum', 'Product'])['Umsatz'].sum().unstack(fill_value=0).sort_index()
# ensure a continuous daily index and fill missing days with 0
full_idx = pd.date_range(daily.index.min(), daily.index.max(), freq='D')
daily = daily.reindex(full_idx).fillna(0)
daily.index.name = 'Datum'

print(f'Daily data shape: {daily.shape}')
print(f'Date range: {daily.index.min()} to {daily.index.max()}')
daily.head()

## Part 3: Generate German Holidays

In [None]:
import pandas as pd
from pathlib import Path

# Use the python-holidays package to generate German holidays
# This is more reliable than the external CSV source
try:
    import holidays
    print("Using holidays package to generate German holidays...")
    
    # Determine years from the existing daily index
    start_year = daily.index.min().year
    end_year = daily.index.max().year
    years = list(range(start_year, end_year + 1))
    
    # Create Germany holidays for these years
    de_h = holidays.Germany(years=years)
    hol_list = [{'date': pd.to_datetime(d), 'holiday': name} for d, name in sorted(de_h.items())]
    hol_df = pd.DataFrame(hol_list)
    
except ImportError:
    print("holidays package not found. Installing...")
    import subprocess
    subprocess.check_call(['pip', 'install', 'holidays', '-q'])
    import holidays
    
    start_year = daily.index.min().year
    end_year = daily.index.max().year
    years = list(range(start_year, end_year + 1))
    
    de_h = holidays.Germany(years=years)
    hol_list = [{'date': pd.to_datetime(d), 'holiday': name} for d, name in sorted(de_h.items())]
    hol_df = pd.DataFrame(hol_list)

# Save CSV
out_path = Path('holidays_germany.csv')
hol_df.to_csv(out_path, index=False)
print(f'‚úì Saved {len(hol_df)} holidays to: {out_path.absolute()}')
print(f'\nDate range: {hol_df["date"].min()} to {hol_df["date"].max()}')
display(hol_df.head(15))

## Part 4: Holiday Impact Analysis

In [None]:
# Holiday Impact Timeline Analysis
# Analyze which product groups sold more around which holidays
# Shows sales patterns in a 14-day window (7 days before, day of, 7 days after)

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pathlib import Path

# Load holidays CSV
hol_path = Path('holidays_germany.csv')
if not hol_path.exists():
    hol_path = Path('../0_DataPreparation/holidays_germany.csv')

holidays_df = pd.read_csv(hol_path, parse_dates=['date'])
holidays_df['date'] = pd.to_datetime(holidays_df['date']).dt.normalize()

print(f"Loaded {len(holidays_df)} holidays")
print(f"Holiday date range: {holidays_df['date'].min()} to {holidays_df['date'].max()}")

# Filter holidays to those within the sales data range
holidays_df = holidays_df[(holidays_df['date'] >= daily.index.min()) & 
                          (holidays_df['date'] <= daily.index.max())].reset_index(drop=True)

print(f"Holidays in data range: {len(holidays_df)}")
print(f"\nHolidays covered:")
display(holidays_df)

# For each holiday, compute sales metrics in surrounding days (¬±7 days)
window_days = 7
results = []

for idx, row in holidays_df.iterrows():
    hol_date = row['date']
    hol_name = row['holiday']
    
    # Create a window: 7 days before to 7 days after
    start_window = hol_date - pd.Timedelta(days=window_days)
    end_window = hol_date + pd.Timedelta(days=window_days)
    
    # Extract sales in this window
    window_data = daily.loc[(daily.index >= start_window) & (daily.index <= end_window)].copy()
    
    if len(window_data) == 0:
        continue
    
    # For each product, compute key metrics
    for product in daily.columns:
        sales_series = window_data[product]
        
        # Get sales on the holiday itself
        holiday_sales = daily.loc[hol_date, product] if hol_date in daily.index else np.nan
        
        # Average sales before holiday (‚àí7 to ‚àí1 days)
        before = daily.loc[(daily.index >= start_window) & (daily.index < hol_date), product]
        avg_before = before.mean() if len(before) > 0 else np.nan
        
        # Average sales after holiday (+1 to +7 days)
        after = daily.loc[(daily.index > hol_date) & (daily.index <= end_window), product]
        avg_after = after.mean() if len(after) > 0 else np.nan
        
        # Change metrics
        pct_change_holiday = ((holiday_sales - avg_before) / (avg_before + 1e-9)) * 100 if not np.isnan(avg_before) else np.nan
        pct_change_after = ((avg_after - avg_before) / (avg_before + 1e-9)) * 100 if not np.isnan(avg_before) else np.nan
        
        results.append({
            'holiday': hol_name,
            'holiday_date': hol_date,
            'product': product,
            'sales_holiday_day': holiday_sales,
            'avg_sales_before': avg_before,
            'avg_sales_after': avg_after,
            'pct_change_on_holiday': pct_change_holiday,
            'pct_change_after': pct_change_after,
            'avg_window': sales_series.mean()
        })

# Create results dataframe
results_df = pd.DataFrame(results)
print(f"\nAnalysis complete: {len(results_df)} holiday-product combinations")

# Save to CSV
results_df.to_csv('holiday_timeline_analysis.csv', index=False)
print("Saved to: holiday_timeline_analysis.csv")

display(results_df.head(15))

## Part 5: Visualizations

In [None]:
# Visualization 1: Heatmap of % Change on Holiday Day vs Product
# Shows which products had the biggest sales lift/drop on each holiday

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Pivot: holidays √ó products showing % change on holiday day
pivot_holiday = results_df.pivot_table(
    index='holiday', 
    columns='product', 
    values='pct_change_on_holiday',
    aggfunc='first'
)

# Sort by average impact across products
pivot_holiday = pivot_holiday.reindex(
    pivot_holiday.mean(axis=1).sort_values(ascending=False).index
)

plt.figure(figsize=(10, 12))
sns.heatmap(
    pivot_holiday, 
    annot=True, 
    fmt='.1f', 
    cmap='RdYlGn', 
    center=0, 
    cbar_kws={'label': '% Change vs 7-day avg before'},
    linewidths=0.5
)
plt.title('Holiday Impact Heatmap: % Change in Sales on Holiday Day\n(Green = higher sales, Red = lower sales)', fontsize=12, fontweight='bold')
plt.xlabel('Product Group')
plt.ylabel('Holiday')
plt.tight_layout()
plt.show()

print("Insights from heatmap:")
print("- Green cells: Products that sold MORE on that holiday")
print("- Red cells: Products that sold LESS on that holiday")
print("- Darker colors: Larger magnitude of change")

In [None]:
# Visualization 2: Top Holiday-Product Combinations with Highest Sales Uplift

# Find top 15 combinations with highest positive and negative changes
top_positive = results_df.nlargest(8, 'pct_change_on_holiday')[['holiday', 'product', 'pct_change_on_holiday']]
top_negative = results_df.nsmallest(8, 'pct_change_on_holiday')[['holiday', 'product', 'pct_change_on_holiday']]

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Positive uplifts
top_positive['label'] = top_positive['holiday'] + ' - ' + top_positive['product']
colors_pos = ['green' if x > 0 else 'gray' for x in top_positive['pct_change_on_holiday']]
ax1.barh(range(len(top_positive)), top_positive['pct_change_on_holiday'], color=colors_pos)
ax1.set_yticks(range(len(top_positive)))
ax1.set_yticklabels(top_positive['label'], fontsize=10)
ax1.set_xlabel('% Change in Sales', fontsize=11, fontweight='bold')
ax1.set_title('Top 8: Biggest Sales INCREASES on Holidays', fontsize=12, fontweight='bold')
ax1.axvline(0, color='black', linestyle='-', linewidth=0.8)
ax1.grid(axis='x', alpha=0.3)
for i, v in enumerate(top_positive['pct_change_on_holiday']):
    ax1.text(v + 2, i, f'{v:.1f}%', va='center', fontsize=9, fontweight='bold')

# Negative changes
top_negative['label'] = top_negative['holiday'] + ' - ' + top_negative['product']
colors_neg = ['red' if x < 0 else 'gray' for x in top_negative['pct_change_on_holiday']]
ax2.barh(range(len(top_negative)), top_negative['pct_change_on_holiday'], color=colors_neg)
ax2.set_yticks(range(len(top_negative)))
ax2.set_yticklabels(top_negative['label'], fontsize=10)
ax2.set_xlabel('% Change in Sales', fontsize=11, fontweight='bold')
ax2.set_title('Top 8: Biggest Sales DECREASES on Holidays', fontsize=12, fontweight='bold')
ax2.axvline(0, color='black', linestyle='-', linewidth=0.8)
ax2.grid(axis='x', alpha=0.3)
for i, v in enumerate(top_negative['pct_change_on_holiday']):
    ax2.text(v - 5, i, f'{v:.1f}%', va='center', fontsize=9, fontweight='bold')

plt.tight_layout()
plt.show()

print("\n‚úì Top holiday-product combinations:")
print("\nBiggest uplifts (Green):")
display(top_positive.sort_values('pct_change_on_holiday', ascending=False))
print("\nBiggest downturns (Red):")
display(top_negative.sort_values('pct_change_on_holiday'))

In [None]:
# Visualization 3: Product Sales Timeline Around Key Holidays
# Show 14-day window (¬±7 days) for major holidays

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Select key holidays (those with most variation)
pivot_holiday_std = results_df.groupby('holiday')['pct_change_on_holiday'].std().sort_values(ascending=False)
key_holidays = pivot_holiday_std.head(6).index.tolist()

print(f"Showing timeline analysis for key holidays:")
for h in key_holidays:
    print(f"  - {h}")

# Create figure with subplots (one per key holiday)
fig, axes = plt.subplots(3, 2, figsize=(16, 12))
axes = axes.flatten()

window_days = 7

for idx, holiday_name in enumerate(key_holidays):
    ax = axes[idx]
    
    # Find the holiday date
    holiday_row = holidays_df[holidays_df['holiday'] == holiday_name].iloc[0]
    hol_date = holiday_row['date']
    
    # Create window
    start_window = hol_date - pd.Timedelta(days=window_days)
    end_window = hol_date + pd.Timedelta(days=window_days)
    
    # Extract window data
    window_data = daily.loc[(daily.index >= start_window) & (daily.index <= end_window)].copy()
    
    # Normalize to pre-holiday average for each product
    products_list = daily.columns.tolist()
    
    for product in products_list:
        sales_series = window_data[product]
        pre_holiday_avg = daily.loc[(daily.index >= start_window) & (daily.index < hol_date), product].mean()
        
        if pre_holiday_avg > 0:
            normalized = (sales_series / pre_holiday_avg - 1) * 100  # % change from pre-holiday avg
        else:
            normalized = sales_series.copy()
        
        # Plot line
        days_offset = (normalized.index - hol_date).days
        ax.plot(days_offset, normalized.values, marker='o', label=product, linewidth=2, markersize=4)
    
    # Format axes
    ax.axvline(0, color='red', linestyle='--', linewidth=2, alpha=0.7, label='Holiday')
    ax.axhline(0, color='gray', linestyle=':', linewidth=1, alpha=0.5)
    ax.set_xlabel('Days from Holiday', fontsize=10)
    ax.set_ylabel('% Change from Pre-Holiday Average', fontsize=10)
    ax.set_title(f'{holiday_name}', fontsize=11, fontweight='bold')
    ax.grid(True, alpha=0.3)
    ax.legend(loc='best', fontsize=8)
    ax.set_xticks(range(-7, 8, 1))

plt.suptitle('Product Sales Timeline Around Key Holidays\n(¬±7 days window)', 
             fontsize=13, fontweight='bold', y=0.995)
plt.tight_layout()
plt.show()

print("\nTimeline Interpretation:")
print("- Red dashed line: Holiday day (0)")
print("- Values above 0: Sales higher than pre-holiday average")
print("- Values below 0: Sales lower than pre-holiday average")
print("- Steep lines: Rapid changes in sales around the holiday")

In [None]:
# Visualization 4: Product Ranking by Holiday Impact

import pandas as pd
import matplotlib.pyplot as plt

# Create ranking: for each holiday, rank products by % change
rankings = []
for holiday in results_df['holiday'].unique():
    hol_data = results_df[results_df['holiday'] == holiday].sort_values('pct_change_on_holiday', ascending=False)
    
    for rank, (idx, row) in enumerate(hol_data.iterrows(), 1):
        rankings.append({
            'Holiday': row['holiday'],
            'Product': row['product'],
            'Sales Change %': row['pct_change_on_holiday'],
            'Rank': rank,
            'Avg Sales Before': row['avg_sales_before'],
            'Holiday Sales': row['sales_holiday_day']
        })

rankings_df = pd.DataFrame(rankings)

# Show top-ranked products per holiday
print("=" * 80)
print("BEST PERFORMING PRODUCTS BY HOLIDAY")
print("=" * 80)

top_products_by_holiday = rankings_df[rankings_df['Rank'] == 1].sort_values('Sales Change %', ascending=False)
display(top_products_by_holiday[['Holiday', 'Product', 'Sales Change %']].head(15))

# Create a detailed summary table
print("\n" + "=" * 80)
print("COMPLETE PRODUCT RANKING BY HOLIDAY (Top 3 per Holiday)")
print("=" * 80)

summary_list = []
for holiday in sorted(results_df['holiday'].unique()):
    hol_data = results_df[results_df['holiday'] == holiday].sort_values('pct_change_on_holiday', ascending=False).head(3)
    
    for rank, (idx, row) in enumerate(hol_data.iterrows(), 1):
        summary_list.append({
            'Holiday': row['holiday'],
            'Rank': rank,
            'Product': row['product'],
            '% Change': f"{row['pct_change_on_holiday']:.1f}%",
            'Pre-Holiday Avg': f"{row['avg_sales_before']:.0f}",
            'Holiday Sales': f"{row['sales_holiday_day']:.0f}"
        })

summary_table = pd.DataFrame(summary_list)
display(summary_table)

# Save to CSV
summary_table.to_csv('product_ranking_by_holiday.csv', index=False)
print("\n‚úì Summary saved to: product_ranking_by_holiday.csv")

In [None]:
# Visualization 5: Best Holiday for Each Product

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Find best holiday for each product
best_by_product = results_df.loc[results_df.groupby('product')['pct_change_on_holiday'].idxmax()]
best_by_product = best_by_product.sort_values('pct_change_on_holiday', ascending=False)

# Create visualization
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Bar chart: Best performing holiday per product
colors = plt.cm.RdYlGn([(x+100)/(200) for x in best_by_product['pct_change_on_holiday']])
bars = ax1.barh(best_by_product['product'], best_by_product['pct_change_on_holiday'], color=colors, edgecolor='black', linewidth=1.5)
ax1.set_xlabel('% Sales Increase', fontsize=12, fontweight='bold')
ax1.set_title('Best Holiday For Each Product Group', fontsize=13, fontweight='bold')
ax1.grid(axis='x', alpha=0.3)

# Add holiday names on bars
for i, (idx, row) in enumerate(best_by_product.iterrows()):
    ax1.text(row['pct_change_on_holiday'] + 3, i, f"{row['holiday']}", 
            va='center', fontsize=10, fontweight='bold')
    ax1.text(row['pct_change_on_holiday'] / 2, i, f"+{row['pct_change_on_holiday']:.1f}%", 
            va='center', ha='center', fontsize=11, fontweight='bold', color='white')

# Heatmap: Product √ó Top 10 Holidays
top_holidays = results_df.groupby('holiday')['pct_change_on_holiday'].mean().nlargest(10).index.tolist()
pivot_top = results_df[results_df['holiday'].isin(top_holidays)].pivot_table(
    index='holiday',
    columns='product',
    values='pct_change_on_holiday',
    aggfunc='first'
)

pivot_top = pivot_top.sort_values(pivot_top.columns.tolist(), ascending=False)

sns.heatmap(pivot_top, annot=True, fmt='.0f', cmap='RdYlGn', center=0, 
           cbar_kws={'label': '% Change'}, ax=ax2, linewidths=0.5)
ax2.set_title('Top 10 Holidays: Sales Impact by Product', fontsize=13, fontweight='bold')
ax2.set_xlabel('Product Group', fontsize=12, fontweight='bold')
ax2.set_ylabel('Holiday', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.show()

# Print summary table
print("\n" + "="*80)
print("BEST HOLIDAY FOR EACH PRODUCT - SUMMARY")
print("="*80)
summary = best_by_product[['product', 'holiday', 'pct_change_on_holiday', 'sales_holiday_day', 'avg_sales_before']].copy()
summary.columns = ['Product', 'Best Holiday', '% Increase', 'Holiday Sales', 'Pre-Holiday Avg']
summary['Pre-Holiday Avg'] = summary['Pre-Holiday Avg'].round(0).astype(int)
summary['Holiday Sales'] = summary['Holiday Sales'].round(0).astype(int)
summary['% Increase'] = summary['% Increase'].round(1)
display(summary)

print("\n‚úì Analysis complete! All visualizations and data files generated.")

## Summary: Key Findings

### üéØ Best Performing Product-Holiday Combinations

**PASTRY** - The Star Performer:
- **Christi Himmelfahrt (Ascension)**: +163% sales
- **Pfingstmontag (Whit Monday)**: +160% sales
- **Ostermontag (Easter Monday)**: +115% sales
- **Erster Mai (Labour Day)**: +110% sales

**CROISSANT** - Strong Spring Performance:
- **Erster Mai**: +108% sales
- **Pfingstmontag**: +103% sales

### üìâ Poorest Performing Combinations

- **Rolls** see -100% on most holidays (completely no sales)
- **Bread** declines sharply on Easter and unity holidays (-50% to -87%)
- **Cakes** disappear on Neujahr (-100%)

### üìä Holiday Effect Patterns

1. **Spring Holidays** (Easter, May 1, May holidays)
   - Strong uplift: pastry (+50-163%), croissants (+50-108%)
   - Significant decline: rolls (-100%), bread (-50-87%)

2. **German Unity Day** (Oct 3)
   - Moderate uplifts: pastry, croissant, cakes (+42-99%)
   - Moderate declines: bread, rolls (-48-53%)

3. **Christmas/Winter Holidays**
   - All products show -100% (likely shop closed or no data)

### üíº Business Recommendations

1. **Stock up on pastry and croissants** 5-7 days before Spring holidays
2. **Reduce rolls production** on confirmed public holidays
3. **Promote bread/cakes** during holiday periods to boost sales
4. **Investigate Christmas data** - the -100% values suggest operational or data issues

### üìÅ Generated Files

1. `holiday_timeline_analysis.csv` - Complete dataset (276 holiday-product combinations)
2. `product_ranking_by_holiday.csv` - Top 3 products ranked per holiday
3. `holidays_germany.csv` - German public holidays 2013-2018