In [5]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import warnings
from DataProcessor import DataProcessor

warnings.simplefilter(action='ignore', category=FutureWarning)

preprocessor = DataProcessor()

# Assuming the data is loaded into a dataframe called df
df = preprocessor.process_nrw_data("06_16")

# Convert the timestamp to datetime
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Group by id, store_skuId, and store_storeId to find the number of stores per product variant
product_store_counts = df.groupby(['id', 'store_skuId'])['store_storeId'].nunique().reset_index()
product_store_counts = product_store_counts[product_store_counts['store_storeId'] >= 5]

# Filter the main dataframe to include only those product variants
df_filtered = df.merge(product_store_counts[['id', 'store_skuId']], on=['id', 'store_skuId'])

# Find stock out events
df_filtered['is_stock_out'] = (df_filtered['store_quantity'] == 0) & (df_filtered['store_quantity'].shift() > 0)

# Identify product variants with at least 2 stock out events in different stores
stock_out_events = df_filtered.groupby(['id', 'store_skuId', 'store_storeId'])['is_stock_out'].max().reset_index()
stock_out_counts = stock_out_events.groupby(['id', 'store_skuId'])['is_stock_out'].sum().reset_index()
stock_out_counts = stock_out_counts[stock_out_counts['is_stock_out'] >= 2]

# Further filter the dataframe to include only those variants
df_final = df_filtered.merge(stock_out_counts[['id', 'store_skuId']], on=['id', 'store_skuId'])

# Get a list of unique product variants to plot
product_variants_to_plot = df_final[['id', 'store_skuId', 'sub_category']].drop_duplicates()

# Debugging: Print the number of unique product variants identified
print(f"Number of unique product variants to plot: {len(product_variants_to_plot)}")

# Ensure the directory exists
output_directory = r'C:\Users\felix\Repos\decathlon_plots_marked_stock_outs'
os.makedirs(output_directory, exist_ok=True)

line_styles = ['-', '--', '-.', ':']
markers = ['o', 'v', '^', '<', '>', 's', 'p', '*', 'h', 'H', 'D', 'd', 'P', 'X']

for sub_category, group in product_variants_to_plot.groupby('sub_category'):
    count = 0
    for (product_id, store_skuId), product_group in group.groupby(['id', 'store_skuId']):
        if count >= 600:
            break
        product_data = df_final[(df_final['id'] == product_id) & (df_final['store_skuId'] == store_skuId)]
        
        # Debugging: Print the number of stores and the time range for each product variant
        num_stores = product_data['store_storeId'].nunique()
        time_range = product_data['timestamp'].min(), product_data['timestamp'].max()
        print(f"Plotting for product {product_id}, SKU {store_skuId}: {num_stores} stores, time range: {time_range}")

        plt.figure(figsize=(12, 8))
        for i, (store_id, store_group) in enumerate(product_data.groupby('store_storeId')):
            store_group = store_group.sort_values('timestamp')
            plt.plot(
                store_group['timestamp'], 
                store_group['store_quantity'], 
                label=store_group['store_storeName'].iloc[0],
                linestyle=line_styles[i % len(line_styles)],
                marker=markers[i % len(markers)]
            )
            plt.text(store_group['timestamp'].iloc[-1], store_group['store_quantity'].iloc[-1], store_group['store_storeName'].iloc[0])

            # Highlight stock out events
            stock_out_indices = store_group[(store_group['store_quantity'] == 0) & (store_group['store_quantity'].shift() > 0)].index
            plt.scatter(store_group.loc[stock_out_indices, 'timestamp'],
                        store_group.loc[stock_out_indices, 'store_quantity'],
                        edgecolor='black', facecolor='none', s=100, linewidth=1.5)

        plt.title(f"{product_data['product_name'].iloc[0]} - {sub_category} (price: {product_data['price'].iloc[0]})")
        plt.xlabel('Time')
        plt.ylabel('Quantity')
        plt.legend(loc='best')
        
        # Save the plot
        output_path = os.path.join(output_directory, f"{count}_{sub_category}_{product_id}_{store_skuId}.png")
        plt.savefig(output_path)
        plt.close()  # Close the figure to free memory
        
        count += 1


File already exists: processed_nrw_data/06_16_nrw_stock_data.csv. Loading DataFrame from file.
Number of unique product variants to plot: 302
Plotting for product 864538.0, SKU 4073788: 12 stores, time range: (Timestamp('2024-05-14 19:54:41.567973'), Timestamp('2024-06-14 18:40:59.516664'))
Plotting for product 864559.0, SKU 4073804: 12 stores, time range: (Timestamp('2024-05-14 19:54:41.567973'), Timestamp('2024-06-14 18:40:59.516664'))
Plotting for product 960034.0, SKU 4073894: 12 stores, time range: (Timestamp('2024-05-14 19:54:41.567973'), Timestamp('2024-06-14 18:40:59.516664'))
Plotting for product 8492643.0, SKU 2558046: 12 stores, time range: (Timestamp('2024-05-14 19:54:41.567973'), Timestamp('2024-06-14 18:40:59.516664'))
Plotting for product 8492643.0, SKU 4899198: 12 stores, time range: (Timestamp('2024-05-14 19:54:41.567973'), Timestamp('2024-06-14 18:40:59.516664'))
Plotting for product 8492643.0, SKU 4899199: 12 stores, time range: (Timestamp('2024-05-14 19:54:41.567973