In [6]:
import warnings
import pandas as pd
from DataProcessor import DataProcessor

warnings.simplefilter(action='ignore', category=FutureWarning)

# read in combined stock data
preprocessor = DataProcessor()
# create df to current date
df = preprocessor.process_nrw_data("06_17")
# remove bike sale since it is not relevant
df = preprocessor.remove_fahrrad_sale(df)
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Group by id, store_skuId, and store_storeId to find the number of stores per product variant
product_store_counts = df.groupby(['id', 'store_skuId'])['store_storeId'].nunique().reset_index()
product_store_counts = product_store_counts[product_store_counts['store_storeId'] >= 5]

# Filter the main dataframe to include only those product variants
df_filtered = df.merge(product_store_counts[['id', 'store_skuId']], on=['id', 'store_skuId'])

# Find stock out events
df_filtered['is_stock_out'] = (df_filtered['store_quantity'] == 0) & (df_filtered['store_quantity'].shift() > 0)

# Identify product variants with at least 2 stock out events in different stores
stock_out_events = df_filtered.groupby(['id', 'store_skuId', 'store_storeId'])['is_stock_out'].max().reset_index()
stock_out_counts = stock_out_events.groupby(['id', 'store_skuId'])['is_stock_out'].sum().reset_index()
stock_out_counts = stock_out_counts[stock_out_counts['is_stock_out'] >= 2]

# filter the dataframe to include only those variants
df_relevant_variants  = df_filtered.merge(stock_out_counts[['id', 'store_skuId']], on=['id', 'store_skuId'])


File already exists: processed_nrw_data/06_17_nrw_stock_data.csv. Loading DataFrame from file.


In [7]:
import warnings
import pandas as pd
import numpy as np
from DataProcessor import DataProcessor

# Assuming df_relevant_variants is your filtered dataframe
df_relevant_variants['timestamp'] = pd.to_datetime(df_relevant_variants['timestamp'])
df_relevant_variants.set_index('timestamp', inplace=True)

# Ensure that categorical columns are treated correctly
categorical_columns = ['id', 'store_skuId', 'store_storeId', 'main_category', 'sub_category', 'product_name', 'brand', 'store_storeName']
for col in categorical_columns:
    df_relevant_variants[col] = df_relevant_variants[col].astype('category')

# Group by id, store_skuId, and store_storeId and resample daily
grouped = df_relevant_variants.groupby(['id', 'store_skuId', 'store_storeId'])

# Resample to daily data and forward fill the quantity
df_daily = grouped['store_quantity'].resample('D').ffill().reset_index()

# Create lag features
for lag in range(1, 22):
    df_daily[f'quantity_lag_{lag}'] = df_daily.groupby(['id', 'store_skuId', 'store_storeId'])['store_quantity'].shift(lag)

# Find stock out events for other stores
df_daily['stock_out'] = (df_daily['store_quantity'] == 0)

# Create a column to count stock outs in other stores
df_daily['stock_out_other_stores'] = df_daily.groupby(['id', 'store_skuId', 'timestamp'])['stock_out'].transform('sum') - df_daily['stock_out']

# Create lag features for stock outs in other stores
for lag in range(1, 22):
    df_daily[f'stock_out_other_stores_lag_{lag}'] = df_daily.groupby(['id', 'store_skuId', 'store_storeId'])['stock_out_other_stores'].shift(lag)

# Get the latest timestamp for each product variant and store
latest_timestamp = df_daily.groupby(['id', 'store_skuId', 'store_storeId'])['timestamp'].max().reset_index()

# Merge to get the latest quantities and features
df_final = df_daily.merge(latest_timestamp, on=['id', 'store_skuId', 'store_storeId', 'timestamp'], suffixes=('', '_latest'))

# Select the relevant columns from the original dataframe to merge with df_final
additional_columns = ['id', 'store_skuId', 'store_storeId', 'main_category', 'sub_category', 'product_name', 'brand', 'price', 'store_storeName']
df_relevant_variants_reset = df_relevant_variants.reset_index()
df_additional_info = df_relevant_variants_reset[additional_columns].drop_duplicates()

# Merge additional information to df_final
df_final = df_final.merge(df_additional_info, on=['id', 'store_skuId', 'store_storeId'])

# Select the final relevant columns
selected_columns = [
    'id', 'main_category', 'sub_category', 'product_name', 'brand', 'price', 
    'store_skuId', 'store_storeId', 'store_storeName', 'timestamp', 'store_quantity'
]
selected_columns += [f'quantity_lag_{i}' for i in range(1, 22)]
selected_columns += [f'stock_out_other_stores_lag_{i}' for i in range(1, 22)]

df_final = df_final[selected_columns]
df_final.rename(columns={'store_quantity': 'quantityLatest', 'timestamp': 'latestTimestamp'}, inplace=True)

# Optionally, you can also add the earliest timestamp
# Ensure that timestamp is not a categorical type before performing min operation
df_relevant_variants_reset['timestamp'] = pd.to_datetime(df_relevant_variants_reset['timestamp'])
earliest_timestamp = df_relevant_variants_reset.groupby(['id', 'store_skuId', 'store_storeId'])['timestamp'].min().reset_index()
df_final = df_final.merge(earliest_timestamp, on=['id', 'store_skuId', 'store_storeId'], suffixes=('', '_earliest'))
df_final.rename(columns={'timestamp': 'earliestTimestamp'}, inplace=True)


# Display the prepared dataframe
#import ace_tools as tools; tools.display_dataframe_to_user(name="Prepared DataFrame", dataframe=df_final)


In [8]:
# save to csv
df_final.to_csv('processed_variant_development_per_store.csv', index=False)