In [1]:
import warnings
import pandas as pd
from data_analysis.DataProcessor import DataProcessor

warnings.simplefilter(action='ignore', category=FutureWarning)

# read in combined stock data
preprocessor = DataProcessor()
# create df to current date
df = preprocessor.process_nrw_data("06_28")

df['timestamp'] = pd.to_datetime(df['timestamp'])
df['price'] = pd.to_numeric(df['price'], errors='coerce')

File already exists: C:\Users\FelixNeubauer\UniRepos\webscraper\data_analysis\processed_nrw_data\06_28_nrw_stock_data.csv. Loading DataFrame from file.


In [2]:
import pandas as pd
import numpy as np
from IPython.display import display

# Define price ranges and labels
def define_price_ranges(df, price_bins):
    price_labels = [f'{price_bins[i]}-{price_bins[i+1]}' for i in range(len(price_bins)-1)]
    price_labels.append(f'{price_bins[-1]}+')
    df['price_range'] = pd.cut(df['price'], bins=price_bins + [np.inf], labels=price_labels, include_lowest=True)
    return df

# Filter DataFrame to include only product variants that were in stock at least once
def filter_in_stock_variants(df):
    in_stock_variants = df[df['store_quantity'] > 0][['id', 'store_skuId', 'store_storeId']].drop_duplicates()
    df_filtered = df.merge(in_stock_variants, on=['id', 'store_skuId', 'store_storeId'])
    return df_filtered

# Function to identify restocks and calculate restock amounts
def identify_and_calculate_restocks(df):
    df = df.sort_values(by=['id', 'store_skuId', 'store_storeId', 'timestamp'])
    df['quantity_diff'] = df.groupby(['id', 'store_skuId', 'store_storeId'])['store_quantity'].diff()
    df['restock'] = df['quantity_diff'] > 0
    df['restock_amount'] = df['quantity_diff'].where(df['restock'], 0)
    df['quantity_before_restock'] = df['store_quantity'] - df['restock_amount']
    return df

# Calculate average stock quantity per sub-category, price range, and store
def calculate_average_stock(df, price_bins):
    df = df[df['store_quantity'] >= 0]
    df = filter_in_stock_variants(df)
    df = define_price_ranges(df, price_bins)
    
    # Calculate unique product variants count
    unique_variants_count = df.groupby(['sub_category', 'price_range', 'store_storeId'])['id'].nunique().reset_index(name='unique_variants_count')
    
    avg_stock_quantity_per_store = df.groupby(['store_storeId', 'sub_category', 'price_range'])['store_quantity'].mean().reset_index()
    avg_stock_quantity_per_store.rename(columns={'store_quantity': 'avg_stock_quantity'}, inplace=True)
    
    avg_stock_quantity_combined = df.groupby(['sub_category', 'price_range'])['store_quantity'].mean().reset_index()
    avg_stock_quantity_combined.rename(columns={'store_quantity': 'avg_stock_quantity'}, inplace=True)
    avg_stock_quantity_combined['store_storeId'] = 'All Stores'
    
    combined_avg_stock_quantity = pd.concat([avg_stock_quantity_per_store, avg_stock_quantity_combined], ignore_index=True)
    
    # Calculate unique variants count for "All Stores"
    unique_variants_count_combined = df.groupby(['sub_category', 'price_range'])['id'].nunique().reset_index(name='unique_variants_count')
    unique_variants_count_combined['store_storeId'] = 'All Stores'
    
    combined_unique_variants_count = pd.concat([unique_variants_count, unique_variants_count_combined], ignore_index=True)
    
    combined_avg_stock_quantity = combined_avg_stock_quantity.merge(combined_unique_variants_count, on=['sub_category', 'price_range', 'store_storeId'], how='left')
    
    # Calculate avg_count_per_variant
    combined_avg_stock_quantity['avg_count_per_variant'] = combined_avg_stock_quantity['avg_stock_quantity'] / combined_avg_stock_quantity['unique_variants_count']
    
    return combined_avg_stock_quantity

# Calculate average restock amount and typical restock points
def calculate_average_restock(df, price_bins):
    df = filter_in_stock_variants(df)
    df = define_price_ranges(df, price_bins)
    df = identify_and_calculate_restocks(df)
    
    # Calculate restock info per store
    avg_restock_amount_per_store = df[df['restock']].groupby(['store_storeId', 'sub_category', 'price_range'])['restock_amount'].mean().reset_index()
    avg_restock_amount_per_store.rename(columns={'restock_amount': 'avg_restock_amount'}, inplace=True)
    
    avg_quantity_before_restock_per_store = df[df['restock']].groupby(['store_storeId', 'sub_category', 'price_range'])['quantity_before_restock'].mean().reset_index()
    avg_quantity_before_restock_per_store.rename(columns={'quantity_before_restock': 'avg_quantity_before_restock'}, inplace=True)
    
    combined_restock_info_per_store = pd.merge(avg_restock_amount_per_store, avg_quantity_before_restock_per_store, on=['store_storeId', 'sub_category', 'price_range'])
    
    # Combine restock info across all stores
    avg_restock_amount_combined = df[df['restock']].groupby(['sub_category', 'price_range'])['restock_amount'].mean().reset_index()
    avg_restock_amount_combined.rename(columns={'restock_amount': 'avg_restock_amount'}, inplace=True)
    
    avg_quantity_before_restock_combined = df[df['restock']].groupby(['sub_category', 'price_range'])['quantity_before_restock'].mean().reset_index()
    avg_quantity_before_restock_combined.rename(columns={'quantity_before_restock': 'avg_quantity_before_restock'}, inplace=True)
    avg_restock_amount_combined['store_storeId'] = 'All Stores'
    avg_quantity_before_restock_combined['store_storeId'] = 'All Stores'
    
    combined_restock_info_combined = pd.merge(avg_restock_amount_combined, avg_quantity_before_restock_combined, on=['sub_category', 'price_range', 'store_storeId'])
    
    combined_restock_info = pd.concat([combined_restock_info_per_store, combined_restock_info_combined], ignore_index=True)
    
    return combined_restock_info

# Calculate reorder stock level and time to restock
# considered useless
'''
def calculate_reorder_stock_and_time(df, price_bins):
    df = filter_in_stock_variants(df)
    df = define_price_ranges(df, price_bins)
    df = identify_and_calculate_restocks(df)
    
    # Calculate reorder stock level per store
    reorder_stock_level_per_store = df[df['restock']].groupby(['store_storeId', 'sub_category', 'price_range'])['quantity_before_restock'].mean().reset_index()
    reorder_stock_level_per_store.rename(columns={'quantity_before_restock': 'reorder_stock_level'}, inplace=True)
    
    # Calculate time to restock per store
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['time_to_restock'] = df.groupby(['id', 'store_skuId', 'store_storeId'])['timestamp'].diff().shift(-1)
    
    avg_time_to_restock_per_store = df[df['restock']].groupby(['store_storeId', 'sub_category', 'price_range'])['time_to_restock'].mean().reset_index()
    avg_time_to_restock_per_store.rename(columns={'time_to_restock': 'avg_time_to_restock'}, inplace=True)
    
    combined_reorder_info_per_store = pd.merge(reorder_stock_level_per_store, avg_time_to_restock_per_store, on=['store_storeId', 'sub_category', 'price_range'])
    
    # Combine reorder stock level and time to restock across all stores
    reorder_stock_level_combined = df[df['restock']].groupby(['sub_category', 'price_range'])['quantity_before_restock'].mean().reset_index()
    reorder_stock_level_combined.rename(columns={'quantity_before_restock': 'reorder_stock_level'}, inplace=True)
    reorder_stock_level_combined['store_storeId'] = 'All Stores'
    
    avg_time_to_restock_combined = df[df['restock']].groupby(['sub_category', 'price_range'])['time_to_restock'].mean().reset_index()
    avg_time_to_restock_combined.rename(columns={'time_to_restock': 'avg_time_to_restock'}, inplace=True)
    avg_time_to_restock_combined['store_storeId'] = 'All Stores'
    
    combined_reorder_info_combined = pd.merge(reorder_stock_level_combined, avg_time_to_restock_combined, on=['sub_category', 'price_range', 'store_storeId'])
    
    combined_reorder_info = pd.concat([combined_reorder_info_per_store, combined_reorder_info_combined], ignore_index=True)
    
    return combined_reorder_info
'''

# price bins 
price_bins = [0, 10, 20, 50, 100, 200, 500, 1000]

# Ensure rows with sub_category "fahrrad_sale" are removed
df = df[df['sub_category'] != 'fahrrad_sale']

# Calculate the average stock quantity
avg_stock_quantity_df = calculate_average_stock(df, price_bins)

# Display the average stock quantity results
display(avg_stock_quantity_df)

# Save average stock quantity DataFrame to CSV
avg_stock_quantity_df.to_csv('data/avg_stock_quantity.csv', index=False)

# Calculate the average restock amount and quantity before restock
restock_info_df = calculate_average_restock(df, price_bins)

# Display the restock information results
display(restock_info_df)

# Save restock information DataFrame to CSV
restock_info_df.to_csv('data/restock_info.csv', index=False)

# Calculate reorder stock levels and time to restock
# reorder_info_df = calculate_reorder_stock_and_time(df, price_bins)

# Display the reorder information results
#display(reorder_info_df)

# Save reorder information DataFrame to CSV
#reorder_info_df.to_csv('data/reorder_info.csv', index=False)


Unnamed: 0,store_storeId,sub_category,price_range,avg_stock_quantity,unique_variants_count,avg_count_per_variant
0,0070008500085,campingmobel,0-10,14.576577,3,4.858859
1,0070008500085,campingmobel,10-20,38.096154,6,6.349359
2,0070008500085,campingmobel,20-50,19.478599,6,3.246433
3,0070008500085,campingmobel,50-100,5.127413,7,0.732488
4,0070008500085,campingmobel,100-200,2.864865,2,1.432432
...,...,...,...,...,...,...
619,All Stores,zelte,50-100,7.782873,9,0.864764
620,All Stores,zelte,100-200,11.316799,12,0.943067
621,All Stores,zelte,200-500,3.340947,10,0.334095
622,All Stores,zelte,500-1000,2.349282,3,0.783094


Unnamed: 0,store_storeId,sub_category,price_range,avg_restock_amount,avg_quantity_before_restock
0,0070008500085,campingmobel,0-10,7.000000,15.625000
1,0070008500085,campingmobel,10-20,9.434783,27.043478
2,0070008500085,campingmobel,20-50,10.312500,18.750000
3,0070008500085,campingmobel,50-100,2.500000,3.833333
4,0070008500085,campingmobel,100-200,1.800000,3.200000
...,...,...,...,...,...
619,All Stores,zelte,50-100,4.906736,6.678756
620,All Stores,zelte,100-200,5.296474,9.471154
621,All Stores,zelte,200-500,2.196507,3.187773
622,All Stores,zelte,500-1000,1.390244,1.890244


Unnamed: 0,store_storeId,sub_category,price_range,reorder_stock_level,avg_time_to_restock
0,0070008500085,campingmobel,0-10,15.625000,1 days 06:25:17.440987750
1,0070008500085,campingmobel,10-20,27.043478,1 days 03:46:19.017142608
2,0070008500085,campingmobel,20-50,18.750000,1 days 03:20:13.633356833
3,0070008500085,campingmobel,50-100,3.833333,1 days 04:54:37.120702416
4,0070008500085,campingmobel,100-200,3.200000,1 days 03:06:12.020191200
...,...,...,...,...,...
619,All Stores,zelte,50-100,6.678756,1 days 05:21:24.408106911
620,All Stores,zelte,100-200,9.471154,1 days 03:48:50.682295637
621,All Stores,zelte,200-500,3.187773,1 days 06:18:57.255594241
622,All Stores,zelte,500-1000,1.890244,1 days 07:05:10.285796592
