In [1]:
from data_analysis.StockDataCombiner import StockDataCombiner

#combiner = StockDataCombiner('../decathlon_scraper/data/sportgear/stocks')
#combined_df = combiner.combine_csv_files()
#combiner.save_combined_data('combined_stock_data.csv')

In [2]:
# read in combine stock data
import pandas as pd
df = pd.read_csv('combined_stock_data.csv')

In [3]:
# list columns
print(df.columns)

In [4]:
# for first insight, only relevant considered features are selected
df = df[['id', 'timestamp', 'main_category', 'sub_category', 'product_name', 'brand', 'price', 'store_skuId','store_storeId',
       'store_storeName', 'store_quantity', 'store_availabilityInfo','store_clickNcollect1h']]

# save reduced version as csv
#df.to_csv('reduced_features_stock_data.csv', index=False)

In [5]:
# print df types
print(df.dtypes)

In [6]:
# set store_storeId as category
df['store_storeId'] = df['store_storeId'].astype('object')

In [7]:
# Frequency distribution for categorical fields
for col in df.select_dtypes(include='object').columns:
    print(df[col].value_counts())

In [8]:
# print head
print(df.head(2))

# Analysis of Stock Changes in an Area

In [10]:
# Ensure the store_storeId column is of string type and strip any whitespace
df['store_storeId'] = df['store_storeId'].astype(str).str.strip()

# Pad the store_storeId values with leading zeros to ensure 13 characters
df['store_storeId'] = df['store_storeId'].str.zfill(13)

# Filter for a specific area
import decathlon_scraper.StoresEnum as StoresEnum
nrw_area = StoresEnum.StoresNRW

# Create a list with the enum values, ensuring no leading/trailing whitespace
nrw_stores = [str(store.value).strip() for store in nrw_area]

# Filter for the stores in the area
nrw_df = df[df['store_storeId'].isin(nrw_stores)]

# Group by 'id' and 'store_skuId' and count the occurrences
product_counts = nrw_df.groupby(['id', 'store_skuId']).size().reset_index(name='count')

# Print the results
print(product_counts)

# If you want to filter and find products that are available in more than one store
same_products_across_stores = product_counts[product_counts['count'] > 1]

# Print the products that are available in more than one store
print(same_products_across_stores)


In [13]:
import matplotlib.pyplot as plt
import seaborn as sns

# Merge the same_products_across_stores DataFrame with the original nrw_df to get the main_category
merged_df = same_products_across_stores.merge(nrw_df, on=['id', 'store_skuId'])

# Drop duplicates to ensure each product is counted only once per main category
unique_products = merged_df.drop_duplicates(subset=['id', 'store_skuId', 'main_category'])

# Group by main_category and count the unique products
category_counts = unique_products['main_category'].value_counts().reset_index()
category_counts.columns = ['main_category', 'product_count']

# Plot the results
plt.figure(figsize=(12, 6))
sns.barplot(data=category_counts, x='main_category', y='product_count', palette='viridis')
plt.xlabel('Main Category')
plt.ylabel('Number of Unique Products')
plt.title('Number of Unique Products by Main Category in NRW Stores')
plt.xticks(rotation=45)
plt.show()


In [14]:

# Ensure the store_storeId column is of string type and strip any whitespace
df['store_storeId'] = df['store_storeId'].astype(str).str.strip()

# Pad the store_storeId values with leading zeros to ensure 13 characters
df['store_storeId'] = df['store_storeId'].str.zfill(13)

# Filter for a specific area
import decathlon_scraper.StoresEnum as StoresEnum
nrw_area = StoresEnum.StoresNRW

# Create a list with the enum values, ensuring no leading/trailing whitespace
nrw_stores = [str(store.value).strip() for store in nrw_area]

# Filter for the stores in the area
nrw_df = df[df['store_storeId'].isin(nrw_stores)]

# Group by 'id' and 'store_skuId' and count the occurrences
product_counts = nrw_df.groupby(['id', 'store_skuId']).size().reset_index(name='count')

# Filter for the same products across stores
same_products_across_stores = product_counts[product_counts['count'] > 1]

# Merge with the original nrw_df to get the main_category and timestamp
merged_df = same_products_across_stores.merge(nrw_df, on=['id', 'store_skuId'])

# Drop duplicates to ensure each product is counted only once per main category
unique_products = merged_df.drop_duplicates(subset=['id', 'store_skuId', 'main_category'])

# Aggregate stock quantity per category and timestamp
aggregated_data = nrw_df.groupby(['timestamp', 'main_category'])['store_quantity'].sum().reset_index()

# Convert timestamp to datetime
aggregated_data['timestamp'] = pd.to_datetime(aggregated_data['timestamp'])

# Create a pivot table for plotting
pivot_data = aggregated_data.pivot(index='timestamp', columns='main_category', values='store_quantity')

# Plot the results
plt.figure(figsize=(14, 8))
pivot_data.plot(kind='line', marker='o', figsize=(14, 8))
plt.xlabel('Timestamp')
plt.ylabel('Total Stock Quantity')
plt.title('Changes in Stock Quantity by Main Category Over Time')
plt.legend(title='Main Category', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)
plt.tight_layout()
plt.show()


In [15]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Ensure the store_storeId column is of string type and strip any whitespace
df['store_storeId'] = df['store_storeId'].astype(str).str.strip()

# Pad the store_storeId values with leading zeros to ensure 13 characters
df['store_storeId'] = df['store_storeId'].str.zfill(13)

# Filter for a specific area
import decathlon_scraper.StoresEnum as StoresEnum
nrw_area = StoresEnum.StoresNRW

# Create a list with the enum values, ensuring no leading/trailing whitespace
nrw_stores = [str(store.value).strip() for store in nrw_area]

# Filter for the stores in the area
nrw_df = df[df['store_storeId'].isin(nrw_stores)]

# Convert timestamp to datetime
nrw_df['timestamp'] = pd.to_datetime(nrw_df['timestamp'])

# Sort the DataFrame by id, store_skuId, store_storeId, and timestamp
nrw_df = nrw_df.sort_values(by=['id', 'store_skuId', 'store_storeId', 'timestamp'])

# Identify products that went out of stock
nrw_df['previous_quantity'] = nrw_df.groupby(['id', 'store_skuId', 'store_storeId'])['store_quantity'].shift(1)
out_of_stock = nrw_df[(nrw_df['store_quantity'] == 0) & (nrw_df['previous_quantity'] > 0)]

# Aggregate the number of out-of-stock products by timestamp
out_of_stock_counts = out_of_stock.groupby('timestamp').size().reset_index(name='out_of_stock_count')

# Plot the results
plt.figure(figsize=(14, 8))
sns.lineplot(data=out_of_stock_counts, x='timestamp', y='out_of_stock_count', marker='o')
plt.xlabel('Timestamp')
plt.ylabel('Number of Products Out of Stock')
plt.title('Number of Unique Products Going Out of Stock Over Time')
plt.grid(True)
plt.tight_layout()
plt.show()


In [16]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Ensure the store_storeId column is of string type and strip any whitespace
df['store_storeId'] = df['store_storeId'].astype(str).str.strip()

# Pad the store_storeId values with leading zeros to ensure 13 characters
df['store_storeId'] = df['store_storeId'].str.zfill(13)

# Filter for a specific area
import decathlon_scraper.StoresEnum as StoresEnum
nrw_area = StoresEnum.StoresNRW

# Create a list with the enum values, ensuring no leading/trailing whitespace
nrw_stores = [str(store.value).strip() for store in nrw_area]

# Filter for the stores in the area
nrw_df = df[df['store_storeId'].isin(nrw_stores)]

# Convert timestamp to datetime
nrw_df['timestamp'] = pd.to_datetime(nrw_df['timestamp'])

# Sort the DataFrame by id, store_skuId, store_storeId, and timestamp
nrw_df = nrw_df.sort_values(by=['id', 'store_skuId', 'store_storeId', 'timestamp'])

# Identify products that went out of stock
nrw_df['previous_quantity'] = nrw_df.groupby(['id', 'store_skuId', 'store_storeId'])['store_quantity'].shift(1)
out_of_stock = nrw_df[(nrw_df['store_quantity'] == 0) & (nrw_df['previous_quantity'] > 0)]

# Get the unique id and store_skuId combinations that went out of stock
out_of_stock_products = out_of_stock[['id', 'store_skuId']].drop_duplicates()

# Merge this back with the original nrw_df to get the full stock history of these products
stock_history = nrw_df.merge(out_of_stock_products, on=['id', 'store_skuId'])

# Plot the stock quantity changes over time for these products
plt.figure(figsize=(14, 8))
for (product_id, sku_id), group in stock_history.groupby(['id', 'store_skuId']):
    sns.lineplot(data=group, x='timestamp', y='store_quantity', hue='store_storeId', marker='o')
    plt.title(f'Stock Quantity Changes Over Time for Product ID: {product_id}, SKU: {sku_id}')
    plt.xlabel('Timestamp')
    plt.ylabel('Stock Quantity')
    plt.legend(title='Store ID', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True)
    plt.tight_layout()
    plt.show()
