In [None]:
# For basic inline static plots (most common)
%matplotlib inline

# For interactive matplotlib plots with zoom/pan capabilities
%matplotlib widget

# Exploratory data analysis

In [None]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
from datetime import datetime

In [None]:
orders = pd.read_csv("orders.csv/orders.csv", delimiter="|")
infos = pd.read_csv("infos.csv", delimiter="|")
items = pd.read_csv("items.csv", delimiter="|")

In [None]:
orders.head(10)

In [None]:
orders.describe()

In [None]:
orders = orders[orders["salesPrice"] > 0]

In [None]:
orders.info(verbose=True)

In [None]:
infos.head()

In [None]:
infos.describe(include="all")

In [None]:
infos.info(verbose=True)

In [None]:
items.head(10)

In [None]:
items.describe(include="all")

In [None]:
items.info(verbose=True)

# Feature engineeing

## Promotion

In [None]:
# Whole period max
orders['in_promotion'] = orders['salesPrice'] < orders.groupby('itemID', observed=False)['salesPrice'].transform('max')

# Historical max
#orders['in_promotion'] = orders['salesPrice'] < orders.groupby('itemID', observed=False)['salesPrice'].transform('cummax')

# Compared to recommended price
#orders = orders.merge(items[['itemID', 'recommendedRetailPrice']], on='itemID', how='left')
#orders['in_promotion'] = orders['salesPrice'] < orders['recommendedRetailPrice']

In [None]:
orders["time"] = pd.to_datetime(orders["time"])

In [None]:
# Revenue per promotion category per unit of transaction number
orders.groupby("in_promotion")[["order", "salesPrice"]].apply(lambda x: (x["order"]*x["salesPrice"]).sum()) / orders.groupby("in_promotion")["order"].size()

In [None]:
 # Total order quantity per promotion category
orders.groupby("in_promotion")["order"].sum() / orders.groupby("in_promotion")["order"].size()

## Aggregation

In [None]:
# Store only date
agg_result = orders.copy()
agg_result.rename(columns={"time": "date"}, inplace=True)
agg_result["date"] = pd.to_datetime(orders["time"].dt.date)

In [None]:
# Group by date and itemID
agg_result_temp = agg_result.copy()
agg_result_temp['weighted_price_temp'] = agg_result_temp['salesPrice'] / agg_result_temp['order']

# Aggregate the different columns in particular way
agg_result = agg_result_temp.groupby(["date", "itemID"]).agg({
    'order': 'sum',
    'weighted_price_temp': 'mean',
    'in_promotion': 'max'
}).rename(columns={'weighted_price_temp': 'weightedAveragePrice'}).reset_index()

In [None]:
# Set column types
agg_result["order"] = agg_result["order"].astype(int)
agg_result["itemID"] = agg_result["itemID"].astype("category")
agg_result["in_promotion"] = agg_result["in_promotion"].astype(bool)

## Customer rating indicator

In [None]:
# Add column idicating if customer rating is missing
items["customerRatingIndicator"] = items["customerRating"] == 0

In [None]:
# Join the aggregated order table with items
joined_tables = agg_result.join(items, how="left", on="itemID", rsuffix="infos")
result = joined_tables.drop(columns="itemIDinfos")
result.head()

In [None]:
# Set the colum names of the merged tables
result["itemID"] = result["itemID"].astype("category")
result["brand"] = result["brand"].astype("category")
result["manufacturer"] = result["manufacturer"].astype("category")
#result["customerRating"] = result["customerRating"].astype(int)
result["category1"] = result["category1"].astype("category")
result["category2"] = result["category2"].astype("category")
result["category3"] = result["category3"].astype("category")
result["recommendedRetailPrice"] = result["recommendedRetailPrice"].astype(float)

In [None]:
result.dtypes

In [None]:
result.head()

## Discount of recommended price

In [None]:
# Calculate the recommended price discount
result["recommendedRetailPriceDiscount"] = (result["weightedAveragePrice"] - result["recommendedRetailPrice"]) / result["recommendedRetailPrice"]

## Date and time features

In [None]:
# Add date features
result["weekDay"] = result["date"].dt.weekday + 1
result["day"] = result["date"].dt.day
result["week"] = result["date"].dt.isocalendar().week
result["weekOfMonth"] = result["date"].apply(
    lambda row: (row.day + row.replace(day=1).weekday()) // 7 + 1
)

In [None]:
result.head()

## Lagging features

## Rolling windows

## Filter

In [None]:
# Interquartile, z-score

## FFT

## Nonlinear transformations

## Zeros - data cleaning

# Normalization

In [None]:
result.groupby("in_promotion")[["order", "weightedAveragePrice"]].apply(lambda x: (x["order"]*x["weightedAveragePrice"]).sum()) / result.groupby("in_promotion")["order"].size()

In [None]:
result.groupby("in_promotion")["order"].sum() / result.groupby("in_promotion")["order"].size()

In [None]:
result.to_csv("results.csv", index=False)

In [None]:
result.shape

In [None]:
# Sample of products
sampled_product_ids = result["itemID"].astype(int).unique()
sampled_product_ids = pd.Series(sampled_product_ids).sample(n=100, random_state=1)
sampled_data = result[result["itemID"].isin(sampled_product_ids)].copy()

In [None]:
# Calculate days from start (starting from day 1)
sampled_data['days_from_start'] = (sampled_data.loc[:, 'date'] - sampled_data.loc[:, 'date'].min()).dt.days + 1

# Sort data by itemID and days to ensure proper line connections
sampled_data = sampled_data.sort_values(['itemID', 'days_from_start']).reset_index(drop=True)

# Define sizes based on discount (scale for visibility) - recalculate after reset_index
sizes = 5 + (sampled_data['recommendedRetailPriceDiscount'] * 2)

# Create the 3D scatter plot
fig = go.Figure()

# Define marker symbols for promotion status
marker_symbols = {True: 'diamond', False: 'circle'}

# Create separate traces for each itemID and promotion status combination
# This ensures promotion status changes are visible as different line segments
first_item_id = sampled_data['itemID'].iloc[0]  # Get first item ID as Python value

for item_id in sampled_data['itemID'].unique():
    item_data = sampled_data[sampled_data['itemID'] == item_id]
    
    for promo_status in [True, False]:
        mask = item_data['in_promotion'] == promo_status
        if mask.any():
            subset = item_data.loc[mask].sort_values('days_from_start')
            
            # Get the sizes for this subset (using loc to get the right indices)
            subset_sizes = 5 + (subset['recommendedRetailPriceDiscount'] * 2)
            
            fig.add_trace(go.Scatter3d(
                x=subset['order'],
                y=subset['days_from_start'].astype(int),
                z=subset['weightedAveragePrice'],
                mode='lines+markers',  # Changed to include lines
                line=dict(
                    color='red' if promo_status else 'blue',
                    width=2
                ),
                marker=dict(
                    size=subset_sizes,
                    color=subset['customerRating'],
                    colorscale='Viridis',
                    symbol=marker_symbols[promo_status],
                    opacity=0.8,
                    colorbar=dict(
                        title="Customer Rating",
                        x=1.1
                    ) if item_id == first_item_id and promo_status == True else None,
                    showscale=bool(item_id == first_item_id and promo_status == True)
                ),
                name=f'Item {item_id} - Promo: {promo_status}',
                legendgroup=f'promo_{promo_status}',  # Group legend items
                legendgrouptitle_text=f'Promotion: {promo_status}',
                showlegend=bool(item_id == first_item_id),  # Only show legend for first item
                hovertemplate=(
                    f'<b>Item ID:</b> {item_id}<br>' +
                    '<b>Order:</b> %{x}<br>' +
                    '<b>Days:</b> %{y}<br>' +
                    '<b>Price:</b> %{z:.2f}<br>' +
                    '<b>Rating:</b> %{marker.color}<br>' +
                    '<b>Promotion:</b> ' + str(promo_status) + '<br>' +
                    '<extra></extra>'
                )
            ))

# Update layout
fig.update_layout(
    title={
        'text': 'Product Trajectories Over Time<br><sub>Lines connect same items, Shape=Promotion, Color=Rating, Size=Discount</sub>',
        'x': 0.5,
        'xanchor': 'center'
    },
    scene=dict(
        xaxis_title='Order Quantity',
        yaxis_title='Days from Start',
        zaxis_title='Weighted Average Price',
        camera=dict(
            eye=dict(x=1.5, y=1.5, z=1.5)
        )
    ),
    width=1200,
    height=900,
    showlegend=True,
    legend=dict(
        yanchor="top",
        y=0.99,
        xanchor="left",
        x=0.01
    )
)

# Add annotation for explanation
fig.add_annotation(
    text="Lines: Connect same items over time<br>" +
         "Line Color: Red=Promotion, Blue=Regular<br>" +
         "Marker Shape: Diamond=Promotion, Circle=Regular<br>" +
         "Marker Color: Customer Rating<br>" +
         "Marker Size: Discount Percentage",
    xref="paper", yref="paper",
    x=0.02, y=0.02,
    showarrow=False,
    font=dict(size=10),
    bgcolor="rgba(255,255,255,0.9)",
    bordercolor="black",
    borderwidth=1
)

# Show the plot
fig.show()

# Data statistics

In [None]:
# Variance, mean, median, and distribition of order quantity, price, discount and customer rating

In [None]:
# Option 1: Simple histogram of order per day
fig1 = px.histogram(
    result.groupby("date")["order"].sum(), 
    x='order', 
    title='Distribution of Order Quantities per Day',
    labels={'order': 'Order Quantity', 'count': 'Frequency'},
    nbins=30
)
fig1.update_layout(
    xaxis_title='Order Quantity',
    yaxis_title='Frequency',
    width=800,
    height=500
)
fig1.show()

order_totals = result.groupby("itemID")["order"].sum()
item_stats = result.groupby("itemID").agg({
    "order": "sum",
    "in_promotion": "any"  # or "any" if you want items that were ever in promotion
}).reset_index()
promo_orders = item_stats[item_stats['in_promotion'] == True]['order']
regular_orders = item_stats[item_stats['in_promotion'] == False]['order']

# Option 2: Simple histogram of order per day
fig2 = make_subplots(
    rows=1, cols=2,
    subplot_titles=[
        'Distribution of Order Quantities per Item',
        'Distribution of Order Quantities by Promotion Status'
    ],
    horizontal_spacing=0.1
)

# First subplot (same as above)
fig2.add_trace(
    go.Histogram(
        x=order_totals,
        name="Total Orders per Item",
        showlegend=False,
        opacity=0.7,
        nbinsx=30
    ),
    row=1, col=1
)

# Second subplot with side-by-side bars
fig2.add_trace(
    go.Histogram(
        x=regular_orders,
        name="Regular Price",
        opacity=0.7,
        nbinsx=30,
        marker_color='blue'
    ),
    row=1, col=2
)

fig2.add_trace(
    go.Histogram(
        x=promo_orders,
        name="In Promotion",
        opacity=0.7,
        nbinsx=30,
        marker_color='red'
    ),
    row=1, col=2
)
fig2.update_layout(
    title_text="Distribution of Order Quantities (Side-by-side)",
    height=600,
    width=1200,
    barmode='group'  # Side-by-side bars
)

fig2.update_xaxes(title_text="Total Order Quantity", row=1, col=1)
fig2.update_xaxes(title_text="Total Order Quantity", row=1, col=2)
fig2.update_yaxes(title_text="Number of Items", row=1, col=1)
fig2.update_yaxes(title_text="Number of Items", row=1, col=2)

fig2.show()

# 3 Price distribution
fig3 = px.histogram(
    result,
    x='weightedAveragePrice', 
    title='Distribution of Weighted Average Price',
    labels={'weightedAveragePrice': 'Weighted Average Price', 'count': 'Frequency'},
    nbins=30
)
fig3.update_layout(
    xaxis_title='Weighted Average Price',
    yaxis_title='Frequency',
    width=800,
    height=500
)
fig3.show()

In [None]:
log_order = result[["date", "itemID", "order", "we"]].copy()
log_order["order"] = np.log(log_order["order"])
fig1 = px.histogram(
    log_order.groupby("date")["order"].sum(), 
    x='order', 
    title='Distribution of Log of Order Quantities per Day',
    labels={'order': 'Log of Order Quantity', 'count': 'Frequency'},
    nbins=30
)
fig1.update_layout(
    xaxis_title='Order Quantity',
    yaxis_title='Frequency',
    width=800,
    height=500
)
fig1.show()

In [None]:
fig1 = px.histogram(
    result, 
    x='order', 
    title='Distribution of Log of Order Quantities per Day',
    labels={'order': 'Log of Order Quantity', 'count': 'Frequency'},
    nbins=30
)
fig1.update_layout(
    xaxis_title='Order Quantity',
    yaxis_title='Frequency',
    width=800,
    height=500
)
fig1.show()

log_order = result[["date", "itemID", "order", "weekDay", "in_promotion"]].copy()
log_order["order"] = np.log(log_order["order"])
fig2 = px.histogram(
    log_order, 
    x='order',
    title='Distribution of Log of Order Quantities per Day',
    labels={'order': 'Log of Order Quantity', 'count': 'Frequency'},
    nbins=30
)
fig2.update_layout(
    xaxis_title='Order Quantity',
    yaxis_title='Frequency',
    width=800,
    height=500
)
fig2.show()

In [None]:
fig1 = px.histogram(
    result, 
    x='order', 
    title='Distribution of Log of Order Quantities per Day',
    labels={'order': 'Log of Order Quantity', 'count': 'Frequency'},
    nbins=30
)
fig1.update_layout(
    xaxis_title='Order Quantity',
    yaxis_title='Frequency',
    width=800,
    height=500
)
fig1.show()

log_order = result[["date", "itemID", "order", "weekDay", "in_promotion"]].copy()
log_order["order"] = np.log(log_order["order"])
fig2 = px.histogram(
    log_order, 
    x='order',
    title='Distribution of Log of Order Quantities per Day',
    labels={'order': 'Log of Order Quantity', 'count': 'Frequency'},
    nbins=30
)
fig2.update_layout(
    xaxis_title='Order Quantity',
    yaxis_title='Frequency',
    width=800,
    height=500
)
fig2.show()

# Models

## XGBoost

## Transformers - TimeGPT

In [None]:
# moirai

## SARIMAX

## Random forest

## Ensemble of models

## SES