In [254]:
import numpy as np
import pandas as pd
import time

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option("display.float_format", lambda x: "%.2f" % x)
alt.data_transformers.disable_max_rows()
pd.set_option("display.max_columns", None)

In [255]:
# Set location of the dataset
filepath = "../scripts/datasets/preprocessed_data_filtered_status.csv"

# Load datasets
main_df = pd.read_csv(filepath)
main_df.head(2)

Unnamed: 0,order_date,order_region,order_country,order_city,market,sales,order_item_quantity,shipping_date,days_for_shipping_real,days_for_shipment_scheduled,category_name,delivery_status,late_delivery_risk,shipping_mode,order_status,order_profit_per_order,sales_per_item,shipping_days_difference,order_year,order_month,order_year_month_date
0,2015-01-01 00:00:00,Central America,Mexico,Mexico City,Latin America,299.98,1,1/3/2015 0:00,2,4,Camping & Hiking,Advance shipping,0,Standard Class,CLOSED,88.79,299.98,2,2015,January,2015-01-01
1,2015-01-01 00:21:00,South America,Colombia,Dos Quebradas,Latin America,199.99,1,1/4/2015 0:21,3,4,Water Sports,Advance shipping,0,Standard Class,PENDING_PAYMENT,91.18,199.99,1,2015,January,2015-01-01


In [256]:
# Filter year anad put it into new variable
df_2017 = main_df[main_df["order_year"]==2017].reset_index(drop=True)
df_2017.head(2)

Unnamed: 0,order_date,order_region,order_country,order_city,market,sales,order_item_quantity,shipping_date,days_for_shipping_real,days_for_shipment_scheduled,category_name,delivery_status,late_delivery_risk,shipping_mode,order_status,order_profit_per_order,sales_per_item,shipping_days_difference,order_year,order_month,order_year_month_date
0,2017-01-01 00:33:00,Asiatic Russia/European Russia,Russia,Cheliábinsk,Europe,99.96,2,1/6/2017 0:33,5,2,Indoor/Outdoor Games,Late delivery,1,Second Class,COMPLETE,-58.78,49.98,-3,2017,January,2017-01-01
1,2017-01-01 00:33:00,Asiatic Russia/European Russia,Russia,Cheliábinsk,Europe,39.99,1,1/6/2017 0:33,5,2,Shop By Sport,Late delivery,1,Second Class,COMPLETE,15.98,39.99,-3,2017,January,2017-01-01


### 1. Sales and Order Analysis

In [257]:
grouped_sales = df_2017.groupby(["order_region", "market"])["sales"].sum().reset_index()
grouped_sales["sales"] = round(grouped_sales["sales"], 2)
grouped_sales.head()

Unnamed: 0,order_region,market,sales
0,Asiatic Russia/European Russia,Europe,39011.36
1,Australia/New Zealand,Pacific Asia,195812.4
2,Caribbean,Latin America,781414.93
3,Central America,Latin America,2794204.96
4,Central Asia,Pacific Asia,11906.64


In [258]:
chartSales = alt.Chart(grouped_sales).mark_bar(orient="horizontal").encode(
    y=alt.Y("order_region:N", sort="-x", title="Region"),
    x=alt.X("sales:Q", title="Sales"),
    color=alt.Color("market", title="Market"),
    tooltip=["order_region", "market", "sales"]
).properties(
    title="Sales Across Different Regions and Markets, 2017",
    width=600,
    height=450
)


chartSales

In [259]:
grouped_order = df_2017.groupby(["order_region", "market"])["order_item_quantity"].sum().reset_index()
grouped_order.head()

Unnamed: 0,order_region,market,order_item_quantity
0,Asiatic Russia/European Russia,Europe,412
1,Australia/New Zealand,Pacific Asia,841
2,Caribbean,Latin America,8395
3,Central America,Latin America,29936
4,Central Asia,Pacific Asia,159


In [260]:
chartOrder = alt.Chart(grouped_order).mark_bar(orient="horizontal").encode(
    y=alt.Y("order_region:N", sort="-x", title="Region"),
    x=alt.X("order_item_quantity:Q", title="Order Item"),
    color=alt.Color("market", title="Market"),
    tooltip=["order_region", "market", "order_item_quantity"]
).properties(
    title="Order Across Different Regions and Markets, 2017",
    width=600,
    height=450
)

chartOrder

In [261]:
# Convert "order_date" to datetime and set it as the index
main_df2 = main_df.copy()
main_df2["order_date"] = pd.to_datetime(main_df2["order_date"])
main_df2.set_index("order_date", inplace=True)

# Resample the DataFrame for every day and sum the item quantity for each region
daily_orders = main_df2.groupby(["market","order_region"]).resample("D")["order_item_quantity"].sum().reset_index()

# Calculate the average daily order for each region by year
avg_daily_orders = daily_orders.groupby(["market", daily_orders["order_date"].dt.year, "order_region"])["order_item_quantity"].mean().reset_index()
avg_daily_orders.rename(columns={"order_date": "order_year"}, inplace=True)

avg_df_2017_order = avg_daily_orders[avg_daily_orders["order_year"] == 2017].reset_index(drop=True)
avg_df_2017_order

Unnamed: 0,market,order_year,order_region,order_item_quantity
0,Africa,2017,Eastern Africa,25.82
1,Africa,2017,Middle Africa,22.27
2,Africa,2017,Northern Africa,39.67
3,Africa,2017,Southern Africa,15.19
4,Africa,2017,Western Africa,53.89
5,Europe,2017,Asiatic Russia/European Russia,22.89
6,Europe,2017,Eastern Europe,32.72
7,Europe,2017,Northern Europe,25.91
8,Europe,2017,Southern Europe,24.59
9,Europe,2017,Western Europe,72.42


In [262]:
# Resample the DataFrame for every day and sum the item quantity for each region
daily_sales = main_df2.groupby(["market","order_region"]).resample("D")["sales"].sum().reset_index()

# Calculate the average daily order for each region by year
avg_daily_sales = daily_sales.groupby(["market", daily_sales["order_date"].dt.year, "order_region"])["sales"].mean().reset_index()
avg_daily_sales.rename(columns={"order_date": "order_year"}, inplace=True)

avg_df_2017_sales = avg_daily_sales[avg_daily_sales["order_year"] == 2017].reset_index(drop=True)
avg_df_2017_sales

Unnamed: 0,market,order_year,order_region,sales
0,Africa,2017,Eastern Africa,2440.14
1,Africa,2017,Middle Africa,1921.33
2,Africa,2017,Northern Africa,3853.51
3,Africa,2017,Southern Africa,1148.04
4,Africa,2017,Western Africa,4499.06
5,Europe,2017,Asiatic Russia/European Russia,2167.3
6,Europe,2017,Eastern Europe,2894.95
7,Europe,2017,Northern Europe,3319.3
8,Europe,2017,Southern Europe,3120.02
9,Europe,2017,Western Europe,9067.96


In [263]:
chartAvgOrder = alt.Chart(avg_df_2017_order).mark_bar(orient="horizontal").encode(
    y=alt.Y("order_region:N", sort="-x", title="Region"),
    x=alt.X("order_item_quantity:Q", title="Order Item"),
    color=alt.Color("market", title="Market"),
    tooltip=["order_region", "market", "order_item_quantity"]
).properties(
    title="Average Daily Order Across Different Regions and Markets, 2017",
    width=600,
    height=450
)


chartAvgOrder

In [264]:
chartAvgSales = alt.Chart(avg_df_2017_sales).mark_bar(orient="horizontal").encode(
    y=alt.Y("order_region:N", sort="-x", title="Region"),
    x=alt.X("sales:Q", title="Sales"),
    color=alt.Color("market", title="Market"),
    tooltip=["order_region", "market", "sales"]
).properties(
    title="Average Daily Sales Across Different Regions and Markets, 2017",
    width=600,
    height=450
)


chartAvgSales

Sales and Order Volume Insights:
* Western Europe, Central America, and South America are the top three regions in terms of order volume and sales.
* Melanesia, Central Asia, and Southern Africa have the lowest order volume and sales.
* Western Asia has a high average daily sales value but a low average daily order volume, suggesting high-value orders.

Market Performance:
* Europe and Latin America markets have the highest sales and order volumes.
* Africa has the lowest sales and order volumes among all markets.

Recommendations:
* Focus on increasing sales and order volume in the low-performing regions such as Melanesia, Central Asia, and Southern Africa. This may involve implementing targeted marketing campaigns or improving the product offerings in these regions.
* Maintain and strengthen the position in high-performing regions like Western Europe, Central America, and South America. Keep a close eye on the competition and make sure to continue delivering a great customer experience.
* Investigate the high-value orders in Western Asia and identify opportunities to further capitalize on these sales. This may involve identifying the best-selling products and offering promotions or bundle deals to boost sales.
* Explore the reasons behind the low order volume in Northern America, despite being part of the USCA market. Consider targeted strategies to increase order volume in this regio*

### 2. Delivery Efficiency Analysis

In [265]:
# Calculate the On-Time Delivery Rate (OTD) using late_delivery_risk column
otd_market = df_2017.groupby(["market", "shipping_mode"])["late_delivery_risk"].apply(lambda x: (1 - x.mean()) * 100).reset_index()

# Rename the late_delivery_risk column to on_time_delivery
otd_market.rename(columns={"late_delivery_risk": "on_time_delivery"}, inplace=True)
otd_market["on_time_delivery"] = round(otd_market["on_time_delivery"], 2)

otd_market

Unnamed: 0,market,shipping_mode,on_time_delivery
0,Africa,First Class,0.0
1,Africa,Same Day,53.12
2,Africa,Second Class,22.73
3,Africa,Standard Class,62.96
4,Europe,First Class,0.0
5,Europe,Same Day,43.24
6,Europe,Second Class,19.66
7,Europe,Standard Class,60.91
8,Latin America,First Class,0.0
9,Latin America,Same Day,50.98


In [266]:
# Define the desired order for the x-axis values
shipping_mode_order = ["Same Day", "First Class", "Second Class", "Standard Class"]

otd_chart = alt.Chart(otd_market).mark_bar().encode(
    x=alt.X("shipping_mode:N", title="Shipping Mode", sort=shipping_mode_order),
    y=alt.Y("on_time_delivery:Q", title="On-Time Delivery Rate (%)", scale=alt.Scale(domain=(0, 100))),
    color=alt.Color("shipping_mode:N", title="Shipping Mode"),
    column=alt.Column("market:N", title="Market"),
    tooltip=["market", "shipping_mode", "on_time_delivery"]
).properties(
    title=alt.TitleParams(
        text="On time Delivery Rate Across Market and Shipping Mode, 2017",
        anchor="middle",
        offset=10
    ),
    width=150,
    height=300
)

otd_chart

In [267]:
# Calculate the average days for shipping (actual vs. scheduled)
avg_shipping_days = df_2017.groupby(["market", "shipping_mode"]).agg({
    "days_for_shipping_real": "mean",
    "days_for_shipment_scheduled": "mean"
}).reset_index()

avg_shipping_days

Unnamed: 0,market,shipping_mode,days_for_shipping_real,days_for_shipment_scheduled
0,Africa,First Class,2.0,1.0
1,Africa,Same Day,0.47,0.0
2,Africa,Second Class,3.72,2.0
3,Africa,Standard Class,4.02,4.0
4,Europe,First Class,2.0,1.0
5,Europe,Same Day,0.57,0.0
6,Europe,Second Class,3.99,2.0
7,Europe,Standard Class,3.97,4.0
8,Latin America,First Class,2.0,1.0
9,Latin America,Same Day,0.49,0.0


In [268]:
# Define the desired order for the x-axis values
shipping_mode_order = ["Same Day", "First Class", "Second Class", "Standard Class"]

# Average days for shipping (actual vs. scheduled) by Region, Market, and Shipping Mode
avg_shipping_days_chart = alt.Chart(avg_shipping_days).mark_bar().encode(
    x=alt.X("shipping_mode:N", title="Shipping Mode", sort=shipping_mode_order),
    y=alt.Y("days_for_shipping_real:Q", title="Average Days for Shipping (Real)"),
    y2=alt.Y2("days_for_shipment_scheduled:Q", title="Average Days for Shipping (Scheduled)"),
    color=alt.Color("shipping_mode:N", title="Shipping Mode"),
    column=alt.Column("market:N", title="Market"),
    tooltip=["market", "shipping_mode", "days_for_shipping_real", "days_for_shipment_scheduled"]
).properties(
    title=alt.TitleParams(
        text="Average Days for Shipping Real vs Scheduled Across Market and Shipping Mode, 2017",
        anchor="middle",
        offset=10
    ),
    width=150,
    height=300
)

avg_shipping_days_chart

On-Time Delivery (OTD) Insights:
* First Class shipping has a 0% on-time delivery rate across all markets, indicating significant inefficiencies.
* Same Day shipping has the highest on-time delivery rate in the USCA market (80%), but it"s lower in other markets.
* Standard Class shipping generally performs better than Second Class shipping in terms of on-time delivery, with the best performance in the Africa market (62.96%).

Average Shipping Days Insights:
* First Class shipping takes 2 days on average, while it is scheduled for 1 day, indicating a discrepancy between the actual and scheduled shipping time.
* Same Day shipping is faster than scheduled across all markets, with the best performance in the USCA market (0.20 days on average).
* Second Class shipping generally takes longer than scheduled, with the biggest discrepancy in the Pacific Asia market (4.07 days on average vs. 2 days scheduled).
* Standard Class shipping meets the scheduled shipping time in Africa, Latin America, and Pacific Asia markets, but it"s slightly faster in the Europe and USCA markets.

Recommendations:
* Investigate and address the issues causing the 0% on-time delivery rate for First Class shipping across all markets. This may involve analyzing the causes of delays and implementing process improvements or working closely with shipping partners.
* Monitor and improve the shipping performance for Second Class shipping, as it consistently takes longer than scheduled. Consider collaborating with shipping partners to identify bottlenecks and develop solutions to reduce shipping time.
* Maintain the good performance of Standard Class shipping in terms of meeting or exceeding the scheduled shipping time, and aim to replicate the faster-than-scheduled performance observed in Europe and USCA markets across all regions.

### 3. Products Performance Analysis

In [269]:
# 1. Top and bottom 5 products by sales in each region and market.
product_sales = df_2017.groupby(["market", "category_name"])["sales"].sum().reset_index()
top5_product = product_sales.groupby(["market"]).apply(lambda x: x.nlargest(5, "sales")).reset_index(drop=True)
top5_product["sales"] = round(top5_product["sales"], 2)

top5_product

Unnamed: 0,market,category_name,sales
0,Africa,Fishing,46397.68
1,Africa,Cleats,33654.39
2,Africa,Cardio Equipment,29567.1
3,Africa,Camping & Hiking,27598.16
4,Africa,Women's Apparel,22400.0
5,Europe,Fishing,751162.46
6,Europe,Computers,507000.0
7,Europe,Cleats,463004.27
8,Europe,Camping & Hiking,426871.56
9,Europe,Cardio Equipment,389511.06


In [270]:
# 1. Top and bottom 5 products by sales in each region and market.
product_sales = df_2017.groupby(["market", "category_name"])["sales"].sum().reset_index()
bottom5_product = product_sales.groupby(["market"]).apply(lambda x: x.nsmallest(5, "sales")).reset_index(drop=True)
bottom5_product["sales"] = round(bottom5_product["sales"], 2)

bottom5_product

Unnamed: 0,market,category_name,sales
0,Africa,Golf Apparel,119.94
1,Africa,Hunting & Shooting,119.96
2,Africa,Boxing & MMA,219.88
3,Africa,Lacrosse,274.89
4,Africa,Baseball & Softball,354.92
5,Europe,Girls' Apparel,179.99
6,Europe,Accessories,349.86
7,Europe,Golf Balls,374.8
8,Europe,Electronics,1185.72
9,Europe,CDs,1467.7


In [271]:
    def top_bottom_products(dataframe, market):
        if market == "All Markets":
            grouped_bar = dataframe.groupby(["category_name"]).agg(total_sales=("sales", "sum")).reset_index()
        else:
            dataframe = dataframe[dataframe["market"]==market]
            grouped_bar = dataframe.groupby(["category_name"]).agg(total_sales=("sales", "sum")).reset_index()

        grouped_bar["total_sales"] = round(grouped_bar["total_sales"], 2)

        # sort by total_sales and split into two groups
        grouped_bar = grouped_bar.sort_values(by="total_sales", ascending=False).reset_index()

        # format total_sales as a string with $ and thousand separator
        grouped_bar["total_sales_formated"] = grouped_bar["total_sales"].apply(lambda x: f"${x:,.2f}")

        top_5 = grouped_bar.head(5)
        bottom_5 = grouped_bar.tail(5)

        # create a dictionary to map each category to a unique color
        category_colors = {
            "Camping & Hiking": "#1f77b4",
            "Water Sports": "#ff7f0e",
            "Women's Apparel": "#2ca02c",
            "Men's Footwear": "#d62728",
            "Indoor/Outdoor Games": "#9467bd",
            "Accessories": "#8c564b",
            "Cleats": "#e377c2",
            "Trade-In": "#7f7f7f",
            "Cardio Equipment": "#bcbd22",
            "Shop By Sport": "#17becf",
            "Hockey": "#ff5733",
            "Electronics": "#e74c3c",
            "Fishing": "#3498db",
            "Golf Balls": "#9b59b6",
            "Lacrosse": "#e67e22",
            "Baseball & Softball": "#34495e",
            "Golf Gloves": "#f1c40f",
            "Girls' Apparel": "#2ecc71",
            "Fitness Accessories": "#1abc9c",
            "Hunting & Shooting": "#95a5a6",
            "Tennis & Racquet": "#2c3e50",
            "Golf Shoes": "#bdc3c7",
            "Golf Apparel": "#d35400",
            "Boxing & MMA": "#7f8c8d",
            "Men's Golf Clubs": "#2980b9",
            "Kids' Golf Clubs": "#16a085",
            "Soccer": "#c0392b",
            "Women's Golf Clubs": "#f39c12",
            "Golf Bags & Carts": "#27ae60",
            "Strength Training": "#e67e22",
            "As Seen on  TV!": "#8e44ad",
            "Basketball": "#f39c12",
            "Books ": "#1abc9c",
            "Baby ": "#95a5a6",
            "CDs ": "#d35400",
            "Cameras ": "#bdc3c7",
            "Children's Clothing": "#9b59b6",
            "Computers": "#7f8c8d",
            "Consumer Electronics": "#2c3e50",
            "Crafts": "#27ae60",
            "DVDs": "#f1c40f",
            "Garden": "#17becf",
            "Health and Beauty": "#bcbd22",
            "Men's Clothing": "#e74c3c",
            "Music": "#8c564b",
            "Pet Supplies": "#2ecc71",
            "Sporting Goods": "#7f7f7f",
            "Toys": "#d62728",
            "Video Games": "#9467bd",
            "Women's Clothing": "#3498db"
        }

        # map category colors to the top_5 and bottom_5 dataframes
        top_5["color"] = top_5["category_name"].map(category_colors)
        bottom_5["color"] = bottom_5["category_name"].map(category_colors)

        # -----------------------------------------------------------

        # create the subplots
        fig = make_subplots(rows=1, cols=2, horizontal_spacing=0.1, subplot_titles=(
            f"<b>Top 5 High-Performing Categories in {market}</b>", 
            f"<b>Bottom 5 Under-Performing Categories in {market}</b>"))

        # add the top 5 subplot
        fig.add_trace(go.Bar(x=top_5["total_sales"], y=top_5["category_name"], orientation="h",
                            text=top_5["total_sales_formated"], name="", marker=dict(color=top_5["color"]),
                            textfont=dict(color="white")),
                    row=1, col=1)
        fig.update_yaxes(title="", categoryorder="total ascending", row=1, col=1)
        fig.update_xaxes(title="Total Sales", row=1, col=1, showgrid=False)

        # add the bottom 5 subplot
        fig.add_trace(go.Bar(x=bottom_5["total_sales"], y=bottom_5["category_name"], orientation="h",
                            text=bottom_5["total_sales_formated"], name="", marker=dict(color=bottom_5["color"]),
                            textfont=dict(color="white")),
                    row=1, col=2)
        fig.update_yaxes(title="", categoryorder="total ascending", row=1, col=2)
        fig.update_xaxes(title="Total Sales", row=1, col=2, showgrid=False)

        # update the layout
        fig.update_layout(
            height=300,
            width=1200,
            template="plotly_dark",
            showlegend=False,
            margin=dict(l=0, r=0, t=20, b=0),
        )
        fig.update_layout({"plot_bgcolor": "rgba(0, 0, 0, 0)","paper_bgcolor": "rgba(0, 0, 0, 0)",})
        fig.update_annotations(font_size=12)

        return fig

In [272]:
display(top_bottom_products(df_2017, "All Markets"), top_bottom_products(df_2017, "Africa"), top_bottom_products(df_2017, "Europe"), 
        top_bottom_products(df_2017, "Latin America"), top_bottom_products(df_2017, "Pacific Asia"), top_bottom_products(df_2017, "USCA"))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

In [273]:
df_2016 = main_df[main_df["order_year"]==2016]
product_sales_2016 = df_2016.groupby(["category_name"])["sales"].sum().reset_index()
product_sales_2017 = df_2017.groupby(["category_name"])["sales"].sum().reset_index()
product_sales_2016

Unnamed: 0,category_name,sales
0,Accessories,56752.29
1,Baseball & Softball,35172.77
2,Boxing & MMA,21108.48
3,Camping & Hiking,1420105.37
4,Cardio Equipment,1251715.86
5,Cleats,1561359.77
6,Electronics,151029.34
7,Fishing,2423478.89
8,Fitness Accessories,11441.73
9,Girls' Apparel,60691.88


In [274]:
sales_growth = pd.merge(product_sales_2016, product_sales_2017, on=["category_name"], suffixes=("_2016", "_2017"))
sales_growth["growth_rate"] = (sales_growth["sales_2017"] - sales_growth["sales_2016"]) / sales_growth["sales_2016"] * 100
sales_growth

Unnamed: 0,category_name,sales_2016,sales_2017,growth_rate
0,Accessories,56752.29,16993.2,-70.06
1,Baseball & Softball,35172.77,21396.95,-39.17
2,Boxing & MMA,21108.48,43577.64,106.45
3,Camping & Hiking,1420105.37,1105126.36,-22.18
4,Cardio Equipment,1251715.86,969693.42,-22.53
5,Cleats,1561359.77,1147670.16,-26.5
6,Electronics,151029.34,45447.7,-69.91
7,Fishing,2423478.89,1818309.13,-24.97
8,Fitness Accessories,11441.73,9498.9,-16.98
9,Girls' Apparel,60691.88,17338.99,-71.43


In [275]:
chart = alt.Chart(sales_growth).mark_bar().encode(
    x=alt.X("category_name", sort="y", axis=alt.Axis(title="Category Name", labelAngle=-45)),
    y=alt.Y("growth_rate", axis=alt.Axis(title="Growth Rate (%)")),
    color=alt.condition(
        alt.datum.growth_rate > 0,
        alt.value("green"),  # The positive values will be green
        alt.value("red")  # The negative values will be red
    ),
    tooltip=["category_name", "growth_rate"]
).properties(
    title="Sales Growth by Product Category",
    width=600,
    height=300
)

chart

Top 5 Products by Sales:
* Africa: Fishing, Cleats, Cardio Equipment, Camping & Hiking, and Women"s Apparel are the top-selling products.
* Europe: Fishing, Computers, Cleats, Camping & Hiking, and Cardio Equipment are the top-selling products.
* Latin America: Fishing, Camping & Hiking, Cleats, Cardio Equipment, and Women"s Apparel are the top-selling products.
* Pacific Asia: Garden, Computers, Crafts, Music, and Health and Beauty are the top-selling products.
* USCA: Camping & Hiking, Cleats, Cardio Equipment, Men"s Footwear, and Women"s Apparel are the top-selling products.

Bottom 5 Products by Sales:
* Africa: Golf Apparel, Hunting & Shooting, Boxing & MMA, Lacrosse, and Baseball & Softball are the least-selling products.
* Europe: Girls" Apparel, Accessories, Golf Balls, Electronics, and CDs are the least-selling products.
* Latin America: Fitness Accessories, Strength Training, Hockey, Golf Bags & Carts, and Tennis & Racquet are the least-selling products.
* Pacific Asia: Hockey, Tennis & Racquet, Fitness Accessories, Golf Apparel, and Trade-In are the least-selling products.
* USCA: Golf Balls, Golf Gloves, Tennis & Racquet, Electronics, and Girls" Apparel are the least-selling products.

Sales Growth Analysis:
* The Boxing & MMA, Golf Apparel, Golf Gloves, Golf Shoes, Hunting & Shooting, Lacrosse, and Trade-In categories experienced positive sales growth between 2016 and 2017.
* The Accessories, Baseball & Softball, Camping & Hiking, Cardio Equipment, Cleats, Electronics, Fishing, Fitness Accessories, Girls" Apparel, Golf Balls, Hockey, Indoor/Outdoor Games, Men"s Footwear, Shop By Sport, Tennis & Racquet, Water Sports, and Women"s Apparel categories experienced negative sales growth between 2016 and 2017.

Recommendations:
* Focus on promoting and expanding the top-selling product categories in each market to capitalize on their popularity and maximize revenue.
* Investigate the reasons for low sales in the bottom 5 product categories in each market. Consider product improvements, better marketing strategies, or discontinuing underperforming products to optimize the product mix.
* Analyze the factors contributing to positive sales growth in the Boxing & MMA, Golf Apparel, Golf Gloves, Golf Shoes, Hunting & Shooting, Lacrosse, and Trade-In categories, and apply these strategies to other product categories to promote growth.
* Address the issues leading to negative sales growth in various categories, such as product quality, pricing, marketing, or distribution channels, to improve sales performance and ensure overall business growth.

### 4. Hypothesis Testing

To determine if there is a significant difference in delivery efficiency among different shipping modes, we need to perform hyptothesis testing. This can help us understand which shipping modes are more efficient than others in terms of on-time delivery. With this information, we can prioritize certain shipping modes or make adjustments to the delivery process to improve efficiency and customer satisfaction.

In [277]:
df_2017.head(2)

Unnamed: 0,order_date,order_region,order_country,order_city,market,sales,order_item_quantity,shipping_date,days_for_shipping_real,days_for_shipment_scheduled,category_name,delivery_status,late_delivery_risk,shipping_mode,order_status,order_profit_per_order,sales_per_item,shipping_days_difference,order_year,order_month,order_year_month_date
0,2017-01-01 00:33:00,Asiatic Russia/European Russia,Russia,Cheliábinsk,Europe,99.96,2,1/6/2017 0:33,5,2,Indoor/Outdoor Games,Late delivery,1,Second Class,COMPLETE,-58.78,49.98,-3,2017,January,2017-01-01
1,2017-01-01 00:33:00,Asiatic Russia/European Russia,Russia,Cheliábinsk,Europe,39.99,1,1/6/2017 0:33,5,2,Shop By Sport,Late delivery,1,Second Class,COMPLETE,15.98,39.99,-3,2017,January,2017-01-01


In [279]:
from scipy.stats import f_oneway

# Group by shipping_mode and calculate the on-time delivery rate for each group
shipping_mode_deliver = df_2017.copy()
shipping_mode_deliver['on_time_delivery'] = (shipping_mode_deliver['days_for_shipping_real'] <= shipping_mode_deliver['days_for_shipment_scheduled']).astype(int)
shipping_mode_delivery = shipping_mode_deliver.groupby("shipping_mode")["on_time_delivery"].agg(["sum", "count"]).reset_index()

# Calculate the on-time delivery rate for each shipping mode
shipping_mode_delivery["on_time_delivery_rate"] = shipping_mode_delivery["sum"] / shipping_mode_delivery["count"]

shipping_mode_delivery

Unnamed: 0,shipping_mode,sum,count,on_time_delivery_rate
0,First Class,0,7579,0.0
1,Same Day,1299,2753,0.47
2,Second Class,2053,9917,0.21
3,Standard Class,18532,30599,0.61


In [281]:
# Filter the data for each shipping mode
standard_class = shipping_mode_deliver[shipping_mode_deliver["shipping_mode"] == "Standard Class"]["on_time_delivery"]
second_class = shipping_mode_deliver[shipping_mode_deliver["shipping_mode"] == "Second Class"]["on_time_delivery"]
first_class = shipping_mode_deliver[shipping_mode_deliver["shipping_mode"] == "First Class"]["on_time_delivery"]
same_day = shipping_mode_deliver[shipping_mode_deliver["shipping_mode"] == "Same Day"]["on_time_delivery"]

In [282]:
f_statistic, p_value = f_oneway(standard_class, second_class, first_class, same_day)

In [283]:
alpha = 0.05  # Set significance level

if p_value < alpha:
    print("Reject the null hypothesis: There is a significant difference in on-time delivery rates among shipping modes.")
else:
    print("Fail to reject the null hypothesis: There is no significant difference in on-time delivery rates among shipping modes.")

Reject the null hypothesis: There is a significant difference in on-time delivery rates among shipping modes.


The delivery efficiency analysis revealed some important insights about on-time delivery rates and average shipping days across different shipping modes and markets. First Class shipping showed a concerning 0% on-time delivery rate, while Same Day shipping performed well, particularly in the USCA market. Standard Class shipping outperformed Second Class shipping in most markets, with the best on-time delivery rate observed in the Africa market. The hypothesis testing confirmed that there is a significant difference in on-time delivery rates among shipping modes.

Recommendations:

* Investigate First Class shipping inefficiencies: Conduct a thorough analysis of the factors causing the 0% on-time delivery rate for First Class shipping across all markets. This may involve working closely with shipping partners, identifying bottlenecks in the process, and implementing improvements to address these issues.

* Improve Second Class shipping performance: As Second Class shipping consistently takes longer than scheduled, consider collaborating with shipping partners to identify areas for improvement and reduce shipping time. This can lead to better on-time delivery rates and increased customer satisfaction.

* Leverage the success of Standard Class and Same Day shipping: Continue to maintain the good performance of Standard Class shipping in meeting or exceeding scheduled shipping times. Aim to replicate the faster-than-scheduled performance observed in Europe and USCA markets across all regions. Additionally, focus on promoting Same Day shipping in markets where it shows high on-time delivery rates, such as the USCA market.

By combining the insights from the delivery efficiency analysis and hypothesis testing, the company can focus on improving the overall delivery process, enhancing customer satisfaction, and building a more reliable and efficient supply chain.