In [118]:
import numpy as np
import pandas as pd
import geopandas as gpd
import time
import locale

import plotly.express as px
import plotly.graph_objects as go
import altair as alt
from vega_datasets import data

from urllib.request import urlopen
import json

pd.set_option("display.float_format", lambda x: "%.2f" % x)
alt.data_transformers.disable_max_rows()
pd.set_option('display.max_columns', None)


In [119]:
# Set location of the dataset
filepath = "../datasets/preprocessed_data_filtered_status.csv"

# Load datasets
main_df = pd.read_csv(filepath)

main_df.head()

Unnamed: 0,order_date,order_region,order_country,order_city,market,sales,order_item_quantity,shipping_date,days_for_shipping_real,days_for_shipment_scheduled,category_name,delivery_status,late_delivery_risk,shipping_mode,order_status,order_profit_per_order,sales_per_item,shipping_days_difference,order_year,order_month,order_year_month_date
0,2015-01-01 00:00:00,Central America,Mexico,Mexico City,Latin America,299.98,1,1/3/2015 0:00,2,4,Camping & Hiking,Advance shipping,0,Standard Class,CLOSED,88.79,299.98,2,2015,January,2015-01-01
1,2015-01-01 00:21:00,South America,Colombia,Dos Quebradas,Latin America,199.99,1,1/4/2015 0:21,3,4,Water Sports,Advance shipping,0,Standard Class,PENDING_PAYMENT,91.18,199.99,1,2015,January,2015-01-01
2,2015-01-01 00:21:00,South America,Colombia,Dos Quebradas,Latin America,250.0,5,1/4/2015 0:21,3,4,Women's Apparel,Advance shipping,0,Standard Class,PENDING_PAYMENT,68.25,50.0,1,2015,January,2015-01-01
3,2015-01-01 00:21:00,South America,Colombia,Dos Quebradas,Latin America,129.99,1,1/4/2015 0:21,3,4,Men's Footwear,Advance shipping,0,Standard Class,PENDING_PAYMENT,36.47,129.99,1,2015,January,2015-01-01
4,2015-01-01 01:03:00,South America,Colombia,Dos Quebradas,Latin America,199.92,4,1/6/2015 1:03,5,4,Indoor/Outdoor Games,Late delivery,1,Standard Class,CLOSED,33.59,49.98,-1,2015,January,2015-01-01


In [126]:
# Convert 'order_date' to datetime and set it as the index
main_df2 = main_df.copy()
main_df2['order_date'] = pd.to_datetime(main_df2['order_date'])
main_df2.set_index('order_date', inplace=True)

# Resample the DataFrame for every day and sum the item quantity for each region
daily_orders = main_df2.groupby(['market','order_region']).resample('D')['order_item_quantity'].sum().reset_index()

# Calculate the average daily order for each region by year
avg_daily_orders = daily_orders.groupby(['market', daily_orders['order_date'].dt.year, 'order_region'])['order_item_quantity'].mean().reset_index()
avg_daily_orders.rename(columns={'order_date': 'order_year'}, inplace=True)

# Filter the years (2015 to 2017)
avg_daily_orders = avg_daily_orders[(avg_daily_orders['order_year'] >= 2015) & (avg_daily_orders['order_year'] <= 2017)]

# Separate the dataframes for each year
avg_df_2015 = avg_daily_orders[avg_daily_orders['order_year'] == 2015].reset_index(drop=True)
avg_df_2016 = avg_daily_orders[avg_daily_orders['order_year'] == 2016].reset_index(drop=True)
avg_df_2017 = avg_daily_orders[avg_daily_orders['order_year'] == 2017].reset_index(drop=True)

avg_df_2015.to_csv(f"../datasets/daily_avg_order_2015.csv", index=False)
avg_df_2016.to_csv(f"../datasets/daily_avg_order_2016.csv", index=False)
avg_df_2017.to_csv(f"../datasets/daily_avg_order_2017.csv", index=False)

avg_df_2017

Unnamed: 0,market,order_year,order_region,order_item_quantity
0,Africa,2017,Eastern Africa,25.82
1,Africa,2017,Middle Africa,22.27
2,Africa,2017,Northern Africa,39.67
3,Africa,2017,Southern Africa,15.19
4,Africa,2017,Western Africa,53.89
5,Europe,2017,Asiatic Russia/European Russia,22.89
6,Europe,2017,Eastern Europe,32.72
7,Europe,2017,Northern Europe,25.91
8,Europe,2017,Southern Europe,24.59
9,Europe,2017,Western Europe,72.42


In [38]:
main_df["market"].unique()

array(['LATAM', 'Europe', 'Pacific Asia', 'USCA', 'Africa'], dtype=object)

In [39]:
order_group_2017 = main_df[main_df["order_year"]==2017].groupby(["market", "order_region"]).agg(
    total_order = ("order_item_quantity", "sum")
).reset_index()

order_group_2017

Unnamed: 0,market,order_region,total_order
0,Africa,Eastern Africa,439
1,Africa,Middle Africa,334
2,Africa,Northern Africa,714
3,Africa,Southern Africa,243
4,Africa,Western Africa,970
5,Europe,Asiatic Russia/European Russia,412
6,Europe,Eastern Europe,589
7,Europe,Northern Europe,8135
8,Europe,Southern Europe,7720
9,Europe,Western Europe,22741


In [40]:
df_2017 = main_df[main_df["order_year"]==2017].reset_index(drop=True)
print(df_2017.head(3))

           order_date                    order_region order_country  \
0 2017-01-01 00:33:00  Asiatic Russia/European Russia        Russia   
1 2017-01-01 00:33:00  Asiatic Russia/European Russia        Russia   
2 2017-01-01 00:33:00  Asiatic Russia/European Russia        Russia   

    order_city  market  sales  order_item_quantity  shipping_date  \
0  Cheliábinsk  Europe  99.96                    2  1/6/2017 0:33   
1  Cheliábinsk  Europe  39.99                    1  1/6/2017 0:33   
2  Cheliábinsk  Europe 239.96                    4  1/6/2017 0:33   

   days_for_shipping_real  days_for_shipment_scheduled         category_name  \
0                       5                            2  Indoor/Outdoor Games   
1                       5                            2         Shop By Sport   
2                       5                            2                Cleats   

  delivery_status  late_delivery_risk shipping_mode order_status  \
0   Late delivery                   1  Second Cla

In [41]:
px.colors.qualitative.G10

['#3366CC',
 '#DC3912',
 '#FF9900',
 '#109618',
 '#990099',
 '#0099C6',
 '#DD4477',
 '#66AA00',
 '#B82E2E',
 '#316395']

In [42]:
def create_bar_region(dataframe):
    grouped = dataframe.groupby(["market", "order_region"]).agg(
        total_sales = ("sales", "sum")
    ).reset_index()

    grouped = grouped.sort_values(by="total_sales", ascending=False).reset_index()
    grouped["total_sales"] = round(grouped["total_sales"], 2)
    grouped = grouped.head(5)

    # format values in millions
    grouped['total_sales_formatted'] = (grouped['total_sales'] / 1000000).round(2).astype(str) + 'M'

    # assign unique colors to each market
    color_map = {'LATAM': '#3366CC', 'Europe': '#DC3912', 'Pacific Asia': '#FF9900', 'USCA': '#109618', 'Africa': '#990099'}

    fig = px.bar(grouped, x="total_sales", y="order_region", orientation="h", text="total_sales_formatted", color="market",
                    labels={"order_region":"", "total_sales":"Total Sales"}, template="plotly_dark",
                     color_discrete_map=color_map)

    fig.update_layout(yaxis={'categoryorder':'total ascending'})

    fig.update_layout(autosize=True,width=400,height=300)
    fig.update_layout(title="<b>Top 5 High-Performing Regions by Total Sales</b>",title_font_size=13)
    fig.update_layout({'plot_bgcolor': 'rgba(0, 0, 0, 0)','paper_bgcolor': 'rgba(0, 0, 0, 0)',})
    fig.update_xaxes(title_font=dict(size=12))
    fig.update_layout(xaxis_showgrid=False, yaxis_showgrid=False)
    fig.update_layout(legend_title_text='Market')
    

    return fig

In [43]:
bar1 = create_bar_region(df_2017)
bar1

In [44]:
def create_bar_region2(dataframe):
    grouped = dataframe.groupby(["market", "order_region"]).agg(
        total_sales = ("sales", "sum")
    ).reset_index()

    grouped = grouped.sort_values(by="total_sales", ascending=False).reset_index()
    grouped["total_sales"] = round(grouped["total_sales"], 2)
    grouped = grouped.tail(5)

    # format values in millions
    grouped['total_sales_formatted'] = (grouped['total_sales'] / 1000).round(2).astype(str) + 'K'

    # assign unique colors to each market
    color_map = {'LATAM': '#3366CC', 'Europe': '#DC3912', 'Pacific Asia': '#FF9900', 'USCA': '#109618', 'Africa': '#990099'}

    fig = px.bar(grouped, x="total_sales", y="order_region", orientation="h", text="total_sales_formatted", color="market",
                    labels={"order_region":"", "total_sales":"Total Sales"}, template="plotly_dark",
                     color_discrete_map=color_map)

    fig.update_layout(yaxis={'categoryorder':'total ascending'})

    fig.update_layout(autosize=True,width=400,height=300)
    fig.update_layout(title="<b>Bottom 5 Underperforming Regions by Total Sales</b>",title_font_size=13)
    fig.update_layout({'plot_bgcolor': 'rgba(0, 0, 0, 0)','paper_bgcolor': 'rgba(0, 0, 0, 0)',})
    fig.update_xaxes(title_font=dict(size=12))
    fig.update_layout(xaxis_showgrid=False, yaxis_showgrid=False)
    fig.update_layout(legend_title_text='Market')
    

    return fig

In [45]:
bar2 = create_bar_region2(df_2017)
bar2

In [46]:
main_df["category_name"].unique()

array(['Camping & Hiking', 'Water Sports', "Women's Apparel",
       "Men's Footwear", 'Indoor/Outdoor Games', 'Accessories', 'Cleats',
       'Trade-In', 'Cardio Equipment', 'Shop By Sport', 'Hockey',
       'Electronics', 'Fishing', 'Golf Balls', 'Lacrosse',
       'Baseball & Softball', 'Golf Gloves', "Girls' Apparel",
       'Fitness Accessories', 'Hunting & Shooting', 'Tennis & Racquet',
       'Golf Shoes', 'Golf Apparel', 'Boxing & MMA', "Men's Golf Clubs",
       "Kids' Golf Clubs", 'Soccer', "Women's Golf Clubs",
       'Golf Bags & Carts', 'Strength Training', 'As Seen on  TV!',
       'Basketball', 'Books ', 'Baby ', 'CDs ', 'Cameras ',
       "Children's Clothing", 'Computers', 'Consumer Electronics',
       'Crafts', 'DVDs', 'Garden', 'Health and Beauty', "Men's Clothing",
       'Music', 'Pet Supplies', 'Sporting Goods', 'Toys', 'Video Games',
       "Women's Clothing"], dtype=object)

In [47]:
df_2017.head(2)

Unnamed: 0,order_date,order_region,order_country,order_city,market,sales,order_item_quantity,shipping_date,days_for_shipping_real,days_for_shipment_scheduled,category_name,delivery_status,late_delivery_risk,shipping_mode,order_status,order_profit_per_order,sales_per_item,shipping_days_difference,order_year,order_month,order_year_month_date
0,2017-01-01 00:33:00,Asiatic Russia/European Russia,Russia,Cheliábinsk,Europe,99.96,2,1/6/2017 0:33,5,2,Indoor/Outdoor Games,Late delivery,1,Second Class,COMPLETE,-58.78,49.98,-3,2017,January,2017-01-01
1,2017-01-01 00:33:00,Asiatic Russia/European Russia,Russia,Cheliábinsk,Europe,39.99,1,1/6/2017 0:33,5,2,Shop By Sport,Late delivery,1,Second Class,COMPLETE,15.98,39.99,-3,2017,January,2017-01-01


In [48]:
import plotly.graph_objs as go
from plotly.subplots import make_subplots

def create_bar_region_combined(dataframe):
    dataframe = dataframe[dataframe["order_region"]=="Northern America"]
    grouped = dataframe.groupby(["category_name"]).agg(
        total_sales=("sales", "sum")
    ).reset_index()

    grouped["total_sales"] = round(grouped["total_sales"], 2)

    # sort by total_sales and split into two groups
    grouped = grouped.sort_values(by="total_sales", ascending=False).reset_index()
    top_5 = grouped.head(5)
    bottom_5 = grouped.tail(5)

    # create a dictionary to map each category to a unique color
    category_colors = {
        'Camping & Hiking': '#1f77b4',
        'Water Sports': '#ff7f0e',
        "Women's Apparel": '#2ca02c',
        "Men's Footwear": '#d62728',
        'Indoor/Outdoor Games': '#9467bd',
        'Accessories': '#8c564b',
        'Cleats': '#e377c2',
        'Trade-In': '#7f7f7f',
        'Cardio Equipment': '#bcbd22',
        'Shop By Sport': '#17becf',
        'Hockey': '#ff5733',
        'Electronics': '#e74c3c',
        'Fishing': '#3498db',
        'Golf Balls': '#9b59b6',
        'Lacrosse': '#e67e22',
        'Baseball & Softball': '#34495e',
        'Golf Gloves': '#f1c40f',
        "Girls' Apparel": '#2ecc71',
        'Fitness Accessories': '#1abc9c',
        'Hunting & Shooting': '#95a5a6',
        'Tennis & Racquet': '#2c3e50',
        'Golf Shoes': '#bdc3c7',
        'Golf Apparel': '#d35400',
        'Boxing & MMA': '#7f8c8d',
        "Men's Golf Clubs": '#2980b9',
        "Kids' Golf Clubs": '#16a085',
        'Soccer': '#c0392b',
        "Women's Golf Clubs": '#f39c12',
        'Golf Bags & Carts': '#27ae60',
        'Strength Training': '#e67e22',
        'As Seen on  TV!': '#8e44ad',
        'Basketball': '#f39c12',
        'Books ': '#1abc9c',
        'Baby ': '#95a5a6',
        'CDs ': '#d35400',
        'Cameras ': '#bdc3c7',
        "Children's Clothing": '#9b59b6',
        'Computers': '#7f8c8d',
        'Consumer Electronics': '#2c3e50',
        'Crafts': '#27ae60',
        'DVDs': '#f1c40f',
        'Garden': '#17becf',
        'Health and Beauty': '#bcbd22',
        "Men's Clothing": '#e74c3c',
        'Music': '#8c564b',
        'Pet Supplies': '#2ecc71',
        'Sporting Goods': '#7f7f7f',
        'Toys': '#d62728',
        'Video Games': '#9467bd',
        "Women's Clothing": '#3498db'
    }

    # map category colors to the top_5 and bottom_5 dataframes
    top_5['color'] = top_5['category_name'].map(category_colors)
    bottom_5['color'] = bottom_5['category_name'].map(category_colors)

    # create the subplots
    fig = make_subplots(rows=2, cols=1, vertical_spacing=0.15, subplot_titles=(
        "Top 5 High-Performing Categories in Northern America", "Bottom 5 Underperforming Categories in Northern America"))

    # add the top 5 subplot
    fig.add_trace(go.Bar(x=top_5['total_sales'], y=top_5['category_name'], orientation='h',
                        text=top_5['total_sales'], name='', marker=dict(color=top_5['color'])),
                row=1, col=1)
    fig.update_yaxes(title='', categoryorder='total ascending', row=1, col=1)
    fig.update_xaxes(title='Total Sales', row=1, col=1, showgrid=False)

    # add the bottom 5 subplot
    fig.add_trace(go.Bar(x=bottom_5['total_sales'], y=bottom_5['category_name'], orientation='h',
                        text=bottom_5['total_sales'], name='', marker=dict(color=bottom_5['color'])),
                row=2, col=1)
    fig.update_yaxes(title='', categoryorder='total ascending', row=2, col=1)
    fig.update_xaxes(title='Total Sales', row=2, col=1, showgrid=False)

    # update the layout
    fig.update_layout(height=800, width=800, template='plotly_dark', showlegend=False)

    return fig


In [49]:
bar3 = create_bar_region_combined(df_2017)
bar3



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [50]:
dailySales = df_2017[df_2017["order_region"]=="Central America"].copy().reset_index()
dailySales = dailySales.groupby(pd.Grouper(key="order_date", freq="D")).agg({"sales": "sum"}).reset_index()
dailySales["sales"] = dailySales["sales"].round()
# dailySales.rename(columns={"order_date": "date"}, inplace=True)
dailySales["order_date"] = pd.to_datetime(dailySales["order_date"]).dt.strftime("%Y-%m-%d")

dailySalesChart = alt.Chart(dailySales).mark_line().encode(
    x=alt.X("order_date:T", title="date"),
    y=alt.Y("sales:Q", title="Total Sales"),
).properties(
    title="Sales Over Time",
    width=700,
    height=300
)

dailySalesChart

In [51]:


daily_sales_chart = go.Figure()

daily_sales_chart.add_trace(
    go.Scatter(
        x=dailySales["order_date"],
        y=dailySales["sales"],
        mode="lines",
        name="Sales Over Time",
    )
)

daily_sales_chart.update_layout(
    title="Sales Over Time",
    xaxis_title="Date",
    yaxis_title="Sales",
    width=700,
    height=300,
    template="plotly_dark",
)

daily_sales_chart.update_layout({'plot_bgcolor': 'rgba(0, 0, 0, 0)','paper_bgcolor': 'rgba(0, 0, 0, 0)',})

daily_sales_chart.show()


In [52]:
def daily_sales(dataframe, region):
    dataframe['order_date'] = pd.to_datetime(dataframe['order_date'])
    dataframe = dataframe.set_index('order_date')

    if region != "All Regions":
        dataframe = dataframe[dataframe["order_region"] == region]

    daily_sales = dataframe.resample('D')['sales'].sum().reset_index()

    daily_sales["sales"] = round(daily_sales["sales"], 2)
    daily_sales["order_date"] = daily_sales["order_date"].dt.strftime("%Y-%m-%d")

    fig = go.Figure()

    fig.add_trace(
        go.Scatter(
            x=daily_sales["order_date"],
            y=daily_sales["sales"],
            mode="lines",
            hovertemplate='<b>Date:</b> %{x}<br><b>Sales:</b> $%{y:.2f}<extra></extra>',
        )
    )

    fig.update_layout(
        title=f"<b>{region} Sales Over Time</b>",
        xaxis_title="",
        yaxis_title="Sales",
        width=600,
        height=300,
        template="plotly_dark",
        title_font_size=13
    )

    fig.update_layout({'plot_bgcolor': 'rgba(0, 0, 0, 0)','paper_bgcolor': 'rgba(0, 0, 0, 0)',})

    return fig

In [53]:
sales = daily_sales(df_2017, "All Regions")
sales

In [54]:
print(df_2017.head(2))

           order_date                    order_region order_country  \
0 2017-01-01 00:33:00  Asiatic Russia/European Russia        Russia   
1 2017-01-01 00:33:00  Asiatic Russia/European Russia        Russia   

    order_city  market  sales  order_item_quantity  shipping_date  \
0  Cheliábinsk  Europe  99.96                    2  1/6/2017 0:33   
1  Cheliábinsk  Europe  39.99                    1  1/6/2017 0:33   

   days_for_shipping_real  days_for_shipment_scheduled         category_name  \
0                       5                            2  Indoor/Outdoor Games   
1                       5                            2         Shop By Sport   

  delivery_status  late_delivery_risk shipping_mode order_status  \
0   Late delivery                   1  Second Class     COMPLETE   
1   Late delivery                   1  Second Class     COMPLETE   

   order_profit_per_order  sales_per_item  shipping_days_difference  \
0                  -58.78           49.98                    

In [55]:
df_2017.head(2)

Unnamed: 0,order_date,order_region,order_country,order_city,market,sales,order_item_quantity,shipping_date,days_for_shipping_real,days_for_shipment_scheduled,category_name,delivery_status,late_delivery_risk,shipping_mode,order_status,order_profit_per_order,sales_per_item,shipping_days_difference,order_year,order_month,order_year_month_date
0,2017-01-01 00:33:00,Asiatic Russia/European Russia,Russia,Cheliábinsk,Europe,99.96,2,1/6/2017 0:33,5,2,Indoor/Outdoor Games,Late delivery,1,Second Class,COMPLETE,-58.78,49.98,-3,2017,January,2017-01-01
1,2017-01-01 00:33:00,Asiatic Russia/European Russia,Russia,Cheliábinsk,Europe,39.99,1,1/6/2017 0:33,5,2,Shop By Sport,Late delivery,1,Second Class,COMPLETE,15.98,39.99,-3,2017,January,2017-01-01


In [56]:
# An order is considered on-time and in-full if the 'shipping_days_difference' is equal to or greater than 0.
# If 'shipping_days_difference' >= 0, the order is delivered on-time and in-full (value 1).
# Otherwise, the order is not delivered on-time and in-full (value 0).
df_2017['on_time_in_full'] = (df_2017['shipping_days_difference'] >= 0).astype(int)

# Group the data by 'order_region' and calculate the total number of orders and the total number of on-time, in-full orders.
region_otif_data = df_2017.groupby('order_region')['on_time_in_full'].agg(['sum', 'count']).reset_index()

# Calculate the OTIF rate for each region by dividing the total number of on-time, in-full orders by the total number of orders, then multiply by 100 to get the percentage.
region_otif_data['otif_rate'] = (region_otif_data['sum'] / region_otif_data['count']) * 100

# Rename the columns for better readability
region_otif_data.columns = ['order_region', 'on_time_in_full_orders', 'total_orders', 'otif_rate']

value = str(round(region_otif_data[region_otif_data["order_region"]=="Southeastern Asia"]["otif_rate"].values[0], 2)) + " %"

value

'41.82 %'

In [57]:
avg_scheduled_shipping_time = df_2017.groupby(['order_region'])['days_for_shipping_real'].mean().reset_index()
avg_scheduled_shipping_time.loc[len(avg_scheduled_shipping_time)] = ['All Region', avg_scheduled_shipping_time['days_for_shipping_real'].mean()]
avg_scheduled_shipping_time.tail()

Unnamed: 0,order_region,days_for_shipping_real
17,Southern Europe,3.44
18,Western Africa,3.59
19,Western Asia,3.68
20,Western Europe,3.48
21,All Region,3.55


In [58]:
total_order = df_2017.groupby(['order_region'])['order_item_quantity'].sum().reset_index()
total_order.loc[len(total_order)] = ['All Region', total_order['order_item_quantity'].sum()]
total_order.tail()

Unnamed: 0,order_region,order_item_quantity
17,Southern Europe,7720
18,Western Africa,970
19,Western Asia,1713
20,Western Europe,22741
21,All Region,101392


In [59]:
total_profit = df_2017.groupby(['order_region'])['order_profit_per_order'].sum().reset_index()
total_profit.loc[len(total_profit)] = ['All Region', total_profit['order_profit_per_order'].sum()]
total_profit.tail()

Unnamed: 0,order_region,order_profit_per_order
17,Southern Europe,100401.6
18,Western Africa,10751.88
19,Western Asia,16249.2
20,Western Europe,317781.5
21,All Region,1260204.06


In [60]:
total_sales = df_2017.groupby(['order_region'])['sales'].sum().reset_index()
total_sales.loc[len(total_sales)] = ['All Region', total_sales['sales'].sum()]
total_sales.tail()

Unnamed: 0,order_region,sales
17,Southern Europe,979686.24
18,Western Africa,80983.16
19,Western Asia,151246.63
20,Western Europe,2847340.92
21,All Region,11285497.46


In [61]:
total_profit = df_2017.groupby(['order_region'])['order_profit_per_order'].sum().reset_index()
total_profit.loc[len(total_profit)] = ['All Region', total_profit['order_profit_per_order'].sum()]
profit_value = total_profit[total_profit["order_region"]=='All Region']["order_profit_per_order"].values[0]
formatted_profit = "${:,.2f}".format(profit_value)
profit = formatted_profit

In [62]:
total_profit

Unnamed: 0,order_region,order_profit_per_order
0,Asiatic Russia/European Russia,4653.68
1,Australia/New Zealand,18188.67
2,Caribbean,86349.98
3,Central America,310165.05
4,Central Asia,431.9
5,Eastern Africa,4018.65
6,Eastern Asia,30713.65
7,Eastern Europe,5492.5
8,Melanesia,101.23
9,Middle Africa,2594.24


In [63]:
profit

'$1,260,204.06'

In [64]:
df_2017[df_2017["order_status"]=="CANCELED"]

Unnamed: 0,order_date,order_region,order_country,order_city,market,sales,order_item_quantity,shipping_date,days_for_shipping_real,days_for_shipment_scheduled,category_name,delivery_status,late_delivery_risk,shipping_mode,order_status,order_profit_per_order,sales_per_item,shipping_days_difference,order_year,order_month,order_year_month_date,on_time_in_full


In [65]:
def create_bar_region_market_combined(dataframe):

    grouped = dataframe.groupby(["market", "order_region"]).agg(
        total_sales = ("sales", "sum")
    ).reset_index()

    grouped["total_sales"] = round(grouped["total_sales"], 2)

    # sort by total_sales and split into two groups
    grouped = grouped.sort_values(by="total_sales", ascending=False).reset_index()

    # format total_sales as a string with $ and thousand separator
    locale.setlocale(locale.LC_ALL, '')  # set locale to default system locale
    grouped["total_sales_formated"] = grouped["total_sales"].apply(lambda x: locale.currency(x, grouping=True))

    top_5 = grouped.head(5)
    bottom_5 = grouped.tail(5)

    # assign unique colors to each market
    color_map = {'LATAM': '#3366CC', 'Europe': '#DC3912', 'Pacific Asia': '#FF9900', 'USCA': '#109618', 'Africa': '#990099'}

    # map category colors to the top_5 and bottom_5 dataframes
    top_5['color'] = top_5['market'].map(color_map)
    bottom_5['color'] = bottom_5['market'].map(color_map)

        # create the subplots
    fig = make_subplots(rows=1, cols=2, horizontal_spacing=0.2, subplot_titles=(
        "<b>Top 5 High-Performing Regions by Total Sales</b>", 
        "<b>Bottom 5 Underperforming Regions by Total Sales</b>"))

    # add the top 5 subplot
    fig.add_trace(go.Bar(x=top_5['total_sales'], y=top_5['order_region'], orientation='h',
                        text=top_5['total_sales_formated'], name='', marker=dict(color=top_5['color']),
                        textfont=dict(color='white'), showlegend=True),
                row=1, col=1)
    fig.update_yaxes(title='', categoryorder='total ascending', row=1, col=1)
    fig.update_xaxes(title='Total Sales', row=1, col=1, showgrid=False)

    # add the bottom 5 subplot
    fig.add_trace(go.Bar(x=bottom_5['total_sales'], y=bottom_5['order_region'], orientation='h',
                        text=bottom_5['total_sales_formated'], name='', marker=dict(color=bottom_5['color']),
                        textfont=dict(color='white'), showlegend=True),
                row=1, col=2)
    fig.update_yaxes(title='', categoryorder='total ascending', row=1, col=2)
    fig.update_xaxes(title='Total Sales', row=1, col=2, showgrid=False)


    # update the layout
    fig.update_layout(
        height=200,
        width=900,
        template="plotly_dark",
        showlegend=False,
        margin=dict(l=0, r=0, t=20, b=0),
    )
    fig.update_layout({'plot_bgcolor': 'rgba(0, 0, 0, 0)','paper_bgcolor': 'rgba(0, 0, 0, 0)',})
    fig.update_annotations(font_size=12)

    return fig


In [66]:
bar2 = create_bar_region_market_combined(df_2017)
bar2



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [67]:
order_status_distribution = df_2017.groupby('order_status').size().reset_index(name='count')
order_status_distribution

Unnamed: 0,order_status,count
0,CLOSED,5669
1,COMPLETE,17365
2,ON_HOLD,2758
3,PAYMENT_REVIEW,568
4,PENDING,6233
5,PENDING_PAYMENT,11725
6,PROCESSING,6530


In [68]:
order_volume_by_shipping_mode = df_2017.groupby('shipping_mode')['order_item_quantity'].sum().reset_index()
order_volume_by_shipping_mode

Unnamed: 0,shipping_mode,order_item_quantity
0,First Class,15056
1,Same Day,5511
2,Second Class,19659
3,Standard Class,61166


In [69]:
late_delivery_data = df_2017.groupby(['order_region', 'market', 'late_delivery_risk']).size().unstack().reset_index()
late_delivery_data['late_delivery_rate'] = (late_delivery_data[1] / (late_delivery_data[0] + late_delivery_data[1])) * 100
late_delivery_data

late_delivery_risk,order_region,market,0,1,late_delivery_rate
0,Asiatic Russia/European Russia,Europe,62.0,131.0,67.88
1,Australia/New Zealand,Pacific Asia,366.0,475.0,56.48
2,Caribbean,LATAM,1782.0,2131.0,54.46
3,Central America,LATAM,5936.0,7875.0,57.02
4,Central Asia,Pacific Asia,23.0,48.0,67.61
5,Eastern Africa,Africa,98.0,107.0,52.2
6,Eastern Asia,Pacific Asia,361.0,471.0,56.61
7,Eastern Europe,Europe,123.0,136.0,52.51
8,Melanesia,Pacific Asia,,2.0,
9,Middle Africa,Africa,66.0,84.0,56.0


In [70]:
# Calculate the count of orders for each shipping mode by region
shipping_mode_distribution = df_2017.groupby(['order_region', 'shipping_mode'])['order_status'].count().reset_index()
shipping_mode_distribution

Unnamed: 0,order_region,shipping_mode,order_status
0,Asiatic Russia/European Russia,First Class,37
1,Asiatic Russia/European Russia,Same Day,3
2,Asiatic Russia/European Russia,Second Class,79
3,Asiatic Russia/European Russia,Standard Class,74
4,Australia/New Zealand,First Class,123
...,...,...,...
77,Western Asia,Standard Class,453
78,Western Europe,First Class,1855
79,Western Europe,Same Day,658
80,Western Europe,Second Class,2352


In [71]:
# Calculate the average order profit by region
avg_order_profit = df_2017.groupby(['market', 'order_region'])['order_profit_per_order'].mean().reset_index()
avg_order_profit = avg_order_profit.sort_values('order_profit_per_order', ascending=False)
avg_order_profit

Unnamed: 0,market,order_region,order_profit_per_order
16,Pacific Asia,Melanesia,50.61
17,Pacific Asia,Southeastern Asia,40.24
15,Pacific Asia,Eastern Asia,36.92
20,USCA,Northern America,33.63
18,Pacific Asia,Southern Asia,31.04
2,Africa,Northern Africa,27.89
9,Europe,Western Europe,26.96
7,Europe,Northern Europe,26.58
4,Africa,Western Africa,25.66
8,Europe,Southern Europe,24.94


In [72]:
# # Calculate the average order profit by region
# avg_order_profit = df_2017.groupby('order_region')['order_profit_per_order'].mean().reset_index()
# avg_order_profit = avg_order_profit.sort_values('order_profit_per_order', ascending=False)

# Create the bar chart using Plotly
fig = go.Figure(go.Bar(x=avg_order_profit['order_region'], y=avg_order_profit['order_profit_per_order'],
                       text=avg_order_profit['order_profit_per_order'].round(2),
                       textposition='outside', marker_color='rgb(104, 204, 104)'))

# Customize the chart appearance
fig.update_layout(title='Average Order Profit by Region',
                  xaxis_title='Region',
                  yaxis_title='Average Order Profit',
                  xaxis_tickangle=-45)

fig.update_traces(texttemplate='%{text:.2f}', textfont=dict(size=10))

fig.update_yaxes(tickprefix="$")

# Display the chart
fig.show()

In [73]:
def get_shipping_relationship(dataframe):
        # Calculate the average days for shipping (actual vs. scheduled) and average sales by region
        avg_days_sales = df_2017.groupby('order_region').agg({'days_for_shipping_real': 'mean',
                                                        'days_for_shipment_scheduled': 'mean',
                                                        'sales': 'mean'}).reset_index()

        # Create the scatter plot using Plotly
        fig = px.scatter(avg_days_sales, x='days_for_shipping_real', y='days_for_shipment_scheduled',
                        size='sales', color='order_region', hover_name='order_region',
                        labels={'days_for_shipping_real': 'Average Days for Shipping (Actual)',
                                'days_for_shipment_scheduled': 'Average Days for Shipping (Scheduled)',
                                'sales': 'Average Sales'})

        # Customize the chart appearance
        fig.update_layout(title={'text': 'Relationship between Average Days for Shipping (Actual vs. Scheduled)<br>and<br>Average Sales by Region</br>',
                                'font': {'size': 13},
                                'x': 0.5,
                                'xanchor': 'center'},
                        height=400,
                        width=600,
                        template="plotly_dark",
                        showlegend=True,
                        margin=dict(l=0, r=0, t=80, b=0),
                        plot_bgcolor='rgba(0, 0, 0, 0)',
                        paper_bgcolor='rgba(0, 0, 0, 0)')

        fig.update_xaxes(title_font=dict(size=11))
        fig.update_yaxes(title_font=dict(size=11))

        fig.update_traces(hovertemplate='<b>%{hovertext}</b><br>Average Days for Shipping (Actual): %{x:.2f}<br>Average Days for Shipping (Scheduled): %{y:.2f}<br>Average Sales: $%{marker.size:.2f}')

        fig.update_layout(legend_title_text='Region', legend=dict(font=dict(size=11)))

        # Display the chart
        return fig

In [74]:
relationship = get_shipping_relationship(df_2017)
relationship

In [86]:
    def create_fig_region_combined(dataframe, region):
        if region == "All Regions":
            grouped_bar = dataframe.groupby(["category_name"]).agg(total_sales=("sales", "sum")).reset_index()
        else:
            dataframe = dataframe[dataframe["order_region"]==region]
            grouped_bar = dataframe.groupby(["category_name"]).agg(total_sales=("sales", "sum")).reset_index()

        grouped_bar["total_sales"] = round(grouped_bar["total_sales"], 2)

        # sort by total_sales and split into two groups
        grouped_bar = grouped_bar.sort_values(by="total_sales", ascending=False).reset_index()

        # format total_sales as a string with $ and thousand separator
        locale.setlocale(locale.LC_ALL, '')  # set locale to default system locale
        grouped_bar["total_sales_formated"] = grouped_bar["total_sales"].apply(lambda x: locale.currency(x, grouping=True))

        top_5 = grouped_bar.head(5)

        # create a dictionary to map each category to a unique color
        category_colors = {
            'Camping & Hiking': '#1f77b4',
            'Water Sports': '#ff7f0e',
            "Women's Apparel": '#2ca02c',
            "Men's Footwear": '#d62728',
            'Indoor/Outdoor Games': '#9467bd',
            'Accessories': '#8c564b',
            'Cleats': '#e377c2',
            'Trade-In': '#7f7f7f',
            'Cardio Equipment': '#bcbd22',
            'Shop By Sport': '#17becf',
            'Hockey': '#ff5733',
            'Electronics': '#e74c3c',
            'Fishing': '#3498db',
            'Golf Balls': '#9b59b6',
            'Lacrosse': '#e67e22',
            'Baseball & Softball': '#34495e',
            'Golf Gloves': '#f1c40f',
            "Girls' Apparel": '#2ecc71',
            'Fitness Accessories': '#1abc9c',
            'Hunting & Shooting': '#95a5a6',
            'Tennis & Racquet': '#2c3e50',
            'Golf Shoes': '#bdc3c7',
            'Golf Apparel': '#d35400',
            'Boxing & MMA': '#7f8c8d',
            "Men's Golf Clubs": '#2980b9',
            "Kids' Golf Clubs": '#16a085',
            'Soccer': '#c0392b',
            "Women's Golf Clubs": '#f39c12',
            'Golf Bags & Carts': '#27ae60',
            'Strength Training': '#e67e22',
            'As Seen on  TV!': '#8e44ad',
            'Basketball': '#f39c12',
            'Books ': '#1abc9c',
            'Baby ': '#95a5a6',
            'CDs ': '#d35400',
            'Cameras ': '#bdc3c7',
            "Children's Clothing": '#9b59b6',
            'Computers': '#7f8c8d',
            'Consumer Electronics': '#2c3e50',
            'Crafts': '#27ae60',
            'DVDs': '#f1c40f',
            'Garden': '#17becf',
            'Health and Beauty': '#bcbd22',
            "Men's Clothing": '#e74c3c',
            'Music': '#8c564b',
            'Pet Supplies': '#2ecc71',
            'Sporting Goods': '#7f7f7f',
            'Toys': '#d62728',
            'Video Games': '#9467bd',
            "Women's Clothing": '#3498db'
        }

        # map category colors to the top_5 and bottom_5 dataframes
        top_5['color'] = top_5['category_name'].map(category_colors)

        # -----------------------------------------------------------

        dailysales = dataframe.copy()
        dailysales['order_date'] = pd.to_datetime(dailysales['order_date'])
        dailysales = dailysales.set_index('order_date')

        if region != "All Regions":
            dailysales = dailysales[dailysales["order_region"] == region]

        daily_sales = dailysales.resample('D')['sales'].sum().reset_index()

        daily_sales["sales"] = round(daily_sales["sales"], 2)
        daily_sales["order_date"] = daily_sales["order_date"].dt.strftime("%Y-%m-%d")

        # -----------------------------------------------------------

        # create the subplots
        fig = make_subplots(rows=2, cols=1, vertical_spacing=0.25, subplot_titles=(
            f"<b>Top 5 High-Performing Categories in {region}</b>", 
            f"<b>{region} Sales Over Time</b>"))

        # add the top 5 subplot
        fig.add_trace(go.Bar(x=top_5['total_sales'], y=top_5['category_name'], orientation='h',
                            text=top_5['total_sales_formated'], name='', marker=dict(color=top_5['color']),
                            textfont=dict(color='white')),
                    row=1, col=1)
        fig.update_yaxes(title='', categoryorder='total ascending', row=1, col=1)
        fig.update_xaxes(title='Total Sales', row=1, col=1, showgrid=False)

        # add dailysales subplot
        fig.add_trace(go.Scatter(x=daily_sales["order_date"], y=daily_sales["sales"],
                                mode="lines", hovertemplate='<b>Date:</b> %{x}<br><b>Sales:</b> $%{y:,.2f}<extra></extra>', 
                                textfont=dict(color='white')), row=2, col=1)
        fig.update_yaxes(title='Sales', row=2, col=1)

        # update the layout
        fig.update_layout(
            height=400,
            width=600,
            template="plotly_dark",
            showlegend=False,
            margin=dict(l=0, r=0, t=20, b=0),
        )
        fig.update_layout({'plot_bgcolor': 'rgba(0, 0, 0, 0)','paper_bgcolor': 'rgba(0, 0, 0, 0)',})
        fig.update_annotations(font_size=12)

        return fig

In [87]:
combined = create_fig_region_combined(df_2017, "All Regions")
combined



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [79]:
def daily_sales(dailysales, region):
    dailysales['order_date'] = pd.to_datetime(dailysales['order_date'])
    dailysales = dailysales.set_index('order_date')

    if region != "All Regions":
        dailysales = dailysales[dailysales["order_region"] == region]

    daily_sales = dailysales.resample('D')['sales'].sum().reset_index()

    daily_sales["sales"] = round(daily_sales["sales"], 2)
    daily_sales["order_date"] = daily_sales["order_date"].dt.strftime("%Y-%m-%d")

    fig = go.Figure()

    fig.add_trace(go.Scatter(x=daily_sales["order_date"], y=daily_sales["sales"],
            mode="lines", hovertemplate='<b>Date:</b> %{x}<br><b>Sales:</b> $%{y:,.2f}<extra></extra>', 
            textfont=dict(color='white')), row=2, col=1)

    fig.update_layout(
        title=f"<b>{region} Sales Over Time</b>",
        xaxis_title="",
        yaxis_title="Sales",
        width=450,
        height=300,
        template="plotly_dark",
        title_font_size=13
    )

    fig.update_layout({'plot_bgcolor': 'rgba(0, 0, 0, 0)','paper_bgcolor': 'rgba(0, 0, 0, 0)',})

    return fig


In [109]:
    def create_bar_region_order(dataframe):
        # Group the dataframe by market and order_region, and sum the order_item_quantity
        grouped = dataframe.groupby(["market", "order_region"]).agg(
            total_order=("order_item_quantity", "sum")
        ).reset_index()

        # Sort the grouped dataframe by total_order in descending order, and round the values to 2 decimal places
        grouped = grouped.sort_values(by="total_order", ascending=False).reset_index()
        grouped["total_order"] = round(grouped["total_order"], 2)

        # Add a new column named total_order_formatted with values from total_order formatted with thousand separators
        grouped["total_order_formatted"] = grouped["total_order"].apply(lambda x: locale.format_string("%0.0f", x, grouping=True))

        # Select the top 5 high-performing regions by total order
        grouped = grouped.head(5)

        # Assign unique colors to each market
        color_map = {'LATAM': '#3366CC', 'Europe': '#DC3912', 'Pacific Asia': '#FF9900', 'USCA': '#109618', 'Africa': '#990099'}

        # Create the bar chart using plotly express
        fig = px.bar(
            grouped,
            x="total_order",
            y="order_region",
            orientation="h",
            text="total_order_formatted",
            color="market",
            labels={"order_region":"", "total_order":"Total Order"},
            template="plotly_dark",
            color_discrete_map=color_map
        )

        # Set the y-axis category order to ascending
        fig.update_layout(yaxis={"categoryorder":"total ascending"})

        # Set the size of the figure
        fig.update_layout(autosize=True, width=400, height=300)

        # Set the title of the figure
        fig.update_layout(
            title="<b>Top 5 High-Performing Regions by Total Order</b>",
            title_x=0.5,
            title_y=0.875,
            title_font_size=13
        )

        # Set the plot background and paper background color to transparent
        fig.update_layout({"plot_bgcolor": "rgba(0, 0, 0, 0)", "paper_bgcolor": "rgba(0, 0, 0, 0)"})

        # Set the font size of the x-axis title
        fig.update_xaxes(title_font=dict(size=12))

        # Hide the x-axis and y-axis grid lines
        fig.update_layout(xaxis_showgrid=False, yaxis_showgrid=False)

        # Set the legend title text to "Market"
        fig.update_layout(legend_title_text="Market")

        return fig


In [110]:
order = create_bar_region_order(df_2017)
order

In [None]:
    def create_bar_region(dataframe):
        grouped = dataframe.groupby(["market", "order_region"]).agg(
            total_sales = ("sales", "sum")
        ).reset_index()

        grouped = grouped.sort_values(by="total_sales", ascending=False).reset_index()
        grouped["total_sales"] = round(grouped["total_sales"], 2)
        
        grouped = grouped.head(5)

        # format values in millions
        grouped['total_sales_formated'] = "$" + (grouped['total_sales'] / 1000000).round(2).astype(str) + 'M'

        # assign unique colors to each market
        color_map = {'LATAM': '#3366CC', 'Europe': '#DC3912', 'Pacific Asia': '#FF9900', 'USCA': '#109618', 'Africa': '#990099'}

        fig = px.bar(grouped, x="total_sales", y="order_region", orientation="h", text="total_sales_formated", color="market",
                        labels={"order_region":"", "total_sales":"Total Sales"}, template="plotly_dark",
                        color_discrete_map=color_map)

        fig.update_layout(yaxis={'categoryorder':'total ascending'})

        fig.update_layout(autosize=True,width=400,height=300)

        # Set the title of the figure
        fig.update_layout(
            title="<b>Top 5 High-Performing Regions by Total Sales</b>",
            title_x=0.5,
            title_y=0.875,
            title_font_size=13
        )


        fig.update_layout({'plot_bgcolor': 'rgba(0, 0, 0, 0)','paper_bgcolor': 'rgba(0, 0, 0, 0)',})
        fig.update_xaxes(title_font=dict(size=12))
        fig.update_layout(xaxis_showgrid=False, yaxis_showgrid=False)
        fig.update_layout(legend_title_text='Market')
        
        return fig