In [2]:
import numpy as np
import pandas as pd
import geopandas as gpd
import time

import plotly.express as px
import plotly.graph_objects as go
import altair as alt
from vega_datasets import data

from urllib.request import urlopen
import json

pd.set_option("display.float_format", lambda x: "%.2f" % x)
alt.data_transformers.disable_max_rows()


DataTransformerRegistry.enable('default')

In [3]:
# Set location of the dataset
filepath = "../datasets/preprocessed_data.csv"

# Load datasets
main_df = pd.read_csv(filepath)

main_df.head()

Unnamed: 0,order_date,order_region,order_country,order_city,market,sales,order_item_quantity,shipping_date,days_for_shipping_real,days_for_shipment_scheduled,...,delivery_status,late_delivery_risk,shipping_mode,order_status,order_profit_per_order,sales_per_item,shipping_days_difference,order_year,order_month,order_year_month_date
0,2015-01-01 00:00:00,Central America,Mexico,Mexico City,LATAM,299.98,1,1/3/2015 0:00,2,4,...,Advance shipping,0,Standard Class,CLOSED,88.79,299.98,2,2015,January,2015-01-01
1,2015-01-01 00:21:00,South America,Colombia,Dos Quebradas,LATAM,199.99,1,1/4/2015 0:21,3,4,...,Advance shipping,0,Standard Class,PENDING_PAYMENT,91.18,199.99,1,2015,January,2015-01-01
2,2015-01-01 00:21:00,South America,Colombia,Dos Quebradas,LATAM,250.0,5,1/4/2015 0:21,3,4,...,Advance shipping,0,Standard Class,PENDING_PAYMENT,68.25,50.0,1,2015,January,2015-01-01
3,2015-01-01 00:21:00,South America,Colombia,Dos Quebradas,LATAM,129.99,1,1/4/2015 0:21,3,4,...,Advance shipping,0,Standard Class,PENDING_PAYMENT,36.47,129.99,1,2015,January,2015-01-01
4,2015-01-01 01:03:00,South America,Colombia,Dos Quebradas,LATAM,199.92,4,1/6/2015 1:03,5,4,...,Late delivery,1,Standard Class,CLOSED,33.59,49.98,-1,2015,January,2015-01-01


In [4]:
sales_dict = {}
main_df["order_date"] = pd.to_datetime(main_df["order_date"])

for year in range(2015, 2018):
    sales_year = main_df[main_df["order_year"] == year]
    
    # Create a DataFrame with MultiIndex containing 'order_date', 'market', and 'order_region'
    multiindex_df = sales_year.set_index(["order_date", "market", "order_region"])
    
    # Resample the data to daily frequency and compute daily sales
    daily_sales = multiindex_df["sales"].sum(level=["order_date", "market", "order_region"]).reset_index()
    
    # Calculate the average daily sales by region
    daily_sales["order_month"] = daily_sales["order_date"].dt.to_period("M")
    mean_daily_sales = daily_sales.groupby(["market", "order_region"]).agg(
        mean_daily_sales=("sales", "mean")).reset_index()
    
    mean_daily_sales["mean_daily_sales"] = np.round(mean_daily_sales["mean_daily_sales"], 2)
    
    sales_dict[year] = mean_daily_sales

for year, sales_df in sales_dict.items():
    sales_df.to_csv(f"../datasets/agg_sales_{year}.csv", index=False)

  daily_sales = multiindex_df["sales"].sum(level=["order_date", "market", "order_region"]).reset_index()
  daily_sales = multiindex_df["sales"].sum(level=["order_date", "market", "order_region"]).reset_index()
  daily_sales = multiindex_df["sales"].sum(level=["order_date", "market", "order_region"]).reset_index()


bar chart

In [5]:
main_df.head(2)

Unnamed: 0,order_date,order_region,order_country,order_city,market,sales,order_item_quantity,shipping_date,days_for_shipping_real,days_for_shipment_scheduled,...,delivery_status,late_delivery_risk,shipping_mode,order_status,order_profit_per_order,sales_per_item,shipping_days_difference,order_year,order_month,order_year_month_date
0,2015-01-01 00:00:00,Central America,Mexico,Mexico City,LATAM,299.98,1,1/3/2015 0:00,2,4,...,Advance shipping,0,Standard Class,CLOSED,88.79,299.98,2,2015,January,2015-01-01
1,2015-01-01 00:21:00,South America,Colombia,Dos Quebradas,LATAM,199.99,1,1/4/2015 0:21,3,4,...,Advance shipping,0,Standard Class,PENDING_PAYMENT,91.18,199.99,1,2015,January,2015-01-01


In [6]:
main_df["market"].unique()

array(['LATAM', 'Europe', 'Pacific Asia', 'USCA', 'Africa'], dtype=object)

In [7]:
order_group_2017 = main_df[main_df["order_year"]==2017].groupby(["market", "order_region"]).agg(
    total_order = ("order_item_quantity", "sum")
).reset_index()

order_group_2017

Unnamed: 0,market,order_region,total_order
0,Africa,Eastern Africa,439
1,Africa,Middle Africa,343
2,Africa,Northern Africa,735
3,Africa,Southern Africa,256
4,Africa,Western Africa,1022
5,Europe,Asiatic Russia/European Russia,451
6,Europe,Eastern Europe,600
7,Europe,Northern Europe,8541
8,Europe,Southern Europe,8121
9,Europe,Western Europe,23880


In [8]:
df_2017 = main_df[main_df["order_year"]==2017]

In [9]:
px.colors.qualitative.G10

['#3366CC',
 '#DC3912',
 '#FF9900',
 '#109618',
 '#990099',
 '#0099C6',
 '#DD4477',
 '#66AA00',
 '#B82E2E',
 '#316395']

In [10]:
def create_bar_region(dataframe):
    grouped = dataframe.groupby(["market", "order_region"]).agg(
        total_sales = ("sales", "sum")
    ).reset_index()

    grouped = grouped.sort_values(by="total_sales", ascending=False).reset_index()
    grouped["total_sales"] = round(grouped["total_sales"], 2)
    grouped = grouped.head(5)

    # format values in millions
    grouped['total_sales_formatted'] = (grouped['total_sales'] / 1000000).round(2).astype(str) + 'M'

    # assign unique colors to each market
    color_map = {'LATAM': '#3366CC', 'Europe': '#DC3912', 'Pacific Asia': '#FF9900', 'USCA': '#109618', 'Africa': '#990099'}

    fig = px.bar(grouped, x="total_sales", y="order_region", orientation="h", text="total_sales_formatted", color="market",
                    labels={"order_region":"", "total_sales":"Total Sales"}, template="plotly_dark",
                     color_discrete_map=color_map)

    fig.update_layout(yaxis={'categoryorder':'total ascending'})

    fig.update_layout(autosize=True,width=400,height=300)
    fig.update_layout(title="<b>Top 5 High-Performing Regions by Total Sales</b>",title_font_size=13)
    fig.update_layout({'plot_bgcolor': 'rgba(0, 0, 0, 0)','paper_bgcolor': 'rgba(0, 0, 0, 0)',})
    fig.update_xaxes(title_font=dict(size=12))
    fig.update_layout(xaxis_showgrid=False, yaxis_showgrid=False)
    fig.update_layout(legend_title_text='Market')
    

    return fig

In [11]:
bar1 = create_bar_region(df_2017)
bar1

In [12]:
def create_bar_region2(dataframe):
    grouped = dataframe.groupby(["market", "order_region"]).agg(
        total_sales = ("sales", "sum")
    ).reset_index()

    grouped = grouped.sort_values(by="total_sales", ascending=False).reset_index()
    grouped["total_sales"] = round(grouped["total_sales"], 2)
    grouped = grouped.tail(5)

    # format values in millions
    grouped['total_sales_formatted'] = (grouped['total_sales'] / 1000).round(2).astype(str) + 'K'

    # assign unique colors to each market
    color_map = {'LATAM': '#3366CC', 'Europe': '#DC3912', 'Pacific Asia': '#FF9900', 'USCA': '#109618', 'Africa': '#990099'}

    fig = px.bar(grouped, x="total_sales", y="order_region", orientation="h", text="total_sales_formatted", color="market",
                    labels={"order_region":"", "total_sales":"Total Sales"}, template="plotly_dark",
                     color_discrete_map=color_map)

    fig.update_layout(yaxis={'categoryorder':'total ascending'})

    fig.update_layout(autosize=True,width=400,height=300)
    fig.update_layout(title="<b>Bottom 5 Underperforming Regions by Total Sales</b>",title_font_size=13)
    fig.update_layout({'plot_bgcolor': 'rgba(0, 0, 0, 0)','paper_bgcolor': 'rgba(0, 0, 0, 0)',})
    fig.update_xaxes(title_font=dict(size=12))
    fig.update_layout(xaxis_showgrid=False, yaxis_showgrid=False)
    fig.update_layout(legend_title_text='Market')
    

    return fig

In [13]:
bar2 = create_bar_region2(df_2017)
bar2

In [14]:
main_df["category_name"].unique()

array(['Camping & Hiking', 'Water Sports', "Women's Apparel",
       "Men's Footwear", 'Indoor/Outdoor Games', 'Accessories', 'Cleats',
       'Trade-In', 'Cardio Equipment', 'Shop By Sport', 'Hockey',
       'Electronics', 'Fishing', 'Golf Balls', 'Lacrosse',
       'Baseball & Softball', 'Golf Gloves', "Girls' Apparel",
       'Fitness Accessories', 'Hunting & Shooting', 'Tennis & Racquet',
       'Golf Shoes', 'Golf Apparel', 'Boxing & MMA', "Men's Golf Clubs",
       "Kids' Golf Clubs", 'Soccer', "Women's Golf Clubs",
       'Golf Bags & Carts', 'Strength Training', 'As Seen on  TV!',
       'Basketball', 'Books ', 'Baby ', 'CDs ', 'Cameras ',
       "Children's Clothing", 'Computers', 'Consumer Electronics',
       'Crafts', 'DVDs', 'Garden', 'Health and Beauty', "Men's Clothing",
       'Music', 'Pet Supplies', 'Sporting Goods', 'Toys', 'Video Games',
       "Women's Clothing"], dtype=object)

In [15]:
df_2017.head(2)

Unnamed: 0,order_date,order_region,order_country,order_city,market,sales,order_item_quantity,shipping_date,days_for_shipping_real,days_for_shipment_scheduled,...,delivery_status,late_delivery_risk,shipping_mode,order_status,order_profit_per_order,sales_per_item,shipping_days_difference,order_year,order_month,order_year_month_date
125200,2017-01-01 00:33:00,Asiatic Russia/European Russia,Russia,Cheliábinsk,Europe,99.96,2,1/6/2017 0:33,5,2,...,Late delivery,1,Second Class,COMPLETE,-58.78,49.98,-3,2017,January,2017-01-01
125201,2017-01-01 00:33:00,Asiatic Russia/European Russia,Russia,Cheliábinsk,Europe,39.99,1,1/6/2017 0:33,5,2,...,Late delivery,1,Second Class,COMPLETE,15.98,39.99,-3,2017,January,2017-01-01


In [16]:
import plotly.graph_objs as go
from plotly.subplots import make_subplots

def create_bar_region_combined(dataframe):
    dataframe = dataframe[dataframe["order_region"]=="Northern America"]
    grouped = dataframe.groupby(["category_name"]).agg(
        total_sales=("sales", "sum")
    ).reset_index()

    grouped["total_sales"] = round(grouped["total_sales"], 2)

    # sort by total_sales and split into two groups
    grouped = grouped.sort_values(by="total_sales", ascending=False).reset_index()
    top_5 = grouped.head(5)
    bottom_5 = grouped.tail(5)

    # create a dictionary to map each category to a unique color
    category_colors = {
        'Camping & Hiking': '#1f77b4',
        'Water Sports': '#ff7f0e',
        "Women's Apparel": '#2ca02c',
        "Men's Footwear": '#d62728',
        'Indoor/Outdoor Games': '#9467bd',
        'Accessories': '#8c564b',
        'Cleats': '#e377c2',
        'Trade-In': '#7f7f7f',
        'Cardio Equipment': '#bcbd22',
        'Shop By Sport': '#17becf',
        'Hockey': '#ff5733',
        'Electronics': '#e74c3c',
        'Fishing': '#3498db',
        'Golf Balls': '#9b59b6',
        'Lacrosse': '#e67e22',
        'Baseball & Softball': '#34495e',
        'Golf Gloves': '#f1c40f',
        "Girls' Apparel": '#2ecc71',
        'Fitness Accessories': '#1abc9c',
        'Hunting & Shooting': '#95a5a6',
        'Tennis & Racquet': '#2c3e50',
        'Golf Shoes': '#bdc3c7',
        'Golf Apparel': '#d35400',
        'Boxing & MMA': '#7f8c8d',
        "Men's Golf Clubs": '#2980b9',
        "Kids' Golf Clubs": '#16a085',
        'Soccer': '#c0392b',
        "Women's Golf Clubs": '#f39c12',
        'Golf Bags & Carts': '#27ae60',
        'Strength Training': '#e67e22',
        'As Seen on  TV!': '#8e44ad',
        'Basketball': '#f39c12',
        'Books ': '#1abc9c',
        'Baby ': '#95a5a6',
        'CDs ': '#d35400',
        'Cameras ': '#bdc3c7',
        "Children's Clothing": '#9b59b6',
        'Computers': '#7f8c8d',
        'Consumer Electronics': '#2c3e50',
        'Crafts': '#27ae60',
        'DVDs': '#f1c40f',
        'Garden': '#17becf',
        'Health and Beauty': '#bcbd22',
        "Men's Clothing": '#e74c3c',
        'Music': '#8c564b',
        'Pet Supplies': '#2ecc71',
        'Sporting Goods': '#7f7f7f',
        'Toys': '#d62728',
        'Video Games': '#9467bd',
        "Women's Clothing": '#3498db'
    }

    # map category colors to the top_5 and bottom_5 dataframes
    top_5['color'] = top_5['category_name'].map(category_colors)
    bottom_5['color'] = bottom_5['category_name'].map(category_colors)

    # create the subplots
    fig = make_subplots(rows=2, cols=1, vertical_spacing=0.15, subplot_titles=(
        "Top 5 High-Performing Categories in Northern America", "Bottom 5 Underperforming Categories in Northern America"))

    # add the top 5 subplot
    fig.add_trace(go.Bar(x=top_5['total_sales'], y=top_5['category_name'], orientation='h',
                        text=top_5['total_sales'], name='', marker=dict(color=top_5['color'])),
                row=1, col=1)
    fig.update_yaxes(title='', categoryorder='total ascending', row=1, col=1)
    fig.update_xaxes(title='Total Sales', row=1, col=1, showgrid=False)

    # add the bottom 5 subplot
    fig.add_trace(go.Bar(x=bottom_5['total_sales'], y=bottom_5['category_name'], orientation='h',
                        text=bottom_5['total_sales'], name='', marker=dict(color=bottom_5['color'])),
                row=2, col=1)
    fig.update_yaxes(title='', categoryorder='total ascending', row=2, col=1)
    fig.update_xaxes(title='Total Sales', row=2, col=1, showgrid=False)

    # update the layout
    fig.update_layout(height=800, width=800, template='plotly_dark', showlegend=False)

    return fig


In [17]:
bar3 = create_bar_region_combined(df_2017)
bar3



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [18]:
dailySales = df_2017[df_2017["order_region"]=="Northern America"].copy().reset_index()
dailySales = dailySales.groupby(pd.Grouper(key="order_date", freq="D")).agg({"sales": "sum"}).reset_index()
dailySales["sales"] = dailySales["sales"].round()
# dailySales.rename(columns={"order_date": "date"}, inplace=True)
dailySales["order_date"] = pd.to_datetime(dailySales["order_date"]).dt.strftime("%Y-%m-%d")

dailySalesChart = alt.Chart(dailySales).mark_line().encode(
    x=alt.X("order_date:T", title="date"),
    y=alt.Y("sales:Q", title="Total Sales"),
).properties(
    title="Sales Over Time",
    width=700,
    height=300
)

dailySalesChart

In [19]:


daily_sales_chart = go.Figure()

daily_sales_chart.add_trace(
    go.Scatter(
        x=dailySales["order_date"],
        y=dailySales["sales"],
        mode="lines",
        name="Sales Over Time",
    )
)

daily_sales_chart.update_layout(
    title="Sales Over Time",
    xaxis_title="Date",
    yaxis_title="Total Sales",
    width=700,
    height=300,
    template="plotly_dark",
)

daily_sales_chart.show()


In [20]:
print(df_2017.head(2))

                order_date                    order_region order_country  \
125200 2017-01-01 00:33:00  Asiatic Russia/European Russia        Russia   
125201 2017-01-01 00:33:00  Asiatic Russia/European Russia        Russia   

         order_city  market  sales  order_item_quantity  shipping_date  \
125200  Cheliábinsk  Europe  99.96                    2  1/6/2017 0:33   
125201  Cheliábinsk  Europe  39.99                    1  1/6/2017 0:33   

        days_for_shipping_real  days_for_shipment_scheduled  ...  \
125200                       5                            2  ...   
125201                       5                            2  ...   

       delivery_status late_delivery_risk  shipping_mode order_status  \
125200   Late delivery                  1   Second Class     COMPLETE   
125201   Late delivery                  1   Second Class     COMPLETE   

       order_profit_per_order  sales_per_item  shipping_days_difference  \
125200                 -58.78           49.98   

In [21]:
df_2017.head(2)

Unnamed: 0,order_date,order_region,order_country,order_city,market,sales,order_item_quantity,shipping_date,days_for_shipping_real,days_for_shipment_scheduled,...,delivery_status,late_delivery_risk,shipping_mode,order_status,order_profit_per_order,sales_per_item,shipping_days_difference,order_year,order_month,order_year_month_date
125200,2017-01-01 00:33:00,Asiatic Russia/European Russia,Russia,Cheliábinsk,Europe,99.96,2,1/6/2017 0:33,5,2,...,Late delivery,1,Second Class,COMPLETE,-58.78,49.98,-3,2017,January,2017-01-01
125201,2017-01-01 00:33:00,Asiatic Russia/European Russia,Russia,Cheliábinsk,Europe,39.99,1,1/6/2017 0:33,5,2,...,Late delivery,1,Second Class,COMPLETE,15.98,39.99,-3,2017,January,2017-01-01


In [22]:
# An order is considered on-time and in-full if the 'shipping_days_difference' is equal to or greater than 0.
# If 'shipping_days_difference' >= 0, the order is delivered on-time and in-full (value 1).
# Otherwise, the order is not delivered on-time and in-full (value 0).
df_2017['on_time_in_full'] = (df_2017['shipping_days_difference'] >= 0).astype(int)

# Group the data by 'order_region' and calculate the total number of orders and the total number of on-time, in-full orders.
region_otif_data = df_2017.groupby('order_region')['on_time_in_full'].agg(['sum', 'count']).reset_index()

# Calculate the OTIF rate for each region by dividing the total number of on-time, in-full orders by the total number of orders, then multiply by 100 to get the percentage.
region_otif_data['otif_rate'] = (region_otif_data['sum'] / region_otif_data['count']) * 100

# Rename the columns for better readability
region_otif_data.columns = ['order_region', 'on_time_in_full_orders', 'total_orders', 'otif_rate']

value = str(round(region_otif_data[region_otif_data["order_region"]=="Southeastern Asia"]["otif_rate"].values[0], 2)) + " %"

value



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



'41.58 %'

In [37]:
avg_scheduled_shipping_time = df_2017.groupby(['order_region'])['days_for_shipping_real'].mean().reset_index()
avg_scheduled_shipping_time.loc[len(avg_scheduled_shipping_time)] = ['All Region', avg_scheduled_shipping_time['days_for_shipping_real'].mean()]
avg_scheduled_shipping_time.tail()

Unnamed: 0,order_region,days_for_shipping_real
17,Southern Europe,3.44
18,Western Africa,3.55
19,Western Asia,3.68
20,Western Europe,3.48
21,All Region,3.55


In [35]:
total_order = df_2017.groupby(['order_region'])['order_item_quantity'].sum().reset_index()
total_order.loc[len(total_order)] = ['All Region', total_order['order_item_quantity'].sum()]
total_order.tail()

Unnamed: 0,order_region,order_item_quantity
17,Southern Europe,8121
18,Western Africa,1022
19,Western Asia,1790
20,Western Europe,23880
21,All Region,106124


In [33]:
total_profit = df_2017.groupby(['order_region'])['order_profit_per_order'].sum().reset_index()
total_profit.loc[len(total_profit)] = ['All Region', total_profit['order_profit_per_order'].sum()]
total_profit.tail()

Unnamed: 0,order_region,order_profit_per_order
17,Southern Europe,105780.4
18,Western Africa,11529.25
19,Western Asia,15638.0
20,Western Europe,323060.82
21,All Region,1304085.11


In [34]:
total_sales = df_2017.groupby(['order_region'])['sales'].sum().reset_index()
total_sales.loc[len(total_sales)] = ['All Region', total_sales['sales'].sum()]
total_sales.tail()

Unnamed: 0,order_region,sales
17,Southern Europe,1032037.05
18,Western Africa,85422.6
19,Western Asia,158574.73
20,Western Europe,2984677.68
21,All Region,11808436.14


In [44]:
total_profit = df_2017.groupby(['order_region'])['order_profit_per_order'].sum().reset_index()
total_profit.loc[len(total_profit)] = ['All Region', total_profit['order_profit_per_order'].sum()]
profit_value = total_profit[total_profit["order_region"]=='All Region']["order_profit_per_order"].values[0]
formatted_profit = "${:,.2f}".format(profit_value)
profit = formatted_profit

In [45]:
total_profit

Unnamed: 0,order_region,order_profit_per_order
0,Asiatic Russia/European Russia,5232.24
1,Australia/New Zealand,17802.56
2,Caribbean,89061.07
3,Central America,325292.06
4,Central Asia,431.9
5,Eastern Africa,4018.65
6,Eastern Asia,30321.69
7,Eastern Europe,5618.26
8,Melanesia,101.23
9,Middle Africa,2384.41


In [46]:
profit

'$1,304,085.11'