In [43]:
import kagglehub
import shutil
import os
import pandas as pd

### Download

In [46]:
# Download dataset
path = kagglehub.dataset_download("olistbr/brazilian-ecommerce")

# Move files to the current directory
shutil.move(path, "./data")

print("Files Downloaded to:", "./data")

Resuming download from 26214400 bytes (18503180 bytes left)...
Resuming download from https://www.kaggle.com/api/v1/datasets/download/olistbr/brazilian-ecommerce?dataset_version_number=2 (26214400/44717580) bytes left.


100%|██████████| 42.6M/42.6M [00:12<00:00, 1.43MB/s]

Extracting files...





Files Downloaded to: ./data


### ETL

In [47]:
data_path = "./data"
files = os.listdir(data_path)

In [48]:
files

['olist_customers_dataset.csv',
 'olist_geolocation_dataset.csv',
 'olist_orders_dataset.csv',
 'olist_order_items_dataset.csv',
 'olist_order_payments_dataset.csv',
 'olist_order_reviews_dataset.csv',
 'olist_products_dataset.csv',
 'olist_sellers_dataset.csv',
 'product_category_name_translation.csv']

In [49]:
customers_dataset = pd.read_csv(f"{data_path}/olist_customers_dataset.csv")
geolocation_dataset = pd.read_csv(f"{data_path}/olist_geolocation_dataset.csv")
orders_dataset = pd.read_csv(f"{data_path}/olist_orders_dataset.csv")
order_items_dataset = pd.read_csv(f"{data_path}/olist_order_items_dataset.csv")
order_payments_dataset = pd.read_csv(f"{data_path}/olist_order_payments_dataset.csv")
order_reviews_dataset = pd.read_csv(f"{data_path}/olist_order_reviews_dataset.csv")
products_dataset = pd.read_csv(f"{data_path}/olist_products_dataset.csv")
sellers_dataset = pd.read_csv(f"{data_path}/olist_sellers_dataset.csv")

##### Creating the column 'geolocation_state_brazil'

In [50]:
uf_to_state = {
    'AC': 'Acre', 'AL': 'Alagoas', 'AP': 'Amapá', 'AM': 'Amazonas', 'BA': 'Bahia',
    'CE': 'Ceará', 'DF': 'Distrito Federal', 'ES': 'Espírito Santo', 'GO': 'Goiás',
    'MA': 'Maranhão', 'MT': 'Mato Grosso', 'MS': 'Mato Grosso do Sul', 'MG': 'Minas Gerais',
    'PA': 'Pará', 'PB': 'Paraíba', 'PR': 'Paraná', 'PE': 'Pernambuco', 'PI': 'Piauí',
    'RJ': 'Rio de Janeiro', 'RN': 'Rio Grande do Norte', 'RS': 'Rio Grande do Sul',
    'RO': 'Rondônia', 'RR': 'Roraima', 'SC': 'Santa Catarina', 'SP': 'São Paulo',
    'SE': 'Sergipe', 'TO': 'Tocantins'
}

geolocation_dataset['geolocation_state_brazil'] = 'BRAZIL-' + geolocation_dataset['geolocation_state'].map(uf_to_state)

geolocation_dataset.to_csv(f"{data_path}/olist_geolocation_dataset.csv", index=False)

### Description
Olist follows a relational database model. Here is a basic map of the connections:

olist_orders_dataset.csv (Orders) is the central table.

olist_customers_dataset.csv (Customers) connects via customer_id.

olist_order_items_dataset.csv (Order Items) connects via order_id.

olist_order_payments_dataset.csv (Payments) connects via order_id.

olist_order_reviews_dataset.csv (Reviews) connects via order_id.

olist_products_dataset.csv (Products) connects via product_id.

olist_sellers_dataset.csv (Sellers) connects via seller_id.

olist_geolocation_dataset.csv (Geolocation) can be linked by zip_code_prefix.

## Dashboard

In [41]:
import dash
from dash import dcc, html
import pandas as pd
import plotly.express as px

Data processing

In [None]:
order_status_counts = orders_dataset["order_status"].value_counts().reset_index()
order_status_counts.columns = ["status", "count"]

payment_type_counts = order_payments_dataset["payment_type"].value_counts().reset_index()
payment_type_counts.columns = ["payment_type", "count"]

In [58]:
orders_dataset["order_purchase_timestamp"] = pd.to_datetime(orders_dataset["order_purchase_timestamp"])
orders_dataset["year_month"] = orders_dataset["order_purchase_timestamp"].dt.to_period("M")

orders_by_month = orders_dataset["year_month"].value_counts().sort_index().reset_index()
orders_by_month.columns = ["year_month", "count"]

In [60]:
df_orders_items = order_items_dataset.merge(products_dataset, on="product_id")
df_orders_items["total_price"] = df_orders_items["price"] + df_orders_items["freight_value"]

revenue_by_category = df_orders_items.groupby("product_category_name")["total_price"].sum().reset_index()
top_categories = revenue_by_category.nlargest(10, "total_price")

In [None]:
df_payments = order_payments_dataset.merge(orders_dataset, on="order_id")
avg_ticket = df_payments.groupby("payment_type")["payment_value"].mean().reset_index()

In [67]:
df_orders_customers = orders_dataset.merge(customers_dataset, on="customer_id")
df_orders_customers["delivery_time"] = (
    pd.to_datetime(df_orders_customers["order_delivered_customer_date"]) - 
    pd.to_datetime(df_orders_customers["order_purchase_timestamp"])
).dt.days

delivery_by_state = df_orders_customers.groupby("customer_state")["delivery_time"].mean().reset_index()

Criação dos gráficos

In [81]:
fig_orders = px.bar(order_status_counts, x="status", y="count", title="Status dos Pedidos")

fig_payments = px.pie(payment_type_counts, names="payment_type", values="count", title="Métodos de Pagamento")

fig_revenue = px.bar(top_categories, x="product_category_name", y="total_price",
                     title="Top 10 Categorias por Receita", labels={"product_category_name": "Categoria"})

fig_ticket = px.bar(avg_ticket, x="payment_value", y="payment_type", orientation="h",
                    title="Ticket Médio por Método de Pagamento", labels={"payment_value": "Valor Médio (R$)"})
                    

Criação do Dash

In [82]:
app = dash.Dash(__name__)

app.layout = html.Div([
    html.H1("Dashboard de E-Commerce"),
    dcc.Graph(figure=fig_orders),
    dcc.Graph(figure=fig_payments),
    dcc.Graph(figure=fig_revenue),
    dcc.Graph(figure=fig_ticket),
])

# Executar servidor e abrir navegador
if __name__ == "__main__":
    from threading import Timer
    import webbrowser

    def open_browser():
        webbrowser.open_new("http://127.0.0.1:8050/")

    Timer(1, open_browser).start()
    app.run_server(debug=False)