In [5]:
from pathlib import Path

cwd = Path.cwd().resolve()
print("CWD:", cwd)

raw = Path("../data/raw").resolve()
print("RAW exists:", raw.exists())
print("RAW path:", raw)



CWD: /home/danyak/olist-ecommerce-analytics/notebooks
RAW exists: True
RAW path: /home/danyak/olist-ecommerce-analytics/data/raw


In [6]:
from pathlib import Path

raw = Path("../data/raw").resolve()
files = sorted([p.name for p in raw.glob("*.csv")])

print("CSV files count:", len(files))
files



CSV files count: 9


['olist_customers_dataset.csv',
 'olist_geolocation_dataset.csv',
 'olist_order_items_dataset.csv',
 'olist_order_payments_dataset.csv',
 'olist_order_reviews_dataset.csv',
 'olist_orders_dataset.csv',
 'olist_products_dataset.csv',
 'olist_sellers_dataset.csv',
 'product_category_name_translation.csv']

In [7]:
import pandas as pd
from pathlib import Path

raw = Path("../data/raw").resolve()
orders = pd.read_csv(raw / "olist_orders_dataset.csv")

print("orders shape:", orders.shape)
orders.head(3)


orders shape: (99441, 8)


Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13 00:00:00
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04 00:00:00


In [8]:
import pandas as pd
from pathlib import Path

raw = Path("../data/raw").resolve()

customers = pd.read_csv(raw / "olist_customers_dataset.csv")

orders = pd.read_csv(
    raw / "olist_orders_dataset.csv",
    parse_dates=[
        "order_purchase_timestamp",
        "order_approved_at",
        "order_delivered_carrier_date",
        "order_delivered_customer_date",
        "order_estimated_delivery_date",
    ],
)

items = pd.read_csv(raw / "olist_order_items_dataset.csv")
payments = pd.read_csv(raw / "olist_order_payments_dataset.csv")

reviews = pd.read_csv(
    raw / "olist_order_reviews_dataset.csv",
    parse_dates=["review_creation_date", "review_answer_timestamp"],
)

products = pd.read_csv(raw / "olist_products_dataset.csv")
cat_tr = pd.read_csv(raw / "product_category_name_translation.csv")


In [9]:
tables = {
    "customers": customers,
    "orders": orders,
    "items": items,
    "payments": payments,
    "reviews": reviews,
    "products": products,
    "cat_tr": cat_tr,
}

for name, df in tables.items():
    print(f"{name:10s} rows={len(df):>8} cols={df.shape[1]:>3}")


customers  rows=   99441 cols=  5
orders     rows=   99441 cols=  8
items      rows=  112650 cols=  7
payments   rows=  103886 cols=  5
reviews    rows=   99224 cols=  7
products   rows=   32951 cols=  9
cat_tr     rows=      71 cols=  2


In [10]:
for name in ["orders", "items", "payments", "reviews", "customers", "products"]:
    df = tables[name]
    print("\n", name.upper())
    print(list(df.columns))



 ORDERS
['order_id', 'customer_id', 'order_status', 'order_purchase_timestamp', 'order_approved_at', 'order_delivered_carrier_date', 'order_delivered_customer_date', 'order_estimated_delivery_date']

 ITEMS
['order_id', 'order_item_id', 'product_id', 'seller_id', 'shipping_limit_date', 'price', 'freight_value']

 PAYMENTS
['order_id', 'payment_sequential', 'payment_type', 'payment_installments', 'payment_value']

 REVIEWS
['review_id', 'order_id', 'review_score', 'review_comment_title', 'review_comment_message', 'review_creation_date', 'review_answer_timestamp']

 CUSTOMERS
['customer_id', 'customer_unique_id', 'customer_zip_code_prefix', 'customer_city', 'customer_state']

 PRODUCTS
['product_id', 'product_category_name', 'product_name_lenght', 'product_description_lenght', 'product_photos_qty', 'product_weight_g', 'product_length_cm', 'product_height_cm', 'product_width_cm']


In [11]:
print("orders: order_id unique? ->", orders["order_id"].is_unique)
print("orders: unique order_id ->", orders["order_id"].nunique(), "rows ->", len(orders))

print("items: unique order_id ->", items["order_id"].nunique(), "rows ->", len(items))
print("payments: unique order_id ->", payments["order_id"].nunique(), "rows ->", len(payments))
print("reviews: unique order_id ->", reviews["order_id"].nunique(), "rows ->", len(reviews))

print("customers: customer_id unique? ->", customers["customer_id"].is_unique)
print("customers: customer_unique_id unique? ->", customers["customer_unique_id"].nunique())


orders: order_id unique? -> True
orders: unique order_id -> 99441 rows -> 99441
items: unique order_id -> 98666 rows -> 112650
payments: unique order_id -> 99440 rows -> 103886
reviews: unique order_id -> 98673 rows -> 99224
customers: customer_id unique? -> True
customers: customer_unique_id unique? -> 96096


In [12]:
def na_share(df, cols):
    return df[cols].isna().mean().sort_values(ascending=False)

na_share(
    orders,
    [
        "order_approved_at",
        "order_delivered_carrier_date",
        "order_delivered_customer_date",
        "order_estimated_delivery_date",
    ],
)


order_delivered_customer_date    0.029817
order_delivered_carrier_date     0.017930
order_approved_at                0.001609
order_estimated_delivery_date    0.000000
dtype: float64

In [13]:
na_share(reviews, ["review_score", "review_creation_date", "review_answer_timestamp"])


review_score               0.0
review_creation_date       0.0
review_answer_timestamp    0.0
dtype: float64

In [15]:
orders["order_status"].value_counts(dropna=False)
print("min purchase:", orders["order_purchase_timestamp"].min())
print("max purchase:", orders["order_purchase_timestamp"].max())
orders["purchase_month"] = orders["order_purchase_timestamp"].dt.to_period("M").astype(str)
orders["purchase_month"].value_counts().sort_index()


min purchase: 2016-09-04 21:15:19
max purchase: 2018-10-17 17:30:18


purchase_month
2016-09       4
2016-10     324
2016-12       1
2017-01     800
2017-02    1780
2017-03    2682
2017-04    2404
2017-05    3700
2017-06    3245
2017-07    4026
2017-08    4331
2017-09    4285
2017-10    4631
2017-11    7544
2017-12    5673
2018-01    7269
2018-02    6728
2018-03    7211
2018-04    6939
2018-05    6873
2018-06    6167
2018-07    6292
2018-08    6512
2018-09      16
2018-10       4
Name: count, dtype: int64

In [16]:
s = orders["order_purchase_timestamp"]
print("count:", s.notna().sum())
print("min:", s.min())
print("median:", s.median())
print("max:", s.max())


count: 99441
min: 2016-09-04 21:15:19
median: 2018-01-18 23:04:36
max: 2018-10-17 17:30:18
