Cell 1: Imports & Setup

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("✅ Libraries imported successfully!")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")
print(f"Matplotlib version: {plt.matplotlib.__version__}")
print(f"Seaborn version: {sns.__version__}")

✅ Libraries imported successfully!
Pandas version: 2.3.3
NumPy version: 2.4.0
Matplotlib version: 3.10.8
Seaborn version: 0.13.2


Cell 2: Load All 9 CSV Files

In [3]:
print('Loading Datasets...')

customers = pd.read_csv('../data/raw/olist_customers_dataset.csv')
geolocation = pd.read_csv('../data/raw/olist_geolocation_dataset.csv')
order_items = pd.read_csv('../data/raw/olist_order_items_dataset.csv')
order_payments = pd.read_csv('../data/raw/olist_order_payments_dataset.csv')
order_reviews = pd.read_csv('../data/raw/olist_order_reviews_dataset.csv')
orders = pd.read_csv('../data/raw/olist_orders_dataset.csv')
products = pd.read_csv('../data/raw/olist_products_dataset.csv')
sellers = pd.read_csv('../data/raw/olist_sellers_dataset.csv')
category_translations = pd.read_csv('../data/raw/product_category_name_translation.csv')

print("All datasets loaded sucessfully!!!")
print(f"n\ Dataset sizes:")
print(f"Customers: {customers.shape}")
print(f"Geolocation: {geolocation.shape}")
print(f"Order Items: {order_items.shape}")
print(f"Order Payments: {order_payments.shape}")
print(f"Order Reviews: {order_reviews.shape}")
print(f"Orders: {orders.shape}")
print(f"Products: {products.shape}")
print(f"Sellers: {sellers.shape}")
print(f"Category Translations: {category_translations.shape}")

Loading Datasets...
All datasets loaded sucessfully!!!
n\ Dataset sizes:
Customers: (99441, 5)
Geolocation: (1000163, 5)
Order Items: (112650, 7)
Order Payments: (103886, 5)
Order Reviews: (99224, 7)
Orders: (99441, 8)
Products: (32951, 9)
Sellers: (3095, 4)
Category Translations: (71, 2)


Cell 3: Explore ORDERS Table (Main Table)

In [4]:
print("=" * 70)
print("ORDERS DATASET EXPLORATION")
print("=" * 70)

print("\nBASIC INFO:")
print(orders.info())

print("\nFirst 5 Rows:")
print(orders.head(5))

print("\nColumn Names:")
print(orders.columns.tolist())

print("\nMissing Values:")
print(orders.isnull().sum())

print("\nOrder Status Distribution:")
print(orders['order_status'].value_counts())

print("\nDATA RANGE:")
print(f"Earliest Order: {orders['order_purchase_timestamp'].min()}")
print(f"Latest Order: {orders['order_purchase_timestamp'].max()}")

ORDERS DATASET EXPLORATION

BASIC INFO:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 8 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   order_id                       99441 non-null  object
 1   customer_id                    99441 non-null  object
 2   order_status                   99441 non-null  object
 3   order_purchase_timestamp       99441 non-null  object
 4   order_approved_at              99281 non-null  object
 5   order_delivered_carrier_date   97658 non-null  object
 6   order_delivered_customer_date  96476 non-null  object
 7   order_estimated_delivery_date  99441 non-null  object
dtypes: object(8)
memory usage: 6.1+ MB
None

First 5 Rows:
                           order_id                       customer_id  \
0  e481f51cbdc54678b7cc49136f2d6af7  9ef432eb6251297304e76186b10a928d   
1  53cdb2fc8bc7dce0b6741e2150273451  b0830fb4747a6c6d20d

Cell 4: Explore ORDER ITEMS Table 

In [11]:
print("=" * 70)
print("ORDER ITEMS DATASET EXPLORATION")
print("=" * 70)

print("\nBasic Info:")
print(order_items.info())

print("\nFirst 5 rows:")
print(order_items.head())

print("\nMissing values:")
print(order_items.isnull().sum())

print("\nPrice statistics:")
print(order_items['price'].describe())

print("\nFreight (shipping) statistics:")
print(order_items['freight_value'].describe())

ORDER ITEMS DATASET EXPLORATION

Basic Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112650 entries, 0 to 112649
Data columns (total 7 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   order_id             112650 non-null  object 
 1   order_item_id        112650 non-null  int64  
 2   product_id           112650 non-null  object 
 3   seller_id            112650 non-null  object 
 4   shipping_limit_date  112650 non-null  object 
 5   price                112650 non-null  float64
 6   freight_value        112650 non-null  float64
dtypes: float64(2), int64(1), object(4)
memory usage: 6.0+ MB
None

First 5 rows:
                           order_id  order_item_id  \
0  00010242fe8c5a6d1ba2dd792cb16214              1   
1  00018f77f2f0320c557190d7a144bdd3              1   
2  000229ec398224ef6ca0657da4fc703e              1   
3  00024acbcdf0a6daa1e931b038114c75              1   
4  00042b26cf59d7ce69dfabb4e55b4fd

CELL 5: Explore CUSTOMERS Table

In [12]:
print("=" * 70)
print("CUSTOMERS DATASET EXPLORATION")
print("=" * 70)

print("\nBasic Info:")
print(customers.info())

print("\nFirst 5 rows:")
print(customers.head())

print("\nMissing values:")
print(customers.isnull().sum())

print("\nTop 10 cities:")
print(customers['customer_city'].value_counts().head(10))

print("\nState distribution:")
print(customers['customer_state'].value_counts())

CUSTOMERS DATASET EXPLORATION

Basic Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 5 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   customer_id               99441 non-null  object
 1   customer_unique_id        99441 non-null  object
 2   customer_zip_code_prefix  99441 non-null  int64 
 3   customer_city             99441 non-null  object
 4   customer_state            99441 non-null  object
dtypes: int64(1), object(4)
memory usage: 3.8+ MB
None

First 5 rows:
                        customer_id                customer_unique_id  \
0  06b8999e2fba1a1fbc88172c00ba8bc7  861eff4711a542e4b93843c6dd7febb0   
1  18955e83d337fd6b2def6b18a428ac77  290c77bc529b7ac935b93aa66c333dc3   
2  4e7b3e00288586ebd08712fdd0374a03  060e732b5b29e8181a18229c7b0b2b5e   
3  b2b6027bc5c5109e529d4dc6358b12c3  259dac757896d24d7702b9acbbff3f3c   
4  4f2d8ab171c80ec8364f7c12e35b23ad 

CELL 6: Explore PRODUCTS Table

In [13]:
print("=" * 70)
print("PRODUCTS DATASET EXPLORATION")
print("=" * 70)

print("\nBasic Info:")
print(products.info())

print("\nFirst 5 rows:")
print(products.head())

print("\nMissing values:")
print(products.isnull().sum())

print(f"\nMissing percentage:")
print((products.isnull().sum() / len(products) * 100).round(2))

print("\nTop 10 categories (Portuguese names):")
print(products['product_category_name'].value_counts().head(10))

PRODUCTS DATASET EXPLORATION

Basic Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32951 entries, 0 to 32950
Data columns (total 9 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   product_id                  32951 non-null  object 
 1   product_category_name       32341 non-null  object 
 2   product_name_lenght         32341 non-null  float64
 3   product_description_lenght  32341 non-null  float64
 4   product_photos_qty          32341 non-null  float64
 5   product_weight_g            32949 non-null  float64
 6   product_length_cm           32949 non-null  float64
 7   product_height_cm           32949 non-null  float64
 8   product_width_cm            32949 non-null  float64
dtypes: float64(7), object(2)
memory usage: 2.3+ MB
None

First 5 rows:
                         product_id  product_category_name  \
0  1e9e8ef04dbcff4541ed26657ea517e5             perfumaria   
1  3aa071139cb16b67ca9e5dea

CELL 7: Overall Data Quality Check

In [14]:
print("=" * 70)
print("DATA QUALITY SUMMARY - ALL TABLES")
print("=" * 70)

datasets = {
    'customers': customers,
    'orders': orders,
    'order_items': order_items,
    'order_payments': order_payments,
    'products': products,
    'sellers': sellers
}

for name, df in datasets.items():
    total_missing = df.isnull().sum().sum()
    total_cells = df.shape[0] * df.shape[1]
    missing_pct = (total_missing / total_cells * 100)
    
    print(f"\n{name.upper()}:")
    print(f"  Rows: {df.shape[0]:,}")
    print(f"  Columns: {df.shape[1]}")
    print(f"  Missing cells: {total_missing:,} ({missing_pct:.2f}%)")
    print(f"  Duplicates: {df.duplicated().sum():,}")

DATA QUALITY SUMMARY - ALL TABLES

CUSTOMERS:
  Rows: 99,441
  Columns: 5
  Missing cells: 0 (0.00%)
  Duplicates: 0

ORDERS:
  Rows: 99,441
  Columns: 8
  Missing cells: 4,908 (0.62%)
  Duplicates: 0

ORDER_ITEMS:
  Rows: 112,650
  Columns: 7
  Missing cells: 0 (0.00%)
  Duplicates: 0

ORDER_PAYMENTS:
  Rows: 103,886
  Columns: 5
  Missing cells: 0 (0.00%)
  Duplicates: 0

PRODUCTS:
  Rows: 32,951
  Columns: 9
  Missing cells: 2,448 (0.83%)
  Duplicates: 0

SELLERS:
  Rows: 3,095
  Columns: 4
  Missing cells: 0 (0.00%)
  Duplicates: 0
