PART 1: Clean Products Table

Load data

In [29]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)

products = pd.read_csv('../data/raw/olist_products_dataset.csv')
category_translation = pd.read_csv('../data/raw/product_category_name_translation.csv')

print("Products loaded:", products.shape)
print("Category translation loaded:", category_translation.shape)

Products loaded: (32951, 9)
Category translation loaded: (71, 2)


Check Missing Values

In [8]:
print("\n Missing values in products:")
print(products.isnull().sum())
print("\nMissing percentage:")
print((products.isnull().sum() / len(products) *100).round(2))


 Missing values in products:
product_id                      0
product_category_name         610
product_name_lenght           610
product_description_lenght    610
product_photos_qty            610
product_weight_g                2
product_length_cm               2
product_height_cm               2
product_width_cm                2
dtype: int64

Missing percentage:
product_id                    0.00
product_category_name         1.85
product_name_lenght           1.85
product_description_lenght    1.85
product_photos_qty            1.85
product_weight_g              0.01
product_length_cm             0.01
product_height_cm             0.01
product_width_cm              0.01
dtype: float64


Handle Missing Category Names

In [19]:
products_clean = products.copy()

products_clean['product_category_name'] = products_clean['product_category_name'].fillna('unknown')

print(f"Filled {products['product_category_name'].isnull().sum()} missing categories")

Filled 610 missing categories


Merge English Category Names

In [20]:
products_clean = products_clean.merge(
    category_translation,
    on='product_category_name',
    how='left'
)

products_clean['product_category_name_english'] = products_clean['product_category_name_english'].fillna('unkown')

print("\nTOP 10 Categories")
print(products_clean['product_category_name_english'].value_counts().head(10))


TOP 10 Categories
product_category_name_english
bed_bath_table           3029
sports_leisure           2867
furniture_decor          2657
health_beauty            2444
housewares               2335
auto                     1900
computers_accessories    1639
toys                     1411
watches_gifts            1329
telephony                1134
Name: count, dtype: int64


Calculate Product Volume

In [26]:
# Flag products with complete dimensions
products_clean['has_dimensions'] = (
    products_clean['product_weight_g'].notna() &
    products_clean['product_length_cm'].notna() &
    products_clean['product_height_cm'].notna() &
    products_clean['product_width_cm'].notna()
).astype(int)

# Calculate volume for products with dimensions
products_clean['product_volume_cm3'] = (
    products_clean['product_length_cm'] *
    products_clean['product_height_cm'] *
    products_clean['product_width_cm']
)

print(f"\nProducts with complete dimensions: {products_clean['has_dimensions'].sum():,}")
print(f"Percentage: {(products_clean['has_dimensions'].mean() * 100):.1f}%")


Products with complete dimensions: 32,949
Percentage: 100.0%


Save Products

In [28]:
products_clean.to_csv('../data/processed/products_clean.csv', index=False)
print("\nProducts saved: data/processed/products_clean.csv")


Products saved: data/processed/products_clean.csv


PART 2: CLEAN ORDER ITEMS TABLE

Load and Inspect

In [31]:
order_items = pd.read_csv('../data/raw/olist_order_items_dataset.csv')

print("\nOrder items shape:", order_items.shape)
print("\nMissing values:")
print(order_items.isnull().sum())


Order items shape: (112650, 7)

Missing values:
order_id               0
order_item_id          0
product_id             0
seller_id              0
shipping_limit_date    0
price                  0
freight_value          0
dtype: int64


Convert Datetime

In [32]:
order_items_clean = order_items.copy()

order_items_clean['shipping_limit_date'] = pd.to_datetime(
    order_items_clean['shipping_limit_date']
)

print("Datetime column converted")

Datetime column converted


Create Cost Metrics

In [37]:
# Total cost per item (price + fregight)
order_items_clean['total_item_cost'] = (
    order_items_clean['price'] +
    order_items_clean['freight_value']
)

# Freight as percentage of price
order_items_clean['freight_pct_of_price'] = (
    order_items_clean['freight_value'] /
    order_items_clean['price'] * 100
)

print("\nSample of calculated metrics:")
print(order_items_clean[['order_id', 'price', 'freight_value', 'total_item_cost', 'freight_pct_of_price']].head())

print("\nFreight statistics:")
print(f"Average freight as % of price: {order_items_clean['freight_pct_of_price'].mean():.1f}%")
print(f"Median: {order_items_clean['freight_pct_of_price'].median():.1f}%")


Sample of calculated metrics:
                           order_id   price  freight_value  total_item_cost  \
0  00010242fe8c5a6d1ba2dd792cb16214   58.90          13.29            72.19   
1  00018f77f2f0320c557190d7a144bdd3  239.90          19.93           259.83   
2  000229ec398224ef6ca0657da4fc703e  199.00          17.87           216.87   
3  00024acbcdf0a6daa1e931b038114c75   12.99          12.79            25.78   
4  00042b26cf59d7ce69dfabb4e55b4fd9  199.90          18.14           218.04   

   freight_pct_of_price  
0             22.563667  
1              8.307628  
2              8.979899  
3             98.460354  
4              9.074537  

Freight statistics:
Average freight as % of price: 32.1%
Median: 23.1%


Merge with Products

In [38]:
order_items_clean = order_items_clean.merge(
    products_clean,
    on='product_id',
    how='left'
)

print(f"\nOrder items with product details: {len(order_items_clean):,} rows")
print(f"Columns: {order_items_clean.shape[1]}")


Order items with product details: 112,650 rows
Columns: 20


Save Order Items

In [40]:
order_items_clean.to_csv('../data/processed/order_items_clean.csv', index=False)
print("Order items saved: ../data/processed/order_items_clean.csv")

Order items saved: ../data/processed/order_items_clean.csv


PART 3: CLEAN PAYMENTS TABLE 

Load and Inspect

In [42]:
order_payments = pd.read_csv('../data/raw/olist_order_payments_dataset.csv')

print("\nPayments shape:", order_payments.shape)
print("\nMissing values:")
print(order_payments.isnull().sum())

print("\nPayment type distribution:")
print(order_payments['payment_type'].value_counts())


Payments shape: (103886, 5)

Missing values:
order_id                0
payment_sequential      0
payment_type            0
payment_installments    0
payment_value           0
dtype: int64

Payment type distribution:
payment_type
credit_card    76795
boleto         19784
voucher         5775
debit_card      1529
not_defined        3
Name: count, dtype: int64


Aggregate to Order Level

Some orders have multiple payment methods. Aggregate to one row per order:

In [48]:
order_payments_agg = order_payments.groupby('order_id').agg({
    'payment_value': 'sum',
    'payment_installments': 'max',
    'payment_type': lambda x: ', '.join(x.unique())
}).reset_index()

order_payments_agg.columns = [
    'order_id', 
    'total_payment_value', 
    'max_installments', 
    'payment_methods'
]

print("\nAggregated payments:")
print(order_payments_agg.head())

multi_payment = (order_payments_agg['payment_methods'].str.contains(',')).sum()
print(f"\nOrders with multiple payment methods: {multi_payment:,}")


Aggregated payments:
                           order_id  total_payment_value  max_installments  \
0  00010242fe8c5a6d1ba2dd792cb16214                72.19                 2   
1  00018f77f2f0320c557190d7a144bdd3               259.83                 3   
2  000229ec398224ef6ca0657da4fc703e               216.87                 5   
3  00024acbcdf0a6daa1e931b038114c75                25.78                 2   
4  00042b26cf59d7ce69dfabb4e55b4fd9               218.04                 3   

  payment_methods  
0     credit_card  
1     credit_card  
2     credit_card  
3     credit_card  
4     credit_card  

Orders with multiple payment methods: 2,246


Save Payments

In [50]:
order_payments_agg.to_csv('../data/processed/order_payments_clean.csv', index=False)
print("Payments saved: ../data/processed/order_payments_clean.csv")

Payments saved: ../data/processed/order_payments_clean.csv


PART  4: BUSINESS INSIGHTS

Insight 1: Freoght Cost by Catgory

In [53]:
# Calculate average freight percentage by category
freight_by_category = order_items_clean.groupby('product_category_name_english').agg({
    'order_id': 'count',
    'price': 'mean',
    'freight_value': 'mean',
    'freight_pct_of_price': 'mean'
}).round(2)

freight_by_category.columns = [
    'total_orders',
    'avg_price',
    'avg_freight',
    'avg_freight_pct' 
]

freight_by_category = freight_by_category.sort_values(
    'avg_freight_pct',
    ascending=False 
)

print("\nTop 10 categories by freight percentage:")
print(freight_by_category.head(10))

# Find high freight categories
high_freight = freight_by_category[freight_by_category['avg_freight_pct'] > 30]
print(f"\nCategories with >30% freight cost: {len(high_freight)}")


Top 10 categories by freight percentage:
                                   total_orders  avg_price  avg_freight  \
product_category_name_english                                             
home_comfort_2                               30      25.34        13.68   
dvds_blu_ray                                 64      93.74        20.14   
electronics                                2767      57.91        16.83   
christmas_supplies                          153      57.52        21.11   
fashion_underwear_beach                     131      72.84        14.63   
flowers                                      33      33.64        14.81   
signaling_and_security                      199     108.09        32.70   
telephony                                  4545      71.21        15.67   
food_drink                                  278      54.60        16.22   
furniture_mattress_and_upholstery            38     114.95        42.91   

                                   avg_freight_pct  
prod

Insight 2: Products Without Dimensions

In [54]:
missing_dims = products_clean[products_clean['has_dimensions'] == 0]
missing_pct = (len(missing_dims) / len(products_clean)) * 100

print(f"\nProducts missing dimensions: {len(missing_dims):,} ({missing_pct:.1f}%)")

# Check if these products are frequently ordered
items_missing_dims = order_items_clean[
    order_items_clean['has_dimensions'] == 0
]

print(f"\nOrders for products with missing dimensions: {len(items_missing_dims):,}")
print(f"Percentage of all orders: {(len(items_missing_dims) / len(order_items_clean) * 100):.1f}%")


Products missing dimensions: 2 (0.0%)

Orders for products with missing dimensions: 18
Percentage of all orders: 0.0%


Insight 3: Payment Method Trends

In [56]:
# Payment type breakdown
payment_breakdown = order_payments.groupby('payment_type').agg({
    'order_id': 'count',
    'payment_value': ['sum', 'mean'] 
}).round(2)

payment_breakdown.columns = ['order_count', 'total_value', 'avg_value']
payment_breakdown = payment_breakdown.sort_values('order_count', ascending=False)

print(f"\nPayment method analysis:")
print(payment_breakdown)

# Installment analysis
installment_stats = order_payments[order_payments['payment_installments'] > 1]
print(f"\nOrders using installments: {len(installment_stats):,}")
print(f"Average installments: {installment_stats['payment_installments'].mean():.1f}")


Payment method analysis:
              order_count  total_value  avg_value
payment_type                                     
credit_card         76795  12542084.19     163.32
boleto              19784   2869361.27     145.03
voucher              5775    379436.87      65.70
debit_card           1529    217989.79     142.57
not_defined             3         0.00       0.00

Orders using installments: 51,338
Average installments: 4.8


SUMMARY and VALIDATION

Final Data Quality Check

In [57]:
import os

print("\nFinal cleaned files:")
for file in['products_clean.csv', 'order_items_clean.csv', 'order_payments_clean.csv']:
    path = f'../data/processed/{file}' 
    if os.path.exists(path):
        df = pd.read_csv(path)
        print("\n{file}:")
        print(f"    Rows: {len(df):,}")
        print(f"    Columns: {df.shape[1]}")
        print(f"    Missing cells: {df.isnull().sum().sum():,}")


Final cleaned files:

{file}:
    Rows: 32,951
    Columns: 12
    Missing cells: 1,840

{file}:
    Rows: 112,650
    Columns: 20
    Missing cells: 4,899

{file}:
    Rows: 99,440
    Columns: 4
    Missing cells: 0


Key Findings Summary

In [58]:
print("\nDAY 3 KEY FINDINGS:")
print("\n1. PRODUCTS:")
print(f"   - {products['product_category_name'].isnull().sum()} products had missing categories")
print(f"   - {(products_clean['has_dimensions'].mean() * 100):.1f}% have complete dimensions")
print(f"   - {products_clean['product_category_name_english'].nunique()} unique categories")

print("\n2. ORDER ITEMS:")
print(f"   - Average freight: {order_items_clean['freight_pct_of_price'].mean():.1f}% of price")
print(f"   - Highest freight category: {freight_by_category.index[0]}")

print("\n3. PAYMENTS:")
print(f"   - {multi_payment:,} orders used multiple payment methods")
print(f"   - Most common payment: {order_payments['payment_type'].value_counts().index[0]}")


DAY 3 KEY FINDINGS:

1. PRODUCTS:
   - 610 products had missing categories
   - 100.0% have complete dimensions
   - 72 unique categories

2. ORDER ITEMS:
   - Average freight: 32.1% of price
   - Highest freight category: home_comfort_2

3. PAYMENTS:
   - 2,246 orders used multiple payment methods
   - Most common payment: credit_card
