# Data Setup & Imports 

In [9]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# ConfiguraciÃ³n de pandas
pd.set_option('display.max_columns', None)  # Mostrar todas las columnas
pd.set_option('display.max_rows', 100)      # MÃ¡ximo 100 filas
pd.set_option('display.float_format', '{:.2f}'.format)  # 2 decimales

# Suprimir warnings
warnings.filterwarnings('ignore')

## Load csvs

In [10]:
customerAddress = pd.read_csv("../data/raw/200K_CustomerAddress.csv")
generalOrder= pd.read_csv("../data/raw/200K_GeneralOrderDetail.csv")
individualCustomer = pd.read_csv("../data/raw/200K_IndividualCustomer.csv")
ordersList = pd.read_csv("../data/raw/200K_OrdersList.csv")
productOrderDetail = pd.read_csv("../data/raw/200K_ProductOrderDetail.csv")
productCatalog = pd.read_csv("../data/raw/Product_Catalog.csv")
print("Loaded 6 datasets successfully")


Loaded 6 datasets successfully


# 01. Baseline snapshot

In [11]:
datasets = {
    'Customer Address': customerAddress,
    'General Order': generalOrder,
    'Individual Customer': individualCustomer,
    'Orders List': ordersList,
    'Product Catalog': productCatalog,
    'Product Order Detail': productOrderDetail

}

# baselines stats 
baseline_stats={}

for name, df in datasets.items():
    total_values = df.shape[0] * df.shape[1]
    total_nulls = df.isnull().sum().sum()
    missing_pct= (total_nulls/total_values)*100
    baseline_stats[name] = {
        'rows_before': df.shape[0],
        'columns_before': df.shape[1],
        'memory_before': df.memory_usage(deep=True).sum() / 1024**2,
        'duplicates_before': df.duplicated().sum(),
        'missing_pct_before': missing_pct
    }

baseline_stats_df = pd.DataFrame.from_dict(baseline_stats, orient='index')
baseline_stats_df

Unnamed: 0,rows_before,columns_before,memory_before,duplicates_before,missing_pct_before
Customer Address,221470,25,356.58,33,15.98
General Order,67934,46,122.98,0,41.42
Individual Customer,178494,53,386.65,0,52.74
Orders List,67831,40,171.62,0,21.07
Product Catalog,7158,6,1.95,0,3.62
Product Order Detail,87610,108,249.21,1,57.46


# 02. Fixing column names 

In [12]:
# Fix column name typos in Customer Address
customerAddress.rename(columns={
    'Cretaed_Timestamp': 'Created_Timestamp',
    'Updaqted_Timestamp': 'Updated_Timestamp'
}, inplace=True)

# Fix column name typos in General Order
generalOrder.rename(columns={
    'Cretaed_Timestamp': 'Created_Timestamp',
    'Updaqted_Timestamp': 'Updated_Timestamp'
}, inplace=True)

print("âœ“ Column names fixed successfully")
print(f"  - Customer Address: {[col for col in customerAddress.columns if 'Timestamp' in col]}")
print(f"  - General Order: {[col for col in generalOrder.columns if 'Timestamp' in col]}")

âœ“ Column names fixed successfully
  - Customer Address: ['Created_Timestamp', 'Updated_Timestamp']
  - General Order: ['Created_Timestamp', 'Updated_Timestamp']


# 03. Handling Missing Values - Drop Empty Columns

In [13]:
# Identify and drop 100% empty columns to reduce noise

# Customer Address - drop completely empty columns
cols_to_drop_ca = ['countryfake', 'auto_filter']
customerAddress.drop(columns=[col for col in cols_to_drop_ca if col in customerAddress.columns], inplace=True)

# General Order - drop 100% empty columns
cols_to_drop_go = ['commercialConditionData', 'checkedInPickupPointId', 'giftRegistryData', 
                   'taxData', 'lastMessage', 'changesAttachment', 'subscriptionData']
generalOrder.drop(columns=[col for col in cols_to_drop_go if col in generalOrder.columns], inplace=True)

# Individual Customer - drop 100% NaN columns
cols_to_drop_ic = ['productPurchasedTag', 'productVisitedTag']
individualCustomer.drop(columns=[col for col in cols_to_drop_ic if col in individualCustomer.columns], inplace=True)

# Orders List - drop completely empty columns
cols_to_drop_ol = ['items', 'listId', 'listType']
ordersList.drop(columns=[col for col in cols_to_drop_ol if col in ordersList.columns], inplace=True)

print("âœ“ Empty columns dropped successfully\n")
print(f"Customer Address:      {customerAddress.shape[1]} columns (removed {len([c for c in cols_to_drop_ca if c in customerAddress.columns])})")
print(f"General Order:         {generalOrder.shape[1]} columns (removed {len([c for c in cols_to_drop_go if c in generalOrder.columns])})")
print(f"Individual Customer:   {individualCustomer.shape[1]} columns (removed {len([c for c in cols_to_drop_ic if c in individualCustomer.columns])})")
print(f"Orders List:           {ordersList.shape[1]} columns (removed {len([c for c in cols_to_drop_ol if c in ordersList.columns])})")

âœ“ Empty columns dropped successfully

Customer Address:      23 columns (removed 0)
General Order:         39 columns (removed 0)
Individual Customer:   51 columns (removed 0)
Orders List:           37 columns (removed 0)


# 04. Handling Duplicates

In [14]:
# Remove duplicate records to ensure data integrity

# Customer Address - 33 duplicates detected
before_ca = len(customerAddress)
customerAddress.drop_duplicates(inplace=True)
removed_ca = before_ca - len(customerAddress)

# Product Order Detail - 1 duplicate detected
before_pod = len(productOrderDetail)
productOrderDetail.drop_duplicates(inplace=True)
removed_pod = before_pod - len(productOrderDetail)

print("âœ“ Duplicates removed successfully\n")
print(f"Customer Address:      Removed {removed_ca} duplicate rows")
print(f"Product Order Detail:  Removed {removed_pod} duplicate row")
print(f"\nNew row counts:")
print(f"  - Customer Address: {len(customerAddress):,} rows")
print(f"  - Product Order Detail: {len(productOrderDetail):,} rows")

âœ“ Duplicates removed successfully

Customer Address:      Removed 33 duplicate rows
Product Order Detail:  Removed 1 duplicate row

New row counts:
  - Customer Address: 221,437 rows
  - Product Order Detail: 87,609 rows


# 05. Final Summary & Export Clean Data

In [15]:
# Create Before/After comparison report

# Update datasets dictionary with cleaned data
datasets_clean = {
    'Customer Address': customerAddress,
    'General Order': generalOrder,
    'Individual Customer': individualCustomer,
    'Orders List': ordersList,
    'Product Catalog': productCatalog,
    'Product Order Detail': productOrderDetail
}

# Build comparison table
comparison_data = []
for name in baseline_stats.keys():
    df_clean = datasets_clean[name]
    before = baseline_stats[name]
    
    comparison_data.append({
        'Dataset': name,
        'Rows Before': before['rows_before'],
        'Rows After': len(df_clean),
        'Rows Removed': before['rows_before'] - len(df_clean),
        'Cols Before': before['columns_before'],
        'Cols After': len(df_clean.columns),
        'Cols Removed': before['columns_before'] - len(df_clean.columns),
        'Missing % Before': f"{before['missing_pct_before']:.2f}%",
        'Missing % After': f"{(df_clean.isnull().sum().sum() / (len(df_clean) * len(df_clean.columns)) * 100):.2f}%"
    })

comparison_df = pd.DataFrame(comparison_data)
print("ðŸ“Š DATA CLEANING SUMMARY - BEFORE vs AFTER\n")
print(comparison_df.to_string(index=False))

ðŸ“Š DATA CLEANING SUMMARY - BEFORE vs AFTER

             Dataset  Rows Before  Rows After  Rows Removed  Cols Before  Cols After  Cols Removed Missing % Before Missing % After
    Customer Address       221470      221437            33           25          23             2           15.98%           8.67%
       General Order        67934       67934             0           46          39             7           41.42%          30.90%
 Individual Customer       178494      178494             0           53          51             2           52.74%          54.81%
         Orders List        67831       67831             0           40          37             3           21.07%          14.67%
     Product Catalog         7158        7158             0            6           6             0            3.62%           3.62%
Product Order Detail        87610       87609             1          108         108             0           57.46%          57.46%


In [16]:
# export csv 
for name, df in datasets_clean.items():
    df.to_csv(f"../data/processed/clean_{name}.csv")