<a href="https://colab.research.google.com/github/fikrifaizz/ecommerce-dashboard/blob/main/notebooks/data_cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import os

# Setup Path
raw_path = '../data/raw/'
processed_path = '../data/processed/'

# Buat folder processed jika belum ada
os.makedirs(processed_path, exist_ok=True)

print("Setup Selesai. Siap melakukan cleaning.")

Setup Selesai. Siap melakukan cleaning.


In [4]:
import csv # Pastikan import ini ada

def clean_and_save(filename):
    print(f"\n--- Memproses: {filename} ---")
    
    # 1. Load Data
    try:
        df = pd.read_csv(os.path.join(raw_path, filename))
    except Exception as e:
        print(f"Error saat membaca file: {e}")
        return

    # 2. PROSES PEMBERSIHAN KHUSUS
    
    # A. Kasus REVIEWS: Hapus Enter (\n)
    if 'olist_order_reviews' in filename:
        print("   Fixing: Menghapus karakter newline di komentar...")
        cols = ['review_comment_title', 'review_comment_message']
        for col in cols:
            # Ganti enter dengan spasi
            df[col] = df[col].astype(str).str.replace(r'[\n\r]', ' ', regex=True)
            # Ganti string 'nan' menjadi kosong agar jadi NULL yang bersih
            df[col] = df[col].replace('nan', '')

    # B. Kasus GEOLOCATION: Hapus Duplikat Massive
    if 'olist_geolocation' in filename:
        initial_rows = len(df)
        df = df.drop_duplicates()
        dropped = initial_rows - len(df)
        print(f"   Optimization: Menghapus {dropped:,} baris duplikat.")

    # C. Kasus PRODUCTS: Isi Kategori Null
    if 'olist_products' in filename:
        print("   Fixing: Mengisi kategori null dengan 'unknown'")
        df['product_category_name'] = df['product_category_name'].fillna('unknown')
        
        # FIX TAMBAHAN: Pastikan kolom angka benar-benar angka (handle error string kosong)
        num_cols = ['product_name_lenght', 'product_description_lenght', 'product_photos_qty', 
                    'product_weight_g', 'product_length_cm', 'product_height_cm', 'product_width_cm']
        for col in num_cols:
            df[col] = pd.to_numeric(df[col], errors='coerce')

    # 3. STANDARISASI UMUM (Semua File)
    # Ubah ke DateTime
    date_cols = [col for col in df.columns if 'date' in col or 'timestamp' in col]
    for col in date_cols:
        df[col] = pd.to_datetime(df[col], errors='coerce')
        
    # 4. SIMPAN KE PROCESSED (REVISI PENTING DI SINI!)
    save_file = os.path.join(processed_path, filename)
    
    # PERUBAHAN: quoting=csv.QUOTE_MINIMAL (0)
    # Ini akan membuat nilai Null menjadi kosong (,,) bukan ("")
    # Sehingga PostgreSQL bisa membacanya sebagai NULL.
    df.to_csv(save_file, index=False, quoting=csv.QUOTE_MINIMAL, quotechar='"') 
    
    print(f"   Tersimpan di: {save_file}")

In [5]:
# List file yang akan diproses
files_to_clean = [
    'olist_customers_dataset.csv',
    'olist_geolocation_dataset.csv',
    'olist_order_items_dataset.csv',
    'olist_order_payments_dataset.csv',
    'olist_order_reviews_dataset.csv',
    'olist_orders_dataset.csv',
    'olist_products_dataset.csv',
    'olist_sellers_dataset.csv',
    'product_category_name_translation.csv'
]

print("MEMULAI PROSES CLEANING & STANDARISASI...\n")

for f in files_to_clean:
    clean_and_save(f)

print("\nSEMUA FILE SELESAI DIPROSES! Folder 'data/processed' siap di-import.")

MEMULAI PROSES CLEANING & STANDARISASI...


--- Memproses: olist_customers_dataset.csv ---
   Tersimpan di: ../data/processed/olist_customers_dataset.csv

--- Memproses: olist_geolocation_dataset.csv ---
   Optimization: Menghapus 261,831 baris duplikat.
   Tersimpan di: ../data/processed/olist_geolocation_dataset.csv

--- Memproses: olist_order_items_dataset.csv ---
   Tersimpan di: ../data/processed/olist_order_items_dataset.csv

--- Memproses: olist_order_payments_dataset.csv ---
   Tersimpan di: ../data/processed/olist_order_payments_dataset.csv

--- Memproses: olist_order_reviews_dataset.csv ---
   Fixing: Menghapus karakter newline di komentar...
   Tersimpan di: ../data/processed/olist_order_reviews_dataset.csv

--- Memproses: olist_orders_dataset.csv ---
   Tersimpan di: ../data/processed/olist_orders_dataset.csv

--- Memproses: olist_products_dataset.csv ---
   Fixing: Mengisi kategori null dengan 'unknown'
   Tersimpan di: ../data/processed/olist_products_dataset.csv

--- Mem