May-2022 Data Cleaning With Analysis

In [4]:
import pandas as pd

# Load dataset
df = pd.read_csv("C:/Users/Jitesh Jangam/Desktop/Future Int/Ecommerce/Dataset/May-2022.csv")

# 1. Drop useless index column
df = df.drop(columns=["index"])

# 2. Standardize column names
df.columns = (
    df.columns
      .str.strip()
      .str.lower()
      .str.replace(" ", "_")
)

# 3. Convert numeric columns safely
numeric_cols = [
    "weight", "tp", "mrp_old", "final_mrp_old",
    "ajio_mrp", "amazon_mrp", "amazon_fba_mrp",
    "flipkart_mrp", "limeroad_mrp",
    "myntra_mrp", "paytm_mrp", "snapdeal_mrp"
]

for col in numeric_cols:
    df[col] = (
        df[col]
        .astype(str)
        .str.replace(",", "", regex=False)
        .str.strip()
    )
    df[col] = pd.to_numeric(df[col], errors="coerce")

# 4. Remove duplicate SKUs if any
df = df.drop_duplicates(subset="sku")

# 5. Handle missing numeric values (safe default)
df[numeric_cols] = df[numeric_cols].fillna(0)

# 6. Final validation
assert df.isnull().sum().sum() == 0, "Null values still exist"

# 7. Save cleaned file
df.to_csv("May-2022_CLEAN.csv", index=False)

print("✅ Dataset cleaned and saved successfully")


✅ Dataset cleaned and saved successfully
