In [25]:
import pandas as pd
import numpy as np
from pathlib import Path
import re

# Load Dataset

In [26]:
path = Path("../input/dataset.csv")
df = pd.read_csv(path, encoding= "unicode_escape")

# Clean Dataset

In [27]:
def clean_df(df: pd.DataFrame):
    """Clean dataframe.
    """
    return (df
        .rename(columns=lambda c: (re.sub(r"([A-Z])", r" \1", c)
                                .lower()
                                .replace(" i d", " id")
                                .strip()
                                .replace(" ", "_")
                                )
        )
        .assign(
            invoice_no= lambda df_: df_["invoice_no"].astype(str).str.strip(), 
            stock_code= lambda df_: df_["stock_code"].astype(str).str.strip(), 
            description= lambda df_: df_["description"].str.title().str.strip(), 
            invoice_date=lambda df_: pd.to_datetime(df_["invoice_date"]),
            unit_price= lambda df_: df_["unit_price"].astype(float),
            country= lambda df_: df_["country"].str.title().str.strip(), 
            total_price=lambda df_: df_["quantity"].mul(df_["unit_price"]).astype(float),
        )
        .loc[lambda df_: df_["quantity"] > 0]
        .loc[:, ["invoice_no", "invoice_date", "description", "stock_code", "unit_price", 
                "quantity", "total_price", "customer_id", "country"]]
    )
df_clean = clean_df(df)

# Save Clean Dataset

In [28]:
if not (path := Path("../input/dataset-clean.parquet")).is_file():
    df_clean.to_parquet(path)

# EDA

In [29]:
(df_clean
    .describe()
    .transpose()
)

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
invoice_date,531285.0,2011-07-04 18:15:45.816539136,2010-12-01 08:26:00,2011-03-28 11:59:00,2011-07-20 12:01:00,2011-10-19 12:35:00,2011-12-09 12:50:00,
unit_price,531285.0,3.857296,-11062.06,1.25,2.08,4.13,13541.33,41.810047
quantity,531285.0,10.655262,1.0,1.0,3.0,10.0,80995.0,156.830323
total_price,531285.0,20.0355,-11062.06,3.75,9.9,17.7,168469.6,270.91237
customer_id,397924.0,15294.315171,12346.0,13969.0,15159.0,16795.0,18287.0,1713.169877


In [30]:
(df_clean
    .loc[:, ["invoice_no", "stock_code", "customer_id", "country"]]
    .nunique().to_frame("num_unique_value")
)

Unnamed: 0,num_unique_value
invoice_no,20728
stock_code,3941
customer_id,4339
country,38


In [31]:
(df_clean
    .isna().sum()
    .to_frame("num_null_rows")
)

Unnamed: 0,num_null_rows
invoice_no,0
invoice_date,0
description,592
stock_code,0
unit_price,0
quantity,0
total_price,0
customer_id,133361
country,0


Observations:
- The dataset comprises of sales from **1 Dec 2010 to 9 Dec 2011**. 
- There are more than 500k entries, consisting of 
    - 25k invoices
    - 4k types of item sold
    - 4k customers
    - 81k items sold
    - 160k revenue
- Customer ID and description has NULL rows.