In [1]:
# import pandas as pd
import polars as pl
# from polars import col

# Read Data
Let's start from the _E-commerce Business Transaction_ dataset in Kaggle Open Data.

In [2]:
raw_ec_data = pl.read_csv('./kaggle_data/Sales_Transaction_v4a.csv', infer_schema = False)

raw_ec_data.glimpse()

Rows: 536350
Columns: 8
$ TransactionNo <str> '581482', '581475', '581475', '581475', '581475', '581475', '581475', '581475', '581475', '581475'
$ Date          <str> '12/9/2019', '12/9/2019', '12/9/2019', '12/9/2019', '12/9/2019', '12/9/2019', '12/9/2019', '12/9/2019', '12/9/2019', '12/9/2019'
$ ProductNo     <str> '22485', '22596', '23235', '23272', '23239', '21705', '22118', '22119', '22217', '22216'
$ ProductName   <str> 'Set Of 2 Wooden Market Crates', 'Christmas Star Wish List Chalkboard', 'Storage Tin Vintage Leaf', 'Tree T-Light Holder Willie Winkie', 'Set Of 4 Knick Knack Tins Poppies', 'Bag 500g Swirly Marbles', 'Joy Wooden Block Letters', 'Peace Wooden Block Letters', 'T-Light Holder Hanging Lace', 'T-Light Holder White Lace'
$ Price         <str> '21.47', '10.65', '11.53', '10.65', '11.94', '10.65', '11.53', '12.25', '10.65', '10.55'
$ Quantity      <str> '12', '36', '12', '12', '6', '24', '18', '12', '12', '24'
$ CustomerNo    <str> '17490', '13069', '13069', '13069', '130

In [3]:
raw_ec_data.sample(n = 10)

TransactionNo,Date,ProductNo,ProductName,Price,Quantity,CustomerNo,Country
str,str,str,str,str,str,str,str
"""559027""","""7/5/2019""","""22938""","""Cupcake Lace Paper Set 6""","""11.94""","""48""","""17046""","""United Kingdom"""
"""540093""","""1/4/2019""","""22072""","""Red Retrospot Tea Cup And Sauc…","""14.09""","""1""","""16725""","""United Kingdom"""
"""543629""","""2/10/2019""","""82599""","""Fanny's Rest Stopmetal Sign""","""14.48""","""1""","""15629""","""United Kingdom"""
"""579792""","""11/30/2019""","""22899""","""Children's Apron Dolly Girl""","""7.24""","""6""","""12714""","""France"""
"""560281""","""7/17/2019""","""23293""","""Set Of 12 Fairy Cake Baking Ca…","""11.10""","""1""","""17984""","""United Kingdom"""
"""575726""","""11/10/2019""","""21591""","""Cosy Hour Cigar Box Matches""","""11.53""","""6""","""16791""","""United Kingdom"""
"""C566741""","""9/14/2019""","""23404""","""Home Sweet Home Blackboard""","""15.32""","""-1""","""15618""","""United Kingdom"""
"""536845""","""12/2/2018""","""21528""","""Dairy Maid Traditional Teapot""","""17.37""","""1""","""17961""","""United Kingdom"""
"""575947""","""11/13/2019""","""22679""","""French Blue Metal Door Sign 4""","""12.77""","""1""","""13947""","""United Kingdom"""
"""578855""","""11/27/2019""","""22139""","""Retrospot Tea Set Ceramic 11 P…","""6.19""","""2""","""14878""","""United Kingdom"""


# Data Transformation

In [13]:
ec_data = (
    raw_ec_data
    .select(
        pl.col('TransactionNo').str.strip_chars().alias('txn_id'),
        pl.col('Date').str.to_date(format = "%m/%d/%Y").alias('txndate'),
        pl.col('ProductNo').str.strip_chars().alias('pid'),
        pl.col('ProductName').str.strip_chars().alias('pname'),
        pl.col('Price').cast(pl.Float64).alias('price'),
        pl.col('Quantity').cast(pl.Int64).alias('quantity'),
        pl.col('CustomerNo').str.strip_chars().alias('cust_id'),
        pl.col('Country').str.strip_chars().alias('country')
    )
    .with_columns(
        payment = pl.col('price') * pl.col('quantity')
    )
)

ec_data

txn_id,txndate,pid,pname,price,quantity,cust_id,country,payment
str,date,str,str,f64,i64,str,str,f64
"""581482""",2019-12-09,"""22485""","""Set Of 2 Wooden Market Crates""",21.47,12,"""17490""","""United Kingdom""",257.64
"""581475""",2019-12-09,"""22596""","""Christmas Star Wish List Chalk…",10.65,36,"""13069""","""United Kingdom""",383.4
"""581475""",2019-12-09,"""23235""","""Storage Tin Vintage Leaf""",11.53,12,"""13069""","""United Kingdom""",138.36
"""581475""",2019-12-09,"""23272""","""Tree T-Light Holder Willie Win…",10.65,12,"""13069""","""United Kingdom""",127.8
"""581475""",2019-12-09,"""23239""","""Set Of 4 Knick Knack Tins Popp…",11.94,6,"""13069""","""United Kingdom""",71.64
…,…,…,…,…,…,…,…,…
"""C536548""",2018-12-01,"""22168""","""Organiser Wood Antique White""",18.96,-2,"""12472""","""Germany""",-37.92
"""C536548""",2018-12-01,"""21218""","""Red Spotty Biscuit Tin""",14.09,-3,"""12472""","""Germany""",-42.27
"""C536548""",2018-12-01,"""20957""","""Porcelain Hanging Bell Small""",11.74,-1,"""12472""","""Germany""",-11.74
"""C536548""",2018-12-01,"""22580""","""Advent Calendar Gingham Sack""",16.35,-4,"""12472""","""Germany""",-65.4


# Data Aggregation and Cleansing

In [5]:
# remove the cust_id == "NA"
ec_data.group_by('cust_id').len().sort(by = 'cust_id')

ec_data= (
    ec_data
    .filter(pl.col('cust_id') != "NA")
)

In [6]:
# remove the order which has total_amount less than 0
agg_data = (
    ec_data
    .group_by('txn_id', 'cust_id', 'txndate')
    .agg(
        sum_payment = pl.col('payment').sum()
    ).
    sort(by = ['cust_id', 'txndate'])
    # filter(pl.col('sum_payment') <= 0)
)




In [7]:
agg_data.filter(pl.col('sum_payment') <= 0)

txn_id,cust_id,txndate,sum_payment
str,str,date,f64
"""C541433""","""12346""",2019-01-18,-840113.8
"""C547388""","""12352""",2019-03-22,-769.08
"""C549955""","""12359""",2019-04-13,-77.68
"""C580165""","""12359""",2019-12-02,-37.14
"""C544902""","""12362""",2019-02-24,-25.52
…,…,…,…
"""C577832""","""18274""",2019-11-22,-1082.32
"""C577386""","""18276""",2019-11-18,-16.66
"""C577390""","""18276""",2019-11-18,-16.66
"""C542086""","""18277""",2019-01-25,-23.32


In [8]:
agg_data.filter(pl.col('cust_id') == "18282")

txn_id,cust_id,txndate,sum_payment
str,str,date,f64
"""562525""","""18282""",2019-08-05,871.54
"""C562808""","""18282""",2019-08-09,-52.75
"""580173""","""18282""",2019-12-02,173.32


In [9]:
aa = (ec_data.filter(pl.col('cust_id') == "18282"))

In [10]:
aa

txn_id,txndate,pid,pname,price,quantity,cust_id,country,payment
str,date,str,str,f64,i64,str,str,f64
"""580173""",2019-12-02,"""22423""","""Regency Cakestand 3 Tier""",6.19,2,"""18282""","""United Kingdom""",12.38
"""580173""",2019-12-02,"""22699""","""Roses Regency Teacup And Sauce…",6.19,6,"""18282""","""United Kingdom""",37.14
"""580173""",2019-12-02,"""22818""","""Card Christmas Village""",6.19,12,"""18282""","""United Kingdom""",74.28
"""580173""",2019-12-02,"""23174""","""Regency Sugar Bowl Green""",6.19,4,"""18282""","""United Kingdom""",24.76
"""580173""",2019-12-02,"""23175""","""Regency Milk Jug Pink""",6.19,4,"""18282""","""United Kingdom""",24.76
…,…,…,…,…,…,…,…,…
"""562525""",2019-08-05,"""23187""","""French Style Storage Jar Bonbo…",10.55,48,"""18282""","""United Kingdom""",506.4
"""562525""",2019-08-05,"""23295""","""Set Of 12 Mini Loaf Baking Cas…",11.1,8,"""18282""","""United Kingdom""",88.8
"""562525""",2019-08-05,"""22089""","""Paper Bunting Vintage Paisley""",13.27,6,"""18282""","""United Kingdom""",79.62
"""562525""",2019-08-05,"""21108""","""Fairy Cake Flannel Assorted Co…",12.86,9,"""18282""","""United Kingdom""",115.74
