In [405]:
import json
import pandas as pd
import duckdb
from datetime import datetime
import io

In [406]:
users_df = pd.read_json('users.json', lines = True)
users_df = users_df.fillna('')

users_df['createdDate'] = users_df['createdDate'].apply(lambda x: datetime.fromtimestamp(x['$date'] / 1000) if x != '' else '')
users_df['_id'] = users_df['_id'].apply(lambda x: json.loads(json.dumps(x))['$oid'] if x != '' else '')
users_df['lastLogin'] = users_df['lastLogin'].apply(lambda x: datetime.fromtimestamp(x['$date'] / 1000) if x != '' else '')

In [407]:
receipts_df = pd.read_json('receipts.json', lines = True)
receipts_df = receipts_df.fillna('')

receipts_df['modifyDate'] = receipts_df['modifyDate'].apply(lambda x: datetime.fromtimestamp(x['$date'] / 1000) if x != '' else '')
receipts_df['createDate'] = receipts_df['createDate'].apply(lambda x: datetime.fromtimestamp(x['$date'] / 1000) if x != '' else '')
receipts_df['finishedDate'] = receipts_df['finishedDate'].apply(lambda x: datetime.fromtimestamp(x['$date'] / 1000) if x != '' else '')
receipts_df['pointsAwardedDate'] = receipts_df['pointsAwardedDate'].apply(lambda x: datetime.fromtimestamp(x['$date'] / 1000) if x != '' else '')
receipts_df['dateScanned'] = receipts_df['dateScanned'].apply(lambda x: datetime.fromtimestamp(x['$date'] / 1000) if x != '' else '')
receipts_df['purchaseDate'] = receipts_df['purchaseDate'].apply(lambda x: datetime.fromtimestamp(x['$date'] / 1000) if x != '' else '')
receipts_df['rewardsReceiptItemList'] = receipts_df['rewardsReceiptItemList'].apply(lambda x: json.loads(json.dumps(x)) if x != '' else None)
receipts_df['_id'] = receipts_df['_id'].apply(lambda x: x['$oid'] if x != '' else None)

receipts_df = receipts_df.explode('rewardsReceiptItemList')
receipts_df = receipts_df.reset_index(drop = True)
receipts_df = receipts_df.join(pd.json_normalize(receipts_df['rewardsReceiptItemList']), rsuffix = '_dict')
receipts_df = receipts_df.fillna('')
receipts_df = receipts_df.drop(columns = ['rewardsReceiptItemList','pointsEarned_dict'])

In [408]:
brands_df = pd.read_json('brands.json', lines = True)
users_df = users_df.fillna('')

brands_df['_id'] = brands_df['_id'].apply(lambda x: json.loads(json.dumps(x))['$oid'] if x != '' else '')

brands_df = brands_df.join(pd.json_normalize(brands_df['cpg'], errors = 'ignore'), rsuffix = '_dict')
brands_df = brands_df.drop(columns = ['cpg'])
brands_df = brands_df.rename(columns = {'$ref': 'cpgRef', '$id.$oid': 'cpgId'})

In [409]:
brands_schema = {x: brands_df[x].dtypes for x in list(brands_df.columns)}
users_schema = {x: users_df[x].dtypes for x in list(users_df.columns)}
receipts_schema = {x: receipts_df[x].dtypes for x in list(receipts_df.columns)}

In [410]:
duckdb.sql("""
with total_receipts as (
    select
    receipts_df._id as receipt_id
    ,case when receipts_df.brandCode in ('','BRAND') then 'NO_BRAND_PROVIDED' else receipts_df.brandCode end as brand_code
    ,cast(receipts_df.dateScanned as varchar) as date_scanned
    ,cast(case when receipts_df.itemPrice = '' then 0.0 else cast(receipts_df.itemPrice as decimal) end as decimal) as item_price
    ,cast(case when receipts_df.purchasedItemCount = '' then 0 else cast(receipts_df.purchasedItemCount as integer) end as integer) as purchased_item_count
    ,cast(case when receipts_df.totalSpent = '' then 0.0 else cast(receipts_df.totalSpent as decimal) end as decimal) as total_spent
    ,receipts_df.rewardsReceiptStatus as rewards_receipt_status
    ,users_duped._id as user_id
    ,cast(users_duped.createdDate as varchar) as user_created_date
    from receipts_df
    left join 
    (
        select
        _id
        ,active
        ,createdDate
        ,lastLogin
        ,role
        ,signUpSource
        ,state
        from
        (
            select 
            *
            ,count(*) as duplicate_count
            from users_df
            group by 1,2,3,4,5,6,7
        )
    ) users_duped on receipts_df.userId = users_duped._id
)

select * from total_receipts
""").to_df().to_json('receipts_data_model.json', orient = 'records')

In [411]:
con = duckdb.connect()

In [412]:
con.execute("""
    drop table if exists receipts_detail;
    
    create table receipts_detail (
        receipt_id varchar(25)
        ,brand_code varchar(25)
        ,date_scanned timestamp
        ,item_price decimal(10,2)
        ,purchased_item_count integer
        ,total_spent decimal(10,2)
        ,rewards_receipt_status varchar(25)
        ,user_id varchar(25)
        ,user_created_date timestamp
        );
        
    truncate receipts_detail;
    
    copy receipts_detail from 'receipts_data_model.json' (format json, array True);
""")

<duckdb.duckdb.DuckDBPyConnection at 0x115cb33f0>

In [413]:
con.sql("""
    drop view if exists brand_scanned_fact;
    create view brand_scanned_fact as
        with receipt_count as (
            select 
            year(date_scanned) as date_scanned_year
            ,month(date_scanned) as date_scanned_month
            ,coalesce(brand_code, 'NO_BRAND_PROVIDED') as brand
            ,count(distinct receipt_id) as receipt_count_scanned
            from receipts_detail
            group by 1,2,3
        )
    
        ,most_recent_month as (
            select
            date_scanned_year
            ,date_scanned_month
            ,dense_rank() over (order by date_scanned_year desc, date_scanned_month desc) as month_rank
            from
            (
                select
                date_scanned_year
                ,date_scanned_month
                ,count(*)
                from receipt_count
                group by 1,2
            )
        )
    
        select
        receipt_count.date_scanned_year
        ,receipt_count.date_scanned_month
        ,month_rank
        ,brand
        ,receipt_count_scanned
        ,dense_rank() over (partition by receipt_count.date_scanned_year, receipt_count.date_scanned_month order by receipt_count desc) as receipt_scanned_rank
        from receipt_count
        inner join most_recent_month on receipt_count.date_scanned_year = most_recent_month.date_scanned_year 
            and receipt_count.date_scanned_month = most_recent_month.date_scanned_month
        order by 1 desc, 2 desc, 5 asc
""")

In [414]:
# What are the top 5 brands by receipts scanned for most recent month?
con.sql("""select * from brand_scanned_fact where month_rank = 1""")

┌───────────────────┬────────────────────┬────────────┬───────────────────┬───────────────────────┬──────────────────────┐
│ date_scanned_year │ date_scanned_month │ month_rank │       brand       │ receipt_count_scanned │ receipt_scanned_rank │
│       int64       │       int64        │   int64    │      varchar      │         int64         │        int64         │
├───────────────────┼────────────────────┼────────────┼───────────────────┼───────────────────────┼──────────────────────┤
│              2021 │                  3 │          1 │ NO_BRAND_PROVIDED │                    23 │                    1 │
└───────────────────┴────────────────────┴────────────┴───────────────────┴───────────────────────┴──────────────────────┘

In [415]:
# How does the ranking of the top 5 brands by receipts scanned for the recent month compare to the ranking for the previous month?
con.sql("""select * from brand_scanned_fact where month_rank <= 2""")

┌───────────────────┬────────────────────┬────────────┬───────────────────┬───────────────────────┬──────────────────────┐
│ date_scanned_year │ date_scanned_month │ month_rank │       brand       │ receipt_count_scanned │ receipt_scanned_rank │
│       int64       │       int64        │   int64    │      varchar      │         int64         │        int64         │
├───────────────────┼────────────────────┼────────────┼───────────────────┼───────────────────────┼──────────────────────┤
│              2021 │                  3 │          1 │ NO_BRAND_PROVIDED │                    23 │                    1 │
│              2021 │                  2 │          2 │ VIVA              │                     1 │                    1 │
│              2021 │                  2 │          2 │ MISSION           │                     2 │                    3 │
│              2021 │                  2 │          2 │ NO_BRAND_PROVIDED │                   446 │                    2 │
└───────────────

In [416]:
con.sql("""
    drop view if exists reward_status_fact;
    create view reward_status_fact as 
        select
        rewards_receipt_status
        ,avg(total_receipt) as average_item_price
        ,sum(total_items_receipt) as total_items
        from
        (
            select
            receipt_id
            ,rewards_receipt_status
            ,avg(purchased_item_count) as total_items_receipt
            ,sum(item_price) as total_receipt
            from receipts_detail
            group by 1,2
        ) a
        group by 1
""")

In [417]:
# When considering average spend from receipts with 'rewardsReceiptStatus’ of ‘Accepted’ or ‘Rejected’, which is greater?
# When considering total number of items purchased from receipts with 'rewardsReceiptStatus’ of ‘Accepted’ or ‘Rejected’, which is greater?
con.sql('select * from reward_status_fact')

┌────────────────────────┬────────────────────┬─────────────┐
│ rewards_receipt_status │ average_item_price │ total_items │
│        varchar         │       double       │   double    │
├────────────────────────┼────────────────────┼─────────────┤
│ FLAGGED                │  180.4517391304348 │      1014.0 │
│ REJECTED               │  23.80492957746479 │       173.0 │
│ FINISHED               │  80.90059845559846 │      8184.0 │
│ SUBMITTED              │                0.0 │         0.0 │
│ PENDING                │            27.4718 │         0.0 │
└────────────────────────┴────────────────────┴─────────────┘

In [418]:
con.sql("""
    drop view if exists brand_transaction_fact;
    create view brand_transaction_fact as
        with users_past_6_months as (
            select 
            user_id
            from receipts_detail
            group by user_id, user_created_date
            having datediff('month', user_created_date, max(date_scanned)) <= 6
        )
    
        select
        brand_code
        ,sum(item_price) as total_spend
        ,count(distinct receipt_id) as total_transactions
        from receipts_detail
        inner join users_past_6_months on users_past_6_months.user_id = receipts_detail.user_id
        group by 1
        order by 2 desc
""")

In [419]:
# Which brand has the most spend among users who were created within the past 6 months?
# Which brand has the most transactions among users who were created within the past 6 months?
con.sql("""
    select
    brand_code
    ,total_spend
    ,dense_rank() over (order by total_spend desc) as total_spend_rank
    ,total_transactions
    ,dense_rank() over (order by total_transactions desc) as total_transactions_rank
    from brand_transaction_fact
""")

┌─────────────────────────┬───────────────┬──────────────────┬────────────────────┬─────────────────────────┐
│       brand_code        │  total_spend  │ total_spend_rank │ total_transactions │ total_transactions_rank │
│         varchar         │ decimal(38,2) │      int64       │       int64        │          int64          │
├─────────────────────────┼───────────────┼──────────────────┼────────────────────┼─────────────────────────┤
│ NO_BRAND_PROVIDED       │      21294.58 │                1 │                842 │                       1 │
│ BEN AND JERRYS          │       1217.40 │                2 │                 17 │                       2 │
│ MISSION                 │         46.27 │               57 │                 17 │                       2 │
│ PEPSI                   │        250.39 │               14 │                 16 │                       3 │
│ KLEENEX                 │        356.07 │                8 │                 15 │                       4 │
│ FOLGERS 

In [420]:
###################### DATA QUALITY ######################

In [421]:
duckdb.sql("""
with barcode_exist as (
    select
    receipts_df.barcode as receipts_barcode
    ,case when receipts_df.brandCode = '' then 'NO_BRAND_PROVIDED' else receipts_df.brandCode end as receipts_brand_code
    ,brands_df.barcode as brands_barcode
    ,brands_df.brandCode as brands_brand_code
    ,case when receipts_brand_code = brands_brand_code then true else false end as is_brand_code_match
    from receipts_df
    left join brands_df on cast(brands_df.barcode as varchar) = cast(receipts_df.barcode as varchar)
)

select 
count(*) as total_items
,count(case when brands_barcode is not null then brands_barcode end) as items_with_matched_barcode 
,count(case when brands_barcode is not null then brands_barcode end) / (1.0 * count(*)) as barcodes_covered
,count(case when is_brand_code_match = true then brands_barcode end) / (1.0 * count(case when brands_barcode is not null then brands_barcode end)) as pct_brands_matched
from barcode_exist
""").to_df()
# in some case we may want to enrich the receipts with further brand information...
# issues - 
# - only ~1% of all barcodes are correctly collected from data source OR the barcode is of a different format / structure
# - of those barcodes that did match ~90% have the same brand name
# in order for brands to be a viable table for enrichment the collection / quality of barcodes values must be improved

Unnamed: 0,total_items,items_with_matched_barcode,barcodes_covered,pct_brands_matched
0,7388,89,0.012047,0.898876


In [422]:
duckdb.sql('select *, count(*) from users_df group by 1,2,3,4,5,6,7').to_df()
# users.json is not deduped
# if this was a table that collected timestamps per user log in - would be valid

Unnamed: 0,_id,active,createdDate,lastLogin,role,signUpSource,state,count_star()
0,5ff1e1b4cfcf6c399c274a54,True,2021-01-03 10:24:36.410,2021-01-03 10:24:36.452000,consumer,Email,WI,1
1,5ff7268c8f142f11dd189579,True,2021-01-07 10:19:40.196,2021-01-07 10:19:40.243000,consumer,Email,WI,1
2,5ff8c241b3348b11c93379fc,True,2021-01-08 15:36:17.626,2021-01-08 15:36:17.831000,consumer,Email,WI,1
3,5ff873ddb3348b11c9337733,True,2021-01-08 10:01:49.160,2021-01-08 10:01:49.202000,consumer,Email,WI,2
4,5ff8a11db3348b11c93378aa,True,2021-01-08 13:14:53.734,2021-01-08 13:14:53.928000,consumer,Email,WI,1
...,...,...,...,...,...,...,...,...
207,601442ce67804a1228b1dc41,True,2021-01-29 12:15:58.195,2021-01-29 12:18:54.023000,consumer,Email,WI,1
208,60182f4ac8b50e11d8454946,True,2021-02-01 11:41:46.573,2021-02-01 11:41:46.615000,consumer,Email,WI,1
209,60186237c8b50e11d8454d5f,True,2021-02-01 15:19:03.551,,consumer,Email,,5
210,60255883efa60114d20e5d4e,True,2021-02-11 11:17:07.502,2021-02-11 11:18:29.219000,consumer,Email,WI,1


In [423]:
duckdb.sql('select brandCode, count(*) from brands_df group by 1 order by 2 desc').to_df()
# brand is not captured or not captured correctly

Unnamed: 0,brandCode,count_star()
0,,234
1,,35
2,HUGGIES,2
3,GOODNITES,2
4,TEST BRANDCODE @1598813526777,1
...,...,...
893,TEST BRANDCODE @1598296704794,1
894,CAMPBELLS SPAGHETTIOS,1
895,511111005377,1
896,COORS EXTRA GOLD,1
