In [549]:
import json
import pandas as pd
import duckdb
from datetime import datetime

In [550]:
users_df = pd.read_json('users.json', lines = True)
users_df = users_df.fillna('')

users_df['createdDate'] = users_df['createdDate'].apply(lambda x: datetime.fromtimestamp(x['$date'] / 1000) if x != '' else '')
users_df['_id'] = users_df['_id'].apply(lambda x: json.loads(json.dumps(x))['$oid'] if x != '' else '')
users_df['lastLogin'] = users_df['lastLogin'].apply(lambda x: datetime.fromtimestamp(x['$date'] / 1000) if x != '' else {})

In [551]:
receipts_df = pd.read_json('receipts.json', lines = True)
receipts_df = receipts_df.fillna('')

receipts_df['modifyDate'] = receipts_df['modifyDate'].apply(lambda x: datetime.fromtimestamp(x['$date'] / 1000) if x != '' else '')
receipts_df['createDate'] = receipts_df['createDate'].apply(lambda x: datetime.fromtimestamp(x['$date'] / 1000) if x != '' else '')
receipts_df['finishedDate'] = receipts_df['finishedDate'].apply(lambda x: datetime.fromtimestamp(x['$date'] / 1000) if x != '' else '')
receipts_df['pointsAwardedDate'] = receipts_df['pointsAwardedDate'].apply(lambda x: datetime.fromtimestamp(x['$date'] / 1000) if x != '' else '')
receipts_df['dateScanned'] = receipts_df['dateScanned'].apply(lambda x: datetime.fromtimestamp(x['$date'] / 1000) if x != '' else '')
receipts_df['purchaseDate'] = receipts_df['purchaseDate'].apply(lambda x: datetime.fromtimestamp(x['$date'] / 1000) if x != '' else '')
receipts_df['rewardsReceiptItemList'] = receipts_df['rewardsReceiptItemList'].apply(lambda x: json.loads(json.dumps(x)) if x != '' else None)
receipts_df['_id'] = receipts_df['_id'].apply(lambda x: x['$oid'] if x != '' else None)

receipts_df = receipts_df.explode('rewardsReceiptItemList')
receipts_df = receipts_df.reset_index(drop = True)
receipts_df = receipts_df.join(pd.json_normalize(receipts_df['rewardsReceiptItemList'], errors = 'ignore'), rsuffix = '_dict')
receipts_df = receipts_df.fillna('')
receipts_df = receipts_df.drop(columns = ['rewardsReceiptItemList','pointsEarned_dict'])

In [552]:
brands_df = pd.read_json('brands.json', lines = True)
users_df = users_df.fillna('')

brands_df['_id'] = brands_df['_id'].apply(lambda x: json.loads(json.dumps(x))['$oid'] if x != '' else '')

brands_df = brands_df.join(pd.json_normalize(brands_df['cpg'], errors = 'ignore'), rsuffix = '_dict')
brands_df = brands_df.drop(columns = ['cpg'])
brands_df = brands_df.rename(columns = {'$ref': 'cpgRef', '$id.$oid': 'cpgId'})

In [553]:
brands_df.columns

Index(['_id', 'barcode', 'category', 'categoryCode', 'name', 'topBrand',
       'brandCode', 'cpgRef', 'cpgId'],
      dtype='object')

In [554]:
users_df.columns

Index(['_id', 'active', 'createdDate', 'lastLogin', 'role', 'signUpSource',
       'state'],
      dtype='object')

In [555]:
receipts_df.columns

Index(['_id', 'bonusPointsEarned', 'bonusPointsEarnedReason', 'createDate',
       'dateScanned', 'finishedDate', 'modifyDate', 'pointsAwardedDate',
       'pointsEarned', 'purchaseDate', 'purchasedItemCount',
       'rewardsReceiptStatus', 'totalSpent', 'userId', 'barcode',
       'description', 'finalPrice', 'itemPrice', 'needsFetchReview',
       'partnerItemId', 'preventTargetGapPoints', 'quantityPurchased',
       'userFlaggedBarcode', 'userFlaggedNewItem', 'userFlaggedPrice',
       'userFlaggedQuantity', 'needsFetchReviewReason',
       'pointsNotAwardedReason', 'pointsPayerId', 'rewardsGroup',
       'rewardsProductPartnerId', 'userFlaggedDescription',
       'originalMetaBriteBarcode', 'originalMetaBriteDescription', 'brandCode',
       'competitorRewardsGroup', 'discountedItemPrice',
       'originalReceiptItemText', 'itemNumber',
       'originalMetaBriteQuantityPurchased', 'targetPrice',
       'competitiveProduct', 'originalFinalPrice',
       'originalMetaBriteItemPrice',

In [556]:
# receipts_df.rewardsProductPartnerId <-> brands_df.cpgId

duckdb.sql("select * from brands_df where cpgId = '5e7cf838f221c312e698a628'").to_df()

Unnamed: 0,_id,barcode,category,categoryCode,name,topBrand,brandCode,cpgRef,cpgId
0,5fd28f0cbe37ce6d53dfc692,511111518044,Dairy & Refrigerated,DAIRY_AND_REFRIGERATED,Sargento® Cheese,0.0,SARGENTO,Cogs,5e7cf838f221c312e698a628


In [562]:
duckdb.sql("""
with barcode_exist as (
    select
    receipts_df.userId as user_id
    ,receipts_df.dateScanned as date_scanned
    ,receipts_df.barcode as receipts_barcode
    ,receipts_df.brandCode as receipts_brand_code
    ,receipts_df.quantityPurchased as receipts_quantity_purchased
    ,brands_df.name as brands_name
    ,brands_df.barcode as brands_barcode
    ,brands_df.category as brands_category
    ,brands_df.brandCode as brands_brand_code
    ,case when receipts_brand_code = brands_brand_code then true else false end as is_brand_code_match
    from receipts_df
    left join brands_df on cast(brands_df.barcode as varchar) = cast(receipts_df.barcode as varchar)
)

select * from barcode_exist
where brands_barcode is not null
""").to_df()

Unnamed: 0,user_id,date_scanned,receipts_barcode,receipts_brand_code,receipts_quantity_purchased,brands_name,brands_barcode,brands_category,brands_brand_code,is_brand_code_match
0,5ff1e1eacfcf6c399c274ae6,2021-01-03 10:25:31,4011,,5.0,,,,,False
1,5ff1e194b6a9d73a3a9f1052,2021-01-03 10:24:43,4011,,1.0,,,,,False
2,5ff1e194b6a9d73a3a9f1052,2021-01-03 10:24:43,028400642255,,1.0,,,,,False
3,5ff1e1f1cfcf6c399c274b0b,2021-01-03 10:25:37,,,,,,,,False
4,5ff1e1eacfcf6c399c274ae6,2021-01-03 10:25:34,4011,,4.0,,,,,False
...,...,...,...,...,...,...,...,...,...,...
7383,600f29a64329897eac239049,2021-01-25 15:53:28,511111704140,PREGO,1.0,Diet Chris Cola,511111704140,,DIETCHRIS2,False
7384,600f29a64329897eac239049,2021-01-25 15:53:28,511111704140,PREGO,1.0,Diet Chris Cola,511111704140,,DIETCHRIS2,False
7385,600f29a64329897eac239049,2021-01-25 15:53:28,511111101451,QUAKER,1.0,Quaker,511111101451,Breakfast & Cereal,QUAKER,True
7386,600f29a64329897eac239049,2021-01-25 15:53:28,511111704140,PREGO,1.0,Prego,511111704140,Condiments & Sauces,PREGO,True
