# How you can use existing data parsing classes

In [None]:
# install necessary packages using  requirements.txt:
!pip install -r requirements.txt

# Generic imports

In [1]:
import json

# Import JSON parsers classes for all retailers

In [5]:
from msd_data.parsers import Parsers # you can clone the repo and import necessary classes wherever you want, i.e. Colab notebook

# Example 1: Woolworths JSON file

In [31]:
# Load the json (you can replace the file with your own)
with open("../../_examples/data_files/woolworths_receipts.json") as f:
    data = json.load(f)

# How the file normally looks inside (first few lines):     
print(json.dumps(data, sort_keys=True, indent=4)[0:1000])

{
    "brand": "woolworths",
    "captureTime": "2025-01-05T00:25:08.213Z",
    "connector_ver": "2.0.0",
    "download": [
        {
            "actionURL": "com.woolworths.rewards://activityDetails?id==",
            "captureTime": "2025-01-05T00:24:39.013Z",
            "clientId": "8h41mMOiDULmlLT28xKRvH",
            "description": "$3.00 at Big W Warringah Mall",
            "displayDate": "Mon 30 Dec",
            "displayValue": "+ 8 pts",
            "displayValueHandling": "NORMAL",
            "ereceipt": {
                "activityDetails": {
                    "__typename": "ActivityDetailsPage",
                    "defaultTabSelection": 1,
                    "tabs": [
                        {
                            "__typename": "ActivityDetailsTab",
                            "label": "eReceipt",
                            "navigationTitle": "Tax Invoice",
                            "navigationTitleAltText": "Tax Invoice",
                            "page":

In [13]:
woolworths_parser = Parsers['woolworths'][0]() # this is the parser for in-store receipts
woolworths_parsed_content = woolworths_parser.parse(data) 
# parse method only pulls data fields from the json. enrich() method adds extra attributes which are not present in the original file
# (i.e. departmens, store locations, etc.) but it requires access to an external data source, which is by authentification only.

In [35]:
# Parsed content object provides 2 main attributes: transactions (each receipt is one transaction) and items
# Each of this attributes is a pandas DataFrame
woolworths_parsed_content.transactions.head()

Unnamed: 0_level_0,Store Number,Segment,segment_id,_brand_cd,Brand,Store,Date,Card Number,Receipt Total,Total Points,Extra Bonus Points,Rewards Points,transactionId,metabrand
tn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
6,1129,Groceries,1.0,woolworths,Woolworths,1129 Macquarie Ryde,2024-11-21 11:25:00+11:00,8h41mMOiDULcccccccSv5ITpp3XBRvH,103.47,254,0.0,0.0,2420675183,woolworths


In [36]:
woolworths_parsed_content.items.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Product,Price Per Unit,Quantity,Unit,Price Total,Sku_o,Sku
tn,in,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
6,0,Apricot,6.9,0.597,kg,4.12,,
6,2,Kiwifruit Green Imported,6.9,0.727,kg,5.02,,
6,4,Cucumber Lebanese,3.9,0.387,kg,1.51,,
6,6,Tomato Truss,3.5,0.382,kg,1.34,,
6,8,Onion Shallot French,13.0,0.187,kg,2.43,,


In [38]:
# Transactions and items can be easily joined by the tn index value (pandas find the match automatically in most cases)
woolworths_parsed_content.transactions.join(woolworths_parsed_content.items).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Store Number,Segment,segment_id,_brand_cd,Brand,Store,Date,Card Number,Receipt Total,Total Points,...,Rewards Points,transactionId,metabrand,Product,Price Per Unit,Quantity,Unit,Price Total,Sku_o,Sku
tn,in,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
6,0,1129,Groceries,1.0,woolworths,Woolworths,1129 Macquarie Ryde,2024-11-21 11:25:00+11:00,8h41mMOiDULcccccccSv5ITpp3XBRvH,103.47,254,...,0.0,2420675183,woolworths,Apricot,6.9,0.597,kg,4.12,,
6,2,1129,Groceries,1.0,woolworths,Woolworths,1129 Macquarie Ryde,2024-11-21 11:25:00+11:00,8h41mMOiDULcccccccSv5ITpp3XBRvH,103.47,254,...,0.0,2420675183,woolworths,Kiwifruit Green Imported,6.9,0.727,kg,5.02,,
6,4,1129,Groceries,1.0,woolworths,Woolworths,1129 Macquarie Ryde,2024-11-21 11:25:00+11:00,8h41mMOiDULcccccccSv5ITpp3XBRvH,103.47,254,...,0.0,2420675183,woolworths,Cucumber Lebanese,3.9,0.387,kg,1.51,,
6,6,1129,Groceries,1.0,woolworths,Woolworths,1129 Macquarie Ryde,2024-11-21 11:25:00+11:00,8h41mMOiDULcccccccSv5ITpp3XBRvH,103.47,254,...,0.0,2420675183,woolworths,Tomato Truss,3.5,0.382,kg,1.34,,
6,8,1129,Groceries,1.0,woolworths,Woolworths,1129 Macquarie Ryde,2024-11-21 11:25:00+11:00,8h41mMOiDULcccccccSv5ITpp3XBRvH,103.47,254,...,0.0,2420675183,woolworths,Onion Shallot French,13.0,0.187,kg,2.43,,


In [None]:
# Coles example

In [39]:
# load the data file
with open("../../_examples/data_files/coles_receipts.json") as f:
    coles_data = json.load(f)

In [20]:
coles_parser = Parsers['coles'][0]() # for in-store receipts
# note that if you ordinary apply Coles parser to Woolworths data, it will not work, as the structure of the json file is different
coles_content = coles_parser.parse(coles_data) # the result is a ParsedContent object with two main attributes - transactions and items

In [40]:
coles_content.transactions.head() # first 5 fows of the transactions DataFrame

Unnamed: 0_level_0,requestId,Segment,segment_id,Store,Store Number,_brand_cd,Brand,Date,Card Number,Receipt Total,Total Points,Extra Bonus Points,Rewards Points,metabrand
tn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,,Groceries,1,Coles Crows Nest,7577,coles,Coles,2025-01-30 10:49:18+00:00,600894******7777,98.2,1104,0,0,coles
1,,Groceries,1,Coles Lachlans Square,7704,coles,Coles,2025-01-22 09:31:23+00:00,600894******1111,77.63,77,0,0,coles
2,X-p-hf8ddddddduxnKP,Groceries,1,Coles Top Ryde,5801,coles,Coles,2025-01-16 09:56:03+00:00,600894******1111,129.18,829,0,0,coles


In [41]:
coles_content.items.head() # first 5 rows of the items DataFrame

Unnamed: 0_level_0,Unnamed: 1_level_0,Product,Price Per Unit,Quantity,Unit,Price Total,Sku_o,Sku
tn,in,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0,Sodastream Lipton Ice Tea Peach Soda Mix 440mL,3.5,1.0,pc,3.5,7162260,7162260
0,1,Coles Recycled Kitchen Tidy Bags Medium 30 pack,3.0,1.0,pc,3.0,6609173,6609173
0,2,Coles Greek Style Natural Yoghurt 1kg,4.2,1.0,pc,4.2,2273478,2273478
0,3,Coles Apple Juice 2L,3.0,1.0,pc,3.0,2531530,2531530
0,4,Jalna Pot Set Greek Style Sweet & Creamy Straw...,6.5,1.0,pc,4.81,2744701,2744701


In [None]:
coles_content.transactions.join(coles_content.items).head() # join transactions and items into a signle DataFrame 
# Note that data from transactions dataframe here is replicated (duplicated) for each item in the receipt, but you have more ways to filter and group the data

Unnamed: 0_level_0,Unnamed: 1_level_0,requestId,Segment,segment_id,Store,Store Number,_brand_cd,Brand,Date,Card Number,Receipt Total,...,Extra Bonus Points,Rewards Points,metabrand,Product,Price Per Unit,Quantity,Unit,Price Total,Sku_o,Sku
tn,in,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,0,,Groceries,1,Coles Crows Nest,7577,coles,Coles,2025-01-30 10:49:18+00:00,600894******7777,98.2,...,0,0,coles,Sodastream Lipton Ice Tea Peach Soda Mix 440mL,3.5,1.0,pc,3.5,7162260,7162260
0,1,,Groceries,1,Coles Crows Nest,7577,coles,Coles,2025-01-30 10:49:18+00:00,600894******7777,98.2,...,0,0,coles,Coles Recycled Kitchen Tidy Bags Medium 30 pack,3.0,1.0,pc,3.0,6609173,6609173
0,2,,Groceries,1,Coles Crows Nest,7577,coles,Coles,2025-01-30 10:49:18+00:00,600894******7777,98.2,...,0,0,coles,Coles Greek Style Natural Yoghurt 1kg,4.2,1.0,pc,4.2,2273478,2273478
0,3,,Groceries,1,Coles Crows Nest,7577,coles,Coles,2025-01-30 10:49:18+00:00,600894******7777,98.2,...,0,0,coles,Coles Apple Juice 2L,3.0,1.0,pc,3.0,2531530,2531530
0,4,,Groceries,1,Coles Crows Nest,7577,coles,Coles,2025-01-30 10:49:18+00:00,600894******7777,98.2,...,0,0,coles,Jalna Pot Set Greek Style Sweet & Creamy Straw...,6.5,1.0,pc,4.81,2744701,2744701
