In [1]:
# This notebook just creates data for use in ./MergeVsJoinDataPrep.ipynb.

In [2]:
import os
import uuid
import pandas as pd
import math
import random

In [3]:
 # day 0
def product_names(i):
    if i % 3 == 0:
        return f'widget{i}'
    elif i % 3 == 1:
        return f'sproket{i}'
    else:
        return f'doodad{i}'

def build_products(n, j, date):
    return [
        {'product_id': f'p_{i+j}', 
         'product_name': product_names(i+j), 
         'price': round(random.random() * 100, 2), 
         'product_modified': date, 
         'product_created': date
        }
    for i in range(n)
]
    
products = build_products(125, 0, 'day0')

def build_invoices(n, j, date):
    return [
        {'invoice_id': f'inv_{i+j}', 
         'customer': str(uuid.uuid4()), 
         'status': 'draft' if random.random() < 0.1 else 'sent', 
         'invoice_modified': date, 
         'invoice_created': date}
        for i in range(n)
    ]

def build_invoice_items(invoices, products, date):
    local_items = []
    for invoice in invoices:
        num_items = random.randint(1, 5)
        items = [
            {'invoice_item_id': str(uuid.uuid4()), 
             'invoice': invoice['invoice_id'], 
             'count': random.randint(1, 4),
             'invoice_item_modified': date, 
             'invoice_item_created': date,
             'product': random.choice(products)['product_id']}
            for i in range(num_items)
        ]
        local_items.extend(items)
    return local_items
    
invoices = build_invoices(1000, 0, 'day0')
invoice_items = build_invoice_items(invoices, products, 'day0')
    

In [4]:
# day 1 - any invoice that has changed will be in here.
# separate delete records made.
existing_invoices = {k['invoice_id']: k for k in invoices}
existing_items = {}
for item in invoice_items:
    inv_id = item['invoice']
    if inv_id not in existing_items:
        existing_items[inv_id] = []
    existing_items[inv_id].append(item)


# update products
product_sample = random.sample(products, 25)
updated_products = []
for p in product_sample:
    cp_p = dict(p)
    cp_p['price'] = round(random.random() * 100, 2)
    cp_p['product_modified'] = 'day1'
    updated_products.append(cp_p)
updated_products.extend(build_products(25, 200, 'day1'))

# make new invoices
random_invoice_ids = set([random.randint(0, 1000) for r in range(100)])  # gives us updates to 20%

# make some new invoices.
changed_invoices = build_invoices(500, 1001, 'day1')
changed_invoice_items = build_invoice_items(changed_invoices, updated_products, 'day1')

deleted_invoice_items = []
deleted_invoices = []

# update some invoices [drop item, add item, modify item]
invoices_to_update = [i for i in random_invoice_ids if i <= 1000]
for update_i in invoices_to_update:
    rand_action = random.random()
    key = f'inv_{update_i}'
    if rand_action < .05:
        # drop an invoice
        deleted_invoices.append(existing_invoices[key])
        deleted_invoice_items.extend(existing_items[key])
    elif rand_action < 0.1:
        # drop an invoice item or two
        num_to_drop = 1 if random.random() < 0.6 else 2
        to_drop = random.choice(existing_items[key])
        deleted_invoice_items.append(to_drop)
        inv['invoice_modified'] = 'day1'
    elif rand_action < 0.9:
        inv = dict(existing_invoices[key])
        inv['status'] == 'sent'
        inv['invoice_modified'] = 'day1'
        changed_invoices.append(inv)
    else:
        # find an item and update the count
        inv_item = dict(random.choice(existing_items[key]))
        inv_item['count'] = random.randint(1, 6)
        inv_item['invoice_item_modified'] = 'day1'
        changed_invoice_items.append(inv_item)


In [5]:
os.makedirs('./data/products/updates/day=0', exist_ok=True)
pd.DataFrame(products).to_csv("./data/products/updates/day=0/products.csv")

os.makedirs('./data/invoice/updates/day=0', exist_ok=True)
pd.DataFrame(invoices).to_csv("./data/invoice/updates/day=0/invoices.csv")

os.makedirs('./data/invoiceitems/updates/day=0', exist_ok=True)
pd.DataFrame(invoice_items).to_csv("./data/invoiceitems/updates/day=0/invoiceitems.csv")



In [6]:
os.makedirs('./data/products/updates/day=1', exist_ok=True)
pd.DataFrame(updated_products).to_csv("./data/products/updates/day=1/products.csv")

os.makedirs('./data/invoice/updates/day=1', exist_ok=True)
pd.DataFrame(changed_invoices).to_csv("./data/invoice/updates/day=1/invoices.csv")
os.makedirs('./data/invoice/deletes/day=1', exist_ok=True)
pd.DataFrame(deleted_invoices).to_csv("./data/invoice/deletes/day=1/invoices.csv")

os.makedirs('./data/invoiceitems/updates/day=1', exist_ok=True)
os.makedirs('./data/invoiceitems/deletes/day=1', exist_ok=True)
pd.DataFrame(changed_invoice_items).to_csv("./data/invoiceitems/updates/day=1/invoiceitems.csv")
pd.DataFrame(deleted_invoice_items).to_csv("./data/invoiceitems/deletes/day=1/invoiceitems.csv")



In [7]:
updated_products[:10]

[{'product_id': 'p_113',
  'product_name': 'doodad113',
  'price': 3.09,
  'product_modified': 'day1',
  'product_created': 'day0'},
 {'product_id': 'p_50',
  'product_name': 'doodad50',
  'price': 8.3,
  'product_modified': 'day1',
  'product_created': 'day0'},
 {'product_id': 'p_90',
  'product_name': 'widget90',
  'price': 41.63,
  'product_modified': 'day1',
  'product_created': 'day0'},
 {'product_id': 'p_12',
  'product_name': 'widget12',
  'price': 17.94,
  'product_modified': 'day1',
  'product_created': 'day0'},
 {'product_id': 'p_46',
  'product_name': 'sproket46',
  'price': 9.45,
  'product_modified': 'day1',
  'product_created': 'day0'},
 {'product_id': 'p_73',
  'product_name': 'sproket73',
  'price': 17.95,
  'product_modified': 'day1',
  'product_created': 'day0'},
 {'product_id': 'p_124',
  'product_name': 'sproket124',
  'price': 17.44,
  'product_modified': 'day1',
  'product_created': 'day0'},
 {'product_id': 'p_93',
  'product_name': 'widget93',
  'price': 69.5,
  

In [8]:
sorted([c['invoice_id'] for c in changed_invoices])

['inv_1',
 'inv_1001',
 'inv_1002',
 'inv_1003',
 'inv_1004',
 'inv_1005',
 'inv_1006',
 'inv_1007',
 'inv_1008',
 'inv_1009',
 'inv_1010',
 'inv_1011',
 'inv_1012',
 'inv_1013',
 'inv_1014',
 'inv_1015',
 'inv_1016',
 'inv_1017',
 'inv_1018',
 'inv_1019',
 'inv_1020',
 'inv_1021',
 'inv_1022',
 'inv_1023',
 'inv_1024',
 'inv_1025',
 'inv_1026',
 'inv_1027',
 'inv_1028',
 'inv_1029',
 'inv_103',
 'inv_1030',
 'inv_1031',
 'inv_1032',
 'inv_1033',
 'inv_1034',
 'inv_1035',
 'inv_1036',
 'inv_1037',
 'inv_1038',
 'inv_1039',
 'inv_1040',
 'inv_1041',
 'inv_1042',
 'inv_1043',
 'inv_1044',
 'inv_1045',
 'inv_1046',
 'inv_1047',
 'inv_1048',
 'inv_1049',
 'inv_1050',
 'inv_1051',
 'inv_1052',
 'inv_1053',
 'inv_1054',
 'inv_1055',
 'inv_1056',
 'inv_1057',
 'inv_1058',
 'inv_1059',
 'inv_106',
 'inv_1060',
 'inv_1061',
 'inv_1062',
 'inv_1063',
 'inv_1064',
 'inv_1065',
 'inv_1066',
 'inv_1067',
 'inv_1068',
 'inv_1069',
 'inv_1070',
 'inv_1071',
 'inv_1072',
 'inv_1073',
 'inv_1074',
 'in