In [1]:
# This notebook just creates data for use in ./MergeVsJoinDataPrep.ipynb.

In [1]:
import os
import uuid
import pandas as pd
import math
import random

In [26]:
 # day 1
def product_names(i):
    if i % 3 == 0:
        return f'widget{i}'
    elif i % 3 == 1:
        return f'sproket{i}'
    else:
        return f'doodad{i}'

def build_products(n, j, date):
    return [
        {'product_id': f'p_{i+j}', 
         'product_name': product_names(i+j), 
         'price': round(random.random() * 100, 2), 
         'product_modified': date, 
         'product_created': date
        }
    for i in range(n)
]
    
products = build_products(125, 0, 'day0')

def build_invoices(n, j, date):
    return [
        {'invoice_id': f'inv_{i+j}', 
         'customer': str(uuid.uuid4()), 
         'status': 'draft' if random.random() < 0.1 else 'sent', 
         'invoice_modified': date, 
         'invoice_created': date}
        for i in range(n)
    ]

def build_invoice_items(invoices, products, date):
    local_items = []
    for invoice in invoices:
        num_items = random.randint(1, 5)
        items = [
            {'invoice_item_id': str(uuid.uuid4()), 
             'invoice': invoice['invoice_id'], 
             'count': random.randint(1, 4),
             'invoice_item_modified': date, 
             'invoice_item_created': date,
             'product': random.choice(products)['product_id']}
            for i in range(num_items)
        ]
        local_items.extend(items)
    return local_items
    
invoices = build_invoices(1000, 0, 'day0')
invoice_items = build_invoice_items(invoices, products, 'day0')
    

In [27]:
# day 1 - any invoice that has changed will be in here.
# separate delete records made.
existing_invoices = {k['invoice_id']: k for k in invoices}
existing_items = {}
for item in invoice_items:
    inv_id = item['invoice']
    if inv_id not in existing_items:
        existing_items[inv_id] = []
    existing_items[inv_id].append(item)


# update products
product_sample = random.sample(products, 25)
updated_products = []
for p in product_sample:
    cp_p = dict(p)
    cp_p['price'] = round(random.random() * 100, 2)
    cp_p['product_modified'] = 'day1'
    updated_products.append(cp_p)
updated_products.extend(build_products(25, 200, 'day1'))

# make new invoices
random_invoice_ids = set([random.randint(0, 5000) for r in range(500)])  # gives us updates to 20%
changed_invoices = [
    {'invoice_id': f'inv_{i}', 
     'customer': str(uuid.uuid4()),
     'invoice_modifed': 'day1',
     'invoice_created': 'day0',
     'status': 'draft' if random.random() < 0.1 else 'sent'}
    for i in random_invoice_ids if i > 1000
]

# make some new invoices.
new_invoices = build_invoices(500, 1001, 'day1')
changed_invoice_items = build_invoice_items(new_invoices, updated_products, 'day1')

deleted_invoice_items = []
deleted_invoices = []

# update some invoices [drop item, add item, modify item]
invoices_to_update = [i for i in random_invoice_ids if i <= 1000]
for update_i in invoices_to_update:
    rand_action = random.random()
    key = f'inv_{update_i}'
    if rand_action < .05:
        # drop an invoice
        deleted_invoices.append(existing_invoices[key])
        deleted_invoice_items.extend(existing_items[key])
    elif rand_action < 0.1:
        # drop an invoice item or two
        num_to_drop = 1 if random.random() < 0.6 else 2
        to_drop = random.choice(existing_items[key])
        deleted_invoice_items.append(to_drop)
        inv['invoice_modified'] = 'day1'
    elif rand_action < 0.9:
        inv = dict(existing_invoices[key])
        inv['status'] == 'sent'
        inv['invoice_modified'] = 'day1'
        changed_invoices.append(inv)
    else:
        # find an item and update the count
        inv_item = dict(random.choice(existing_items[key]))
        inv_item['count'] = random.randint(1, 6)
        inv_item['invoice_item_modified'] = 'day1'
        changed_invoice_items.append(inv_item)


In [28]:
os.makedirs('./data/products/updates/day=0', exist_ok=True)
pd.DataFrame(products).to_csv("./data/products/updates/day=0/products.csv")

os.makedirs('./data/invoice/updates/day=0', exist_ok=True)
pd.DataFrame(invoices).to_csv("./data/invoice/updates/day=0/invoices.csv")

os.makedirs('./data/invoiceitems/updates/day=0', exist_ok=True)
pd.DataFrame(invoice_items).to_csv("./data/invoiceitems/updates/day=0/invoiceitems.csv")



In [29]:
os.makedirs('./data/products/updates/day=1', exist_ok=True)
pd.DataFrame(updated_products).to_csv("./data/products/updates/day=1/products.csv")

os.makedirs('./data/invoice/updates/day=1', exist_ok=True)
pd.DataFrame(changed_invoices).to_csv("./data/invoice/updates/day=1/invoices.csv")
os.makedirs('./data/invoice/deletes/day=1', exist_ok=True)
pd.DataFrame(deleted_invoices).to_csv("./data/invoice/deletes/day=1/invoices.csv")

os.makedirs('./data/invoiceitems/updates/day=1', exist_ok=True)
os.makedirs('./data/invoiceitems/deletes/day=1', exist_ok=True)
pd.DataFrame(changed_invoice_items).to_csv("./data/invoiceitems/updates/day=1/invoiceitems.csv")
pd.DataFrame(deleted_invoice_items).to_csv("./data/invoiceitems/deletes/day=1/invoiceitems.csv")



In [30]:
updated_products[:10]

[{'product_id': 'p_66',
  'product_name': 'widget66',
  'price': 9.31,
  'product_modified': 'day1',
  'product_created': 'day0'},
 {'product_id': 'p_2',
  'product_name': 'doodad2',
  'price': 23.57,
  'product_modified': 'day1',
  'product_created': 'day0'},
 {'product_id': 'p_88',
  'product_name': 'sproket88',
  'price': 45.29,
  'product_modified': 'day1',
  'product_created': 'day0'},
 {'product_id': 'p_50',
  'product_name': 'doodad50',
  'price': 92.52,
  'product_modified': 'day1',
  'product_created': 'day0'},
 {'product_id': 'p_109',
  'product_name': 'sproket109',
  'price': 65.04,
  'product_modified': 'day1',
  'product_created': 'day0'},
 {'product_id': 'p_89',
  'product_name': 'doodad89',
  'price': 20.66,
  'product_modified': 'day1',
  'product_created': 'day0'},
 {'product_id': 'p_10',
  'product_name': 'sproket10',
  'price': 28.37,
  'product_modified': 'day1',
  'product_created': 'day0'},
 {'product_id': 'p_97',
  'product_name': 'sproket97',
  'price': 12.27,
 