### SETUP & CONSTANTS

In [3]:
import re
import os
from pyspark.sql import SparkSession
from pyspark.sql import DataFrame
import pyspark.sql.functions as F
from pyspark.sql import Window
from functools import reduce

# CONSTANTS
INPUT_PATH = './notebooks/csv/'
TABLE_NAMES = [
    'Purchases',
    'Sales',
    'BegInv',
    'EndInv',
    'PurchasePrices',
    'InvoicePurchases',
]

In [4]:
def unique(df, group_by, select_cols,min=1):
    return df.groupBy(*group_by).agg(F.collect_set(F.struct(*select_cols)).alias('objs')).where(F.array_size('objs')>min)

### SUMMARY

In [5]:
spark=SparkSession.builder.appName("transform").getOrCreate()
existing_csv = os.listdir(INPUT_PATH)
_df_raw={}
for file in existing_csv:
    filename=rf'{INPUT_PATH}/{file}'
    table_name = re.sub(r'FINAL|Dec|\d+|\.csv', '', file)
    if table_name not in TABLE_NAMES:
        continue

    _df_raw[table_name]=spark.read.format("csv").option("header",True).load(filename)
    print(f'{table_name}:')
    _df_raw[table_name].limit(2).show()



PurchasePrices:
+-----+--------------------+-----+-----+------+--------------+-------------+------------+--------------------+
|Brand|         Description|Price| Size|Volume|Classification|PurchasePrice|VendorNumber|          VendorName|
+-----+--------------------+-----+-----+------+--------------+-------------+------------+--------------------+
|   58|Gekkeikan Black &...|12.99|750mL|   750|             1|         9.28|        8320|SHAW ROSS INT L I...|
|   62|Herradura Silver ...|36.99|750mL|   750|             1|        28.67|        1128|BROWN-FORMAN CORP...|
+-----+--------------------+-----+-----+------+--------------+-------------+------------+--------------------+

BegInv:
+-----------------+-----+------------+-----+--------------------+-----+------+-----+----------+
|      InventoryId|Store|        City|Brand|         Description| Size|onHand|Price| startDate|
+-----------------+-----+------------+-----+--------------------+-----+------+-----+----------+
|1_HARDERSFIELD_58|  

### SIMPLE TABLES

#### _df_product

In [91]:
product_cols = ['Brand', 'Description', 'Size']

_df_product_raw = (
        reduce(
        DataFrame.unionByName,
        [
            _df_raw['Purchases'].select(*product_cols),
            _df_raw['Sales'].select(*product_cols),
            _df_raw['BegInv'].select(*product_cols),
            _df_raw['EndInv'].select(*product_cols),
        ]
    )
    .select(F.col('Brand').cast('int').alias('Brand'),'Description','Size')
)
_df_product_raw.cache()

_df_product = (
    _df_product_raw
    .groupBy('Brand')
    .agg(
        F.collect_set('Description').alias('alt_names'),
        F.first('Size').alias('Size')
    )
    .withColumn('Description', F.expr("alt_names[0]"))
    .withColumn('alt_descriptions', F.expr("slice(alt_names, 2, size(alt_names) - 1)"))


    # .distinct()
    .select(
        F.row_number().over(Window.orderBy('Brand', 'Description', 'Size')).alias('id'),
        'Brand',
        'Description',
        'Size',
        'alt_descriptions',
    )
)
_df_product.orderBy('id','Brand','Description','Size').show(10)
_df_product.cache()
print(_df_product.count())

+---+-----+--------------------+-----+----------------+
| id|Brand|         Description| Size|alt_descriptions|
+---+-----+--------------------+-----+----------------+
|  1|   58|Gekkeikan Black &...|750mL|              []|
|  2|   60|Canadian Club 185...|750mL|              []|
|  3|   61|Margaritaville Si...|750mL|              []|
|  4|   62|Herradura Silver ...|750mL|              []|
|  5|   63|Herradura Reposad...|750mL|              []|
|  6|   70|Luxardo Amaretto ...|750mL|              []|
|  7|   72|No. 3 London Dry Gin|750mL|              []|
|  8|   75|Three Olives Toma...|750mL|              []|
|  9|   77|Three Olives Espr...|750mL|              []|
| 10|   79|Three Olives Loop...|750mL|              []|
+---+-----+--------------------+-----+----------------+
only showing top 10 rows

11503


In [7]:
unique(_df_product, ['Brand'], ['Description'],1).show(10000, truncate=False)
unique(_df_product, ['Brand'], ['Size'],1).show(10000, truncate=False)

+-----+----+
|Brand|objs|
+-----+----+
+-----+----+

+-----+----+
|Brand|objs|
+-----+----+
+-----+----+



#### _df_vendor

In [89]:
vendor_cols=['VendorNumber', 'VendorName']
_df_vendor = (
    reduce(
        DataFrame.unionByName,
        [
            _df_raw['Purchases'].select(*vendor_cols),
            _df_raw['InvoicePurchases'].select(*vendor_cols),
            _df_raw['PurchasePrices'].select(*vendor_cols),
        ]
    )
    .groupBy(F.col('VendorNumber').cast('int').alias('VendorNumber'))
    .agg(
        F.collect_set('VendorName').alias('alt_names')
    )
    .withColumn('VendorName', F.expr("alt_names[0]"))
    .withColumn('alt_vendor_names', F.expr("slice(alt_names, 2, size(alt_names) - 1)"))
    .select(
        F.row_number().over(Window.orderBy('VendorNumber')).alias('id'),
        'VendorNumber',
        'VendorName',
        'alt_vendor_names',
    )
)
_df_vendor.orderBy('id').show(20)

+---+------------+--------------------+----------------+
| id|VendorNumber|          VendorName|alt_vendor_names|
+---+------------+--------------------+----------------+
|  1|           2|IRA GOLDMAN AND W...|              []|
|  2|          54|AAPER ALCOHOL & C...|              []|
|  3|          60|ADAMBA IMPORTS IN...|              []|
|  4|         105|ALTAMAR BRANDS LL...|              []|
|  5|         200|AMERICAN SPIRITS ...|              []|
|  6|         287|APPOLO VINEYARDS ...|              []|
|  7|         388|ATLANTIC IMPORTIN...|              []|
|  8|         480|BACARDI USA INC  ...|              []|
|  9|         516|BANFI PRODUCTS CO...|              []|
| 10|         653|STATE WINE & SPIR...|              []|
| 11|         660|SAZERAC NORTH AME...|              []|
| 12|        1002|BERNIKO LLC      ...|              []|
| 13|        1003|BRONCO WINE COMPA...|              []|
| 14|        1128|BROWN-FORMAN CORP...|              []|
| 15|        1189|BULLY BOY DIS

#### _df_city

In [9]:
_df_inventory_union = _df_raw['BegInv'].unionByName(_df_raw['BegInv'],allowMissingColumns=True)

_df_city = (
    _df_inventory_union
    .select('City')
    .distinct()
    .select(
        F.row_number().over(Window.orderBy('City')).alias('id'),
        'City'
    )
)
_df_city.show(10)

+---+-------------+
| id|         City|
+---+-------------+
|  1|     ABERDEEN|
|  2|    AETHELNEY|
|  3|    ALNERWICK|
|  4|    ARBINGTON|
|  5|     ASHBORNE|
|  6|    AYLESBURY|
|  7|      BALERNO|
|  8|    BALLYMENA|
|  9|    BARNCOMBE|
| 10|BEGGAR'S HOLE|
+---+-------------+
only showing top 10 rows



#### _df_store

In [87]:
_df_store = (
    _df_inventory_union
    .join(_df_city.select('id','City').alias('c'), ['City'], 'left')
    .select(F.col('Store').cast('int').alias('Store'),F.col('c.id').alias('city_id'))
    .distinct()
    .select(
        F.row_number().over(Window.orderBy('Store','city_id')).alias('id'),
        F.col('Store').alias('Store'),
        'city_id'
    )
)
_df_store.show(10)

+---+-----+-------+
| id|Store|city_id|
+---+-----+-------+
|  1|    1|     33|
|  2|    2|      5|
|  3|    3|     36|
|  4|    4|     24|
|  5|    5|     59|
|  6|    6|     29|
|  7|    7|     58|
|  8|    8|      3|
|  9|    9|     12|
| 10|   10|     36|
+---+-----+-------+
only showing top 10 rows



#### _df_inventory

In [11]:
shared_inventory_cols = [
    'InventoryId',
    'Store',
    'City',
    'Brand',
]
_df_inventory = (
    _df_raw['BegInv'].alias('start')
    .join(
        _df_raw['EndInv']
        .alias('end'), 
        shared_inventory_cols,
        'outer'
    )
    .withColumn('Brand', F.col('Brand').cast('int'))
    .join(_df_city.select(F.col('id').alias('city_id'),'City').alias('c'), ['City'], 'left')
    .join(_df_store.select(F.col('id').alias('store_id'),'Store','city_id').alias('s'), ['Store','city_id'], 'left')
    .join(_df_product.select(F.col('id').alias('product_id'),'Brand').alias('p'), ['Brand'], 'left')
    .select(
        'InventoryId','store_id','product_id', # PK
        F.struct('start.price', F.col('start.startDate').alias('date'), 'start.onHand').alias('start'),
        F.struct('end.price', F.col('end.endDate').alias('date'), 'end.onHand').alias('end'),
    )
    .distinct()
    .select(
        F.row_number().over(Window.orderBy('InventoryId','store_id','product_id')).alias('id'),
        'InventoryId', # Format: store_city_brand
        'store_id',
        'product_id',
        

        'start',
        'end',
    )
)
_df_inventory.orderBy('product_id').show(30,truncate=False)



+------+-----------------+--------+----------+-----------------------+-----------------------+
|id    |InventoryId      |store_id|product_id|start                  |end                    |
+------+-----------------+--------+----------+-----------------------+-----------------------+
|29900 |1_HARDERSFIELD_58|1       |1         |{12.99, 2016-01-01, 8} |{12.99, 2016-12-31, 11}|
|3739  |10_HORNSEY_58    |10      |1         |{12.99, 2016-01-01, 6} |{12.99, 2016-12-31, 9} |
|13308 |14_BROMWICH_58   |14      |1         |{12.99, 2016-01-01, 3} |{12.99, 2016-12-31, 2} |
|18052 |15_WANBORNE_58   |15      |1         |{12.99, 2016-01-01, 9} |{12.99, 2016-12-31, 12}|
|19874 |16_LUNDY_58      |16      |1         |{12.99, 2016-01-01, 4} |{12.99, 2016-12-31, 7} |
|6845  |11_CARDEND_58    |11      |1         |{12.99, 2016-01-01, 9} |{12.99, 2016-12-31, 9} |
|32479 |20_BREDWARDINE_58|20      |1         |{NULL, NULL, NULL}     |{12.99, 2016-12-31, 21}|
|36092 |21_BALERNO_58    |21      |1         |{12.

#### _df_purchase_order

In [None]:
purchase_order_cols = ['PONumber', 'PODate', 'VendorNumber']
_df_purchase_order = (
    _df_raw['Purchases'].select(*purchase_order_cols)
    .join(
        _df_raw['InvoicePurchases'].select(*purchase_order_cols),
        purchase_order_cols,
        'left'
    )
    .select(
        F.col('PONumber').cast('int').alias('PONumber'),
        F.col('PODate').cast('date').alias('PODate'),
        F.col('VendorNumber').cast('int').alias('VendorNumber'),
    )
    .distinct()
    .select(
        F.row_number().over(Window.orderBy(purchase_order_cols)).alias('id'),
        *purchase_order_cols,
    )
)
_df_purchase_order.orderBy('id').show()

+---+--------+----------+------------+
| id|PONumber|    PODate|VendorNumber|
+---+--------+----------+------------+
|  1|    8106|2015-12-20|         480|
|  2|    8107|2015-12-20|       90046|
|  3|    8108|2015-12-20|        1392|
|  4|    8109|2015-12-20|        1590|
|  5|    8110|2015-12-20|        3252|
|  6|    8111|2015-12-20|       17032|
|  7|    8112|2015-12-20|        4692|
|  8|    8113|2015-12-20|        8352|
|  9|    8114|2015-12-20|        6785|
| 10|    8115|2015-12-20|       10754|
| 11|    8116|2015-12-20|        9206|
| 12|    8117|2015-12-20|       90027|
| 13|    8118|2015-12-20|       10050|
| 14|    8119|2015-12-20|        8663|
| 15|    8120|2015-12-20|        8920|
| 16|    8121|2015-12-20|       28776|
| 17|    8124|2015-12-21|         105|
| 18|    8125|2015-12-21|        2876|
| 19|    8126|2015-12-21|       90047|
| 20|    8128|2015-12-21|       10000|
+---+--------+----------+------------+
only showing top 20 rows



#### _df_invoice

In [None]:
_df_invoice = (
    _df_raw['InvoicePurchases']
    .join(_df_vendor.select(F.col('id').alias('vendor_id'),'VendorNumber'), ['VendorNumber'], 'left')
    .select(
        F.row_number().over(Window.orderBy('PONumber', 'PODate')).alias('id'),
        'PONumber',
        'PONumber',
        'PODate',
        'vendor_id',
        F.col('InvoiceDate').cast('date').alias('InvoiceDate'),
        F.col('PayDate').cast('date').alias('PayDate'),
        F.col('Quantity').cast('int').alias('Quantity'),
        F.col('Dollars').cast('float').alias('Dollars'),
        F.col('Freight').cast('float').alias('Freight'),
        # 'Approval', # ignore approval column for now
    )
)
_df_invoice.orderBy('id').show(10,truncate=False)

+---+--------+--------+----------+---------+-----------+----------+--------+---------+-------+
|id |PONumber|PONumber|PODate    |vendor_id|InvoiceDate|PayDate   |Quantity|Dollars  |Freight|
+---+--------+--------+----------+---------+-----------+----------+--------+---------+-------+
|1  |10000   |10000   |2016-04-29|29       |2016-05-16 |2016-06-23|12      |52.08    |0.26   |
|2  |10001   |10001   |2016-04-29|14       |2016-05-10 |2016-06-12|16679   |227776.44|1138.88|
|3  |10002   |10002   |2016-04-29|15       |2016-05-17 |2016-06-24|125     |2775.93  |14.43  |
|4  |10003   |10003   |2016-04-29|34       |2016-05-12 |2016-06-14|30      |258.36   |1.27   |
|5  |10004   |10004   |2016-04-29|31       |2016-05-12 |2016-06-19|175     |2873.57  |14.94  |
|6  |10005   |10005   |2016-04-29|33       |2016-05-17 |2016-06-20|1190    |31095.81 |161.7  |
|7  |10006   |10006   |2016-04-29|41       |2016-05-13 |2016-06-12|6782    |53202.67 |287.29 |
|8  |10007   |10007   |2016-04-29|101      |2016-0

#### _df_purchase

In [None]:
_df_purchase = (
    _df_raw['Purchases']
    .join(_df_vendor.select(F.col('id').alias('vendor_id'),'VendorNumber'), ['VendorNumber'], 'left')
    .join(
        _df_inventory.select(
            F.col('id').alias('inventory_id'),
            'InventoryId',
            'store_id',
            'product_id',
        ),
        ['InventoryId'],
        'left'
    )
    .join(_df_invoice.withColumnRenamed('id', 'invoice_id'), ['PONumber', 'PODate', 'vendor_id', 'InvoiceDate', 'PayDate', 'Quantity', 'Dollars'], 'left')
    # .join(_df_product.select(F.col('id').alias('product_id'),'Brand'), ['Brand'], 'left')
    
)
_df_purchase.orderBy('PONumber').show(10,truncate=False)

+--------+----------+---------+-----------+----------+--------+-------+----------------+------------+-----+-----+----------------------------+-----+---------------------------+-------------+-------------+--------------+------------+--------+----------+----------+-------+
|PONumber|PODate    |vendor_id|InvoiceDate|PayDate   |Quantity|Dollars|InventoryId     |VendorNumber|Store|Brand|Description                 |Size |VendorName                 |ReceivingDate|PurchasePrice|Classification|inventory_id|store_id|product_id|invoice_id|Freight|
+--------+----------+---------+-----------+----------+--------+-------+----------------+------------+-----+-----+----------------------------+-----+---------------------------+-------------+-------------+--------------+------------+--------+----------+----------+-------+
|10000   |2016-04-29|29       |2016-05-16 |2016-06-23|12      |52.08  |51_ABERDEEN_3508|2396        |51   |3508 |Llord's Orange Curacao      |Liter|BLACK PRINCE DISTILLERY INC|2016-05-

In [59]:
# _df_raw['Purchases'].groupBy('PONumber').count().where('count > 1').show(100, truncate=False)W
unique(_df_raw['Purchases'], ['PONumber', 'PODate', 'VendorName','InvoiceDate', 'PayDate'], ['Brand', 'Quantity','PurchasePrice','Dollars'], 1).orderBy('PONumber').show(50, truncate=False)

+--------+----------+---------------------------+-----------+----------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [46]:
common_cols = ['PONumber', 'PODate', 'VendorNumber'] # , 'PODate', 'VendorNumber', 'InvoiceDate', 'PayDate', 'Quantity', 'Dollars'
_df_temp = (
    _df_raw['Purchases'].withColumn('purchase', F.lit(True)).alias('p')
    .join(
        _df_raw['InvoicePurchases'].withColumn('invoice', F.lit(True)).alias('i'),
        common_cols,
        'left'
    )
    .select('p.*', 'i.*')
)

# _df_temp.where('purchase is null or invoice is null').show(10, truncate=False)

print('matching:', _df_temp.where('purchase and invoice').count())
print('missing :', _df_temp.where('purchase is null or invoice is null').count())

# _df_temp.where('purchase is null or invoice is null').show(100, truncate=False)


matching: 1750484
missing : 621990


### EXTRA

In [None]:

# _df_raw['PurchasePrices'].printSchema()
_df_raw['PurchasePrices'].groupBy('brand','description').agg(F.count('*').alias('cnt')).where('cnt > 1').show()

(
    # _df_raw['Purchases']
    _df_raw['BegInv'].union(_df_raw['EndInv'])
    .groupBy('brand','description')
    .agg(
        F.count('*').alias('cnt'),
        # F.array_distinct(F.array_agg('purchasePrice')).alias('uniquePurchasePrices'),
        # F.array_distinct(F.array_agg('Store')).alias('uniqueStore'),
        # F.array_distinct(F.array_agg('InventoryId')).alias('uniqueInventoryId'),
        F.array_distinct(F.array_agg('Size')).alias('uniqueSize'),
        # F.array_distinct(F.array_agg('Volume')).alias('uniqueVolume'),
    )
    # .where(F.size('uniquePurchasePrices') > 1)
    # .where(F.size('uniqueInventoryId') > 1)
    # .where(F.size('uniqueStore') > 1)
    .where(F.size('uniqueSize') > 1)
    # .where(F.size('uniqueVolume') > 1)
    .orderBy('brand','description')
    .show(100,truncate=False)
)

+-----+-----------+---+
|brand|description|cnt|
+-----+-----------+---+
+-----+-----------+---+

+-----+------------------------+---+------------------+
|brand|description             |cnt|uniqueSize        |
+-----+------------------------+---+------------------+
|1136 |Malibu 50ml Sampler Pack|55 |[50mL 5 Pk, 250mL]|
|3085 |Absolut Mini Bar 5/50mls|150|[50mL 5 Pk, 250mL]|
+-----+------------------------+---+------------------+



In [None]:
_df_purchase_prices_dupes = _df_raw['PurchasePrices'].groupBy(['brand','description']).agg(F.count_distinct('price','purchasePrice').alias('count')).filter("count > 1")
_df_purchases_dupes = _df_raw['Purchases'].groupBy(['InventoryId','description']).agg(F.count_distinct('purchasePrice').alias('count')).filter("count > 1")
_df_sales_dupes = _df_raw['Sales'].groupBy(['InventoryId','description']).agg(F.count_distinct('salesPrice').alias('count')).filter("count > 1")


_df_purchase_prices_dupes.show()
_df_purchases_dupes.show()
_df_sales_dupes.show()

+-----+-----------+-----+
|brand|description|count|
+-----+-----------+-----+
+-----+-----------+-----+

+-----------+-----------+-----+
|InventoryId|description|count|
+-----------+-----------+-----+
+-----------+-----------+-----+

+-------------------+--------------------+-----+
|        InventoryId|         description|count|
+-------------------+--------------------+-----+
|   11_CARDEND_24310|        Hi! Prosecco|    4|
|  22_SHARNWICK_6590|Almaden B/B Wh Zn...|    2|
| 23_ARBINGTON_16440|Banshee Cab Svgn ...|    2|
|     36_LEWES_10957|Seaglass Svgn Bl ...|    2|
|   71_CLARCTON_4787|Appleton Special ...|    2|
|  9_BLACKPOOL_46830|Pacific Rim Sweet...|    3|
|59_CLAETHORPES_3763|Patron Anejo Tequila|    2|
|  61_AETHELNEY_1269|  Jeffersons Bourbon|    2|
|   30_CULCHETH_3938|         Stolichnaya|    3|
|  34_PITMERDEN_2682|The Glendronach 1...|    3|
|   41_LARNWICK_4330|Capt Morgan Spice...|    5|
|   53_HILLFAR_12227|   Korbel Sweet Rose|    3|
|  49_GARIGILL_32648|Beringer C

In [None]:
# _df_raw['Sales'].select('InventoryId','salesPrice').where('InventoryId == "11_CARDEND_24310"').distinct().show()
_df_raw['Sales'].where('InventoryId == "11_CARDEND_24310"').show()

+----------------+-----+-----+------------+-----+-------------+------------+----------+----------+------+--------------+---------+--------+--------------------+
|     InventoryId|Store|Brand| Description| Size|SalesQuantity|SalesDollars|SalesPrice| SalesDate|Volume|Classification|ExciseTax|VendorNo|          VendorName|
+----------------+-----+-----+------------+-----+-------------+------------+----------+----------+------+--------------+---------+--------+--------------------+
|11_CARDEND_24310|   11|24310|Hi! Prosecco|750mL|            1|       15.99|     15.99|2016-01-04|   750|             2|     0.11|   10754|PERFECTA WINES   ...|
|11_CARDEND_24310|   11|24310|Hi! Prosecco|750mL|            2|       31.98|     15.99|2016-01-08|   750|             2|     0.22|   10754|PERFECTA WINES   ...|
|11_CARDEND_24310|   11|24310|Hi! Prosecco|750mL|            1|       12.99|     12.99|2016-01-21|   750|             2|     0.11|   10754|PERFECTA WINES   ...|
|11_CARDEND_24310|   11|24310|Hi! 

### ANOMALIES

#### Vendors names aren't normalized

In [None]:
unique(_df_raw['Purchases'], ['VendorNumber'], ['VendorName']).show(10,truncate=False)

+------------+--------------------------------------------------------------+
|VendorNumber|objs                                                          |
+------------+--------------------------------------------------------------+
|2000        |[{SOUTHERN WINE & SPIRITS NE }, {SOUTHERN GLAZERS W&S OF NE }]|
|1587        |[{VINEYARD BRANDS LLC        }, {VINEYARD BRANDS INC        }]|
|4425        |[{MARTIGNETTI COMPANIES }, {MARTIGNETTI COMPANIES}]           |
+------------+--------------------------------------------------------------+



#### Brand=Product. Description and size colums aren't normalized though

In [69]:
unique(_df_product_raw, ['Brand'], ['Description'],1).show(10, truncate=False)
unique(_df_product_raw, ['Brand'], ['Size'],1).show(10, truncate=False)

+-----+--------------------------------------------------------------------------------+
|Brand|objs                                                                            |
+-----+--------------------------------------------------------------------------------+
|350  |[{Courvoisier VS Gift Pak}, {Courvoisier VS 2 Glass Pack}, {Courvoisier VS VAP}]|
|1064 |[{Jack Daniels Honey VAP}, {Jack Daniels Honey + Glass}]                        |
|1065 |[{DiSaronno 3/50mls Pack}, {DiSaronno Cavalli Collection}]                      |
|1084 |[{Cointreau Liqueur w/ Carafe}, {Cointreau Liqueur with Carafe}]                |
|1927 |[{Jack Daniels Barrel Proof}, {"Jack Daniels ""Barrel Proof"""}]                |
|1961 |[{Everclear 151 Vodka}, {Everclear 151}]                                        |
|2209 |[{Pendleton 1910}, {Pendleton 1910 Rye}]                                        |
|2531 |[{WhistlePig 10 Yr Old Rye}, {WhistlePig Straight Rye 10Yr}]                    |
|2549 |[{WhistlePig 1

In [70]:
purchase_order_cols = ['PONumber', 'PODate', 'VendorNumber']
_df_purchase_orders = (
    _df_raw['Purchases'].select(*purchase_order_cols,F.lit(True).alias('is_purchase'))
    .join(
        _df_raw['InvoicePurchases'].select(*purchase_order_cols,F.lit(True).alias('is_invoice')),
        purchase_order_cols,
        'outer'
    )
)
print('total:', _df_purchase_orders.count())
print('lost invoices :', _df_purchase_orders.where('is_purchase is null and is_invoice').count())
_df_purchase_orders.show()

total: 2373810
lost invoices : 1336
+--------+----------+------------+-----------+----------+
|PONumber|    PODate|VendorNumber|is_purchase|is_invoice|
+--------+----------+------------+-----------+----------+
|   10000|2016-04-29|        2396|       true|      true|
|   10001|2016-04-29|        1128|       true|      true|
|   10001|2016-04-29|        1128|       true|      true|
|   10001|2016-04-29|        1128|       true|      true|
|   10001|2016-04-29|        1128|       true|      true|
|   10001|2016-04-29|        1128|       true|      true|
|   10001|2016-04-29|        1128|       true|      true|
|   10001|2016-04-29|        1128|       true|      true|
|   10001|2016-04-29|        1128|       true|      true|
|   10001|2016-04-29|        1128|       true|      true|
|   10001|2016-04-29|        1128|       true|      true|
|   10001|2016-04-29|        1128|       true|      true|
|   10001|2016-04-29|        1128|       true|      true|
|   10001|2016-04-29|        1128|  