### SETUP & CONSTANTS

In [1]:
import re
import os
from pyspark.sql import SparkSession
from pyspark.sql import DataFrame
import pyspark.sql.functions as F
from pyspark.sql import Window
from functools import reduce

# CONSTANTS
INPUT_PATH = './notebooks/csv/'
TABLE_NAMES = [
    'Purchases',
    'Sales',
    'BegInv',
    'EndInv',
    'PurchasePrices',
    'InvoicePurchases',
]

In [2]:
def unique(df, group_by, select_cols,min=1):
    return df.groupBy(*group_by).agg(F.collect_set(F.struct(*select_cols)).alias('objs')).where(F.array_size('objs')>min)

### SUMMARY

In [3]:
spark=SparkSession.builder.appName("transform").getOrCreate()
existing_csv = os.listdir(INPUT_PATH)
_df_raw={}
for file in existing_csv:
    filename=rf'{INPUT_PATH}/{file}'
    table_name = re.sub(r'FINAL|Dec|\d+|\.csv', '', file)
    if table_name not in TABLE_NAMES:
        continue

    _df_raw[table_name]=spark.read.format("csv").option("header",True).load(filename)
    print(f'{table_name}:')
    _df_raw[table_name].limit(2).show()



PurchasePrices:
+-----+--------------------+-----+-----+------+--------------+-------------+------------+--------------------+
|Brand|         Description|Price| Size|Volume|Classification|PurchasePrice|VendorNumber|          VendorName|
+-----+--------------------+-----+-----+------+--------------+-------------+------------+--------------------+
|   58|Gekkeikan Black &...|12.99|750mL|   750|             1|         9.28|        8320|SHAW ROSS INT L I...|
|   62|Herradura Silver ...|36.99|750mL|   750|             1|        28.67|        1128|BROWN-FORMAN CORP...|
+-----+--------------------+-----+-----+------+--------------+-------------+------------+--------------------+

BegInv:
+-----------------+-----+------------+-----+--------------------+-----+------+-----+----------+
|      InventoryId|Store|        City|Brand|         Description| Size|onHand|Price| startDate|
+-----------------+-----+------------+-----+--------------------+-----+------+-----+----------+
|1_HARDERSFIELD_58|  

### SIMPLE TABLES

#### _df_product

In [None]:
product_cols = ['Brand', 'Description', 'Size']

_df_product_raw = (
        reduce(
        DataFrame.unionByName,
        [
            _df_raw['Purchases'].select(*product_cols),
            _df_raw['Sales'].select(*product_cols),
            _df_raw['BegInv'].select(*product_cols),
            _df_raw['EndInv'].select(*product_cols),
        ]
    )
    .select(F.col('Brand').cast('int').alias('Brand'),'Description','Size')
)
_df_product_raw.cache()

_df_product = (
    _df_product_raw
    .groupBy('Brand')
    .agg(
        F.collect_set('Description').alias('alt_names'),
        F.first('Size').alias('Size')
    )
    .withColumn('Description', F.expr("alt_names[0]")) # just take the first description available
    .withColumn('alt_descriptions', F.expr("slice(alt_names, 2, size(alt_names) - 1)")) # store alternate tames though
    .select(
        F.row_number().over(Window.orderBy('Brand')).alias('id'),
        'Brand',
        'Description',
        'Size',
        'alt_descriptions',
    )
)
_df_product.orderBy('id','Brand','Description','Size').show(10)
_df_product.cache()
print(_df_product.count())

+---+-----+--------------------+-----+----------------+
| id|Brand|         Description| Size|alt_descriptions|
+---+-----+--------------------+-----+----------------+
|  1|   58|Gekkeikan Black &...|750mL|              []|
|  2|   60|Canadian Club 185...|750mL|              []|
|  3|   61|Margaritaville Si...|750mL|              []|
|  4|   62|Herradura Silver ...|750mL|              []|
|  5|   63|Herradura Reposad...|750mL|              []|
|  6|   70|Luxardo Amaretto ...|750mL|              []|
|  7|   72|No. 3 London Dry Gin|750mL|              []|
|  8|   75|Three Olives Toma...|750mL|              []|
|  9|   77|Three Olives Espr...|750mL|              []|
| 10|   79|Three Olives Loop...|750mL|              []|
+---+-----+--------------------+-----+----------------+
only showing top 10 rows

11503


In [5]:
unique(_df_product, ['Brand'], ['Description'],1).show(10000, truncate=False)
unique(_df_product, ['Brand'], ['Size'],1).show(10000, truncate=False)

+-----+----+
|Brand|objs|
+-----+----+
+-----+----+

+-----+----+
|Brand|objs|
+-----+----+
+-----+----+



#### _df_vendor

In [141]:
vendor_cols=['VendorNumber', 'VendorName']
_df_vendor = (
    reduce(
        DataFrame.unionByName,
        [
            _df_raw['Purchases'].select(*vendor_cols),
            _df_raw['InvoicePurchases'].select(*vendor_cols),
            _df_raw['PurchasePrices'].select(*vendor_cols),
            _df_raw['Sales'].withColumnRenamed('VendorNo','VendorNumber').select(*vendor_cols), # may not be necessary
        ]
    )
    .groupBy(F.col('VendorNumber').cast('int').alias('VendorNumber'))
    .agg(
        F.collect_set('VendorName').alias('alt_names')
    )
    .withColumn('VendorName', F.expr("alt_names[0]"))
    .withColumn('alt_vendor_names', F.expr("slice(alt_names, 2, size(alt_names) - 1)"))
    .select(
        F.row_number().over(Window.orderBy('VendorNumber')).alias('id'),
        'VendorNumber',
        'VendorName',
        'alt_vendor_names',
    )
)
print(_df_vendor.count())
_df_vendor.orderBy('id').show(20)

132
+---+------------+--------------------+----------------+
| id|VendorNumber|          VendorName|alt_vendor_names|
+---+------------+--------------------+----------------+
|  1|           2|IRA GOLDMAN AND W...|              []|
|  2|          54|AAPER ALCOHOL & C...|              []|
|  3|          60|ADAMBA IMPORTS IN...|              []|
|  4|         105|ALTAMAR BRANDS LL...|              []|
|  5|         200|AMERICAN SPIRITS ...|              []|
|  6|         287|APPOLO VINEYARDS ...|              []|
|  7|         388|ATLANTIC IMPORTIN...|              []|
|  8|         480|BACARDI USA INC  ...|              []|
|  9|         516|BANFI PRODUCTS CO...|              []|
| 10|         653|STATE WINE & SPIR...|              []|
| 11|         660|SAZERAC NORTH AME...|              []|
| 12|        1002|BERNIKO LLC      ...|              []|
| 13|        1003|BRONCO WINE COMPA...|              []|
| 14|        1128|BROWN-FORMAN CORP...|              []|
| 15|        1189|BULLY BOY

In [60]:
unique(_df_vendor, ['VendorNumber'], ['VendorName'], 1).show(100, truncate=False)

+------------+----+
|VendorNumber|objs|
+------------+----+
+------------+----+



#### _df_city

In [107]:
_df_raw['BegInv'].printSchema()

root
 |-- InventoryId: string (nullable = true)
 |-- Store: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Brand: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Size: string (nullable = true)
 |-- onHand: string (nullable = true)
 |-- Price: string (nullable = true)
 |-- startDate: string (nullable = true)



In [7]:
_df_inventory_union = _df_raw['BegInv'].unionByName(_df_raw['BegInv'],allowMissingColumns=True)

_df_city = (
    _df_inventory_union
    .select('City')
    .distinct()
    .select(
        F.row_number().over(Window.orderBy('City')).alias('id'),
        'City'
    )
)
_df_city.show(10)

+---+-------------+
| id|         City|
+---+-------------+
|  1|     ABERDEEN|
|  2|    AETHELNEY|
|  3|    ALNERWICK|
|  4|    ARBINGTON|
|  5|     ASHBORNE|
|  6|    AYLESBURY|
|  7|      BALERNO|
|  8|    BALLYMENA|
|  9|    BARNCOMBE|
| 10|BEGGAR'S HOLE|
+---+-------------+
only showing top 10 rows



#### _df_store

In [138]:
_df_store = (
    _df_inventory_union
    .join(_df_city.select('id','City').alias('c'), ['City'], 'left')
    .select(F.col('Store').cast('int').alias('Store'),F.col('c.id').alias('city_id'))
    .distinct()
    .select(
        F.row_number().over(Window.orderBy('Store','city_id')).alias('id'),
        F.col('Store').alias('Store'),
        'city_id'
    )
)
_df_store.count()
_df_store.show(100)

+---+-----+-------+
| id|Store|city_id|
+---+-----+-------+
|  1|    1|     33|
|  2|    2|      5|
|  3|    3|     36|
|  4|    4|     24|
|  5|    5|     59|
|  6|    6|     29|
|  7|    7|     58|
|  8|    8|      3|
|  9|    9|     12|
| 10|   10|     36|
| 11|   11|     17|
| 12|   12|     43|
| 13|   13|     62|
| 14|   14|     14|
| 15|   15|     65|
| 16|   16|     45|
| 17|   17|     49|
| 18|   18|     27|
| 19|   19|     66|
| 20|   20|     13|
| 21|   21|      7|
| 22|   22|     56|
| 23|   23|      4|
| 24|   24|     52|
| 25|   25|     16|
| 26|   26|     40|
| 27|   27|     47|
| 28|   28|     42|
| 29|   29|      6|
| 30|   30|     21|
| 31|   31|     36|
| 32|   32|     47|
| 33|   33|     36|
| 34|   34|     54|
| 35|   35|     32|
| 36|   36|     44|
| 37|   37|     51|
| 38|   38|     29|
| 39|   39|     25|
| 40|   40|     15|
| 41|   41|     42|
| 42|   42|     11|
| 43|   43|     67|
| 44|   44|     55|
| 45|   45|     64|
| 46|   46|     63|
| 47|   47|     53|


#### _df_inventory

In [125]:
inventory_cols = ['InventoryID', 'Store', 'Brand']
_df_inventory_raw = (
    reduce(
        DataFrame.unionByName,
        [
            _df_raw['Purchases'].select(*inventory_cols),
            _df_raw['Sales'].select(*inventory_cols),
            _df_raw['BegInv'].select(*inventory_cols),
            _df_raw['EndInv'].select(*inventory_cols),
        ]
    ).distinct()
)
_df_inventory_raw.cache()
_df_inventory_raw.orderBy(inventory_cols).show(100, truncate=False)

+----------------+-----+-----+
|InventoryID     |Store|Brand|
+----------------+-----+-----+
|10_HORNSEY_1000 |10   |1000 |
|10_HORNSEY_1001 |10   |1001 |
|10_HORNSEY_1003 |10   |1003 |
|10_HORNSEY_10030|10   |10030|
|10_HORNSEY_1004 |10   |1004 |
|10_HORNSEY_1005 |10   |1005 |
|10_HORNSEY_10057|10   |10057|
|10_HORNSEY_10058|10   |10058|
|10_HORNSEY_1006 |10   |1006 |
|10_HORNSEY_10062|10   |10062|
|10_HORNSEY_1009 |10   |1009 |
|10_HORNSEY_1012 |10   |1012 |
|10_HORNSEY_1013 |10   |1013 |
|10_HORNSEY_10164|10   |10164|
|10_HORNSEY_1019 |10   |1019 |
|10_HORNSEY_1021 |10   |1021 |
|10_HORNSEY_1022 |10   |1022 |
|10_HORNSEY_10227|10   |10227|
|10_HORNSEY_1023 |10   |1023 |
|10_HORNSEY_10236|10   |10236|
|10_HORNSEY_10238|10   |10238|
|10_HORNSEY_10239|10   |10239|
|10_HORNSEY_1024 |10   |1024 |
|10_HORNSEY_10242|10   |10242|
|10_HORNSEY_1025 |10   |1025 |
|10_HORNSEY_10254|10   |10254|
|10_HORNSEY_10266|10   |10266|
|10_HORNSEY_1028 |10   |1028 |
|10_HORNSEY_10283|10   |10283|
|10_HORN

In [126]:
unique(_df_inventory_raw, ['InventoryID'], ['Store', 'Brand'], 1).show(100, truncate=False)

+-----------+----+
|InventoryID|objs|
+-----------+----+
+-----------+----+



In [127]:
shared_inventory_cols = ['InventoryId','Store','Brand']
_df_inventory = (
    _df_inventory_raw
    .join(_df_raw['BegInv'].alias('start'), shared_inventory_cols,'left')
    .join(_df_raw['EndInv'].alias('end'), shared_inventory_cols,'left')
    .withColumn('Brand', F.col('Brand').cast('int'))
    # .join(_df_city.select(F.col('id').alias('city_id'),'City').alias('c'), ['City'], 'left')
    .join(_df_store.select(F.col('id').alias('store_id'),'Store').alias('s'), ['Store'], 'left')
    .join(_df_product.select(F.col('id').alias('product_id'),'Brand').alias('p'), ['Brand'], 'left')
    .select(
        F.row_number().over(Window.orderBy('InventoryId','store_id','product_id')).alias('id'),
        'InventoryId','store_id','product_id', # PK
        F.struct('start.price', F.col('start.startDate').alias('date'), 'start.onHand').alias('start'),
        F.struct('end.price', F.col('end.endDate').alias('date'), 'end.onHand').alias('end'),
    )
)
_df_inventory.orderBy('InventoryId').show(30,truncate=False)



+---+----------------+--------+----------+-----------------------+-----------------------+
|id |InventoryId     |store_id|product_id|start                  |end                    |
+---+----------------+--------+----------+-----------------------+-----------------------+
|1  |10_HORNSEY_1000 |10      |504       |{14.99, 2016-01-01, 1} |{NULL, NULL, NULL}     |
|2  |10_HORNSEY_1001 |10      |505       |{5.99, 2016-01-01, 11} |{5.99, 2016-12-31, 0}  |
|3  |10_HORNSEY_1003 |10      |506       |{NULL, NULL, NULL}     |{NULL, NULL, NULL}     |
|4  |10_HORNSEY_10030|10      |3753      |{NULL, NULL, NULL}     |{NULL, NULL, NULL}     |
|5  |10_HORNSEY_1004 |10      |507       |{NULL, NULL, NULL}     |{NULL, NULL, NULL}     |
|6  |10_HORNSEY_1005 |10      |508       |{34.99, 2016-01-01, 18}|{NULL, NULL, NULL}     |
|7  |10_HORNSEY_10057|10      |3757      |{NULL, NULL, NULL}     |{NULL, NULL, NULL}     |
|8  |10_HORNSEY_10058|10      |3758      |{NULL, NULL, NULL}     |{NULL, NULL, NULL}     |

In [9]:
# OLD
# shared_inventory_cols = [
#     'InventoryId',
#     'Store',
#     # 'City',
#     'Brand',
# ]
# _df_inventory = (
#     _df_raw['BegInv'].alias('start')
#     .join(
#         _df_raw['EndInv']
#         .alias('end'), 
#         shared_inventory_cols,
#         'outer'
#     )
#     .withColumn('Brand', F.col('Brand').cast('int'))
#     # .join(_df_city.select(F.col('id').alias('city_id'),'City').alias('c'), ['City'], 'left')
#     .join(_df_store.select(F.col('id').alias('store_id'),'Store').alias('s'), ['Store'], 'left')
#     .join(_df_product.select(F.col('id').alias('product_id'),'Brand').alias('p'), ['Brand'], 'left')
#     .select(
#         'InventoryId','store_id','product_id', # PK
#         F.struct('start.price', F.col('start.startDate').alias('date'), 'start.onHand').alias('start'),
#         F.struct('end.price', F.col('end.endDate').alias('date'), 'end.onHand').alias('end'),
#     )
#     .distinct()
#     .select(
#         F.row_number().over(Window.orderBy('InventoryId','store_id','product_id')).alias('id'),
#         'InventoryId', # Format: store_city_brand
#         'store_id',
#         'product_id',
        

#         'start',
#         'end',
#     )
# )
# _df_inventory.orderBy('product_id').show(30,truncate=False)



+------+-----------------+--------+----------+-----------------------+-----------------------+
|id    |InventoryId      |store_id|product_id|start                  |end                    |
+------+-----------------+--------+----------+-----------------------+-----------------------+
|29900 |1_HARDERSFIELD_58|1       |1         |{12.99, 2016-01-01, 8} |{12.99, 2016-12-31, 11}|
|3739  |10_HORNSEY_58    |10      |1         |{12.99, 2016-01-01, 6} |{12.99, 2016-12-31, 9} |
|13308 |14_BROMWICH_58   |14      |1         |{12.99, 2016-01-01, 3} |{12.99, 2016-12-31, 2} |
|18052 |15_WANBORNE_58   |15      |1         |{12.99, 2016-01-01, 9} |{12.99, 2016-12-31, 12}|
|19874 |16_LUNDY_58      |16      |1         |{12.99, 2016-01-01, 4} |{12.99, 2016-12-31, 7} |
|6845  |11_CARDEND_58    |11      |1         |{12.99, 2016-01-01, 9} |{12.99, 2016-12-31, 9} |
|32479 |20_BREDWARDINE_58|20      |1         |{NULL, NULL, NULL}     |{12.99, 2016-12-31, 21}|
|36092 |21_BALERNO_58    |21      |1         |{12.

In [114]:
unique(_df_inventory,['product_id'], ['InventoryId'],  1).show(100, truncate=False)

+----------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

#### _df_purchase_order

In [115]:
purchase_order_cols = ['PONumber', 'PODate', 'VendorNumber', 'InvoiceDate', 'PayDate']
_df_purchase_order = (
    _df_raw['Purchases'].select(*purchase_order_cols)
    .join(
        _df_raw['InvoicePurchases'].select(*purchase_order_cols),
        purchase_order_cols,
        'outer'
    )
    .withColumn('VendorNumber', F.col('VendorNumber').cast('int'))
    .join(
        _df_vendor.select(F.col('id').alias('vendor_id'), 'VendorNumber'),
        ['VendorNumber'],
        'left'
    )
    .select(
        F.col('PONumber').cast('int').alias('PONumber'),
        F.col('PODate').cast('date').alias('PODate'),
        'vendor_id',
        F.col('InvoiceDate').cast('date').alias('InvoiceDate'),
        F.col('PayDate').cast('date').alias('PayDate'),
    )
    .distinct()
    .select(
        F.row_number().over(Window.orderBy('PONumber', 'PODate', 'vendor_id', 'InvoiceDate', 'PayDate')).alias('id'),
        'PONumber', 
        'PODate', 
        'vendor_id',
        'InvoiceDate',
        'PayDate',
    )
)
print('count=',_df_purchase_order.count())
_df_purchase_order.orderBy('id').show()

count= 6952
+---+--------+----------+---------+-----------+----------+
| id|PONumber|    PODate|vendor_id|InvoiceDate|   PayDate|
+---+--------+----------+---------+-----------+----------+
|  1|    8106|2015-12-20|        8| 2016-01-12|2016-02-05|
|  2|    8107|2015-12-20|      119| 2016-01-05|2016-02-10|
|  3|    8108|2015-12-20|       18| 2016-01-11|2016-02-10|
|  4|    8109|2015-12-20|       22| 2016-01-12|2016-02-11|
|  5|    8110|2015-12-20|       37| 2016-01-09|2016-02-19|
|  6|    8111|2015-12-20|      103| 2016-01-10|2016-02-04|
|  7|    8112|2015-12-20|       49| 2016-01-11|2016-02-06|
|  8|    8113|2015-12-20|       77| 2016-01-08|2016-02-07|
|  9|    8114|2015-12-20|       64| 2016-01-08|2016-02-14|
| 10|    8115|2015-12-20|       98| 2016-01-10|2016-02-10|
| 11|    8116|2015-12-20|       86| 2016-01-06|2016-02-07|
| 12|    8117|2015-12-20|      114| 2016-01-11|2016-02-09|
| 13|    8118|2015-12-20|       97| 2016-01-06|2016-02-20|
| 14|    8119|2015-12-20|       78| 2016-01-

In [22]:
print(_df_raw['Purchases'].select('PONumber','PODate','VendorNumber').distinct().count())
print(_df_raw['InvoicePurchases'].select('PONumber','PODate','VendorNumber').distinct().count())

5543
5543


#### _df_invoice

In [116]:
_df_invoice = (
    _df_raw['InvoicePurchases']
    .join(_df_vendor.select(F.col('id').alias('vendor_id'),'VendorNumber'), ['VendorNumber'], 'left')
    .join(_df_purchase_order.withColumnRenamed('id','purchase_order_id'),['PONumber', 'PODate', 'vendor_id', 'InvoiceDate', 'PayDate'])
    .select(
        F.row_number().over(Window.orderBy('purchase_order_id')).alias('id'),
        'purchase_order_id',
        F.col('Quantity').cast('int').alias('Quantity'),
        F.col('Dollars').cast('float').alias('Dollars'),
        F.col('Freight').cast('float').alias('Freight'),
    )
)
_df_invoice.orderBy('id').show(10,truncate=False)

+---+-----------------+--------+---------+-------+
|id |purchase_order_id|Quantity|Dollars  |Freight|
+---+-----------------+--------+---------+-------+
|1  |1                |10100   |137483.78|2935.2 |
|2  |2                |24      |348.72   |9.08   |
|3  |3                |8466    |60281.13 |1549.81|
|4  |4                |2246    |14298.09 |408.72 |
|5  |5                |8086    |56493.23 |1300.92|
|6  |6                |24      |185.4    |9.5    |
|7  |7                |427     |5336.48  |185.93 |
|8  |8                |819     |6139.47  |196.61 |
|9  |9                |715     |5533.18  |242.73 |
|10 |10               |2398    |20861.33 |633.9  |
+---+-----------------+--------+---------+-------+
only showing top 10 rows



#### _df_purchase

In [146]:
_df_purchase = (
    _df_raw['Purchases']
    .join(_df_vendor.select(F.col('id').alias('vendor_id'),'VendorNumber'), ['VendorNumber'], 'left')
    .join(
        _df_inventory.select(
            F.col('id').alias('inventory_id'),
            'InventoryId',
            # 'store_id',
            # 'product_id',
        ),
        ['InventoryId'],
        'left'
    )
    .join(_df_purchase_order.withColumnRenamed('id','purchase_order_id'),['PONumber', 'PODate', 'vendor_id', 'InvoiceDate', 'PayDate'])
    .select(
        'purchase_order_id',
        'inventory_id',
        F.col('ReceivingDate').cast('date').alias('ReceivingDate'),
        F.col('PurchasePrice').cast('float').alias('PurchasePrice'),
        F.col('Quantity').cast('int').alias('Quantity'),
        F.col('Dollars').cast('float').alias('Dollars'),
    )
    .select(
        F.row_number().over(Window.orderBy('purchase_order_id','inventory_id')).alias('id'),
        'purchase_order_id',
        'inventory_id',
        'ReceivingDate',
        'PurchasePrice',
        'Quantity',
        'Dollars',
    )
    
)
_df_purchase.cache()
_df_purchase.orderBy('purchase_order_id','inventory_id').show(10,truncate=False)

+---+-----------------+------------+-------------+-------------+--------+-------+
|id |purchase_order_id|inventory_id|ReceivingDate|PurchasePrice|Quantity|Dollars|
+---+-----------------+------------+-------------+-------------+--------+-------+
|1  |1                |17472       |2016-01-01   |21.42        |12      |257.04 |
|2  |1                |17917       |2016-01-01   |17.64        |12      |211.68 |
|3  |1                |17965       |2016-01-01   |18.31        |6       |109.86 |
|4  |1                |18097       |2016-01-01   |14.49        |58      |840.42 |
|5  |1                |18180       |2016-01-01   |18.79        |6       |112.74 |
|6  |1                |18537       |2016-01-01   |18.89        |18      |340.02 |
|7  |1                |18599       |2016-01-01   |18.45        |18      |332.1  |
|8  |1                |18621       |2016-01-01   |12.02        |11      |132.22 |
|9  |1                |18627       |2016-01-01   |38.27        |4       |153.08 |
|10 |1          

#### _df_sale

In [None]:
_df_sale = (
    _df_raw['Sales']
    .withColumn('VendorNumber', F.col('VendorNo').cast('int'))
    .join(_df_inventory.select(F.col('id').alias('inventory_id'),'InventoryId'), ['InventoryId'], 'left')
    .join(_df_vendor.select(F.col('id').alias('vendor_id'),'VendorNumber'), ['VendorNumber'], 'left')
    .select(
        'inventory_id',
        'vendor_id',
        F.col('SalesDate').cast('date').alias('SalesDate'),
        F.col('SalesQuantity').cast('int').alias('Quantity'),
        F.col('SalesDollars').cast('float').alias('Dollars'),
        F.col('SalesPrice').cast('float').alias('SalesPrice'),
        F.col('ExciseTax').cast('float').alias('ExciseTax'),
    )
    .select(
        F.row_number().over(Window.orderBy('inventory_id', 'vendor_id', 'SalesDate')).alias('id'),
        'inventory_id',
        'vendor_id',
        'SalesDate',
        'Quantity',
        'Dollars',
        'SalesPrice',
        'ExciseTax',
    )
)
_df_sale.show(30, truncate=False)

In [147]:
unique(_df_raw['Sales'],['VendorNo', 'InventoryId', 'SalesDate'], ['SalesPrice', 'SalesQuantity', 'SalesDollars']).show(5, truncate=False)

+--------+-----------+---------+----+
|VendorNo|InventoryId|SalesDate|objs|
+--------+-----------+---------+----+
+--------+-----------+---------+----+



### EXTRA

In [None]:

# _df_raw['PurchasePrices'].printSchema()
_df_raw['PurchasePrices'].groupBy('brand','description').agg(F.count('*').alias('cnt')).where('cnt > 1').show()

(
    # _df_raw['Purchases']
    _df_raw['BegInv'].union(_df_raw['EndInv'])
    .groupBy('brand','description')
    .agg(
        F.count('*').alias('cnt'),
        # F.array_distinct(F.array_agg('purchasePrice')).alias('uniquePurchasePrices'),
        # F.array_distinct(F.array_agg('Store')).alias('uniqueStore'),
        # F.array_distinct(F.array_agg('InventoryId')).alias('uniqueInventoryId'),
        F.array_distinct(F.array_agg('Size')).alias('uniqueSize'),
        # F.array_distinct(F.array_agg('Volume')).alias('uniqueVolume'),
    )
    # .where(F.size('uniquePurchasePrices') > 1)
    # .where(F.size('uniqueInventoryId') > 1)
    # .where(F.size('uniqueStore') > 1)
    .where(F.size('uniqueSize') > 1)
    # .where(F.size('uniqueVolume') > 1)
    .orderBy('brand','description')
    .show(100,truncate=False)
)

+-----+-----------+---+
|brand|description|cnt|
+-----+-----------+---+
+-----+-----------+---+

+-----+------------------------+---+------------------+
|brand|description             |cnt|uniqueSize        |
+-----+------------------------+---+------------------+
|1136 |Malibu 50ml Sampler Pack|55 |[50mL 5 Pk, 250mL]|
|3085 |Absolut Mini Bar 5/50mls|150|[50mL 5 Pk, 250mL]|
+-----+------------------------+---+------------------+



In [None]:
_df_purchase_prices_dupes = _df_raw['PurchasePrices'].groupBy(['brand','description']).agg(F.count_distinct('price','purchasePrice').alias('count')).filter("count > 1")
_df_purchases_dupes = _df_raw['Purchases'].groupBy(['InventoryId','description']).agg(F.count_distinct('purchasePrice').alias('count')).filter("count > 1")
_df_sales_dupes = _df_raw['Sales'].groupBy(['InventoryId','description']).agg(F.count_distinct('salesPrice').alias('count')).filter("count > 1")


_df_purchase_prices_dupes.show()
_df_purchases_dupes.show()
_df_sales_dupes.show()

+-----+-----------+-----+
|brand|description|count|
+-----+-----------+-----+
+-----+-----------+-----+

+-----------+-----------+-----+
|InventoryId|description|count|
+-----------+-----------+-----+
+-----------+-----------+-----+

+-------------------+--------------------+-----+
|        InventoryId|         description|count|
+-------------------+--------------------+-----+
|   11_CARDEND_24310|        Hi! Prosecco|    4|
|  22_SHARNWICK_6590|Almaden B/B Wh Zn...|    2|
| 23_ARBINGTON_16440|Banshee Cab Svgn ...|    2|
|     36_LEWES_10957|Seaglass Svgn Bl ...|    2|
|   71_CLARCTON_4787|Appleton Special ...|    2|
|  9_BLACKPOOL_46830|Pacific Rim Sweet...|    3|
|59_CLAETHORPES_3763|Patron Anejo Tequila|    2|
|  61_AETHELNEY_1269|  Jeffersons Bourbon|    2|
|   30_CULCHETH_3938|         Stolichnaya|    3|
|  34_PITMERDEN_2682|The Glendronach 1...|    3|
|   41_LARNWICK_4330|Capt Morgan Spice...|    5|
|   53_HILLFAR_12227|   Korbel Sweet Rose|    3|
|  49_GARIGILL_32648|Beringer C

In [None]:
# _df_raw['Sales'].select('InventoryId','salesPrice').where('InventoryId == "11_CARDEND_24310"').distinct().show()
_df_raw['Sales'].where('InventoryId == "11_CARDEND_24310"').show()

+----------------+-----+-----+------------+-----+-------------+------------+----------+----------+------+--------------+---------+--------+--------------------+
|     InventoryId|Store|Brand| Description| Size|SalesQuantity|SalesDollars|SalesPrice| SalesDate|Volume|Classification|ExciseTax|VendorNo|          VendorName|
+----------------+-----+-----+------------+-----+-------------+------------+----------+----------+------+--------------+---------+--------+--------------------+
|11_CARDEND_24310|   11|24310|Hi! Prosecco|750mL|            1|       15.99|     15.99|2016-01-04|   750|             2|     0.11|   10754|PERFECTA WINES   ...|
|11_CARDEND_24310|   11|24310|Hi! Prosecco|750mL|            2|       31.98|     15.99|2016-01-08|   750|             2|     0.22|   10754|PERFECTA WINES   ...|
|11_CARDEND_24310|   11|24310|Hi! Prosecco|750mL|            1|       12.99|     12.99|2016-01-21|   750|             2|     0.11|   10754|PERFECTA WINES   ...|
|11_CARDEND_24310|   11|24310|Hi! 

### ANOMALIES

#### Vendors names aren't normalized

In [None]:
unique(_df_raw['Purchases'], ['VendorNumber'], ['VendorName']).show(10,truncate=False)

+------------+--------------------------------------------------------------+
|VendorNumber|objs                                                          |
+------------+--------------------------------------------------------------+
|2000        |[{SOUTHERN WINE & SPIRITS NE }, {SOUTHERN GLAZERS W&S OF NE }]|
|1587        |[{VINEYARD BRANDS LLC        }, {VINEYARD BRANDS INC        }]|
|4425        |[{MARTIGNETTI COMPANIES }, {MARTIGNETTI COMPANIES}]           |
+------------+--------------------------------------------------------------+



#### Brand=Product. Description and size colums aren't normalized though

In [None]:
unique(_df_product_raw, ['Brand'], ['Description'],1).show(10, truncate=False)
unique(_df_product_raw, ['Brand'], ['Size'],1).show(10, truncate=False)

+-----+--------------------------------------------------------------------------------+
|Brand|objs                                                                            |
+-----+--------------------------------------------------------------------------------+
|350  |[{Courvoisier VS Gift Pak}, {Courvoisier VS 2 Glass Pack}, {Courvoisier VS VAP}]|
|1064 |[{Jack Daniels Honey VAP}, {Jack Daniels Honey + Glass}]                        |
|1065 |[{DiSaronno 3/50mls Pack}, {DiSaronno Cavalli Collection}]                      |
|1084 |[{Cointreau Liqueur w/ Carafe}, {Cointreau Liqueur with Carafe}]                |
|1927 |[{Jack Daniels Barrel Proof}, {"Jack Daniels ""Barrel Proof"""}]                |
|1961 |[{Everclear 151 Vodka}, {Everclear 151}]                                        |
|2209 |[{Pendleton 1910}, {Pendleton 1910 Rye}]                                        |
|2531 |[{WhistlePig 10 Yr Old Rye}, {WhistlePig Straight Rye 10Yr}]                    |
|2549 |[{WhistlePig 1

In [None]:
purchase_order_cols = ['PONumber', 'PODate', 'VendorNumber']
_df_purchase_orders = (
    _df_raw['Purchases'].select(*purchase_order_cols,F.lit(True).alias('is_purchase'))
    .join(
        _df_raw['InvoicePurchases'].select(*purchase_order_cols,F.lit(True).alias('is_invoice')),
        purchase_order_cols,
        'outer'
    )
)
print('total:', _df_purchase_orders.count())
print('lost invoices :', _df_purchase_orders.where('is_purchase is null and is_invoice').count())
_df_purchase_orders.show()

total: 2373810
lost invoices : 1336
+--------+----------+------------+-----------+----------+
|PONumber|    PODate|VendorNumber|is_purchase|is_invoice|
+--------+----------+------------+-----------+----------+
|   10000|2016-04-29|        2396|       true|      true|
|   10001|2016-04-29|        1128|       true|      true|
|   10001|2016-04-29|        1128|       true|      true|
|   10001|2016-04-29|        1128|       true|      true|
|   10001|2016-04-29|        1128|       true|      true|
|   10001|2016-04-29|        1128|       true|      true|
|   10001|2016-04-29|        1128|       true|      true|
|   10001|2016-04-29|        1128|       true|      true|
|   10001|2016-04-29|        1128|       true|      true|
|   10001|2016-04-29|        1128|       true|      true|
|   10001|2016-04-29|        1128|       true|      true|
|   10001|2016-04-29|        1128|       true|      true|
|   10001|2016-04-29|        1128|       true|      true|
|   10001|2016-04-29|        1128|  

In [33]:
# unique(_df_raw['Purchases'], ['PONumber'], ['PODate', 'VendorNumber'], 1).show(10, truncate=False)

_df_raw['Purchases'].orderBy(F.col('PONumber').desc()).show(300, truncate=False)

+---------------------+-----+-----+----------------------------+-----+------------+---------------------------+--------+----------+-------------+-----------+----------+-------------+--------+-------+--------------+
|InventoryId          |Store|Brand|Description                 |Size |VendorNumber|VendorName                 |PONumber|PODate    |ReceivingDate|InvoiceDate|PayDate   |PurchasePrice|Quantity|Dollars|Classification|
+---------------------+-----+-----+----------------------------+-----+------------+---------------------------+--------+----------+-------------+-----------+----------+-------------+--------+-------+--------------+
|79_BALLYMENA_5215    |79   |5215 |TGI Fridays Long Island Iced|1.75L|4466        |AMERICAN VINTAGE BEVERAGE  |9999    |2016-04-29|2016-05-05   |2016-05-14 |2016-06-26|9.41         |6       |56.46  |1             |
|67_EANVERNESS_3140   |67   |3140 |TGI Fridays Orange Dream    |1.75L|4466        |AMERICAN VINTAGE BEVERAGE  |9999    |2016-04-29|2016-05-0

In [119]:
print('BegInv InventoryID count=', _df_raw['BegInv'].select('InventoryID').distinct().count())
print('EndInv InventoryID count=', _df_raw['EndInv'].select('InventoryID').distinct().count())
print('Purchases InventoryID count=', _df_raw['Purchases'].select('InventoryID').distinct().count())
print('Sales InventoryID count=', _df_raw['Sales'].select('InventoryID').distinct().count())


BegInv InventoryID count= 206529
EndInv InventoryID count= 224489
Purchases InventoryID count= 245907
Sales InventoryID count= 267552


In [120]:
print('BegInv Store count=', _df_raw['BegInv'].select('Store').distinct().count())
print('EndInv Store count=', _df_raw['EndInv'].select('Store').distinct().count())
print('Purchases Store count=', _df_raw['Purchases'].select('Store').distinct().count())
print('Sales Store count=', _df_raw['Sales'].select('Store').distinct().count())

BegInv Store count= 79
EndInv Store count= 80
Purchases Store count= 80
Sales Store count= 80


#### Purchase Orders data is inconsistent. PONumber is not an identifier

In [136]:
common_cols = [
    'PONumber', 
]
_df_temp = (
    _df_raw['Purchases']
    .withColumn('purchase', F.lit(True))
    .groupBy('PONumber', 'PODate', 'VendorNumber','InvoiceDate','PayDate', 'purchase').agg(F.sum('Quantity').cast('int').alias('Quantity'), F.round(F.sum('Dollars'),2).alias('Dollars'))
    .alias('p')
    .join(
        _df_raw['InvoicePurchases']
        .select('PONumber', 'PODate', 'VendorNumber','Quantity','Dollars','Approval','InvoiceDate','PayDate').withColumn('invoice', F.lit(True)).alias('i'),
        common_cols,
        'outer'
    )
)
print('matching:', _df_temp.where('purchase and invoice').count())
print('missing :', _df_temp.where('purchase is null or invoice is null').count())
print('matching PODate:', _df_temp.where('p.PODate = i.PODate').count())
print('matching VendorNumber:', _df_temp.where('p.VendorNumber = i.VendorNumber').count())
print('matching InvoiceDate:', _df_temp.where('p.InvoiceDate = i.InvoiceDate').count())
print('matching PayDate:', _df_temp.where('p.PayDate = i.PayDate').count())
print('matching PODate AND VendorNumber:', _df_temp.where('p.PODate = i.PODate AND p.VendorNumber = i.VendorNumber').count())
print('matching Dollars:', _df_temp.where('p.Dollars = i.Dollars').count())
print('matching Quantity:', _df_temp.where('p.Quantity = i.Quantity').count())

print('non-related rows')
_df_temp.where('p.PODate != i.PODate or p.VendorNumber != i.VendorNumber').orderBy('p.PONumber').show(10,truncate=False)
print('matching rows')
_df_temp.where('p.PODate == i.PODate and p.VendorNumber == i.VendorNumber').orderBy('p.PONumber').show(10,truncate=False)


matching: 5543
missing : 0
matching PODate: 5192
matching VendorNumber: 4207
matching InvoiceDate: 4278
matching PayDate: 4218
matching PODate AND VendorNumber: 4207
matching Dollars: 4207
matching Quantity: 4210
non-related rows
+--------+----------+------------+-----------+----------+--------+--------+---------+----------+------------+--------+---------+--------------+-----------+----------+-------+
|PONumber|PODate    |VendorNumber|InvoiceDate|PayDate   |purchase|Quantity|Dollars  |PODate    |VendorNumber|Quantity|Dollars  |Approval      |InvoiceDate|PayDate   |invoice|
+--------+----------+------------+-----------+----------+--------+--------+---------+----------+------------+--------+---------+--------------+-----------+----------+-------+
|12311   |2016-09-25|60          |2016-10-12 |2016-11-14|true    |249     |4051.23  |2016-09-25|1128        |21757   |286441.4 |Frank Delahunt|2016-10-13 |2016-11-22|true   |
|12312   |2016-09-25|480         |2016-10-13 |2016-11-13|true    |2527