In [1]:
from pyspark.sql import SparkSession
from pandas import set_option

set_option('display.max.columns', None)

spark = (
    SparkSession
    .builder
    .config('spark.driver.memory', '4g')
    .config('spark.executor.memory', '2g')
    .getOrCreate()
)

spark.conf.set('spark.sql.adaptive.enabled', True)
spark.conf.set('spark.sql.session.timeZone', 'UTC')

### Silver

In [2]:
import os

PATH = '/tmp/requests'
BRONZE = os.path.join(PATH, 'bronze')
SILVER = os.path.join(PATH, 'silver')

### Employees

In [27]:
%%time
path = os.path.join(SILVER, 'employees')

(
    spark
    .read
    .parquet(os.path.join(BRONZE, 'Employees'))
    .where(col('FullName') != 'OnBase Processing Service')
    .selectExpr(
        'ID as EMPLOYEE_ID',
        'FullName as EMPLOYEE_NAME',
        'PrimaryRoleName as ROLE',
        'lower(Email) as EMAIL',
        'ManagerName as MANAGER_NAME',
        'PodName as TEAM'
    )
    .write
    .mode('overwrite')
    .parquet(path)
)

employees = spark.read.parquet(path)
employees.limit(5).toPandas()

CPU times: user 18.3 ms, sys: 3.68 ms, total: 21.9 ms
Wall time: 853 ms


Unnamed: 0,EMPLOYEE_ID,EMPLOYEE_NAME,ROLE,EMAIL,MANAGER_NAME,TEAM
0,194728,Andi Prins,Audit Supervisor,aprins@spendmend.com,Travis Wheeler,Gold
1,194729,Bob VanGoor,Audit Supervisor,bvangoor@spendmend.com,Dan Hutchins,Red
2,194730,Colleen Kretowicz,Audit Supervisor,ckretowicz@spendmend.com,Travis Wheeler,Gold
3,194731,Cindy Allen,WNC Auditor,callen@spendmend.com,,
4,194732,Dan Hutchins,Audit Manager,dhutchins@spendmend.com,Dan Hutchins,Red


### Requests
```sql
    NeedLeadVendor --
```

In [55]:
%%time
from pyspark.sql.functions import col, lit, lower

column = when(lower('Contact').contains('@'), lower('Contact'))

path = os.path.join(SILVER, 'requests')


(
    spark
    .read
    .parquet(os.path.join(BRONZE, 'StatementRequests'))
    .withColumn('Contact', column)
    .withColumnRenamed('RequesterFullName', 'EMPLOYEE_NAME')
    .join(
        employees
            .select('EMPLOYEE_ID', 'EMPLOYEE_NAME'),
        on='EMPLOYEE_NAME',
        how='left'
    )
    .selectExpr(
        'ObjectID as REQUEST_ID',
        'ReferenceNumber as REFERENCE_ID',
        'EMPLOYEE_ID',
        'JobNo as JOB_ID',
        'ReferenceNumber as REFERENCE_NUMBER',
        'cast(RequestDate as date) as REQUEST_DATE',
        'Status as REQUEST_STATUS',
        'RequestMethod as REQUEST_METHOD',
        'RequestType as REQUEST_TYPE',
        'cast(LastActivityDate as date) as LAST_ACTIVITY_DATE',
        'cast(LastStatementReceivedDate as date) as LAST_RECEIVED_DATE',
        'CustomerName as CUSTOMER_NAME',
        'VendorNo as VENDOR_NUMBER',
        'cast(WNC as boolean) as WILL_NOT_COMPLY',
        'cast(StatementWNC as boolean) as STATEMENT_WILL_NOT_COMPLY',
        'cast(WNCSpecialHandling as boolean) as HAS_SPECIAL_HANDLING',
        'VendorGroupName as VENDOR_GROUP_NAME',
        'cast(Volume as double) as VOLUME',
        'VolumeTier as VOLUME_LEVEL',
        'cast(VolumeLast12 as double) as VOLUME_PREVIOUS_YEAR',
        'CallsheetNo as CALL_SHEET_NUMBER',
        'CallerStatus as CALL_STATUS',
        'cast(WebsiteVendor as boolean) as VENDOR_HAS_WEBSITE',
        'cast(NeedLeadVendor as boolean) as NEEDS_LEAD_VENDOR'
        
    )
    .sample(False, 1e-3, 42)
    .limit(5)
    .toPandas()
#     .write
#     .mode('overwrite')
#     .parquet(path)
)


# requests = spark.read.parquet(path)
# # requests.createOrReplaceTempView('requests')
# # display(requests)
# requests.limit(5).toPandas()

CPU times: user 42.8 ms, sys: 5.74 ms, total: 48.5 ms
Wall time: 543 ms


Unnamed: 0,REQUEST_ID,REFERENCE_ID,EMPLOYEE_ID,JOB_ID,REFERENCE_NUMBER,REQUEST_DATE,REQUEST_STATUS,REQUEST_METHOD,REQUEST_TYPE,LAST_ACTIVITY_DATE,LAST_RECEIVED_DATE,CUSTOMER_NAME,VENDOR_NUMBER,WILL_NOT_COMPLY,STATEMENT_WILL_NOT_COMPLY,HAS_SPECIAL_HANDLING,VENDOR_GROUP_NAME,VOLUME,VOLUME_LEVEL,VOLUME_PREVIOUS_YEAR,CALL_SHEET_NUMBER,CALL_STATUS,VENDOR_HAS_WEBSITE,NEEDS_LEAD_VENDOR
0,26681254,1285483,8398642,3427,1285483,2020-07-24,Superceded,MassEmail,,2020-07-24,,New Hanover Regional Medical Center,15740,,,,ACCUVEIN INC,61733.69,B,38929.68,C-338765,,,
1,32068618,1285643,20493525,3427,1285643,2021-03-09,Superceded,MassEmail,Caller,2021-02-03,,New Hanover Regional Medical Center,13884,,,,CAROLINA FURNISHING + DESIGN,87698.87,B,0.0,C-338899,Sent Authorization Letter,,
2,26681195,1285464,8398642,3427,1285464,2020-07-24,Superceded,MassEmail,,2020-07-24,,New Hanover Regional Medical Center,14292,,,,CMS IMAGING INC,114648.62,A,20833.37,C-338750,,,
3,26681051,1285420,8398642,3427,1285420,2020-07-24,Superceded,MassEmail,,2020-07-24,,New Hanover Regional Medical Center,7456,,,,HEALTHMARK INDUSTRIES CO INC,141068.22,A,97782.3,C-338715,,,
4,47587303,1285473,8404677,3427,1285473,2022-05-05,No Receipt,MassEmail,Caller,2022-05-05,,New Hanover Regional Medical Center,14476,,,,OFFICE CLEANERS OF CAROLINA,99998.99,BS,33714.0,C-430367,Needs Research,,


### Processes
- workitemlc
- rmobject

In [26]:
%%time
from pyspark.sql import Window
from pyspark.sql.functions import date_trunc, dense_rank, desc

window = Window.partitionBy('contentnum').orderBy(desc('transdate'))

path = os.path.join(SILVER, 'processes')

# primary key (contentnum, lcnum)
(
    spark.read.parquet(os.path.join(BRONZE, 'WorkItems'))
    .withColumn('transdate', date_trunc('second', 'transdate'))
    .select('contentnum', 'lcnum', 'statenum', 'transdate')
    .dropDuplicates()
    .withColumn('_dense_rank', dense_rank().over(window))
    .join(
        spark.read.parquet(os.path.join(BRONZE, 'Objects'))
        .selectExpr('objectid as contentnum', 'activestatus'),
        on='contentnum',
        how='inner'
    )
    .selectExpr(
        'contentnum as PROCESS_ID',
        'lcnum as PROCESS_CODE',
        'statenum as STATE_CODE',
        'activestatus as STATUS_CODE',
        'transdate as TRANSACTION_DATE',
        '_dense_rank as PROCESS_RANK'
    )
    .write
    .mode('overwrite')
    .parquet(path)
)

processes = spark.read.parquet(path)
processes.limit(5).toPandas()

CPU times: user 31.9 ms, sys: 7.65 ms, total: 39.6 ms
Wall time: 27.8 s


Unnamed: 0,PROCESS_ID,PROCESS_CODE,STATE_CODE,STATUS_CODE,TRANSACTION_DATE,PROCESS_RANK
0,194780,126,208,0,2018-08-06 09:40:25,1
1,194784,126,208,0,2016-07-26 00:29:57,1
2,291093,134,252,0,2021-05-08 01:35:53,1
3,315261,126,208,0,2016-07-21 18:29:16,1
4,404196,124,222,1,2016-07-19 14:49:20,1


### Jobs

In [15]:
%%time
path = os.path.join(SILVER, 'jobs')

(
    spark.read.parquet(os.path.join(BRONZE, 'Jobs'))
    .selectExpr(
        'JobNo as JOB_ID',
        'ManagerID as MANAGER_ID',
        'SupervisorID as SUPERVISOR_ID',
        'JobNo as JOB_NUMBER',
        'ManagerPodName as TEAM'
    )
    .write
    .mode('overwrite')
    .parquet(path)
)

jobs = spark.read.parquet(path)
jobs.limit(5).toPandas()

CPU times: user 16 ms, sys: 3.44 ms, total: 19.4 ms
Wall time: 532 ms


Unnamed: 0,JOB_ID,MANAGER_ID,SUPERVISOR_ID,JOB_NUMBER,TEAM
0,2600,,,2600,
1,3570,,,3570,
2,3178,,,3178,
3,3447,,,3447,
4,3467,,,3467,


### States
- lcstate

In [23]:
%%time
path = os.path.join(SILVER, 'states')

(
    spark.read.parquet(os.path.join(BRONZE, 'States'))
    .selectExpr(
        'statenum as STATE_CODE',
        'statename as STATE_NAME'
    )
    .write
    .mode('overwrite')
    .parquet(path)
)

states = spark.read.parquet(path)
states.limit(5).toPandas()

CPU times: user 13.2 ms, sys: 3.1 ms, total: 16.3 ms
Wall time: 434 ms


Unnamed: 0,STATE_CODE,STATE_NAME
0,185,SYS - Initial
1,186,SYS - Done
2,187,Claim Image Errors
3,188,SYS - Initial
4,189,Assignment


### Notes

In [24]:
path = os.path.join(BRONZE, 'Notes')
notes = spark.read.parquet(path)

In [None]:
%%time
from pyspark.sql import Window
from pyspark.sql.functions import col, date_trunc, desc, dense_rank, length, lower, when

column = when(length('STNDescription') < 4096, col('STNDescription'))

condition = lower('STNDescription').contains('statement re-released')
window = dense_rank().over(Window.partitionBy(condition, 'STID').orderBy(desc('STNAdded')))
case = when(condition, window)


path = os.path.join(SILVER, 'descriptions')

(
    notes
    .join(
        requests,
        on=requests['ReferenceNumber'] == notes['STID'],
        how='left_semi'
    )
    .withColumn('_length', column)
    .withColumn('_row_number', case)
    .withColumn('STNAdded', date_trunc('second', 'STNAdded'))
    .selectExpr(
        'STID as REFERENCE_ID',
        'STNAdded as DESCRIPTION_DATE',
        '_length as STATEMENT_DESCRIPTION',
        '_row_number as RELEASE_RANK'
    )
    .write
    .mode('overwrite')
    .parquet(path)
)


descriptions = spark.read.parquet(path)
descriptions.limit(5).toPandas()

In [None]:
spark.stop()