In [1]:
from pyspark.sql import SparkSession
from pandas import set_option

set_option('display.max.columns', None)

spark = (
    SparkSession
    .builder
    .config('spark.driver.memory', '4g')
    .config('spark.executor.memory', '2g')
    .getOrCreate()
)

spark.conf.set('spark.sql.adaptive.enabled', True)
spark.conf.set('spark.sql.session.timeZone', 'UTC')

In [2]:
import os

PATH = '/tmp/requests'
BRONZE = os.path.join(PATH, 'bronze')
SILVER = os.path.join(PATH, 'silver')

## Bronze

### Work Items

In [3]:
path = os.path.join(BRONZE, 'WorkItems')

# life cycle
# truncate(transdate)
# to_date('ownedsince', 'lastupdated')
workItems = spark.read.parquet(path)

print(f'workItems contains {workItems.count():,} records')
workItems.limit(5).toPandas()

workItems contains 2,652,986 records


Unnamed: 0,lcnum,statenum,contentnum,wfcontenttype,transdate,priority,versionid,ownernum,ownedstatus,ownedsince,lastupdated,flags,contentclassnum
0,125,298,11190595,3,2018-02-11 07:07:08.553,0,0,0,0,1964-01-01,1964-01-01,0,1184
1,125,298,11190596,3,2018-02-11 07:07:08.630,0,0,0,0,1964-01-01,1964-01-01,0,1184
2,125,298,11190597,3,2018-02-11 07:07:08.710,0,0,0,0,1964-01-01,1964-01-01,0,1184
3,125,298,11190598,3,2018-02-11 07:07:08.787,0,0,0,0,1964-01-01,1964-01-01,0,1184
4,125,298,11190599,3,2018-02-11 07:07:08.880,0,0,0,0,1964-01-01,1964-01-01,0,1184


```sql
SELECT
    lcnum,
    statenum,
    contentnum,
    transdate
FROM
    workitemlc
WHERE
    contentnum = 40252908
ORDER BY
    transdate
```

In [5]:
# from pyspark.sql.functions import min, max

# (
#     workItems
#     .groupBy('contentnum')
#     .agg(
#         min('transdate').cast('long').alias('minimum'),
#         max('transdate').cast('long').alias('maximum')
#     )
#     .selectExpr(
#         'contentnum',
#         'maximum - minimum as seconds'
#     )
#     .orderBy(desc('seconds'))
#     .show(5)
# )

In [6]:
# from pyspark.sql import Window
# from pyspark.sql.functions import desc, rank

# window = Window.partitionBy('contentnum').orderBy(desc('transdate'))

# (
#     workItems
#     .withColumn('_rank', rank().over(window))
#     .where('lcnum = 160')
#     .where('_rank > 1')
#     .limit(5)
#     .toPandas()
# )

In [7]:
# # 160 always last or second to last
# (
#     workItems
#     .withColumn('_rank', rank().over(window))
#     .where('lcnum = 160')
#     .groupBy('_rank')
#     .count()
#     .show()
# )

### Objects

In [4]:
path = os.path.join(BRONZE, 'Objects')

# truncate(createddate)
objects = spark.read.parquet(path)

# status 0..2
print(f'objects contains {objects.count():,} records')
(
    objects
    .select(
        'objectid',
        'activestatus' # = 0
    )
    .limit(5)
    .toPandas()
)

objects contains 47,827,882 records


Unnamed: 0,objectid,activestatus
0,193512,1
1,193513,0
2,193514,0
3,193515,0
4,193516,0


In [126]:
# roughly 1,000 records missing from objects table
(
    requests
    .join(
        objects,
        on='objectid',
        how='left_semi'
    )
    .count()
)

539270

In [128]:
# all requests status = 0
(
    objects
    .join(
        requests,
        on='objectid',
        how='left_semi'    
    )
    .groupBy('activestatus')
    .count()
    .show()
)

+------------+------+
|activestatus| count|
+------------+------+
|           0|539270|
+------------+------+



In [131]:
# very few status = 1
(
    objects
    .join(
        workItems.where('lcnum = 160'),
        on=workItems['contentnum'] == objects['objectid'],
        how='left_semi'        
    )
    .groupBy('activestatus')
    .count()
    .show()
)

+------------+-----+
|activestatus|count|
+------------+-----+
|           1|   32|
|           0|52589|
+------------+-----+



In [132]:
(
    requests
    .join(
        objects.where('activestatus = 1')
        .join(
            workItems.where('lcnum = 160'),
            on=workItems['contentnum'] == objects['objectid'],
            how='left_semi'        
        ),
        on='objectid',
        how='inner'
    )
    .count()
)

0

### Jobs

In [5]:
path = os.path.join(BRONZE, 'Jobs')

# JobNo
# ManagerPodName
# ManagerID
# SupervisorID
jobs = spark.read.parquet(path)

print(f'jobs contains {jobs.count():,} records')
(
    jobs
    .select(
        'JobNo',
        'ManagerPodName',
        'ManagerID',
        'SupervisorID'
    )
    .where('ManagerPodName is not null')
    .limit(5)
    .toPandas()
)

jobs contains 1,400 records


Unnamed: 0,JobNo,ManagerPodName,ManagerID,SupervisorID
0,2929,Blue,194740,194740
1,3067,Gold,194747,194730
2,2761,Blue,194740,194749
3,2916,Blue,194740,194749
4,2927,Blue,194740,194749


In [6]:
jobs.groupBy('ManagerPodName').count().orderBy('ManagerPodName').show()

+--------------+-----+
|ManagerPodName|count|
+--------------+-----+
|          null| 1059|
|         Black|   10|
|          Blue|   59|
|          Gold|   76|
|          Grey|   53|
|        Orange|   28|
|        Purple|   18|
|           Red|   97|
+--------------+-----+



### States

In [9]:
path = os.path.join(BRONZE, 'States')

states = spark.read.parquet(path)

print(f'states contains {states.count():,} records')
(
    states
    .selectExpr(
        'statenum as queue',
        'statename as state'
    )
    .limit(5)
    .toPandas()
)

states contains 310 records


Unnamed: 0,queue,state
0,185,SYS - Initial
1,186,SYS - Done
2,187,Claim Image Errors
3,188,SYS - Initial
4,189,Assignment


### Requests

In [11]:
path = os.path.join(BRONZE, 'StatementRequests')
requests = spark.read.parquet(path)

print(f'requests contains {requests.count():,} records')

# TODO: convert decimal to double
(
    requests
    .select(
        'JobNo',
        'CustomerName',
        'VendorNo',
        'WNC',
        'StatementWNC',
        'VendorGroupName',
        'Volume',
        'VolumeTier',
        'VolumeLast12',
        'RequestDate',
        'ReferenceNumber',
        'Status',
        'RequestMethod',
        'RequestType',
        'Contact',
        'RequesterFullName',
        'LastActivityDate',
        'LastStatementReceivedDate',
        'CallsheetNo',
        'ObjectID',
        'CallerStatus',
        'WebsiteVendor',
        'WNCSpecialHandling',
        'NeedLeadVendor'
    )
    .limit(5)
    .toPandas()
)

requests contains 540,462 records


Unnamed: 0,JobNo,CustomerName,VendorNo,WNC,StatementWNC,VendorGroupName,Volume,VolumeTier,VolumeLast12,RequestDate,ReferenceNumber,Status,RequestMethod,RequestType,Contact,RequesterFullName,LastActivityDate,LastStatementReceivedDate,CallsheetNo,ObjectID,CallerStatus,WebsiteVendor,WNCSpecialHandling,NeedLeadVendor
0,,,,,,,,,,NaT,,New,,Caller,,,NaT,NaT,,47597380,,,,
1,3427.0,New Hanover Regional Medical Center,16845.0,,,A M COFFEE DISTRIBUTORS LLC,0.0,B,0.0,2020-07-24,1285660.0,Superceded,MassEmail,,amcoffeedist@gmail.com,Jessica Rinehart,2020-07-24,NaT,C-338914,26681846,,,,
2,3427.0,New Hanover Regional Medical Center,16845.0,,,A M COFFEE DISTRIBUTORS LLC,0.0,B,0.0,2020-07-31,1285660.0,Superceded,MassEmail,Mass 1,amcoffeedist@gmail.com,Jessica Rinehart,2020-07-31,NaT,C-338914,26819776,,,,
3,3427.0,New Hanover Regional Medical Center,16845.0,,,A M COFFEE DISTRIBUTORS LLC,0.0,B,0.0,2021-01-19,1285660.0,Superceded,MassEmail,Caller,amcoffeedist@gmail.com,Jessica Rinehart,2020-07-31,NaT,C-338914,30690711,Sent Authorization Letter,,,
4,3427.0,New Hanover Regional Medical Center,16845.0,,,A M COFFEE DISTRIBUTORS LLC,0.0,B,0.0,2021-08-19,1285660.0,Superceded,MassEmail,Mass 1,amcoffeedist@gmail.com,Cindy Allen,2021-08-23,NaT,C-338914,36365526,Sent Authorization Letter,,,


In [20]:
# many requests have no key to work items
# no date range
# anti: 487,758
# semi: 52,589
(
    requests
    .join(
        workItems,
        on=requests['ObjectID'] == workItems['contentnum'],
        how='left_semi'
    )
    # .groupBy('RequesterFullName')
    # .count()
    .selectExpr('min(LastActivityDate) as minimum', 'max(LastActivityDate) as maximum')
    .show()
    # .count()
)

+-------------------+-------------------+
|            minimum|            maximum|
+-------------------+-------------------+
|2020-06-19 00:00:00|2022-06-16 00:00:00|
+-------------------+-------------------+



In [21]:
path = os.path.join(BRONZE, 'Employees')

employees = spark.read.parquet(path)
employees.createOrReplaceTempView('EMPLOYEES')

# TODO: drop ManagerObjectID
print(f'employees contains {employees.count():,} records')
(
    employees
    .limit(5)
    .toPandas()
)

employees contains 468 records


Unnamed: 0,ID,Email,FullName,Phone,PrimaryRoleName,PodName,ManagerName,ManagerObjectID
0,194728,aprins@spendmend.com,Andi Prins,616-257-6362,Audit Supervisor,Gold,Travis Wheeler,194747
1,194729,bvangoor@spendmend.com,Bob VanGoor,616-257-6306,Audit Supervisor,Red,Dan Hutchins,194732
2,194730,ckretowicz@spendmend.com,Colleen Kretowicz,616-257-6398,Audit Supervisor,Gold,Travis Wheeler,194747
3,194731,callen@spendmend.com,Cindy Allen,616-257-6377,WNC Auditor,,,33876443
4,194732,dhutchins@spendmend.com,Dan Hutchins,616-257-6317,Audit Manager,Red,Dan Hutchins,194739


In [184]:
(
    requests
    .join(
        jobs
            .select(
                'JobNo',
                'ManagerPodName',
                'ManagerID',
                'SupervisorID'
            ),
        on='JobNo',
        how='inner'
    )
    .join(
        employees,
        on=requests['RequesterFullName'] == employees['FullName'],
        how='inner'
    )
    .limit(5)
    .toPandas()
)

Unnamed: 0,JobNo,CustomerName,VendorNo,WNC,StatementWNC,VendorGroupName,Volume,VolumeTier,VolumeLast12,RequestDate,ReferenceNumber,Status,RequestMethod,RequestType,Contact,RequesterFullName,LastActivityDate,LastStatementReceivedDate,CallsheetNo,ObjectID,CallerStatus,WebsiteVendor,WNCSpecialHandling,NeedLeadVendor,ManagerPodName,ManagerID,SupervisorID,ID,Email,FullName,Phone,PrimaryRoleName,PodName,ManagerName,ManagerObjectID
0,3427,New Hanover Regional Medical Center,16845,,,A M COFFEE DISTRIBUTORS LLC,0.0,B,0.0,2020-07-24,1285660,Superceded,MassEmail,,amcoffeedist@gmail.com,Jessica Rinehart,2020-07-24,NaT,C-338914,26681846,,,,,Gold,194747,194728,8398642,jrinehart@spendmend.com,Jessica Rinehart,(616) 257-6373,Statement Admin,,,194731
1,3427,New Hanover Regional Medical Center,16845,,,A M COFFEE DISTRIBUTORS LLC,0.0,B,0.0,2020-07-31,1285660,Superceded,MassEmail,Mass 1,amcoffeedist@gmail.com,Jessica Rinehart,2020-07-31,NaT,C-338914,26819776,,,,,Gold,194747,194728,8398642,jrinehart@spendmend.com,Jessica Rinehart,(616) 257-6373,Statement Admin,,,194731
2,3427,New Hanover Regional Medical Center,16845,,,A M COFFEE DISTRIBUTORS LLC,0.0,B,0.0,2021-01-19,1285660,Superceded,MassEmail,Caller,amcoffeedist@gmail.com,Jessica Rinehart,2020-07-31,NaT,C-338914,30690711,Sent Authorization Letter,,,,Gold,194747,194728,8398642,jrinehart@spendmend.com,Jessica Rinehart,(616) 257-6373,Statement Admin,,,194731
3,3427,New Hanover Regional Medical Center,16845,,,A M COFFEE DISTRIBUTORS LLC,0.0,B,0.0,2021-08-19,1285660,Superceded,MassEmail,Mass 1,amcoffeedist@gmail.com,Cindy Allen,2021-08-23,NaT,C-338914,36365526,Sent Authorization Letter,,,,Gold,194747,194728,194731,callen@spendmend.com,Cindy Allen,616-257-6377,WNC Auditor,,,33876443
4,3427,New Hanover Regional Medical Center,16845,,,A M COFFEE DISTRIBUTORS LLC,0.0,B,0.0,2021-09-03,1285660,No Receipt,MassEmail,Caller,amcoffeedist@gmail.com,Cindy Allen,2021-08-23,NaT,C-338914,36846273,Sent Authorization Letter,,,,Gold,194747,194728,194731,callen@spendmend.com,Cindy Allen,616-257-6377,WNC Auditor,,,33876443


### Activities

In [22]:
path = os.path.join(BRONZE, 'StatementRequestActivityRecords')

activities = spark.read.parquet(path)
print(f'jobs contains {activities.count():,} records')
activities.limit(5).toPandas()

jobs contains 2,363,580 records


Unnamed: 0,ObjectID,CreatedDate,ReferenceNumber,CustomerVendorName,JobNumber,JobName,ContactType,ActivityUser,Notes,Outcome,ActivityType,VendorContactObjectID,StatementRequestObjectID
0,28785475,2020-11-02 11:41:33.170,1279250,ULINE,3412,Carolinas Shared Service - 3412,Call,ASIBLEY,This is a test - Updated,,,,25422522
1,28785637,2020-11-02 11:47:27.003,1279250,ULINE,3412,Carolinas Shared Service - 3412,Email,ASIBLEY,Test 2 - Updated,,,,25422522
2,28786094,2020-11-02 11:52:58.307,1279250,ULINE,3412,Carolinas Shared Service - 3412,Client Email,ASIBLEY,Test 3,,,,25422522
3,28838136,2020-11-04 17:47:58.533,1287154,BIOCOMPOSITES INC.,3435,The Christ Hospital - 3435,,bwilliams1,Statement Request creation skipped because thi...,,,,27781594
4,28838138,2020-11-04 17:48:02.617,1225130,BONA FIDE COMMERCIAL SERVICES,3177,UC Health - 3177,,jdagher,Statement Request creation skipped because thi...,,,,25860124


### Statements

In [23]:
path = os.path.join(BRONZE, 'Statements')

# StatementDate
statements = spark.read.parquet(path)
print(f'statements contains {statements.count():,} records')
statements.limit(5).toPandas()

statements contains 490,410 records


Unnamed: 0,ReferenceNumber,StatementDate,ObjectID,EmailMessageID,CreatedDate,SRARObjectId
0,1224005,2019-08-14,20513258,00000000F643DA057DBD124B829A30CFCF400C1507002B...,2019-08-14 11:18:01.353,
1,1229524,2019-08-14,20513265,00000000F643DA057DBD124B829A30CFCF400C1507002B...,2019-08-14 11:18:26.637,
2,1247505,2019-08-14,20513268,00000000F643DA057DBD124B829A30CFCF400C1507002B...,2019-08-14 11:19:08.073,
3,1248087,2019-08-14,20513273,00000000F643DA057DBD124B829A30CFCF400C1507002B...,2019-08-14 11:19:43.393,
4,1224005,2019-08-14,20513254,00000000F643DA057DBD124B829A30CFCF400C1507002B...,2019-08-14 11:17:48.653,


### Notes

In [24]:
path = os.path.join(BRONZE, 'Notes')

# STID = ReferenceNumber
# STNDescription
# STNAdded
notes = spark.read.parquet(path)
print(f'notes contains {notes.count():,} records')
notes.limit(5).toPandas()

notes contains 7,800,558 records


Unnamed: 0,STNID,STID,STLID,STNDescription,STNAdded,STNAddedby,STNUpdated,STNUpdatedby,PTNoteID
0,48,1694,,Follow up date changed from 9/12/2002 to 09/05...,2002-09-23 11:48:34,Frederick Clingen,2002-09-23 11:48:34,,
1,50,1726,,TEst,2002-09-23 14:24:04,Frederick Clingen,2002-09-23 14:24:04,,
2,51,1726,,Status changed from Need Senior to Call to Sen...,2002-09-23 14:24:08,Frederick Clingen,2002-09-23 14:24:08,,
3,52,1726,,Status changed from Sent Authorization Letter ...,2002-09-23 14:24:24,Frederick Clingen,2002-09-23 14:24:24,,
4,53,1726,,Follow up date changed from 9/11/2002 to 09/06...,2002-09-23 14:39:16,Frederick Clingen,2002-09-23 14:39:16,,


In [51]:
from pyspark.sql.functions import lower, regexp_extract

# Non Rolling audit statement un-re-released
# with out there superior visior permission
(
    notes
    .where(lower('STNDescription').contains('re-release'))
    .withColumn('word', regexp_extract('STNDescription', r'([A-Za-z]+)\s+re-release', 1))
    .groupBy('word')
    .count()
    .orderBy(desc('count'))
    .show(5)
)

+---------+------+
|     word| count|
+---------+------+
|statement|366566|
|         |   846|
|    audit|    49|
|  Special|    19|
| manually|    17|
+---------+------+
only showing top 5 rows



### Projects

In [26]:
path = os.path.join(BRONZE, 'Projects')

# JobNo
# ProjectType
# Status
projects = spark.read.parquet(path)
print(f'projects contains {projects.count():,} records')
projects.limit(5).toPandas()

projects contains 3,937 records


Unnamed: 0,ProjectNo,Name,JobNo,Status,PrimaryAuditorID,SecondaryAuditorID,StartDate,EndDate,ProjectType,VendorPotentialThreshold,WorkingItemThreshold,ImagingAccess,PercentComplete,AvailableInPortal,PortalFriendlyName,RollingOOSInterval,UsesModernWorkingItems,AgedOpenCreditDate,AgedOpenCreditMinimum,DebitsProject,KillOnClosed,ServiceType,ServiceLine
0,P-1311,3011 - Data,3011,Closed,,,2018-04-26,NaT,Data,1000.0,250.0,1,0,1,,,,NaT,,,,Recovery Audit,Profit Recovery
1,P-1313,3011 - Returns,3011,Closed,,,2018-04-26,NaT,Returns,1000.0,250.0,1,0,1,,,,NaT,,,,Recovery Audit,Profit Recovery
2,P-1314,3010 - Data,3010,Closed,,,2018-04-26,NaT,Data,1000.0,250.0,1,0,1,,,,NaT,,,,Recovery Audit,Profit Recovery
3,P-1315,3010 - Dupes,3010,Closed,,,2018-04-26,2018-12-06,Dupes,1000.0,250.0,1,0,1,,,,NaT,,,,Recovery Audit,Profit Recovery
4,P-1317,3008 - Data,3008,Closed,,,2018-04-26,2018-05-31,Data,1000.0,250.0,1,0,1,,,,NaT,,,,Recovery Audit,Profit Recovery


In [38]:
# (
#     projects
#     .groupBy('JobNo', 'ProjectType')
#     .count()
#     .where('count > 1')
#     .orderBy(desc('count'))
#     .show(truncate=False)
# )

In [37]:
# (
#     projects
#     .groupBy('ProjectType')
#     .count()
#     .orderBy(desc('count'))
#     .show(35, truncate=False)
# )

## Silver

### Requests

In [19]:
%%time
from pyspark.sql.functions import col, lit, lower

column = when(lower('Contact').contains('@'), lower('Contact'))

path = os.path.join(SILVER, 'requests')


(
    spark
    .read
    .parquet(os.path.join(BRONZE, 'StatementRequests'))
    .withColumn('Contact', column)
    .withColumnRenamed('RequesterFullName', 'EMPLOYEE_NAME')
#     .join(
#         employees
#             .select('EMPLOYEE_ID', 'EMPLOYEE_NAME'),
#         on='EMPLOYEE_NAME',
#         how='inner'
#     )
    .selectExpr(
        'ObjectID as REQUEST_ID',
        # 'EMPLOYEE_ID',
        'cast(RequestDate as date) as REQUEST_DATE',
        'Status as REQUEST_STATUS',
        'RequestMethod as REQUEST_METHOD',
        'RequestType as REQUEST_TYPE',
        'cast(LastActivityDate as date) as LAST_ACTIVITY_DATE',
        'cast(LastStatementReceivedDate as date) as LAST_RECEIVED_DATE',
        'cast(WNC as boolean) as WILL_NOT_COMPLY'
    )
    .write
    .mode('overwrite')
    .parquet(path)
)


requests = spark.read.parquet(path)
requests.createOrReplaceTempView('requests')
# display(requests)
# requests.limit(5).toPandas()

CPU times: user 8.25 ms, sys: 2.97 ms, total: 11.2 ms
Wall time: 1.61 s


In [20]:
path = os.path.join(SILVER, 'requests')
print(f'requests contains {requests.count():,} records')
requests = spark.read.parquet(path)
requests.createOrReplaceTempView('requests')
# display(requests)
requests.limit(5).toPandas()

requests contains 539,270 records


Unnamed: 0,REQUEST_ID,REQUEST_DATE,REQUEST_STATUS,REQUEST_METHOD,REQUEST_TYPE,LAST_ACTIVITY_DATE,LAST_RECEIVED_DATE,WILL_NOT_COMPLY
0,25422373,2020-04-30,Fully Received,MassEmail,,2020-04-30,2020-05-03,False
1,25422392,2020-04-30,Fully Received,MassEmail,,2020-04-30,,
2,25422433,2020-04-30,Fully Received,MassEmail,,2020-04-30,2020-05-03,
3,25422445,2020-04-30,Fully Received,MassEmail,,2020-04-30,2020-05-03,
4,25422454,2020-04-30,Partial Receipt,MassEmail,,2020-04-30,2020-05-03,


### Activities

In [61]:
from pyspark.sql import Window
from pyspark.sql.functions import col, date_trunc, desc, dense_rank, length, lower, when

conditions = [
    col('Notes').contains('contact information updated'),
    col('Notes').contains('statement request in project tracker'),
    col('Notes').isNull()
]

condition = ~(conditions[0] | conditions[1] | conditions[2])
window = dense_rank().over(Window.partitionBy(condition, 'ReferenceNumber').orderBy(desc('CreatedDate')))
case = when(condition, window)

(
    activities
    .withColumn('ActivityRank', case)
    .limit(5)
    .toPandas()
)

Unnamed: 0,ObjectID,CreatedDate,ReferenceNumber,CustomerVendorName,JobNumber,JobName,ContactType,ActivityUser,Notes,Outcome,ActivityType,VendorContactObjectID,StatementRequestObjectID,ActivityRank
0,33665339,2021-05-13 09:15:45.773,1048708,,,,,Bill Wilhoit,,,,,,
1,31251151,2021-02-10 11:05:00.157,1048708,LOGIQUIPLLC,2775.0,BJC Healthcare - 2775,Email,Bill Wilhoit,,Sent Authorization Letter,,,31251147.0,
2,35198830,2021-07-14 08:07:20.880,1048711,,,,,Bill Wilhoit,,,,,,
3,32792121,2021-04-09 08:32:26.753,1048711,LSI SOLUTIONS INC,2775.0,BJC Healthcare - 2775,Email,Bill Wilhoit,,Sent Authorization Letter,,32792113.0,32792114.0,
4,32792115,2021-04-09 08:32:25.307,1048711,,,,,Bill Wilhoit,,,,,,


In [11]:
from pyspark.sql import Window
from pyspark.sql.functions import col, create_map, date_trunc, dense_rank, desc, lit, when

path = os.path.join(SILVER, 'activities')

column = when(col('ContactType') != 'N/A', col('ContactType'))

mapping = create_map(
    lit('Called Vendor'), lit('CALL'),
    lit('Note Only'), lit('NOTE'),
    lit('Emailed Vendor'), lit('EMAIL'),
    lit('Received Call / Email'), lit('RESPONSE')
)

# window = Window.partitionBy('EMPLOYEE_ID', 'REFERENCE_ID').orderBy(desc('ACTIVITY_DATE'))

(
    spark
    .read
    .parquet(os.path.join(BRONZE, 'StatementRequestActivityRecords'))
    .withColumn('ActivityType', mapping[col('ActivityType')])
    .withColumn('ContactType', column)
    .withColumn('CreatedDate', date_trunc('second', 'CreatedDate'))
    .withColumnRenamed('ActivityUser', 'EMPLOYEE_NAME')
#     .join(
#         employees
#             .select('EMPLOYEE_ID', 'EMPLOYEE_NAME'),
#         on='EMPLOYEE_NAME',
#         # how='inner'
#         how='left'
#     )
    .selectExpr(
        'ObjectID as ACTIVITY_ID',
        'ReferenceNumber as REFERENCE_ID',
        'VendorContactObjectID as CONTACT_ID',
        'StatementRequestObjectID as REQUEST_ID',
        # 'EMPLOYEE_ID',
        'CreatedDate as ACTIVITY_DATE',
        'ActivityType as ACTIVITY',
        'JobNumber as JOB_NUMBER',
        'JobName as JOB_NAME',
        'CustomerVendorName as VENDOR_NAME',
        'cast(ReferenceNumber as string) as REFERENCE_NUMBER',
        'Outcome as OUTCOME'
    )
    # .withColumn('EMPLOYEE_REFERENCE_RANK', row_number().over(window))
    .write
    .mode('overwrite')
    .parquet(path)
)


activities = spark.read.parquet(path)
activities.createOrReplaceTempView('activities')
# display(activities)
# activities.limit(5).toPandas()

DataFrame[ACTIVITY_ID: int, REFERENCE_ID: int, CONTACT_ID: int, REQUEST_ID: int, ACTIVITY_DATE: timestamp, ACTIVITY: string, JOB_NUMBER: int, JOB_NAME: string, VENDOR_NAME: string, REFERENCE_NUMBER: string, OUTCOME: string]

In [16]:
path = os.path.join(SILVER, 'activities')
activities = spark.read.parquet(path)
activities.createOrReplaceTempView('activities')
print(f'activities contains {activities.count():,} records')
# display(activities)
activities.limit(5).toPandas()

activities contains 2,358,664 records


Unnamed: 0,ACTIVITY_ID,REFERENCE_ID,CONTACT_ID,REQUEST_ID,ACTIVITY_DATE,ACTIVITY,JOB_NUMBER,JOB_NAME,VENDOR_NAME,REFERENCE_NUMBER,OUTCOME
0,28785475,1279250,,25422522,2020-11-02 06:41:33,,3412,Carolinas Shared Service - 3412,ULINE,1279250,
1,28785637,1279250,,25422522,2020-11-02 06:47:27,,3412,Carolinas Shared Service - 3412,ULINE,1279250,
2,28786094,1279250,,25422522,2020-11-02 06:52:58,,3412,Carolinas Shared Service - 3412,ULINE,1279250,
3,28838136,1287154,,27781594,2020-11-04 12:47:58,,3435,The Christ Hospital - 3435,BIOCOMPOSITES INC.,1287154,
4,28838138,1225130,,25860124,2020-11-04 12:48:02,,3177,UC Health - 3177,BONA FIDE COMMERCIAL SERVICES,1225130,


### Statements

In [21]:
path = os.path.join(SILVER, 'statements')

( 
    spark
    .read
    .parquet(os.path.join(BRONZE, 'Statements'))
    .withColumn('CreatedDate', date_trunc('second', 'CreatedDate'))
    .selectExpr(
        'ReferenceNumber as REFERENCE_ID',
        'abs(xxhash64(EmailMessageID)) as EMAIL_ID',
        'CreatedDate as STATEMENT_DATE',
    )
#     .join(
#         activities,
#         on='REFERENCE_ID',
#         how='left_semi'
#     )
    .write
    .mode('overwrite')
    .parquet(path)
)


statements = spark.read.parquet(path)
statements.createOrReplaceTempView('statements')
# display(statements)
# statements.limit(5).toPandas()

In [22]:
path = os.path.join(SILVER, 'statements')
statements = spark.read.parquet(path)
print(f'statements contains {statements.count():,} records')
statements.createOrReplaceTempView('statements')
# display(statements)
statements.limit(5).toPandas()

statements contains 490,410 records


Unnamed: 0,REFERENCE_ID,EMAIL_ID,STATEMENT_DATE
0,1224005,8479961916226615153,2019-08-14 07:18:01
1,1229524,3064791696795579927,2019-08-14 07:18:26
2,1247505,4632731259035230330,2019-08-14 07:19:08
3,1248087,5187333399925037547,2019-08-14 07:19:43
4,1224005,8479961916226615153,2019-08-14 07:17:48


### Life Cycle

In [30]:
# %%time
# from pyspark.sql import Window
# from pyspark.sql.functions import date_trunc, desc, rank

# window = Window.partitionBy('contentnum').orderBy(desc('transdate'))

path = os.path.join(SILVER, 'processes')

# life cycle
# primary key (contentnum, lcnum)
(
    workItems
    .withColumn('transdate', date_trunc('second', 'transdate'))
    .select('contentnum', 'lcnum', 'statenum', 'transdate')
    .dropDuplicates()
    .withColumn('_rank', row_number().over(window))
    .join(
        objects
        .selectExpr('objectid as contentnum', 'activestatus'),
        on='contentnum',
        how='inner'
    )
    .selectExpr(
        'contentnum as LIFE_CYCLE_ID',
        'lcnum as LIFE_CYCLE_CODE',
        'statenum as STATE_CODE',
        'activestatus as ACTIVE_STATUS_CODE',
        'transdate as TRANSACTION_DATE',
        '_rank as LIFE_CYCLE_RANK'
    )
    .write
    .mode('overwrite')
    .parquet(path)
)

processes = spark.read.parquet(path)
processes.limit(5).toPandas()

Unnamed: 0,LIFE_CYCLE_ID,LIFE_CYCLE_CODE,STATE_CODE,ACTIVE_STATUS_CODE,TRANSACTION_DATE,LIFE_CYCLE_RANK
0,194780,126,208,0,2018-08-06 09:40:25,1
1,194784,126,208,0,2016-07-26 00:29:57,1
2,291093,134,252,0,2021-05-08 01:35:53,1
3,315261,126,208,0,2016-07-21 18:29:16,1
4,404196,124,222,1,2016-07-19 14:49:20,1


### Job

In [32]:
%%time

path = os.path.join(SILVER, 'jobs')

(
    jobs
    .selectExpr(
        'JobNo as JOB_ID',
        'ManagerID as MANAGER_ID',
        'SupervisorID as SUPERVISOR_ID',
        'JobNo as JOB_NUMBER',
        'ManagerPodName as TEAM'
    )
    .write
    .mode('overwrite')
    .parquet(path)
)

jobs = spark.read.parquet(path)
jobs.limit(5).toPandas()

CPU times: user 19.8 ms, sys: 2.99 ms, total: 22.8 ms
Wall time: 482 ms


Unnamed: 0,JOB_ID,MANAGER_ID,SUPERVISOR_ID,JOB_NUMBER,TEAM
0,2600,,,2600,
1,3570,,,3570,
2,3178,,,3178,
3,3447,,,3447,
4,3467,,,3467,


### Notes

In [177]:
%%time
from pyspark.sql import Window
from pyspark.sql.functions import col, date_trunc, desc, dense_rank, length, lower, when

column = when(length('STNDescription') < 4096, col('STNDescription'))

condition = lower('STNDescription').contains('statement re-released')
window = dense_rank().over(Window.partitionBy(condition, 'STID').orderBy(desc('STNAdded')))
case = when(condition, window)


path = os.path.join(SILVER, 'descriptions')

(
    notes
    .join(
        requests,
        on=requests['ReferenceNumber'] == notes['STID'],
        how='left_semi'
    )
    .withColumn('_length', column)
    .withColumn('_row_number', case)
    .withColumn('STNAdded', date_trunc('second', 'STNAdded'))
    .selectExpr(
        'STID as REFERENCE_ID',
        'STNAdded as DESCRIPTION_DATE',
        '_length as STATEMENT_DESCRIPTION',
        '_row_number as RELEASE_RANK'
    )
    .write
    .mode('overwrite')
    .parquet(path)
)


descriptions = spark.read.parquet(path)
descriptions.limit(5).toPandas()

CPU times: user 26.3 ms, sys: 10.4 ms, total: 36.7 ms
Wall time: 13.4 s


Unnamed: 0,REFERENCE_ID,DESCRIPTION_DATE,STATEMENT_DESCRIPTION,RELEASE_RANK
0,1048708,2021-02-10,,
1,1048722,2021-01-22,,
2,1048722,2020-12-24,,
3,1048725,2021-02-16,,
4,1048732,2021-03-23,,


In [181]:
descriptions.where('RELEASE_RANK is not null').limit(5).toPandas()

Unnamed: 0,REFERENCE_ID,DESCRIPTION_DATE,STATEMENT_DESCRIPTION,RELEASE_RANK
0,1048712,2019-05-09 03:00:01,Rolling audit statement re-released on 05/09/2019,1
1,1048712,2017-07-14 03:00:01,Rolling audit statement re-released on 07/14/2017,2
2,1048712,2017-04-12 03:00:01,Rolling audit statement re-released on 04/12/2017,3
3,1048712,2016-12-21 03:00:01,Rolling audit statement re-released on 12/21/2016,4
4,1048712,2016-08-05 03:00:01,Rolling audit statement re-released on 08/05/2016,5


### Projects

In [None]:
from pyspark.sql import Window
from pyspark.sql.functions import col, create_map, dense_rank, desc, lit

mapping = create_map(
    lit('Closed'), lit('Closed'),
    lit('Open'), lit('Open'),
    lit('Prep'), lit('Prepared'),
    lit('Pull'), lit('Pull'),
    lit('Review'), lit('Review')
)


# appears Closed usually has an EndDate
# name is JobNo + ProjectType
column = dense_rank().over(Window.partitionBy('JobNo', 'ProjectType').orderBy('StartDate'))

(
    projects
    .withColumn('Status', mapping[col('Status')])
    .withColumn('_dense_rank', column)
    .selectExpr(
        'JobNo as JOB_ID',
        'ProjectType as PROJECT_TYPE',
        'Status as STATUS',
        'StartDate as START_DATE',
        'EndDate as END_DATE',
        '_dense_rank as PROJECT_RANK'
    )
    .where('PROJECT_RANK > 1')
    .toPandas()
)

In [39]:
spark.stop()