In [100]:
from pyspark.sql import SparkSession
from pandas import set_option

set_option('display.max.columns', None)

spark = (
    SparkSession
    .builder
    .confit('spark.driver.memory', '4g')
    .confit('spark.executor.memory', '2g')
    .getOrCreate()
)

spark.conf.set('spark.sql.adaptive.enabled', True)
spark.conf.set('spark.sql.session.timeZone', 'UTC')

In [2]:
import os

PATH = '/tmp/requests'
BRONZE = os.path.join(PATH, 'bronze')
SILVER = os.path.join(PATH, 'silver')

## Bronze

### Work Items

In [27]:
path = os.path.join(BRONZE, 'WorkItems')

# truncate(transdate)
# to_date('ownedsince', 'lastupdated')
workItems = spark.read.parquet(path)

print(f'workItems contains {workItems.count():,} records')
workItems.limit(5).toPandas()

workItems contains 2,652,986 records


Unnamed: 0,lcnum,statenum,contentnum,wfcontenttype,transdate,priority,versionid,ownernum,ownedstatus,ownedsince,lastupdated,flags,contentclassnum
0,125,298,11190595,3,2018-02-11 07:07:08.553,0,0,0,0,1964-01-01,1964-01-01,0,1184
1,125,298,11190596,3,2018-02-11 07:07:08.630,0,0,0,0,1964-01-01,1964-01-01,0,1184
2,125,298,11190597,3,2018-02-11 07:07:08.710,0,0,0,0,1964-01-01,1964-01-01,0,1184
3,125,298,11190598,3,2018-02-11 07:07:08.787,0,0,0,0,1964-01-01,1964-01-01,0,1184
4,125,298,11190599,3,2018-02-11 07:07:08.880,0,0,0,0,1964-01-01,1964-01-01,0,1184


In [72]:
from pyspark.sql.functions import desc

(
    workItems
    .groupBy('contentnum')
    .count()
    .orderBy(desc('count'))
    .show(5)
)

+----------+-----+
|contentnum|count|
+----------+-----+
|  38042788|    3|
|  48645558|    3|
|  43469386|    3|
|  43148707|    3|
|  48538073|    3|
+----------+-----+
only showing top 5 rows



In [124]:
# priority...flags all same value
(
    workItems
    .where('contentnum = 40252908')
    .orderBy('transdate')
    # .select('lcnum', 'statenum', 'contentnum', 'transdate')
    .toPandas()
    # .to_csv('/tmp/workitemlc', index=False)
)

Unnamed: 0,lcnum,statenum,contentnum,wfcontenttype,transdate,priority,versionid,ownernum,ownedstatus,ownedsince,lastupdated,flags,contentclassnum
0,160,446,40252908,3,2022-03-25 09:27:27.057,0,0,0,0,1964-01-01,1964-01-01,0,1225
1,152,419,40252908,3,2022-06-09 09:24:47.243,0,0,0,0,1964-01-01,1964-01-01,0,1225


```sql
SELECT lcnum, statenum, contentnum, transdate FROM workitemlc WHERE contentnum = 40252908 ORDER BY transdate
```

In [78]:
from pyspark.sql.functions import min, max

(
    workItems
    .groupBy('contentnum')
    .agg(
        min('transdate').cast('long').alias('minimum'),
        max('transdate').cast('long').alias('maximum')
    )
    .selectExpr(
        'contentnum',
        'maximum - minimum as seconds'
    )
    .orderBy(desc('seconds'))
    .show(5)
)

+----------+---------+
|contentnum|  seconds|
+----------+---------+
|   1137442|172830170|
|   1094891|172829720|
|   1137484|172824362|
|   3417175|172816578|
|   1137469|147937637|
+----------+---------+
only showing top 5 rows



In [97]:
from pyspark.sql import Window
from pyspark.sql.functions import desc, rank

window = Window.partitionBy('contentnum').orderBy(desc('transdate'))

(
    workItems
    .withColumn('_rank', rank().over(window))
    .where('lcnum = 160')
    .where('_rank > 1')
    .limit(5)
    .toPandas()
)

Unnamed: 0,lcnum,statenum,contentnum,wfcontenttype,transdate,priority,versionid,ownernum,ownedstatus,ownedsince,lastupdated,flags,contentclassnum,_rank
0,160,446,40141049,3,2022-02-03 11:51:05.843,0,0,0,0,1964-01-01,1964-01-01,0,1225,2
1,160,446,40252908,3,2022-03-25 09:27:27.057,0,0,0,0,1964-01-01,1964-01-01,0,1225,2
2,160,446,42021849,3,2022-05-18 09:20:25.170,0,0,0,0,1964-01-01,1964-01-01,0,1225,2
3,160,446,46422327,3,2022-04-26 09:11:31.643,0,0,0,0,1964-01-01,1964-01-01,0,1225,2
4,160,446,47308726,3,2022-06-10 08:33:59.490,0,0,0,0,1964-01-01,1964-01-01,0,1225,2


In [91]:
# 160 always last or second to last
(
    workItems
    .withColumn('_rank', rank().over(window))
    .where('lcnum = 160')
    .groupBy('_rank')
    .count()
    .show()
)

+-----+-----+
|_rank|count|
+-----+-----+
|    1|51428|
|    2| 1193|
+-----+-----+



In [98]:
# when rank = 2, statenum almost always 446
(
    workItems
    .withColumn('_rank', row_number().over(window))
    .where('lcnum = 160')
    .groupBy('_rank', 'statenum')
    .count()
    .orderBy(desc('_rank'), 'statenum')
    .show()
)

+-----+--------+-----+
|_rank|statenum|count|
+-----+--------+-----+
|    2|     444|    1|
|    2|     446| 1192|
|    1|     441|   33|
|    1|     442| 3694|
|    1|     444|33984|
|    1|     445| 4760|
|    1|     446|    2|
|    1|     447| 2933|
|    1|     448|   15|
|    1|     449| 2549|
|    1|     450|  150|
|    1|     451| 3067|
|    1|     464|  209|
|    1|     507|   32|
+-----+--------+-----+



In [102]:
%%time

(
    workItems
    .groupBy('contentnum', 'lcnum')
    .count()
    .where('count > 1')
    .show()
)

+----------+-----+-----+
|contentnum|lcnum|count|
+----------+-----+-----+
|  19782533|  126|    2|
|  38784550|  126|    2|
+----------+-----+-----+

CPU times: user 3.66 ms, sys: 2.12 ms, total: 5.78 ms
Wall time: 2.74 s


In [104]:
workItems.where('contentnum = 38784550').toPandas()

Unnamed: 0,lcnum,statenum,contentnum,wfcontenttype,transdate,priority,versionid,ownernum,ownedstatus,ownedsince,lastupdated,flags,contentclassnum
0,126,211,38784550,3,2021-11-02 18:01:35.710,0,0,0,0,1964-01-01,1964-01-01,0,1138
1,126,211,38784550,3,2021-11-02 18:01:35.713,0,0,0,0,1964-01-01,1964-01-01,0,1138


In [109]:
from pyspark.sql.functions import date_trunc

(
    workItems
    .withColumn('transdate', date_trunc('second', 'transdate'))
    .select('contentnum', 'transdate')
    .dropDuplicates()
    .count()
) - workItems.count()

-2

### Objects

In [28]:
path = os.path.join(BRONZE, 'Objects')

# truncate(createddate)
objects = spark.read.parquet(path)

print(f'objects contains {objects.count():,} records')
(
    objects
    .select(
        'objectid',
        'activestatus' # = 0
    )
    .limit(5)
    .toPandas()
)

objects contains 47,827,882 records


Unnamed: 0,objectid,rmobjectname,parentobjectid,classid,rmcreatedby,createddate,writestatus,statusid,activestatus
0,193512,,0,1094,MANAGER,2016-02-29 14:11:27.440,0,0,1
1,193513,,0,1094,ASMITTER,2016-03-01 07:58:00.210,0,0,0
2,193514,,0,1093,ASMITTER,2016-03-01 08:01:14.907,0,0,0
3,193515,,0,1093,ASMITTER,2016-03-01 08:02:20.740,0,0,0
4,193516,,0,1093,ASMITTER,2016-03-01 08:03:01.927,0,0,0


In [45]:
(
    objects
    .groupBy('activestatus')
    .count()
    .toPandas()
)

Unnamed: 0,activestatus,count
0,1,2775623
1,2,4090960
2,0,40961299


In [125]:
%%time
# object only has one state
(
    objects
    .groupBy('objectid')
    .count()
    .where('count > 1')
    .count()
)

CPU times: user 8.18 ms, sys: 5 ms, total: 13.2 ms
Wall time: 45.2 s


0

In [126]:
# roughly 1,000 records missing from objects table
(
    requests
    .join(
        objects,
        on='objectid',
        how='left_semi'
    )
    .count()
)

539270

In [128]:
# requests.count()

(
    objects
    .join(
        requests,
        on='objectid',
        how='left_semi'        
    )
    .groupBy('activestatus')
    .count()
    .show()
)

+------------+------+
|activestatus| count|
+------------+------+
|           0|539270|
+------------+------+



In [131]:
(
    objects
    .join(
        workItems.where('lcnum = 160'),
        on=workItems['contentnum'] == objects['objectid'],
        how='left_semi'        
    )
    .groupBy('activestatus')
    .count()
    .show()
)

+------------+-----+
|activestatus|count|
+------------+-----+
|           1|   32|
|           0|52589|
+------------+-----+



In [132]:
(
    requests
    .join(
        objects.where('activestatus = 1')
        .join(
            workItems.where('lcnum = 160'),
            on=workItems['contentnum'] == objects['objectid'],
            how='left_semi'        
        ),
        on='objectid',
        how='inner'
    )
    .count()
)

0

### Jobs

In [189]:
path = os.path.join(BRONZE, 'Jobs')

# JobNo
# ManagerPodName
# ManagerID
# SupervisorID
jobs = spark.read.parquet(path)

print(f'jobs contains {jobs.count():,} records')
(
    jobs
    .select(
        'JobNo',
        'ManagerPodName',
        'ManagerID',
        'SupervisorID'
    )
    .where('ManagerPodName is not null')
    .limit(5)
    .toPandas()
)

jobs contains 1,400 records


Unnamed: 0,JobNo,ManagerPodName,ManagerID,SupervisorID
0,2929,Blue,194740,194740
1,3067,Gold,194747,194730
2,2761,Blue,194740,194749
3,2916,Blue,194740,194749
4,2927,Blue,194740,194749


In [30]:
jobs.groupBy('ManagerPodName').count().show()

+--------------+-----+
|ManagerPodName|count|
+--------------+-----+
|        Orange|   28|
|          null| 1059|
|        Purple|   18|
|          Blue|   59|
|          Gold|   76|
|         Black|   10|
|           Red|   97|
|          Grey|   53|
+--------------+-----+



### States

In [34]:
path = os.path.join(BRONZE, 'States')

states = spark.read.parquet(path)

print(f'states contains {states.count():,} records')
(
    states
    .selectExpr(
        'statenum as queue'
    )
    .limit(5)
    .toPandas()
)

states contains 310 records


Unnamed: 0,queue
0,185
1,186
2,187
3,188
4,189


### Requests

In [63]:
# path = os.path.join(BRONZE, 'StatementRequests')
path = '/Users/curtispassorelli/Desktop/data/requests/bronze/StatementRequests'

requests = spark.read.parquet(path)

print(f'requests contains {requests.count():,} records')

# TODO: convert decimal to double
(
    requests
#     .select(
#         'JobNo',
#         'CustomerName',
#         'VendorNo',
#         'WNC',
#         'StatementWNC',
#         'VendorGroupName',
#         'Volume',
#         'VolumeTier',
#         'VolumeLast12',
#         'RequestDate',
#         'ReferenceNumber',
#         'Status',
#         'RequestMethod',
#         'RequestType',
#         'Contact',
#         'RequesterFullName',
#         'LastActivityDate',
#         'LastStatementReceivedDate',
#         'CallsheetNo',
#         'ObjectID',
#         'CallerStatus',
#         'WebsiteVendor',
#         'WNCSpecialHandling',
#         'NeedLeadVendor'
#     )
    .limit(5)
    .toPandas()
)

requests contains 540,462 records


Unnamed: 0,JobNo,CustomerName,VendorNo,WNC,StatementWNC,VendorGroupName,Volume,VolumeTier,VolumeLast12,RequestDate,ReferenceNumber,Status,RequestMethod,RequestType,Contact,RequesterFullName,LastActivityDate,LastStatementReceivedDate,CallsheetNo,ObjectID,CallerStatus,WebsiteVendor,WNCSpecialHandling,NeedLeadVendor
0,,,,,,,,,,NaT,,New,,Caller,,,NaT,NaT,,47597380,,,,
1,3427.0,New Hanover Regional Medical Center,16845.0,,,A M COFFEE DISTRIBUTORS LLC,0.0,B,0.0,2020-07-24,1285660.0,Superceded,MassEmail,,amcoffeedist@gmail.com,Jessica Rinehart,2020-07-24,NaT,C-338914,26681846,,,,
2,3427.0,New Hanover Regional Medical Center,16845.0,,,A M COFFEE DISTRIBUTORS LLC,0.0,B,0.0,2020-07-31,1285660.0,Superceded,MassEmail,Mass 1,amcoffeedist@gmail.com,Jessica Rinehart,2020-07-31,NaT,C-338914,26819776,,,,
3,3427.0,New Hanover Regional Medical Center,16845.0,,,A M COFFEE DISTRIBUTORS LLC,0.0,B,0.0,2021-01-19,1285660.0,Superceded,MassEmail,Caller,amcoffeedist@gmail.com,Jessica Rinehart,2020-07-31,NaT,C-338914,30690711,Sent Authorization Letter,,,
4,3427.0,New Hanover Regional Medical Center,16845.0,,,A M COFFEE DISTRIBUTORS LLC,0.0,B,0.0,2021-08-19,1285660.0,Superceded,MassEmail,Mass 1,amcoffeedist@gmail.com,Cindy Allen,2021-08-23,NaT,C-338914,36365526,Sent Authorization Letter,,,


In [49]:
# anti: 487,758
# semi: 52,589
(
    requests
    .join(
        workItems,
        on=requests['ObjectID'] == workItems['contentnum'],
        how='left_semi'
    )
    .count()
)

52589

In [66]:
(
    objects
        # .where('status = 1')
    .join(
        workItems,
        on=objects['objectid'] == workItems['contentnum'],
        how='left_anti'
    )
    .count()
)

45195528

In [187]:
# only four records in workItems not found in objects
# still 2,652,982 items
(
    workItems
    .join(
        objects,
        on=objects['objectid'] == workItems['contentnum'],
        how='left_semi'
    )
    .count()
)

2652982

In [185]:
# (
#     workItems
#     .groupBy('lcnum')
#     .count()
#     .show()
# )

In [157]:
path = os.path.join(BRONZE, 'Employees')

employees = spark.read.parquet(path)
employees.createOrReplaceTempView('EMPLOYEES')

print(f'employees contains {employees.count():,} records')
(
    employees
#     .selectExpr(
#         'statenum as queue'
#     )
    .limit(5)
    .toPandas()
)

employees contains 468 records


Unnamed: 0,ID,Email,FullName,Phone,PrimaryRoleName,PodName,ManagerName,ManagerObjectID
0,194728,aprins@spendmend.com,Andi Prins,616-257-6362,Audit Supervisor,Gold,Travis Wheeler,194747
1,194729,bvangoor@spendmend.com,Bob VanGoor,616-257-6306,Audit Supervisor,Red,Dan Hutchins,194732
2,194730,ckretowicz@spendmend.com,Colleen Kretowicz,616-257-6398,Audit Supervisor,Gold,Travis Wheeler,194747
3,194731,callen@spendmend.com,Cindy Allen,616-257-6377,WNC Auditor,,,33876443
4,194732,dhutchins@spendmend.com,Dan Hutchins,616-257-6317,Audit Manager,Red,Dan Hutchins,194739


In [151]:
# employees.where('ManagerObjectID is null').limit(20).toPandas()
# jobs.where('ManagerID = SupervisorID').count() # 18
# jobs.where('ManagerID != SupervisorID').count() # 381; almost all different


# jobs.where('ManagerID is not null').count() # 412
# jobs.where('SupervisorID is not null').count() # 400

# jobs.where('ManagerID is not null and SupervisorID is not null').count() # 399

# employees.where('ManagerName is not null').count() 95
# employees.where('ManagerObjectID is not null').count() # 349

349

In [168]:
# query = """
# WITH
# organization (ID, FullName, ManagerObjectID) AS (
# SELECT
#     ID,
#     FullName,
#     ManagerObjectID
# FROM
#     EMPLOYEES
# WHERE
#     ManagerObjectID IS NULL
# UNION ALL
# SELECT
#     EMPLOYEES.ID,
#     EMPLOYEES.FullName,
#     EMPLOYEES.ManagerObjectID
# FROM
#     EMPLOYEES
#         INNER JOIN
#             organization ON
#                 organization.ID = EMPLOYEES.ManagerObjectID
# )
# SELECT * FROM organization
# """

# (
#     spark
#     .sql(query)
#     .limit(5)
#     .toPandas()
# )

In [184]:
(
    requests
    .join(
        jobs
            .select(
                'JobNo',
                'ManagerPodName',
                'ManagerID',
                'SupervisorID'
            ),
        on='JobNo',
        how='inner'
    )
    .join(
        employees,
        on=requests['RequesterFullName'] == employees['FullName'],
        how='inner'
    )
    .limit(5)
    .toPandas()
)

Unnamed: 0,JobNo,CustomerName,VendorNo,WNC,StatementWNC,VendorGroupName,Volume,VolumeTier,VolumeLast12,RequestDate,ReferenceNumber,Status,RequestMethod,RequestType,Contact,RequesterFullName,LastActivityDate,LastStatementReceivedDate,CallsheetNo,ObjectID,CallerStatus,WebsiteVendor,WNCSpecialHandling,NeedLeadVendor,ManagerPodName,ManagerID,SupervisorID,ID,Email,FullName,Phone,PrimaryRoleName,PodName,ManagerName,ManagerObjectID
0,3427,New Hanover Regional Medical Center,16845,,,A M COFFEE DISTRIBUTORS LLC,0.0,B,0.0,2020-07-24,1285660,Superceded,MassEmail,,amcoffeedist@gmail.com,Jessica Rinehart,2020-07-24,NaT,C-338914,26681846,,,,,Gold,194747,194728,8398642,jrinehart@spendmend.com,Jessica Rinehart,(616) 257-6373,Statement Admin,,,194731
1,3427,New Hanover Regional Medical Center,16845,,,A M COFFEE DISTRIBUTORS LLC,0.0,B,0.0,2020-07-31,1285660,Superceded,MassEmail,Mass 1,amcoffeedist@gmail.com,Jessica Rinehart,2020-07-31,NaT,C-338914,26819776,,,,,Gold,194747,194728,8398642,jrinehart@spendmend.com,Jessica Rinehart,(616) 257-6373,Statement Admin,,,194731
2,3427,New Hanover Regional Medical Center,16845,,,A M COFFEE DISTRIBUTORS LLC,0.0,B,0.0,2021-01-19,1285660,Superceded,MassEmail,Caller,amcoffeedist@gmail.com,Jessica Rinehart,2020-07-31,NaT,C-338914,30690711,Sent Authorization Letter,,,,Gold,194747,194728,8398642,jrinehart@spendmend.com,Jessica Rinehart,(616) 257-6373,Statement Admin,,,194731
3,3427,New Hanover Regional Medical Center,16845,,,A M COFFEE DISTRIBUTORS LLC,0.0,B,0.0,2021-08-19,1285660,Superceded,MassEmail,Mass 1,amcoffeedist@gmail.com,Cindy Allen,2021-08-23,NaT,C-338914,36365526,Sent Authorization Letter,,,,Gold,194747,194728,194731,callen@spendmend.com,Cindy Allen,616-257-6377,WNC Auditor,,,33876443
4,3427,New Hanover Regional Medical Center,16845,,,A M COFFEE DISTRIBUTORS LLC,0.0,B,0.0,2021-09-03,1285660,No Receipt,MassEmail,Caller,amcoffeedist@gmail.com,Cindy Allen,2021-08-23,NaT,C-338914,36846273,Sent Authorization Letter,,,,Gold,194747,194728,194731,callen@spendmend.com,Cindy Allen,616-257-6377,WNC Auditor,,,33876443


### Activities

In [193]:
path = os.path.join(BRONZE, 'StatementRequestActivityRecords')

activities = spark.read.parquet(path)
print(f'jobs contains {activities.count():,} records')
activities.limit(5).toPandas()

jobs contains 2,358,664 records


Unnamed: 0,ObjectID,CreatedDate,ReferenceNumber,CustomerVendorName,JobNumber,JobName,ContactType,ActivityUser,Outcome,ActivityType,VendorContactObjectID,StatementRequestObjectID
0,28785475,2020-11-02 11:41:33.170,1279250,ULINE,3412,Carolinas Shared Service - 3412,Call,ASIBLEY,,,,25422522
1,28785637,2020-11-02 11:47:27.003,1279250,ULINE,3412,Carolinas Shared Service - 3412,Email,ASIBLEY,,,,25422522
2,28786094,2020-11-02 11:52:58.307,1279250,ULINE,3412,Carolinas Shared Service - 3412,Client Email,ASIBLEY,,,,25422522
3,28838136,2020-11-04 17:47:58.533,1287154,BIOCOMPOSITES INC.,3435,The Christ Hospital - 3435,,bwilliams1,,,,27781594
4,28838138,2020-11-04 17:48:02.617,1225130,BONA FIDE COMMERCIAL SERVICES,3177,UC Health - 3177,,jdagher,,,,25860124


### Statements

In [196]:
path = os.path.join(BRONZE, 'Statements')

# StatementDate
statements = spark.read.parquet(path)
print(f'statements contains {statements.count():,} records')
statements.limit(5).toPandas()

statements contains 490,410 records


Unnamed: 0,ReferenceNumber,StatementDate,ObjectID,EmailMessageID,CreatedDate,SRARObjectId
0,1224005,2019-08-14,20513258,00000000F643DA057DBD124B829A30CFCF400C1507002B...,2019-08-14 11:18:01.353,
1,1229524,2019-08-14,20513265,00000000F643DA057DBD124B829A30CFCF400C1507002B...,2019-08-14 11:18:26.637,
2,1247505,2019-08-14,20513268,00000000F643DA057DBD124B829A30CFCF400C1507002B...,2019-08-14 11:19:08.073,
3,1248087,2019-08-14,20513273,00000000F643DA057DBD124B829A30CFCF400C1507002B...,2019-08-14 11:19:43.393,
4,1224005,2019-08-14,20513254,00000000F643DA057DBD124B829A30CFCF400C1507002B...,2019-08-14 11:17:48.653,


### Notes

In [195]:
path = os.path.join(BRONZE, 'Notes')

# STID = ReferenceNumber
# STNDescription
# STNAdded
notes = spark.read.parquet(path)
print(f'notes contains {notes.count():,} records')
notes.limit(5).toPandas()

notes contains 7,800,558 records


Unnamed: 0,STNID,STID,STLID,STNDescription,STNAdded,STNAddedby,STNUpdated,STNUpdatedby,PTNoteID
0,48,1694,,Follow up date changed from 9/12/2002 to 09/05...,2002-09-23 11:48:34,Frederick Clingen,2002-09-23 11:48:34,,
1,50,1726,,TEst,2002-09-23 14:24:04,Frederick Clingen,2002-09-23 14:24:04,,
2,51,1726,,Status changed from Need Senior to Call to Sen...,2002-09-23 14:24:08,Frederick Clingen,2002-09-23 14:24:08,,
3,52,1726,,Status changed from Sent Authorization Letter ...,2002-09-23 14:24:24,Frederick Clingen,2002-09-23 14:24:24,,
4,53,1726,,Follow up date changed from 9/11/2002 to 09/06...,2002-09-23 14:39:16,Frederick Clingen,2002-09-23 14:39:16,,


In [203]:
from pyspark.sql.functions import lower

# with out there superior visior permission
# (
#     notes
#     .where(lower('STNDescription').contains('release'))
#     .select('STNDescription')
#     .where(~lower('STNDescription').contains('re-release'))
#     .show(truncate=False)
#     # .count()
# )

### Projects

In [204]:
path = os.path.join(BRONZE, 'Projects')

# JobNo
# ProjectType
# Status
projects = spark.read.parquet(path)
print(f'projects contains {projects.count():,} records')
projects.limit(5).toPandas()

projects contains 3,937 records


Unnamed: 0,ProjectNo,Name,JobNo,Status,PrimaryAuditorID,SecondaryAuditorID,StartDate,EndDate,ProjectType,VendorPotentialThreshold,WorkingItemThreshold,ImagingAccess,PercentComplete,AvailableInPortal,PortalFriendlyName,RollingOOSInterval,UsesModernWorkingItems,AgedOpenCreditDate,AgedOpenCreditMinimum,DebitsProject,KillOnClosed,ServiceType,ServiceLine
0,P-1311,3011 - Data,3011,Closed,,,2018-04-26,NaT,Data,1000.0,250.0,1,0,1,,,,NaT,,,,Recovery Audit,Profit Recovery
1,P-1313,3011 - Returns,3011,Closed,,,2018-04-26,NaT,Returns,1000.0,250.0,1,0,1,,,,NaT,,,,Recovery Audit,Profit Recovery
2,P-1314,3010 - Data,3010,Closed,,,2018-04-26,NaT,Data,1000.0,250.0,1,0,1,,,,NaT,,,,Recovery Audit,Profit Recovery
3,P-1315,3010 - Dupes,3010,Closed,,,2018-04-26,2018-12-06,Dupes,1000.0,250.0,1,0,1,,,,NaT,,,,Recovery Audit,Profit Recovery
4,P-1317,3008 - Data,3008,Closed,,,2018-04-26,2018-05-31,Data,1000.0,250.0,1,0,1,,,,NaT,,,,Recovery Audit,Profit Recovery


In [206]:
# (
#     projects
#     .groupBy('JobNo')
#     .count()
#     .orderBy(desc('count'))
#     .show()
# )

In [209]:
(
    projects
    .groupBy('Status')
    .count()
    .orderBy(desc('count'))
    .show()
)

+------+-----+
|Status|count|
+------+-----+
|Closed| 2533|
|  Open| 1086|
|  Prep|  288|
|  Pull|   19|
|Review|   11|
+------+-----+



In [228]:
from pyspark.sql.functions import approx_count_distinct

(
    projects
    # .groupBy('JobNo', 'StartDate')
    # .agg(approx_count_distinct('Status').alias('count'))
    .groupBy('JobNo', 'ProjectType')
    # .groupBy('JobNo')
    # .agg(approx_count_distinct('StartDate').alias('count'))
    .count()
    .where('count > 1')
    .orderBy(desc('count'))
    .show(25, truncate=False)
)

+-----+-----------------------------+-----+
|JobNo|ProjectType                  |count|
+-----+-----------------------------+-----+
|2873 |GPO Pricing                  |2    |
|2879 |GPO Pricing                  |2    |
|3278 |Local Contracts              |2    |
|3427 |In Scope Credits Loaded      |2    |
|2939 |Spike                        |2    |
|3292 |Dupes                        |2    |
|3179 |Sales and Use Tax Review     |2    |
|2868 |Pricing Top 100              |2    |
|3010 |Multi Acct Recon             |2    |
|3415 |Missing Invoices             |2    |
|3406 |Local Contracts              |2    |
|2938 |Uncashed Checks              |2    |
|3175 |Returns                      |2    |
|2896 |Statement Phone and Account #|2    |
|2929 |OOS                          |2    |
+-----+-----------------------------+-----+



In [230]:
# appears Closed usually has an EndDate
# name is JobNo + ProjectType
projects.where('JobNo = 2896').orderBy('StartDate', 'EndDate').toPandas()

Unnamed: 0,ProjectNo,Name,JobNo,Status,PrimaryAuditorID,SecondaryAuditorID,StartDate,EndDate,ProjectType,VendorPotentialThreshold,WorkingItemThreshold,ImagingAccess,PercentComplete,AvailableInPortal,PortalFriendlyName,RollingOOSInterval,UsesModernWorkingItems,AgedOpenCreditDate,AgedOpenCreditMinimum,DebitsProject,KillOnClosed,ServiceType,ServiceLine
0,P-36,2896 - Data,2896,Closed,194734,,NaT,NaT,Data,,,,,1.0,Data,,,NaT,,,,Recovery Audit,Profit Recovery
1,P-43,2896 - Pricing Top 100,2896,Closed,194730,194730.0,NaT,NaT,Pricing Top 100,,,,,,,,,NaT,,,,Recovery Audit,Profit Recovery
2,P-42,2896 - Cash Discount,2896,Closed,194731,194731.0,2016-08-04,2016-10-14,Cash Discount,,,,,,,,,NaT,,,,Recovery Audit,Profit Recovery
3,P-40,2896 - Spike,2896,Closed,194731,194731.0,2016-08-04,2016-10-19,Spike,,,,,,,,,NaT,,,,Recovery Audit,Profit Recovery
4,P-41,2896 - GPO Pricing,2896,Closed,194730,194730.0,2016-08-04,2017-01-27,GPO Pricing,,,,,,,,,NaT,,,,Recovery Audit,Profit Recovery
5,P-37,2896 - Dupes,2896,Closed,194730,194730.0,2016-10-07,2018-02-23,Dupes,,,1.0,75.0,1.0,2896 - Dupes,,,NaT,,,,Recovery Audit,Profit Recovery
6,P-38,2896 - Returns,2896,Closed,194730,194730.0,2016-10-10,2017-01-24,Returns,,,,,,,,,NaT,,,,Recovery Audit,Profit Recovery
7,P-395,2896 - Statement Phone and Account #,2896,Closed,194728,194728.0,2016-10-31,2016-11-01,Statement Phone and Account #,,,,,,,,,NaT,,,,Recovery Audit,Profit Recovery
8,P-416,2896 - Statement Phone and Account #,2896,Closed,194728,1137359.0,2016-11-29,2017-01-23,Statement Phone and Account #,,,,,,,,,NaT,,,,Recovery Audit,Profit Recovery
9,P-39,2896 - AP vs PO,2896,Closed,194730,194730.0,2017-01-24,2017-03-19,AP vs PO,,,,,,,,,NaT,,,,Recovery Audit,Profit Recovery


## Silver

### Requests

In [19]:
%%time
from pyspark.sql.functions import col, lit, lower

column = when(lower('Contact').contains('@'), lower('Contact'))

path = os.path.join(SILVER, 'requests')


(
    spark
    .read
    .parquet(os.path.join(BRONZE, 'StatementRequests'))
    .withColumn('Contact', column)
    .withColumnRenamed('RequesterFullName', 'EMPLOYEE_NAME')
#     .join(
#         employees
#             .select('EMPLOYEE_ID', 'EMPLOYEE_NAME'),
#         on='EMPLOYEE_NAME',
#         how='inner'
#     )
    .selectExpr(
        'ObjectID as REQUEST_ID',
        # 'EMPLOYEE_ID',
        'cast(RequestDate as date) as REQUEST_DATE',
        'Status as REQUEST_STATUS',
        'RequestMethod as REQUEST_METHOD',
        'RequestType as REQUEST_TYPE',
        'cast(LastActivityDate as date) as LAST_ACTIVITY_DATE',
        'cast(LastStatementReceivedDate as date) as LAST_RECEIVED_DATE',
        'cast(WNC as boolean) as WILL_NOT_COMPLY'
    )
    .write
    .mode('overwrite')
    .parquet(path)
)


requests = spark.read.parquet(path)
requests.createOrReplaceTempView('requests')
# display(requests)
# requests.limit(5).toPandas()

CPU times: user 8.25 ms, sys: 2.97 ms, total: 11.2 ms
Wall time: 1.61 s


In [20]:
path = os.path.join(SILVER, 'requests')
print(f'requests contains {requests.count():,} records')
requests = spark.read.parquet(path)
requests.createOrReplaceTempView('requests')
# display(requests)
requests.limit(5).toPandas()

requests contains 539,270 records


Unnamed: 0,REQUEST_ID,REQUEST_DATE,REQUEST_STATUS,REQUEST_METHOD,REQUEST_TYPE,LAST_ACTIVITY_DATE,LAST_RECEIVED_DATE,WILL_NOT_COMPLY
0,25422373,2020-04-30,Fully Received,MassEmail,,2020-04-30,2020-05-03,False
1,25422392,2020-04-30,Fully Received,MassEmail,,2020-04-30,,
2,25422433,2020-04-30,Fully Received,MassEmail,,2020-04-30,2020-05-03,
3,25422445,2020-04-30,Fully Received,MassEmail,,2020-04-30,2020-05-03,
4,25422454,2020-04-30,Partial Receipt,MassEmail,,2020-04-30,2020-05-03,


### Activities

In [11]:
from pyspark.sql import Window
from pyspark.sql.functions import col, create_map, date_trunc, desc, lit, row_number, when

path = os.path.join(SILVER, 'activities')

column = when(col('ContactType') != 'N/A', col('ContactType'))

mapping = create_map(
    lit('Called Vendor'), lit('CALL'),
    lit('Note Only'), lit('NOTE'),
    lit('Emailed Vendor'), lit('EMAIL'),
    lit('Received Call / Email'), lit('RESPONSE')
)

# window = Window.partitionBy('EMPLOYEE_ID', 'REFERENCE_ID').orderBy(desc('ACTIVITY_DATE'))

(
    spark
    .read
    .parquet(os.path.join(BRONZE, 'StatementRequestActivityRecords'))
    .withColumn('ActivityType', mapping[col('ActivityType')])
    .withColumn('ContactType', column)
    .withColumn('CreatedDate', date_trunc('second', 'CreatedDate'))
    .withColumnRenamed('ActivityUser', 'EMPLOYEE_NAME')
#     .join(
#         employees
#             .select('EMPLOYEE_ID', 'EMPLOYEE_NAME'),
#         on='EMPLOYEE_NAME',
#         # how='inner'
#         how='left'
#     )
    .selectExpr(
        'ObjectID as ACTIVITY_ID',
        'ReferenceNumber as REFERENCE_ID',
        'VendorContactObjectID as CONTACT_ID',
        'StatementRequestObjectID as REQUEST_ID',
        # 'EMPLOYEE_ID',
        'CreatedDate as ACTIVITY_DATE',
        'ActivityType as ACTIVITY',
        'JobNumber as JOB_NUMBER',
        'JobName as JOB_NAME',
        'CustomerVendorName as VENDOR_NAME',
        'cast(ReferenceNumber as string) as REFERENCE_NUMBER',
        'Outcome as OUTCOME'
    )
    # .withColumn('EMPLOYEE_REFERENCE_RANK', row_number().over(window))
    .write
    .mode('overwrite')
    .parquet(path)
)


activities = spark.read.parquet(path)
activities.createOrReplaceTempView('activities')
# display(activities)
# activities.limit(5).toPandas()

DataFrame[ACTIVITY_ID: int, REFERENCE_ID: int, CONTACT_ID: int, REQUEST_ID: int, ACTIVITY_DATE: timestamp, ACTIVITY: string, JOB_NUMBER: int, JOB_NAME: string, VENDOR_NAME: string, REFERENCE_NUMBER: string, OUTCOME: string]

In [16]:
path = os.path.join(SILVER, 'activities')
activities = spark.read.parquet(path)
activities.createOrReplaceTempView('activities')
print(f'activities contains {activities.count():,} records')
# display(activities)
activities.limit(5).toPandas()

activities contains 2,358,664 records


Unnamed: 0,ACTIVITY_ID,REFERENCE_ID,CONTACT_ID,REQUEST_ID,ACTIVITY_DATE,ACTIVITY,JOB_NUMBER,JOB_NAME,VENDOR_NAME,REFERENCE_NUMBER,OUTCOME
0,28785475,1279250,,25422522,2020-11-02 06:41:33,,3412,Carolinas Shared Service - 3412,ULINE,1279250,
1,28785637,1279250,,25422522,2020-11-02 06:47:27,,3412,Carolinas Shared Service - 3412,ULINE,1279250,
2,28786094,1279250,,25422522,2020-11-02 06:52:58,,3412,Carolinas Shared Service - 3412,ULINE,1279250,
3,28838136,1287154,,27781594,2020-11-04 12:47:58,,3435,The Christ Hospital - 3435,BIOCOMPOSITES INC.,1287154,
4,28838138,1225130,,25860124,2020-11-04 12:48:02,,3177,UC Health - 3177,BONA FIDE COMMERCIAL SERVICES,1225130,


### Statements

In [21]:
path = os.path.join(SILVER, 'statements')

( 
    spark
    .read
    .parquet(os.path.join(BRONZE, 'Statements'))
    .withColumn('CreatedDate', date_trunc('second', 'CreatedDate'))
    .selectExpr(
        'ReferenceNumber as REFERENCE_ID',
        'abs(xxhash64(EmailMessageID)) as EMAIL_ID',
        'CreatedDate as STATEMENT_DATE',
    )
#     .join(
#         activities,
#         on='REFERENCE_ID',
#         how='left_semi'
#     )
    .write
    .mode('overwrite')
    .parquet(path)
)


statements = spark.read.parquet(path)
statements.createOrReplaceTempView('statements')
# display(statements)
# statements.limit(5).toPandas()

In [22]:
path = os.path.join(SILVER, 'statements')
statements = spark.read.parquet(path)
print(f'statements contains {statements.count():,} records')
statements.createOrReplaceTempView('statements')
# display(statements)
statements.limit(5).toPandas()

statements contains 490,410 records


Unnamed: 0,REFERENCE_ID,EMAIL_ID,STATEMENT_DATE
0,1224005,8479961916226615153,2019-08-14 07:18:01
1,1229524,3064791696795579927,2019-08-14 07:18:26
2,1247505,4632731259035230330,2019-08-14 07:19:08
3,1248087,5187333399925037547,2019-08-14 07:19:43
4,1224005,8479961916226615153,2019-08-14 07:17:48


In [None]:
spark.stop()