In [0]:
from pyspark.sql import DataFrame
from pyspark.sql.readwriter import DataFrameReader
from pyspark.sql.functions import col, lit, trim, when
from typing import Dict, List

def databaseReader(readFormat: str, readOptions: Dict[str, str]) -> DataFrameReader:
    return (
        spark
        .read
        .format(readFormat)
        .options(**readOptions)
    )
    

def truncateStrings(dataFrame: DataFrame) -> DataFrame:
    for colName, dtype in dataFrame.dtypes:
        if dtype == 'string':
            column = trim(colName)
            case = when(column == '', lit(None).cast('string')).otherwise(column)
            dataFrame = dataFrame.withColumn(colName, case)

    return dataFrame
  

# def decimalToDouble(dataFrame: DataFrame) -> DataFrame:
#     for colName, dtype in table.dtypes:
#         if 'decimal' in dtype:
#             dataFrame = dataFrame.withColumn(colName, col(colName).cast('double'))
    
#     return dataFrame
  
  
# def concatColumnNames(dataFrame: DataFrame, sep: str='') -> DataFrame:
#     for colName in dataFrame.columns:
#         dataFrame = dataFrame.withColumnRenamed(colName, colName.replace(' ', sep))
    
#     return dataFrame


# from pyspark.sql import DataFrame
# from pyspark.sql.functions import min, max
# from typing import Dict

# # sc.defaultParallelism
# def partitionColumn(dataFrame: DataFrame, column: str) -> Dict[str, str]:
#     bounds = (
#         dataFrame
#         .select(
#             min(column).cast('string'),
#             max(column).cast('string')
#         )
#         .na
#         .drop(how='any')
#         .first()
#     )
    
#     if bounds:
#         return {
#             'partitionColumn': column,
#             'lowerBound': bounds[0],
#             'upperBound': bounds[1],            
#         }
#     else:
#         return dict()

### Bronze

In [0]:
import os

PATH = '/tmp/statements'
BRONZE = os.path.join(PATH, 'bronze')
SILVER = os.path.join(PATH, 'silver')

options = {
    'url': 'jdbc:sqlserver://',
    'server': '10.0.2.16',
    'database': 'OnBase',
    'user': dbutils.secrets.get('silo-key-vault-scope', 'databricks-base-user'),
    'password': dbutils.secrets.get('silo-key-vault-scope', 'databricks-base-password')
}

reader = databaseReader('com.microsoft.sqlserver.jdbc.spark', options)

In [0]:
from pyspark.sql.functions import date_trunc

query = """
(SELECT
    ID,
    -- OnBaseUserName,
    -- BSAPUserName,
    Email,
    -- FirstName,
    -- LastName,
    FullName,
    -- Title,
    Phone,
    PrimaryRoleName,
    -- PodId,
    -- [Named Client],
    -- [Named WF/WV],
    -- Active,
    PodName,
    ManagerName
    -- [Employee ID],
    -- [Hire Date],
    -- HRManagerFullName,
    -- [HR - Manager Object ID],
    -- [HR - Manager OnBase User Name],
    -- [HR - Manager Last Name],
    -- [HR - Manager First Name],
    -- BSAPSID,
    -- CreatedBy,
    -- CreatedDate
FROM
    hsi.rm_DVEmployees) as query
"""


table = (
    reader
    .option('dbtable', query)
    .load()
)

table = truncateStrings(table)

path = os.path.join(BRONZE, 'Employees')

(
    table
    .write
    .mode('overwrite')
    .parquet(path)
)

display(spark.read.parquet(path))

ID,Email,FullName,Phone,PrimaryRoleName,PodName,ManagerName
194728,aprins@spendmend.com,Andi Prins,616-257-6362,Audit Supervisor,Gold,Travis Wheeler
194729,bvangoor@spendmend.com,Bob VanGoor,616-257-6306,Audit Supervisor,Red,Dan Hutchins
194730,ckretowicz@spendmend.com,Colleen Kretowicz,616-257-6398,Audit Supervisor,Gold,Travis Wheeler
194731,callen@spendmend.com,Cindy Allen,616-257-6377,WNC Auditor,,
194732,dhutchins@spendmend.com,Dan Hutchins,616-257-6317,Audit Manager,Red,Dan Hutchins
194733,dottenwess@spendmend.com,David Ottenwess,616-257-6314,Audit Supervisor,Grey,Eric Florance
194734,dlindquist@spendmend.com,Dirk Lindquist,616-257-2004,Scrub,,
194735,dkuester@spendmend.com,Dorthy Kuester,616-257-6405,Project Audit,Red,Dan Hutchins
194736,eflorance@spendmend.com,Eric Florance,616-257-6412,Audit Manager,Grey,Eric Florance
4675815,mgalla@bsihealthcare.com,Mychal Galla,,Project Audit,,


In [0]:
query = """
(SELECT
    [Reference Number] as ReferenceNumber,
    -- JobNo,
    -- JobTier,
    [Statement Date] as StatementDate,
    -- [Document Handle],
    ObjectID,
    -- CustVendorObjID,
    -- CustVendGroupObjID,
    -- CustVendGroupName,
    -- CustomerName,
    -- CID,
    -- CustVendName,
    -- CustVendNo,
    -- Volume,
    -- [Accounts Identified],
    -- Recon,
    EmailMessageID,
    CreatedDate,
    SRARObjectId
    -- ZeroBalance
FROM
    hsi.rm_DVStatements) as query
"""

table = (
    reader
    .option('dbtable', query)
    .load()
)

table = truncateStrings(table)

path = os.path.join(BRONZE, 'Statements')

(
    table
    .write
    .mode('overwrite')
    .parquet(path)
)

display(spark.read.parquet(path))

ReferenceNumber,StatementDate,ObjectID,EmailMessageID,CreatedDate,SRARObjectId
1224005,2019-08-14T00:00:00.000+0000,20513258,00000000F643DA057DBD124B829A30CFCF400C1507002B871AC579DA7041B55C292E8DC8BBBA000000CBE8AA0000AC80609CDFFC214CBE54978FCA484FD6000231DA71920000,2019-08-14T11:18:01.353+0000,
1229524,2019-08-14T00:00:00.000+0000,20513265,00000000F643DA057DBD124B829A30CFCF400C1507002B871AC579DA7041B55C292E8DC8BBBA000000CBE8AA0000AC80609CDFFC214CBE54978FCA484FD6000231DC72A50000,2019-08-14T11:18:26.637+0000,
1247505,2019-08-14T00:00:00.000+0000,20513268,00000000F643DA057DBD124B829A30CFCF400C1507002B871AC579DA7041B55C292E8DC8BBBA000000CBE8AA0000AC80609CDFFC214CBE54978FCA484FD6000231DC72790000,2019-08-14T11:19:08.073+0000,
1248087,2019-08-14T00:00:00.000+0000,20513273,00000000F643DA057DBD124B829A30CFCF400C1507002B871AC579DA7041B55C292E8DC8BBBA000000CBE8AA0000AC80609CDFFC214CBE54978FCA484FD6000231DC72780000,2019-08-14T11:19:43.393+0000,
1224005,2019-08-14T00:00:00.000+0000,20513254,00000000F643DA057DBD124B829A30CFCF400C1507002B871AC579DA7041B55C292E8DC8BBBA000000CBE8AA0000AC80609CDFFC214CBE54978FCA484FD6000231DA71920000,2019-08-14T11:17:48.653+0000,
1223925,2019-08-14T00:00:00.000+0000,20513249,00000000F643DA057DBD124B829A30CFCF400C1507002B871AC579DA7041B55C292E8DC8BBBA000000CBE8AA0000AC80609CDFFC214CBE54978FCA484FD6000231DC72AB0000,2019-08-14T11:17:15.840+0000,
1223925,2019-08-14T00:00:00.000+0000,20513238,00000000F643DA057DBD124B829A30CFCF400C1507002B871AC579DA7041B55C292E8DC8BBBA000000CBE8AA0000AC80609CDFFC214CBE54978FCA484FD6000231DC72AB0000,2019-08-14T11:16:58.657+0000,
1223925,2019-08-14T00:00:00.000+0000,20513233,00000000F643DA057DBD124B829A30CFCF400C1507002B871AC579DA7041B55C292E8DC8BBBA000000CBE8AA0000AC80609CDFFC214CBE54978FCA484FD6000231DC72AB0000,2019-08-14T11:16:44.880+0000,
1223925,2019-08-14T00:00:00.000+0000,20513228,00000000F643DA057DBD124B829A30CFCF400C1507002B871AC579DA7041B55C292E8DC8BBBA000000CBE8AA0000AC80609CDFFC214CBE54978FCA484FD6000231DC72AB0000,2019-08-14T11:16:26.217+0000,
1223925,2019-08-14T00:00:00.000+0000,20513227,00000000F643DA057DBD124B829A30CFCF400C1507002B871AC579DA7041B55C292E8DC8BBBA000000CBE8AA0000AC80609CDFFC214CBE54978FCA484FD6000231DC72AB0000,2019-08-14T11:16:12.893+0000,


In [0]:
query = """
(SELECT
    -- JobNo,
    -- JobTier,
    -- CustomerName,
    -- CustVendorID,
    -- VendorNo,
    -- CustVendorGroupID,
    WNC,
    -- StatementWNC,
    -- VendorGroupName,
    -- Volume,
    -- VolumeTier,
    -- VolumeLast12,
    RequestDate,
    -- ReferenceNumber,
    Status,
    RequestMethod,
    RequestType,
    Contact,
    RequesterFullName,
    -- RequestText,
    LastActivityDate,
    LastStatementReceivedDate,
    -- CreatedDate,
    -- CallsheetNo,
    ObjectID
    -- CallerStatus,
    -- ReconStatus,
    -- CurrentAssigneeID,
    -- CurrentAssigneeName,
    -- EnteredReconDate,
    -- LastReconQueueName,
    -- LastReconQueueEntryDate,
    -- AccountsReceived,
    -- AccountsRequested,
    -- MessageID,
    -- VendorContactObjectID,
    -- WebsiteVendor,
    -- WNCSpecialHandling,
    -- NeedLeadVendor,
    -- VendorGroupPrimaryAccountType
FROM
    hsi.rm_DVStatementRequests) as query
"""

table = (
    reader
    .option('dbtable', query)
    .load()
)

table = truncateStrings(table)

path = os.path.join(BRONZE, 'StatementRequests')

(
    table
    .write
    .mode('overwrite')
    .parquet(path)
)

display(spark.read.parquet(path))

WNC,RequestDate,Status,RequestMethod,RequestType,Contact,RequesterFullName,LastActivityDate,LastStatementReceivedDate,ObjectID
0.0,2020-05-01T00:00:00.000+0000,Fully Received,MassEmail,,ar@system1us.com,Jessica Rinehart,2020-05-01T00:00:00.000+0000,2020-05-04T00:00:00.000+0000,25422373
,2020-05-01T00:00:00.000+0000,Fully Received,MassEmail,,courtney.groden@omadahealth.com,Jessica Rinehart,2020-05-01T00:00:00.000+0000,,25422392
,2020-05-01T00:00:00.000+0000,Fully Received,MassEmail,,ar@adreima.com,Jessica Rinehart,2020-05-01T00:00:00.000+0000,2020-05-04T00:00:00.000+0000,25422433
,2020-05-01T00:00:00.000+0000,Fully Received,MassEmail,,rebecca@dashcourier.com,Jessica Rinehart,2020-05-01T00:00:00.000+0000,2020-05-04T00:00:00.000+0000,25422445
,2020-05-01T00:00:00.000+0000,Partial Receipt,MassEmail,,CBaker@cbccts.org,Jessica Rinehart,2020-05-01T00:00:00.000+0000,2020-05-04T00:00:00.000+0000,25422454
0.0,2020-05-01T00:00:00.000+0000,Fully Received,MassEmail,,lgilmore@mcomc.com,Jessica Rinehart,2020-05-01T00:00:00.000+0000,2020-05-01T00:00:00.000+0000,25422500
0.0,2020-05-01T00:00:00.000+0000,Fully Received,MassEmail,,Accounts.Receivable@ULINE.COM,Jessica Rinehart,2020-11-02T00:00:00.000+0000,,25422522
0.0,2020-05-01T00:00:00.000+0000,Fully Received,MassEmail,,accountsreceivable@heartflow.com,Jessica Rinehart,2020-05-01T00:00:00.000+0000,,25422526
,2020-05-01T00:00:00.000+0000,Fully Received,MassEmail,,US.AR@BRAINLAB.COM,Jessica Rinehart,2020-05-01T00:00:00.000+0000,,25422532
0.0,2020-05-01T00:00:00.000+0000,Fully Received,MassEmail,,ar@biofiredx.com,Jessica Rinehart,2020-05-01T00:00:00.000+0000,,25422539


In [0]:
query = """
(SELECT
    -- CustVendorGroupNo,
    -- CustVendorNo,
    -- ContactType,
    -- TypePreferredContact,
    -- [Last Name],
    -- [First Name],
    [Full Name] as FullName,
    Email,
    Phone,
    -- Fax,
    -- Title,
    -- Note,
    -- ExternalID,
    -- BSAP VCID,
    -- CID,
    CustomerName,
    -- CVObjectID,
    -- CustObjectID,
    ObjectID
    -- CreatedDate,
    -- CreatedBy,
    -- BSAPVendorObjectID
FROM
    hsi.rm_DVVendorContacts) as query
"""

table = (
    reader
    .option('dbtable', query)
    .load()
)

table = truncateStrings(table)

path = os.path.join(BRONZE, 'VendorContacts')

(
    table
    .write
    .mode('overwrite')
    .parquet(path)
)

display(spark.read.parquet(path))

FullName,Email,Phone,CustomerName,ObjectID
Rita Morris,rita.morris@navigant.com,7049044725,New Hanover Regional Medical Center,556858
,nci_apinvoices@navigant.com,,New Hanover Regional Medical Center,559542
,ar@pharmedium.com,8005237749,New Hanover Regional Medical Center,610390
Lee Ohlson,lohlson@pharmedium.com,8474572340,New Hanover Regional Medical Center,611761
Colleen Beeghly,Cbeeghly@fffenterprises.com,(800) 843-7477 x1280,New Hanover Regional Medical Center,953252
,uscare@avaya.com,8003287833,New Hanover Regional Medical Center,1016790
,info@pyapc.com,(800) 270-9629,New Hanover Regional Medical Center,1016792
,askcustomerservice@ironmountain.com,(800) 934-3453,New Hanover Regional Medical Center,1016794
,invoice@mmodal.com,(615) 261-1542,New Hanover Regional Medical Center,1016800
,info@rightpointmedia.com,(910) 264-6263,New Hanover Regional Medical Center,1016802


In [0]:
query = """
(SELECT
    ObjectID,
    CreatedDate,
    ReferenceNumber,
    CustomerVendorName,
    JobNumber,
    JobName,
    ContactType,
    ActivityUser,
    -- ActivityDate,
    -- FollowUpDate,
    -- STNID,
    -- Notes,
    Outcome,
    ActivityType,
    -- CustVendorObjectID,
    VendorContactObjectID,
    StatementRequestObjectID
    -- CreatedBy,
    -- MessageID
FROM
    hsi.rm_DVStatementRequestActivityRecords) as query
"""

table = (
    reader
    .option('dbtable', query)
    .load()
)

table = truncateStrings(table)

path = os.path.join(BRONZE, 'StatementRequestActivityRecords')

(
    table
    .write
    .mode('overwrite')
    .parquet(path)
)

display(spark.read.parquet(path))

ObjectID,CreatedDate,ReferenceNumber,CustomerVendorName,JobNumber,JobName,ContactType,ActivityUser,Outcome,ActivityType,VendorContactObjectID,StatementRequestObjectID
28785475,2020-11-02T11:41:33.170+0000,1279250,ULINE,3412,Carolinas Shared Service - 3412,Call,ASIBLEY,,,,25422522
28785637,2020-11-02T11:47:27.003+0000,1279250,ULINE,3412,Carolinas Shared Service - 3412,Email,ASIBLEY,,,,25422522
28786094,2020-11-02T11:52:58.307+0000,1279250,ULINE,3412,Carolinas Shared Service - 3412,Client Email,ASIBLEY,,,,25422522
28838136,2020-11-04T17:47:58.533+0000,1287154,BIOCOMPOSITES INC.,3435,The Christ Hospital - 3435,,bwilliams1,,,,27781594
28838138,2020-11-04T17:48:02.617+0000,1225130,BONA FIDE COMMERCIAL SERVICES,3177,UC Health - 3177,,jdagher,,,,25860124
28838140,2020-11-04T17:48:03.550+0000,1278415,THERACOM LLC,3408,University Hospitals Health System - 3408,,knorman,,,,25561324
28838142,2020-11-04T17:48:04.307+0000,1255665,CMX CORPORATION,3339,Adventist Health - 3339,,mpetroelje,,,,26527142
28838144,2020-11-04T17:48:05.100+0000,1239124,VIRTUAL MEDICAL STAFF LLC,3285,Community Health Systems - 3285,,ykahssai,,,,26489835
28838146,2020-11-04T17:48:06.007+0000,1287350,TEKSYSTEMS,3435,The Christ Hospital - 3435,,bwilliams1,,,,27781943
28838148,2020-11-04T17:48:06.863+0000,1273483,SCALES INDUSTRIAL TECHNOLOGIES,3398,Health Quest - 3398,,cbrooks,,,,27434567


In [0]:
query = """
(SELECT
    -- [Document Handle],
    [Date Created] as DateCreated,
    [MAIL Date Time] as MAILDateTime,
    -- [MAIL From Address],
    -- [MAIL To Address],
    -- [MAIL Cc Address],
    -- [MAIL Subject],
    [MAIL MessageID] as MAILMessageID,
    [MAIL Attachment Count] as MAILAttachmentCount
    -- [S - Ref #],
    -- [S - Customer Name],
    -- [S - Job #],
    -- [S - Vendor Name],
    -- [S - Created By User],
    -- [S - Skip AutoReceive],
    -- [S - Recon],
    -- [S - Large Credits],
    -- [Ingestion Source],
    -- [S - Statement Exists In Batch]
FROM
    dbo.cc_STStatementEmailDocs) as query
"""

table = (
    reader
    .option('dbtable', query)
    .load()
)

table = truncateStrings(table)

path = os.path.join(BRONZE, 'StatementEmailDocs')

(
    table
    .write
    .mode('overwrite')
    .parquet(path)
)

display(spark.read.parquet(path))

DateCreated,MAILDateTime,MAILMessageID,MAILAttachmentCount
2019-06-12T09:22:11.947+0000,2019-06-07T19:14:29.000+0000,00000000F643DA057DBD124B829A30CFCF400C1507002B871AC579DA7041B55C292E8DC8BBBA000000CBE8AA0000AC80609CDFFC214CBE54978FCA484FD6000206DA776C0000,11.0
2019-06-12T09:28:05.423+0000,2019-06-07T19:14:17.000+0000,00000000F643DA057DBD124B829A30CFCF400C1507002B871AC579DA7041B55C292E8DC8BBBA000000CBE8AA0000AC80609CDFFC214CBE54978FCA484FD6000206DA776B0000,7.0
2019-06-12T09:40:03.420+0000,2019-06-07T19:12:18.000+0000,00000000F643DA057DBD124B829A30CFCF400C1507002B871AC579DA7041B55C292E8DC8BBBA000000CBE8AA0000AC80609CDFFC214CBE54978FCA484FD600020A89DCAB0000,2.0
2019-06-12T09:42:47.153+0000,2019-06-07T19:15:58.000+0000,00000000F643DA057DBD124B829A30CFCF400C1507002B871AC579DA7041B55C292E8DC8BBBA000000CBE8AA0000AC80609CDFFC214CBE54978FCA484FD6000206DA776D0000,2.0
2019-06-12T09:49:49.783+0000,2019-06-07T19:15:37.000+0000,00000000F643DA057DBD124B829A30CFCF400C1507002B871AC579DA7041B55C292E8DC8BBBA000000CBE8AA0000AC80609CDFFC214CBE54978FCA484FD600020A89DCBC0000,2.0
2019-06-12T10:34:52.297+0000,2019-06-07T19:17:51.000+0000,00000000F643DA057DBD124B829A30CFCF400C1507002B871AC579DA7041B55C292E8DC8BBBA000000CBE8AA0000AC80609CDFFC214CBE54978FCA484FD6000206DA77700000,2.0
2019-06-12T10:35:06.010+0000,2019-06-07T19:24:07.000+0000,00000000F643DA057DBD124B829A30CFCF400C1507002B871AC579DA7041B55C292E8DC8BBBA000000CBE8AA0000AC80609CDFFC214CBE54978FCA484FD6000206DA77740000,3.0
2019-06-12T10:49:44.383+0000,2019-06-07T19:28:29.000+0000,00000000F643DA057DBD124B829A30CFCF400C1507002B871AC579DA7041B55C292E8DC8BBBA000000CBE8AA0000AC80609CDFFC214CBE54978FCA484FD6000206DA77780000,3.0
2019-06-12T10:50:02.900+0000,2019-06-07T19:42:59.000+0000,00000000F643DA057DBD124B829A30CFCF400C1507002B871AC579DA7041B55C292E8DC8BBBA000000CBE8AA0000AC80609CDFFC214CBE54978FCA484FD6000206DA77870000,1.0
2019-06-12T10:51:06.190+0000,2019-06-07T19:46:08.000+0000,00000000F643DA057DBD124B829A30CFCF400C1507002B871AC579DA7041B55C292E8DC8BBBA000000CBE8AA0000AC80609CDFFC214CBE54978FCA484FD6000206DA778A0000,7.0


### Silver

In [0]:
from pyspark.sql import Column
from pyspark.sql.functions import col, udf, when
from typing import Optional

import phonenumbers


# returns phone number in format (987) 654-3210
@udf(returnType='string')
def parsePhoneNumber(number: str, region: str='US') -> Optional[str]:
    try:
        phoneNumber = phonenumbers.parse(number, region)
        return phonenumbers.format_number(phoneNumber, phonenumbers.PhoneNumberFormat.NATIONAL)
    except:
        return None


# retains value when value matches pattern
def matchPattern(colName: str, pattern: str) -> Column:
    column = col(colName) 
    return when(column.rlike(pattern), column)


# limit characters for writing to database
def varchar(dataFrame: DataFrame, width: int=1024) -> Optional[str]:
    columns = [f'{name} VARCHAR ({width})' for name, dtype in dataFrame.dtypes if dtype == 'string']
    return ', '.join(columns) if columns else None

#### Employees

In [0]:
path = os.path.join(SILVER, 'employees')

(
    spark
    .read
    .parquet(os.path.join(BRONZE, 'Employees'))
    .selectExpr(
        'ID as EMPLOYEE_ID',
        'FullName as EMPLOYEE_NAME',
        'PrimaryRoleName as ROLE',
        'lower(Email) as EMAIL',
        'ManagerName as MANAGER_NAME',
        'PodName as TEAM'
    )
    .write
    .mode('overwrite')
    .parquet(path)
)

employees = spark.read.parquet(path)
employees.createOrReplaceTempView('employees')
display(employees)

EMPLOYEE_ID,EMPLOYEE_NAME,ROLE,EMAIL,MANAGER_NAME,TEAM
194728,Andi Prins,Audit Supervisor,aprins@spendmend.com,Travis Wheeler,Gold
194729,Bob VanGoor,Audit Supervisor,bvangoor@spendmend.com,Dan Hutchins,Red
194730,Colleen Kretowicz,Audit Supervisor,ckretowicz@spendmend.com,Travis Wheeler,Gold
194731,Cindy Allen,WNC Auditor,callen@spendmend.com,,
194732,Dan Hutchins,Audit Manager,dhutchins@spendmend.com,Dan Hutchins,Red
194733,David Ottenwess,Audit Supervisor,dottenwess@spendmend.com,Eric Florance,Grey
194734,Dirk Lindquist,Scrub,dlindquist@spendmend.com,,
194735,Dorthy Kuester,Project Audit,dkuester@spendmend.com,Dan Hutchins,Red
194736,Eric Florance,Audit Manager,eflorance@spendmend.com,Eric Florance,Grey
4675815,Mychal Galla,Project Audit,mgalla@bsihealthcare.com,,


#### Contacts

In [0]:
from pyspark.sql.functions import lower, regexp_replace

pattern = r'\(\d+\) \d+-\d+' # (987) 654-3210

column = when(lower('Email').contains('@'), lower('Email')) # email must have @

path = os.path.join(SILVER, 'contacts')

(
    spark
    .read
    .parquet(os.path.join(BRONZE, 'VendorContacts'))
    .withColumn('Email', column)
    .withColumn('Phone', regexp_replace('Phone', '^([^0-9]+)', ''))
    .withColumn('Phone', regexp_replace('Phone', '\s+(?=[^0-9])(.*)$', ''))
    .withColumn('Phone', parsePhoneNumber('Phone'))
    .withColumn('Phone', matchPattern('Phone', pattern))
    .selectExpr(
        'ObjectID as CONTACT_ID',
        'CustomerName as CUSTOMER_NAME',
        'FullName as CONTACT_NAME',
        'Phone as PHONE',
        'Email as EMAIL'
    )
    .write
    .mode('overwrite')
    .parquet(path)
)

contacts = spark.read.parquet(path)
contacts.createOrReplaceTempView('contacts')
display(contacts)

CONTACT_ID,CUSTOMER_NAME,CONTACT_NAME,PHONE,EMAIL
556858,New Hanover Regional Medical Center,Rita Morris,(704) 904-4725,rita.morris@navigant.com
559542,New Hanover Regional Medical Center,,,nci_apinvoices@navigant.com
610390,New Hanover Regional Medical Center,,(800) 523-7749,ar@pharmedium.com
611761,New Hanover Regional Medical Center,Lee Ohlson,(847) 457-2340,lohlson@pharmedium.com
953252,New Hanover Regional Medical Center,Colleen Beeghly,(800) 843-7477,cbeeghly@fffenterprises.com
1016790,New Hanover Regional Medical Center,,(800) 328-7833,uscare@avaya.com
1016792,New Hanover Regional Medical Center,,(800) 270-9629,info@pyapc.com
1016794,New Hanover Regional Medical Center,,(800) 934-3453,askcustomerservice@ironmountain.com
1016800,New Hanover Regional Medical Center,,(615) 261-1542,invoice@mmodal.com
1016802,New Hanover Regional Medical Center,,(910) 264-6263,info@rightpointmedia.com


#### Activities

In [0]:
from pyspark.sql.functions import create_map, date_trunc

path = os.path.join(SILVER, 'activities')

column = when(col('ContactType') != 'N/A', col('ContactType'))

mapping = create_map(
    lit('Called Vendor'), lit(True),
    lit('Received Call / Email'), lit(False)
)

(
    spark
    .read
    .parquet(os.path.join(BRONZE, 'StatementRequestActivityRecords'))
    .where(col('ActivityType').isin('Called Vendor', 'Received Call / Email'))
    .withColumn('ActivityType', mapping[col('ActivityType')])
    .withColumn('ContactType', column)
    .withColumn('CreatedDate', date_trunc('second', 'CreatedDate'))
    .withColumnRenamed('ActivityUser', 'EMPLOYEE_NAME')
    .join(
        employees
            .select('EMPLOYEE_ID', 'EMPLOYEE_NAME'),
        on='EMPLOYEE_NAME',
        how='inner'
    )
    .selectExpr(
        'ObjectID as ACTIVITY_ID',
        'ReferenceNumber as REFERENCE_ID',
        'VendorContactObjectID as CONTACT_ID',
        'StatementRequestObjectID as REQUEST_ID',
        'EMPLOYEE_ID',
        'CreatedDate as ACTIVITY_DATE',
        'ActivityType as IS_OUTGOING',
        'JobNumber as JOB_NUMBER',
        'JobName as JOB_NAME',
        'CustomerVendorName as VENDOR_NAME',
        'cast(ReferenceNumber as string) as REFERENCE_NUMBER',
        'Outcome as OUTCOME'
    )
    .write
    .mode('overwrite')
    .parquet(path)
)


activities = spark.read.parquet(path)
activities.createOrReplaceTempView('activities')
display(activities)

ACTIVITY_ID,REFERENCE_ID,CONTACT_ID,REQUEST_ID,EMPLOYEE_ID,ACTIVITY_DATE,IS_OUTGOING,JOB_NUMBER,JOB_NAME,VENDOR_NAME,REFERENCE_NUMBER,OUTCOME
30157033,1257066,,,22196321,2020-12-22T04:35:11.000+0000,False,,,,1257066,Needs Supervisor
30611509,1155663,29556597.0,29267808.0,12190448,2021-01-15T11:07:06.000+0000,True,2936.0,Providence - 2936,ALASKA SURGICAL SERVICES LLC,1155663,Will Email or Fax
30615531,1273791,,28883490.0,8406307,2021-01-15T14:39:42.000+0000,True,3354.0,Advocate Aurora Health - 3354,TELECOURIER COMMUNICATIONS COR,1273791,Will Email or Fax
30630015,1273806,,29286619.0,8406307,2021-01-18T10:01:00.000+0000,False,3354.0,Advocate Aurora Health - 3354,RR DONNELLEY & SONS COMPANY,1273806,Left Voicemail
30630143,1274694,,29286620.0,8406307,2021-01-18T10:11:20.000+0000,True,3354.0,Advocate Aurora Health - 3354,SECURE EXCHANGE SOLUTIONS INC,1274694,Left Voicemail
30630160,1274694,,29286620.0,8406307,2021-01-18T10:14:40.000+0000,False,3354.0,Advocate Aurora Health - 3354,SECURE EXCHANGE SOLUTIONS INC,1274694,Left Voicemail
30630187,1273953,,29286637.0,8406307,2021-01-18T10:20:24.000+0000,True,3354.0,Advocate Aurora Health - 3354,TRACT MANAGER INC,1273953,Left Voicemail
30630327,1273788,,29286629.0,8406307,2021-01-18T10:29:06.000+0000,False,3354.0,Advocate Aurora Health - 3354,STAFF CARE INC,1273788,Will Email or Fax
30630436,1274228,,29286616.0,8406307,2021-01-18T10:34:25.000+0000,True,3354.0,Advocate Aurora Health - 3354,REPUBLIC SERVICES INC,1274228,Left Voicemail
30630595,1274621,,29286625.0,8406307,2021-01-18T10:39:33.000+0000,True,3354.0,Advocate Aurora Health - 3354,SIVANTOS INC,1274621,Left Voicemail


#### Calls

In [0]:
column = when(col('duration') > 0, col('duration'))

(
    spark
    .read
    .parquet(os.path.join(BRONZE, 'AccountCallLogs'))
    .withColumn('duration', column)
    .createOrReplaceTempView('calls')
)

In [0]:
query = """
SELECT
    abs(xxhash64(id)) as CALL_ID,
    EMPLOYEE_ID,
    date_time as CALL_DATE,
    true as IS_OUTGOING,
    call_type as CALL_TYPE,
    result as CALL_RESULT,
    duration as CALL_DURATION,
    callee_number as PHONE
FROM
    calls
        INNER JOIN
            employees ON
                calls.caller_name = employees.EMPLOYEE_NAME
                AND direction = 'outbound'
                AND callee_number IS NOT NULL
"""

outbound = spark.sql(query)

In [0]:
query = """
SELECT
    abs(xxhash64(id)) as CALL_ID,
    EMPLOYEE_ID,
    date_time as CALL_DATE,
    false as IS_OUTGOING,
    call_type as CALL_TYPE,
    result as CALL_RESULT,
    duration as CALL_DURATION,
    caller_number as PHONE
FROM
    calls
        INNER JOIN
            employees ON
                calls.callee_name = employees.EMPLOYEE_NAME
                AND direction = 'inbound'
                AND caller_number IS NOT NULL
"""

inbound = spark.sql(query)

In [0]:
path = os.path.join(SILVER, 'calls')

(
    outbound
    .union(inbound)
    .write
    .mode('overwrite')
    .parquet(path)
)

calls = spark.read.parquet(path)
calls.createOrReplaceTempView('calls')
display(calls)

CALL_ID,EMPLOYEE_ID,CALL_DATE,IS_OUTGOING,CALL_TYPE,CALL_RESULT,CALL_DURATION,PHONE
5020654500538580841,32302927,2022-05-24T18:11:21.000+0000,True,pstn,Call connected,62.0,(530) 582-6656
2556925878001901551,32302927,2022-05-24T18:05:46.000+0000,True,pstn,Call connected,327.0,(530) 587-6011
6344296643072098619,32302927,2022-05-24T17:59:55.000+0000,True,pstn,Call connected,225.0,(909) 580-1000
6379539968163155573,33107513,2022-05-24T17:47:19.000+0000,True,pstn,Call connected,39.0,(800) 835-3832
2338195452411416248,33107513,2022-05-24T17:44:54.000+0000,True,pstn,Call connected,69.0,(800) 934-4477
2087893152334336565,32302927,2022-05-24T17:44:10.000+0000,True,pstn,Call connected,114.0,(208) 226-3200
6600915434226016173,33107513,2022-05-24T17:42:33.000+0000,True,pstn,Call connected,87.0,(888) 320-2220
1476935122761580958,32302927,2022-05-24T17:42:33.000+0000,True,pstn,Call connected,80.0,(208) 226-3200
603516813618359597,32302927,2022-05-24T17:39:49.000+0000,True,pstn,Call connected,151.0,(208) 983-1700
5415465277715417609,32302927,2022-05-24T17:37:57.000+0000,True,pstn,Call connected,79.0,(208) 983-8516


#### Activity Phone Call Bridge
- connects activities to calls

In [0]:
query = """
SELECT
    activities.ACTIVITY_ID,
    calls.CALL_ID,
    abs(
        cast(ACTIVITY_DATE as long)
        - cast(CALL_DATE as long)
    ) as _seconds
FROM
    activities
        INNER JOIN
            contacts ON
                activities.CONTACT_ID = contacts.CONTACT_ID
        INNER JOIN
            calls ON
                activities.EMPLOYEE_ID = calls.EMPLOYEE_ID
                AND cast(ACTIVITY_DATE as date) = cast(CALL_DATE as date)
                AND split(contacts.PHONE, ' ')[0] = split(calls.PHONE, ' ')[0]
                AND split(contacts.PHONE, ' ')[1] = split(calls.PHONE, ' ')[1]
                AND activities.IS_OUTGOING
                AND calls.IS_OUTGOING
"""

outbound = spark.sql(query)

In [0]:
query = """
SELECT
    activities.ACTIVITY_ID,
    calls.CALL_ID,
    abs(
        cast(ACTIVITY_DATE as long)
        - cast(CALL_DATE as long)
    ) as _seconds
FROM
    activities
        INNER JOIN
            contacts ON
                activities.CONTACT_ID = contacts.CONTACT_ID
        INNER JOIN
            calls ON
                activities.EMPLOYEE_ID = calls.EMPLOYEE_ID
                AND cast(ACTIVITY_DATE as date) = cast(CALL_DATE as date)
                AND split(contacts.PHONE, ' ')[0] = split(calls.PHONE, ' ')[0]
                AND split(contacts.PHONE, ' ')[1] = split(calls.PHONE, ' ')[1]
                AND NOT activities.IS_OUTGOING
                AND NOT calls.IS_OUTGOING
"""

inbound = spark.sql(query)

In [0]:
%%time
from pyspark.sql import Window
from pyspark.sql.functions import row_number

window = Window.partitionBy('CALL_ID').orderBy('_seconds')

path = os.path.join(SILVER, 'bridge')

(
    outbound
    .union(inbound)
    .withColumn('_row_number', row_number().over(window))
    .where('_row_number = 1')
    .where(col('_seconds') < 600)
    .drop('_seconds', '_row_number')
    .write
    .mode('overwrite')
    .parquet(path)
)


bridge = spark.read.parquet(path)
bridge.createOrReplaceTempView('bridge')
display(bridge)

ACTIVITY_ID,CALL_ID
48157733,1651837011805286
48153527,15623514248147357
48186981,35791583843690623
48186545,50295988505165788
48155017,63512227272563030
48163645,102690514255742327
48175182,107182879942185866
48157488,128755243679438378
48181001,129989605406627935
48162117,138695552127906543


#### Statements

In [0]:
from pyspark.sql.functions import coalesce, lag

window = (
    Window
    .partitionBy('ReferenceNumber', 'StatementDate', coalesce('EmailMessageID', lit('0')))
    .orderBy('CreatedDate')
)

path = os.path.join(SILVER, 'statements')

( 
    spark
    .read
    .parquet(os.path.join(BRONZE, 'Statements'))
    .withColumn('CreatedDate', date_trunc('second', 'CreatedDate'))
    .withColumn('StatementDate', col('CreatedDate').cast('date'))
    .withColumn('_timestamp', col('CreatedDate').cast('long'))
    .withColumn('seconds', col('_timestamp') - lag('_timestamp', 1).over(window))
    .where('seconds is null or seconds > 600')
    .selectExpr(
        'ReferenceNumber as REFERENCE_ID',
        'abs(xxhash64(EmailMessageID)) as EMAIL_ID',
        'CreatedDate as STATMENT_DATE',
    )
    .join(
        activities,
        on='REFERENCE_ID',
        how='left_semi'
    )
    .write
    .mode('overwrite')
    .parquet(path)
)


statements = spark.read.parquet(path)
statements.createOrReplaceTempView('statements')
display(statements)

REFERENCE_ID,EMAIL_ID,STATMENT_DATE
1048716,447962200369716352,2019-12-13T14:22:56.000+0000
1048716,9167804064408159425,2020-03-16T09:40:58.000+0000
1048716,7374627470664143829,2020-09-16T12:05:22.000+0000
1048725,4744079002938424233,2020-05-18T10:45:45.000+0000
1048725,8440982277561841050,2020-06-16T15:57:29.000+0000
1048725,4126291024429273849,2020-11-17T09:43:56.000+0000
1048729,2838536166921413567,2020-04-23T15:34:49.000+0000
1048729,5604598790801992715,2020-11-04T11:12:02.000+0000
1048729,5624728569561827676,2021-02-03T22:04:32.000+0000
1048729,6708631841172652304,2021-05-14T08:08:58.000+0000


#### Requests

In [0]:
%%time
from pyspark.sql.functions import col, create_map, lit, lower

mapping = create_map(
    lit(0), lit(True),
    lit(1), lit(False)
)

column = when(lower('Contact').contains('@'), lower('Contact'))

path = os.path.join(SILVER, 'requests')

(
    spark
    .read
    .parquet(os.path.join(BRONZE, 'StatementRequests'))
    .withColumn('WNC', mapping[col('WNC')])
    .withColumn('Contact', column)
    .withColumnRenamed('RequesterFullName', 'EMPLOYEE_NAME')
    .join(
        employees
            .select('EMPLOYEE_ID', 'EMPLOYEE_NAME'),
        on='EMPLOYEE_NAME',
        how='inner'
    )
    .selectExpr(
        'ObjectID as REQUEST_ID',
        'EMPLOYEE_ID',
        'cast(RequestDate as date) as REQUEST_DATE',
        'Status as REQUEST_STATUS',
        'RequestMethod as REQUEST_METHOD',
        'RequestType as REQUEST_TYPE',
        'cast(LastActivityDate as date) as LAST_ACTIVITY_DATE',
        'cast(LastStatementReceivedDate as date) as LAST_RECEIVED_DATE',
        'WNC as COMPLIED'
    )
    .write
    .mode('overwrite')
    .parquet(path)
)


requests = spark.read.parquet(path)
requests.createOrReplaceTempView('requests')
display(requests)

REQUEST_ID,EMPLOYEE_ID,REQUEST_DATE,REQUEST_STATUS,REQUEST_METHOD,REQUEST_TYPE,LAST_ACTIVITY_DATE,LAST_RECEIVED_DATE,COMPLIED
25422373,8398642,2020-05-01,Fully Received,MassEmail,,2020-05-01,2020-05-04,True
25422392,8398642,2020-05-01,Fully Received,MassEmail,,2020-05-01,,
25422433,8398642,2020-05-01,Fully Received,MassEmail,,2020-05-01,2020-05-04,
25422445,8398642,2020-05-01,Fully Received,MassEmail,,2020-05-01,2020-05-04,
25422454,8398642,2020-05-01,Partial Receipt,MassEmail,,2020-05-01,2020-05-04,
25422500,8398642,2020-05-01,Fully Received,MassEmail,,2020-05-01,2020-05-01,True
25422522,8398642,2020-05-01,Fully Received,MassEmail,,2020-11-02,,True
25422526,8398642,2020-05-01,Fully Received,MassEmail,,2020-05-01,,True
25422532,8398642,2020-05-01,Fully Received,MassEmail,,2020-05-01,,
25422539,8398642,2020-05-01,Fully Received,MassEmail,,2020-05-01,,True


#### Emails

In [0]:
window = Window.partitionBy('MAILMessageID').orderBy('DateCreated')

path = os.path.join(SILVER, 'emails')

(
    spark
    .read
    .parquet(os.path.join(BRONZE, 'StatementEmailDocs'))
    .where('MAILMessageID is not null')
    .withColumn('MAILDateTime', date_trunc('second', 'MAILDateTime'))
    .withColumn('_row_number', row_number().over(window))
    .where('_row_number = 1')
    .selectExpr(
        'abs(xxhash64(MAILMessageID)) as EMAIL_ID',
        'MAILDateTime as EMAIL_DATE',
        'MAILAttachmentCount as ATTACHMENTS'
    )
    .write
    .mode('overwrite')
    .parquet(path)
)


emails = spark.read.parquet(path)
emails.createOrReplaceTempView('emails')
display(emails)

EMAIL_ID,EMAIL_DATE,ATTACHMENTS
3434900459225551788,2020-04-27T17:13:11.000+0000,3
47513485390769022,2020-04-27T17:13:22.000+0000,2
7691150148485878859,2019-08-14T17:40:02.000+0000,7
6524703005561816072,2019-08-14T19:22:55.000+0000,3
2428594531453498976,2019-08-15T19:30:07.000+0000,6
1588462854887033223,2019-08-21T15:22:36.000+0000,4
8100874607689828866,2019-08-27T12:04:59.000+0000,3
6392962833598207899,2019-08-27T15:32:50.000+0000,4
8329442739920661668,2019-08-29T17:56:53.000+0000,9
7350330946354199811,2019-08-29T19:46:37.000+0000,6


In [0]:
writeOptions = {
    'url': 'jdbc:sqlserver://',
    'server': 'sm-dataproc-01.database.windows.net',
    'database': 'SM-DW',
    'user': dbutils.secrets.get('silo-key-vault-scope', 'spendmend-dbo-user'),
    'password': dbutils.secrets.get('silo-key-vault-scope', 'spendmend-dbo-password'),
}


In [0]:
names = [
    'activities',
    'bridge',
    'calls',
    'contacts',
    'emails',
    'employees',
    'requests',
    'statements'
]

for name in names:
    table = spark.table(name)
    records = table.count()
    print(f'table {name} contains {records:,} records')

    columnTypes = varchar(table, 128)
    options = {**writeOptions, 'createTableColumnTypes': columnTypes} if columnTypes else writeOptions

    (
        table
        .write
        .format('com.microsoft.sqlserver.jdbc.spark')
        .options(**options)
        .option('dbtable', name)
        .mode('overwrite')
        .save()
    )