In [1]:
from pyspark.sql import SparkSession
from pandas import set_option

set_option('display.max.columns', None)

spark = SparkSession.builder.getOrCreate()

In [2]:
from pyspark.sql import Column
from pyspark.sql.functions import col, udf, when
from typing import Optional

import phonenumbers


# returns phone number in format (987) 654-3210
@udf(returnType='string')
def parsePhoneNumber(number: str, region: str='US') -> Optional[str]:
    try:
        phoneNumber = phonenumbers.parse(number, region)
        return phonenumbers.format_number(phoneNumber, phonenumbers.PhoneNumberFormat.NATIONAL)
    except:
        return None


# retains value when value matches pattern
def matchPattern(colName: str, pattern: str) -> Column:
    column = col(colName) 
    return when(column.rlike(pattern), column)

In [4]:
import os

SILVER = '/tmp/phone-calls/silver'

In [7]:
%%time

path = os.path.join(SILVER, 'employees')

(
    spark
    .read
    .parquet('/tmp/phone-calls/Employees')
    .selectExpr(
        'ID as EMPLOYEE_ID',
        'FullName as EMPLOYEE_NAME',
        'PrimaryRoleName as ROLE',
        'lower(Email) as EMAIL',
        'ManagerName as MANAGER_NAME',
        'PodName as TEAM'
    )
    .write
    .mode('overwrite')
    .parquet(path)
)

employees = spark.read.parquet(path)
employees.createOrReplaceTempView('employees')
employees.limit(5).toPandas()

CPU times: user 16.5 ms, sys: 3.09 ms, total: 19.6 ms
Wall time: 393 ms


Unnamed: 0,EMPLOYEE_ID,EMPLOYEE_NAME,ROLE,EMAIL,MANAGER_NAME,TEAM
0,194728,Andi Prins,Audit Supervisor,aprins@spendmend.com,Travis Wheeler,Gold
1,194729,Bob VanGoor,Audit Supervisor,bvangoor@spendmend.com,Dan Hutchins,Red
2,194730,Colleen Kretowicz,Audit Supervisor,ckretowicz@spendmend.com,Travis Wheeler,Gold
3,194731,Cindy Allen,WNC Auditor,callen@spendmend.com,,
4,194732,Dan Hutchins,Audit Manager,dhutchins@spendmend.com,Dan Hutchins,Red


In [31]:
%%time
from pyspark.sql.functions import col, lower, regexp_replace, when

pattern = r'\(\d+\) \d+-\d+' # (987) 654-3210

column = when(lower('Email').contains('@'), lower('Email'))

path = os.path.join(SILVER, 'contacts')

(
    spark
    .read
    .parquet('/tmp/phone-calls/VendorContacts')
    .withColumn('Email', column)
    .withColumn('Phone', regexp_replace('Phone', '^([^0-9]+)', ''))
    .withColumn('Phone', regexp_replace('Phone', '\s+(?=[^0-9])(.*)$', ''))
    .withColumn('Phone', parsePhoneNumber('Phone'))
    .withColumn('Phone', matchPattern('Phone', pattern))
    .selectExpr(
        'ObjectID as CONTACT_ID',
        'CustVendorNo as VENDOR_NUMBER',
        'CustomerName as CUSTOMER_NAME',
        'FullName as CONTACT_NAME',
        'Phone as PHONE',
        'Email as EMAIL'
    )
    .write
    .mode('overwrite')
    .parquet(path)
)

contacts = spark.read.parquet(path)
contacts.createOrReplaceTempView('contacts')
contacts.limit(5).toPandas()

CPU times: user 24.1 ms, sys: 5.09 ms, total: 29.2 ms
Wall time: 1min 3s


Unnamed: 0,CONTACT_ID,VENDOR_NUMBER,CUSTOMER_NAME,CONTACT_NAME,PHONE,EMAIL
0,1016896,1192,Stanly Regional,,,invoiceinquiries@premierinc.com
1,1016899,650,Stanly Regional,,,tabbie.alvarado@henryschein.com
2,1016843,1521,Stanly Regional,,(888) 882-9942,theracomar@icsconnect.com
3,1017021,782,Sentara Healthcare,Lynne Hanrahan,(757) 217-1381,lynne.hanrahan@esi.net
4,1017024,103762,Sentara Healthcare,,(804) 347-8839,scoleman@ajccpas.com


In [23]:
%%time
from pyspark.sql.functions import col, create_map, date_trunc, lit, when

path = os.path.join(SILVER, 'activities')

column = when(col('ContactType') != 'N/A', col('ContactType'))

mapping = create_map(
    lit('Called Vendor'), lit(True),
    lit('Received Call / Email'), lit(None).cast('boolean')
)

(
    spark
    .read
    .parquet('/tmp/phone-calls/StatementRequestActivityRecords')
    .where(col('ActivityType').isin('Called Vendor', 'Received Call / Email'))
    .withColumn('ActivityType', mapping[col('ActivityType')])
    .withColumn('ContactType', column)
    .withColumn('CreatedDate', date_trunc('second', 'CreatedDate'))
    .withColumnRenamed('ActivityUser', 'EMPLOYEE_NAME')
    .join(
        employees
            .select('EMPLOYEE_ID', 'EMPLOYEE_NAME'),
        on='EMPLOYEE_NAME',
        how='inner'
    )
    .selectExpr(
        'ObjectID as ACTIVITY_ID',
        'ReferenceNumber as REFERENCE_ID',
        'VendorContactObjectID as CONTACT_ID',
        'StatementRequestObjectID as REQUEST_ID',
        'EMPLOYEE_ID',
        'CreatedDate as ACTIVITY_DATE',
        'ActivityType as IS_OUTGOING',
        'JobNumber as JOB_NUMBER',
        'JobName as JOB_NAME',
        'CustomerVendorName as VENDOR_NAME',
        'cast(ReferenceNumber as string) as REFERENCE_NUMBER',
        'Outcome as OUTCOME'
    )
    .write
    .mode('overwrite')
    .parquet(path)
)


activities = spark.read.parquet(path)
activities.limit(5).toPandas()

CPU times: user 36.7 ms, sys: 5.85 ms, total: 42.5 ms
Wall time: 1.79 s


Unnamed: 0,ACTIVITY_ID,REFERENCE_ID,CONTACT_ID,REQUEST_ID,EMPLOYEE_ID,ACTIVITY_DATE,IS_OUTGOING,JOB_NUMBER,JOB_NAME,VENDOR_NAME,REFERENCE_NUMBER,OUTCOME
0,36228229,1306574,33991530,34427094,33107601,2021-08-20 07:52:35,,3766,Cooper University Health Care - 3766,STERLING INFOSYSTEMS INC,1306574,Sent Authorization Letter
1,36229304,1305099,33740838,34061321,33107601,2021-08-20 08:11:01,True,3815,SwedishAmerican Hospital - 3815,MARINA MEDICAL INSTRUMENTS,1305099,Sent Authorization Letter
2,36229442,1281479,29551031,30714692,35385742,2021-08-20 08:13:25,True,3319,Catholic Health Initiatives - 3319,TIERPOINT LLC,1281479,Sent Authorization Letter
3,36229553,1242989,31401519,30686048,35385742,2021-08-20 08:15:04,True,3319,Catholic Health Initiatives - 3319,WCP SOLUTIONS,1242989,Left Voicemail
4,36229838,1243349,29553618,29126627,35385742,2021-08-20 08:19:41,True,3319,Catholic Health Initiatives - 3319,BIOPTICS INC,1243349,Left Voicemail


In [60]:
%%time
from pyspark.sql.functions import col

column = when(col('duration') > 0, col('duration'))

(
    spark
    .read
    .parquet('/tmp/phone-calls/AccountCallLogs')
    .withColumn('duration', column)
    .createOrReplaceTempView('calls')
)


query = """
SELECT
    abs(xxhash64(id)) as CALL_ID,
    date_time as CALL_DATE,
    true as IS_OUTGOING,
    call_type as CALL_TYPE,
    result as CALL_RESULT,
    duration as CALL_DURATION,
    EMPLOYEE_ID,
    callee_number as PHONE
FROM
    calls
        INNER JOIN
            employees ON
                calls.caller_name = employees.EMPLOYEE_NAME
                AND direction = 'outbound'
                AND callee_number IS NOT NULL

UNION ALL

SELECT
    abs(xxhash64(id)) as CALL_ID,
    date_time as CALL_DATE,
    cast(null as boolean) as IS_OUTGOING,
    call_type as CALL_TYPE,
    result as CALL_RESULT,
    duration as CALL_DURATION,
    EMPLOYEE_ID,
    caller_number as PHONE
FROM
    calls
        INNER JOIN
            employees ON
                calls.callee_name = employees.EMPLOYEE_NAME
                AND direction = 'inbound'
                AND caller_number IS NOT NULL
"""

path = os.path.join(SILVER, 'calls')

(
    spark
    .sql(query)
    .write
    .mode('overwrite')
    .parquet(path)
)

calls = spark.read.parquet(path)
calls.createOrReplaceTempView('calls')


CPU times: user 22.4 ms, sys: 2.85 ms, total: 25.3 ms
Wall time: 445 ms


Unnamed: 0,CALL_ID,CALL_DATE,IS_OUTGOING,CALL_TYPE,CALL_RESULT,CALL_DURATION,EMPLOYEE_ID,PHONE
0,7362212912651137723,2022-05-11 19:36:35,True,pstn,Call Cancel,,23561006,(828) 737-7552
1,591728025720393858,2022-05-11 18:09:01,True,pstn,Call connected,12.0,8406307,(800) 417-3747
2,1445124709734510472,2022-05-11 18:05:58,True,pstn,Call connected,102.0,8406307,(602) 288-0031
3,701486195797491244,2022-05-11 18:05:20,True,pstn,Call connected,8.0,8406307,(602) 288-0060
4,8326658045653164631,2022-05-11 18:04:48,True,pstn,Call connected,8.0,34745413,(800) 431-1055


In [None]:
spark.stop()