In [1]:
from pyspark.sql import SparkSession
from pandas import set_option

set_option('display.max.columns', None)

spark = SparkSession.builder.getOrCreate()

In [2]:
from pyspark.sql import Column
from pyspark.sql.functions import col, udf, when
from typing import Optional
import phonenumbers


@udf(returnType='string')
def parsePhoneNumber(number: str, region: str='US') -> Optional[str]:
    try:
        phoneNumber = phonenumbers.parse(number, region)
        return phonenumbers.format_number(phoneNumber, phonenumbers.PhoneNumberFormat.NATIONAL)
    except:
        return None


def matchPattern(colName: str, pattern: str) -> Column:
    column = col(colName) 
    return when(column.rlike(pattern), column)

In [3]:
activities = spark.read.parquet('/tmp/phone-calls/StatementRequestActivityRecords')
activities.createOrReplaceTempView('activities')
activities.limit(5).toPandas()

Unnamed: 0,ObjectID,CreatedDate,ReferenceNumber,CustomerVendorName,JobNumber,JobName,ContactType,ActivityUser,ActivityDate,FollowUpDate,STNID,Notes,Outcome,ActivityType,CustVendorObjectID,VendorContactObjectID,StatementRequestObjectID,CreatedBy,MessageID
0,42373858,2022-02-14 10:29:14.410,1204872,ONTARGETJOBS INC,3056,Sutter Health - 3056,,Workflow Service,2022-02-14,NaT,,,Sent Authorization Letter,,15863298,31446300,42372122,SVC_WORKFLOW,lo0kpojVRpK63Q0-tZt9FQ
1,42373853,2022-02-14 10:29:08.107,1281223,CARDIVA MEDICAL INC,3320,Dignity Health - 3320,,Deborah Murphy,2022-02-14,NaT,,,,,26215461,29600706,36587007,DMURPHY,
2,42373905,2022-02-14 10:30:21.083,1331383,"EDGE INFORMATION MANAGEMENT, INC.",4056,"Health First, Inc. - 4056",,Jessica Rinehart,2022-02-14,NaT,,,,,42352516,42355958,42357889,JRINEHART,
3,42374642,2022-02-14 10:43:59.057,1242775,FAVORITE NURSES FAVORITE TEMPS,3319,Catholic Health Initiatives - 3319,,Deborah Murphy,2022-02-14,NaT,,,,,3490272,31401710,37795162,DMURPHY,
4,42374788,2022-02-14 10:45:59.810,1324919,ARTHREX,4053,Hospital Sisters Health System - 4053,,Dulce Reyes-Loredo,2022-02-14,NaT,,,,,41967124,41967126,41967127,DRLOREDO,


In [4]:
vendors = spark.read.parquet('/tmp/phone-calls/VendorContacts')
vendors.limit(5).toPandas()

Unnamed: 0,CustVendorGroupNo,CustVendorNo,ContactType,TypePreferredContact,LastName,FirstName,FullName,Email,Phone,Fax,Title,Note,ExternalID,BSAPVCID,CID,CustomerName,CVObjectID,CustObjectID,ObjectID,CreatedDate,CreatedBy,BSAPVendorObjectID
0,52891,1192,Accounts Receivable,,,,,InvoiceInquiries@PremierInc.com,,,,,,,15249,Stanly Regional,412893,194771,1016896,2016-08-15 08:27:25.537,DKUESTER,1016896
1,50771,650,Accounts Receivable,,,,,Tabbie.Alvarado@henryschein.com,,,,,,,15249,Stanly Regional,408624,194771,1016899,2016-08-15 08:33:35.630,DKUESTER,1016899
2,53889,1521,Accounts Receivable,,,,,theracomar@icsconnect.com,(888) 882-9942,,,,,,15249,Stanly Regional,414897,194771,1016843,2016-08-10 13:30:07.430,DKUESTER,1016843
3,80931,782,Accounts Receivable,,Hanrahan,Lynne,Lynne Hanrahan,Lynne.hanrahan@esi.net,(757) 217-1381,,Assistant Controller,,,,2406,Sentara Healthcare,522962,515060,1017021,2016-08-24 11:01:55.053,DKUESTER,1017021
4,77228,103762,Accounts Receivable,,,,,scoleman@ajccpas.com,(804) 347-8839,,,,,,2406,Sentara Healthcare,515472,515060,1017024,2016-08-24 11:04:54.023,DKUESTER,1017024


In [185]:
# # vendors.groupBy('ContactType').count().show()
# (
#     vendors
#     .groupBy('ContactType')
#     .count()
#     .orderBy(desc('count'))
#     .show(truncate=False)
# )

In [6]:
%%time
from pyspark.sql.functions import col, lower, regexp_replace, when

pattern = r'\(\d+\) \d+-\d+' # (987) 654-3210

column = when(lower('Email').contains('@'), lower('Email'))

# TODO: while import from database?
(
    vendors
    .withColumn('Email', column)
    .withColumn('Phone', regexp_replace('Phone', '^([^0-9]+)', ''))
    .withColumn('Phone', regexp_replace('Phone', '\s+(?=[^0-9])(.*)', ''))
    .withColumn('Phone', parsePhoneNumber('Phone'))
    .withColumn('Phone', matchPattern('Phone', pattern))
    .selectExpr(
        'ObjectID as CONTACT_ID',
        'CustVendorNo as VENDOR_NUMBER',
        'CustomerName as CUSTOMER_NAME',
        'FullName as CONTACT_NAME',
        'Phone as PHONE',
        'Email as EMAIL'
    )
    .write
    .mode('overwrite')
    .parquet('/tmp/phone-calls/_vendors')
)

CPU times: user 19.3 ms, sys: 6.18 ms, total: 25.5 ms
Wall time: 1min 3s


In [7]:
_vendors = spark.read.parquet('/tmp/phone-calls/_vendors')
_vendors.createOrReplaceTempView('vendors')
_vendors.limit(5).toPandas()

Unnamed: 0,CONTACT_ID,VENDOR_NUMBER,CUSTOMER_NAME,CONTACT_NAME,PHONE,EMAIL
0,1016896,1192,Stanly Regional,,,invoiceinquiries@premierinc.com
1,1016899,650,Stanly Regional,,,tabbie.alvarado@henryschein.com
2,1016843,1521,Stanly Regional,,(888) 882-9942,theracomar@icsconnect.com
3,1017021,782,Sentara Healthcare,Lynne Hanrahan,(757) 217-1381,lynne.hanrahan@esi.net
4,1017024,103762,Sentara Healthcare,,(804) 347-8839,scoleman@ajccpas.com


In [8]:
# remove 0 as will affect aggregation
column = when(col('duration') > 0, col('duration'))

calls = spark.read.parquet('/tmp/phone-calls/AccountCallLogs')
calls = calls.withColumn('duration', column)
calls.createOrReplaceTempView('calls')
calls.limit(5).toPandas()

Unnamed: 0,next_page_token,page_size,total_records,from,to,id,user_id,call_type,caller_number,caller_number_type,caller_name,caller_number_source,callee_number,caller_location,callee_number_type,callee_number_source,callee_location,callee_name,direction,duration,result,waiting_time,date_time,path,has_recording,charge,voice_mail_id,has_voicemail,rate,call_id,owner_type,owner_id,owner_name,owner_extension_number,caller_did_number,caller_country_code,caller_country_iso_code,callee_did_number,callee_country_code,callee_country_iso_code,answer_start_time,call_end_time
0,MNI3JNNEPEyFrnxeuneeLaoRqOrnm1XNOQ2,300,2530,2022-05-09,2022-05-12,34bfc3eb-7180-4c83-b61b-345d084f72b1,d9JiQ8qqTcCh6bfVo2YIYw,pstn,255,1,Nicole Thompson,,18287377552,,2,internal,"Newland, NC",,outbound,,Call Cancel,,2022-05-11 15:36:35,pstn,False,,,False,,7096626840318811841,user,d9JiQ8qqTcCh6bfVo2YIYw,Nicole Thompson,255,16167290255.0,1,US,18287377552,1,US,NaT,2022-05-11 15:38:41
1,MNI3JNNEPEyFrnxeuneeLaoRqOrnm1XNOQ2,300,2530,2022-05-09,2022-05-12,d3d79bc4-22cd-4936-9889-e10f815f45ba,,voip,16162081843,2,Amerisave,internal,300,,1,,,Main Auto Receptionist,inbound,,No Answer,,2022-05-11 15:28:25,autoReceptionist,False,,,False,,7096624731490955678,autoReceptionist,fK99-uaJRnG8yW-b9B4-9g,Main Auto Receptionist,300,,1,US,16162576300,1,US,NaT,NaT
2,MNI3JNNEPEyFrnxeuneeLaoRqOrnm1XNOQ2,300,2530,2022-05-09,2022-05-12,2b8ca4c8-7adc-436a-a2b5-17b0aea69a20,,voip,15165658083,2,,internal,300,Garden City NY,1,,,Main Auto Receptionist,inbound,,No Answer,,2022-05-11 15:21:14,autoReceptionist,False,,,False,,7096622880360005894,autoReceptionist,fK99-uaJRnG8yW-b9B4-9g,Main Auto Receptionist,300,,1,US,16162576300,1,US,NaT,NaT
3,MNI3JNNEPEyFrnxeuneeLaoRqOrnm1XNOQ2,300,2530,2022-05-09,2022-05-12,fbc302fc-4082-4cb5-bb06-f99b603f3392,1avZmyzLRzu2AInJp75T1Q,voip,12312656202,2,2312656202,internal,226,,1,,,Jessica Osborn,inbound,,No Answer,,2022-05-11 15:00:02,pstn,False,,,False,,7096617421447184140,user,1avZmyzLRzu2AInJp75T1Q,Jessica Osborn,226,,1,US,16162572026,1,US,NaT,NaT
4,MNI3JNNEPEyFrnxeuneeLaoRqOrnm1XNOQ2,300,2530,2022-05-09,2022-05-12,a0d06ee4-4472-40f6-a793-2edadca307f6,bkasEu6zRz2YKurlHXnIqw,voip,17022280222,2,RR Partners,internal,208,,1,,,Jessica Caggiano,inbound,,No Answer,,2022-05-11 14:47:33,pstn,False,,,False,,7096614200270916926,user,bkasEu6zRz2YKurlHXnIqw,Jessica Caggiano,208,,1,US,16162572008,1,US,NaT,NaT


In [280]:
query = """
/*
SELECT
    id as CALL_ID,
    date_time as CALL_DATE,
    true as IS_OUTGOING,
    call_type as CALL_TYPE,
    result as CALL_RESULT,
    duration as CALL_DURATION,
    caller_name as EMPLOYEE_NAME,
    caller_number as EMPLOYEE_NUMBER,
    callee_name as CUSTOMER_NAME,
    callee_number as CUSTOMER_NUMBER
FROM
    calls
WHERE
    direction = 'outbound'

UNION ALL
*/

SELECT
    -- id as CALL_ID,
    abs(hash(id)) as CALL_ID,
    date_time as CALL_DATE,
    cast(null as boolean) as IS_OUTGOING,
    call_type as CALL_TYPE,
    result as CALL_RESULT,
    duration as CALL_DURATION,
    callee_name as EMPLOYEE_NAME,
    callee_number as EMPLOYEE_NUMBER,
    caller_name as CUSTOMER_NAME,
    caller_number as CUSTOMER_NUMBER
FROM
    calls
WHERE
    direction = 'inbound'
"""

spark.sql(query).limit(5).toPandas()

Unnamed: 0,CALL_ID,CALL_DATE,IS_OUTGOING,CALL_TYPE,CALL_RESULT,CALL_DURATION,EMPLOYEE_NAME,EMPLOYEE_NUMBER,CUSTOMER_NAME,CUSTOMER_NUMBER
0,1491618881,2022-05-11 19:28:25,,voip,No Answer,,Main Auto Receptionist,,Amerisave,(616) 208-1843
1,404196517,2022-05-11 19:21:14,,voip,No Answer,,Main Auto Receptionist,,,(516) 565-8083
2,382186848,2022-05-11 19:00:02,,voip,No Answer,,Jessica Osborn,,2312656202,(231) 265-6202
3,722380940,2022-05-11 18:47:33,,voip,No Answer,,Jessica Caggiano,,RR Partners,(702) 228-0222
4,1073102257,2022-05-11 18:47:14,,voip,No Answer,,Jessica Caggiano,,RR Partners,(702) 228-0222


In [113]:
# TODO: want only Call connected or average all attempts?
(
    calls
    .groupBy('result')
    .count()
    .show()
)

+--------------+-----+
|        result|count|
+--------------+-----+
|   Call Cancel|  120|
|     No Answer|  199|
|Call connected| 2199|
|      Rejected|   10|
|   Call failed|    2|
+--------------+-----+



In [51]:
# TODO: make NULL?
calls.where(col('duration') == 0).count() # significant amount

343

In [124]:
%%time

query = """
SELECT
    activities.ObjectID,
    activities.StatementRequestObjectID,
    calls.id,
    date_trunc('second', CreatedDate) as CreatedDate,
    calls.date_time,
    ActivityUser,
    calls.caller_name,
    ActivityType,
    calls.direction,
    vendors.Phone,
    -- calls.callee_number,
    vendors.CustomerName as callee_name,
    calls.duration,
    abs(
        cast(CreatedDate as long)
        - cast(date_time as long)
    ) as difference
FROM
    activities
        INNER JOIN
            vendors ON
                activities.VendorContactObjectID = vendors.ObjectID
        INNER JOIN
            calls ON
                activities.ActivityUser = calls.caller_name
                AND cast(activities.CreatedDate as date) = cast(calls.date_time as date)
                AND split(vendors.Phone, ' ')[0] = split(calls.callee_number, ' ')[0]
                AND split(vendors.Phone, ' ')[1] = split(calls.callee_number, ' ')[1]
                
                AND activities.ActivityType = 'Called Vendor'
                AND calls.direction = 'outbound'
"""

from pyspark.sql import Window
from pyspark.sql.functions import row_number

window = Window.partitionBy('id').orderBy('difference')

(
    spark
    .sql(query)
    .withColumn('_row_number', row_number().over(window))
    .where(col('difference') < 600)
    .write
    .mode('overwrite')
    .parquet('/tmp/phone-calls/outbound')
)


outbound = spark.read.parquet('/tmp/phone-calls/outbound')
outbound.limit(5).toPandas()

CPU times: user 30.6 ms, sys: 2.53 ms, total: 33.1 ms
Wall time: 6.71 s


Unnamed: 0,ObjectID,StatementRequestObjectID,id,CreatedDate,date_time,ActivityUser,caller_name,ActivityType,direction,Phone,callee_name,duration,difference,_row_number
0,47836897,46365544,09a20e02-8bb2-424d-aa7e-33bd24c42856,2022-05-10 13:24:44,2022-05-10 13:25:14,Josephine Dagher,Josephine Dagher,Called Vendor,outbound,(901) 730-5718,Martin Memorial Health Systems,68,30,1
1,47839093,46000154,151d7c0b-541e-4748-8e6c-704c0001a079,2022-05-10 15:18:48,2022-05-10 15:18:36,Deborah Murphy,Deborah Murphy,Called Vendor,outbound,(415) 923-9376,Dignity Health,64,12,1
2,47855622,47851855,1ea9f7a7-10da-4b71-a649-7b1c149f9b6e,2022-05-11 10:35:37,2022-05-11 10:31:57,Yodit Kahssai,Yodit Kahssai,Called Vendor,outbound,(800) 438-4810,Community Health Systems,214,220,1
3,47858975,47011657,1ff4388b-3184-440e-aa6d-6ee1ee88cdf4,2022-05-11 12:07:15,2022-05-11 12:07:35,Jessica Caggiano,Jessica Caggiano,Called Vendor,outbound,(727) 545-2800,Envision Healthcare Corporation,111,20,1
4,47863495,38708447,22ea29fa-b7ee-4ea6-a2c1-50482c66e0a8,2022-05-11 15:53:18,2022-05-11 15:51:35,Yodit Kahssai,Yodit Kahssai,Called Vendor,outbound,(907) 279-2500,Community Health Systems,125,103,1


In [70]:
%%time

query = """
SELECT
    activities.ObjectID,
    activities.StatementRequestObjectID,
    calls.id,
    date_trunc('second', activities.CreatedDate) as CreatedDate,
    calls.date_time,
    ActivityUser,
    -- calls.callee_name,
    vendors.CustomerName as caller_name,
    ActivityType,
    calls.direction,
    vendors.Phone,
    -- calls.caller_number,
    -- vendors.CustomerName,
    calls.callee_name,
    calls.duration,
    abs(
        cast(CreatedDate as long)
        - cast(date_time as long)
    ) as difference
FROM
    activities
        INNER JOIN
            vendors ON
                activities.VendorContactObjectID = vendors.ObjectID
        INNER JOIN
            calls ON
                activities.ActivityUser = calls.callee_name
                AND cast(activities.CreatedDate as date) = cast(calls.date_time as date)
                AND split(vendors.Phone, ' ')[0] = split(calls.caller_number, ' ')[0]
                AND split(vendors.Phone, ' ')[1] = split(calls.caller_number, ' ')[1]

                AND activities.ActivityType = 'Received Call / Email'
                AND calls.direction = 'inbound'
"""

from pyspark.sql import Window
from pyspark.sql.functions import row_number

window = Window.partitionBy('id').orderBy('difference')

(
    spark
    .sql(query)
    .withColumn('_row_number', row_number().over(window))
    .where(col('difference') < 600)
    .write
    .mode('overwrite')
    .parquet('/tmp/phone-calls/inbound')
)


inbound = spark.read.parquet('/tmp/phone-calls/inbound')
inbound.limit(5).toPandas()

CPU times: user 38.6 ms, sys: 4.51 ms, total: 43.1 ms
Wall time: 4.27 s


Unnamed: 0,ObjectID,StatementRequestObjectID,id,CreatedDate,date_time,ActivityUser,caller_name,ActivityType,direction,Phone,callee_name,duration,difference,_row_number
0,47773225,46028922,1d13c825-7bb1-4e0d-bd5a-449dec5bbd9b,2022-05-09 11:07:12,2022-05-09 11:04:10,Yodit Kahssai,The Johns Hopkins Health System Corporation,Received Call / Email,inbound,(410) 494-9200,Yodit Kahssai,170,182,1
1,47771759,46888478,0f97fba5-3603-4bf8-8b50-a0c997932a01,2022-05-09 10:20:42,2022-05-09 10:17:22,Xavier Baron,Duke University Health System,Received Call / Email,inbound,(919) 209-2404,Xavier Baron,0,200,1
2,47781354,46887063,d1479e47-a39c-4ddb-9fe5-7d3404eeded0,2022-05-09 16:31:50,2022-05-09 16:27:39,Xavier Baron,Duke University Health System,Received Call / Email,inbound,(919) 609-9493,Xavier Baron,0,251,1
3,47839484,45441553,f9fadf3e-d118-4755-b1f1-7f27d5b9b354,2022-05-10 15:47:29,2022-05-10 15:45:31,Briana Louck,Cambridge Health Alliance,Received Call / Email,inbound,(617) 349-4753,Briana Louck,83,118,1
4,47838906,45441553,7f8e11be-d6f5-404d-84f4-6018cc3580d2,2022-05-10 15:03:37,2022-05-10 15:02:42,Briana Louck,Cambridge Health Alliance,Received Call / Email,inbound,(617) 349-4753,Briana Louck,239,55,1


In [89]:
# from pyspark.sql.functions import desc

# (
#     outbound
#     .groupBy('StatementRequestObjectID')
#     .count()
#     .orderBy(desc('count'))
#     .show()
# )

In [234]:
# from pyspark.sql.functions import date_format

# # activity.ObjectID
# (
#     outbound
#     .withColumn('_hour', date_format('CreatedDate', 'a hh'))
#     .select('CreatedDate', '_hour')
#     .groupBy('_hour')
#     .count()
#     .orderBy('_hour')
#     .show()
# )

In [23]:
columns = {
    'StatementRequestObjectID',
    'ObjectID',
    'CreatedDate',
    'SRARObjectID',
    'SRARReferenceNumber',
    'CustomerVendorName',
    'JobNumber',
    'JobName',
    'ContactType',
    'ActivityUser',
    'ActivityDate',
    'FollowUpDate',
    'STNID',
    'Notes',
    'Outcome',
    'ActivityType',
    'CustVendorObjectID',
    'VendorContactObjectID',
    'CreatedBy',
    'MessageID',
    'SRARMessageID',
    #   'ReferenceNumber',
    'NumberOfStatementsReceived',
    'CustomerName',
    'VendorNo',
    'WNC', # will not comply
    'StatementWNC',
    'RequestDate',
    'StatementRequestReferenceNumber',
    'Status',
    'RequestMethod',
    'RequestType',
    'Contact',
    'RequesterFullName',
    'LastActivityDate',
    'LastStatementReceivedDate',
    'CurrentAssigneeID',
    'CurrentAssigneeName',
    'EmailMessageID',
    'MAILDateTime'
}

#### Requests

In [293]:
requests = spark.read.parquet('/tmp/phone-calls/StatementRequests')
requests.limit(5).toPandas()

Unnamed: 0,JobNo,JobTier,CustomerName,CustVendorID,VendorNo,CustVendorGroupID,WNC,StatementWNC,VendorGroupName,Volume,VolumeTier,VolumeLast12,RequestDate,ReferenceNumber,Status,RequestMethod,RequestType,Contact,RequesterFullName,RequestText,LastActivityDate,LastStatementReceivedDate,CreatedDate,CallsheetNo,ObjectID,CallerStatus,ReconStatus,CurrentAssigneeID,CurrentAssigneeName,EnteredReconDate,LastReconQueueName,LastReconQueueEntryDate,AccountsReceived,AccountsRequested,MessageID,VendorContactObjectID,WebsiteVendor,WNCSpecialHandling,NeedLeadVendor,VendorGroupPrimaryAccountType
0,,,,,,,,,,,,,NaT,,New,,Caller,,,,NaT,NaT,2022-05-05 15:41:16.783,,47597380,,,,,NaT,,NaT,,,,,,,,
1,3427.0,,New Hanover Regional Medical Center,415960.0,16845.0,415959.0,,,A M COFFEE DISTRIBUTORS LLC,0.0,B,0.0,2020-07-24,1285660.0,Superceded,MassEmail,,amcoffeedist@gmail.com,Jessica Rinehart,,2020-07-24,NaT,2020-07-24 16:29:52.840,C-338914,26681846,,,19272454.0,Yodit Kahssai,NaT,,NaT,,,,,,,,
2,3427.0,,New Hanover Regional Medical Center,415960.0,16845.0,415959.0,,,A M COFFEE DISTRIBUTORS LLC,0.0,B,0.0,2020-07-31,1285660.0,Superceded,MassEmail,Mass 1,amcoffeedist@gmail.com,Jessica Rinehart,,2020-07-31,NaT,2020-07-31 08:05:15.817,C-338914,26819776,,,19272454.0,Yodit Kahssai,NaT,,NaT,,,,,,,,
3,3427.0,,New Hanover Regional Medical Center,415960.0,16845.0,415959.0,,,A M COFFEE DISTRIBUTORS LLC,0.0,B,0.0,2021-01-19,1285660.0,Superceded,MassEmail,Caller,amcoffeedist@gmail.com,Jessica Rinehart,,2020-07-31,NaT,2021-01-19 12:01:21.287,C-338914,30690711,Sent Authorization Letter,,8398263.0,Rebekah Dykema,NaT,,NaT,,,,29510277.0,,,,
4,3427.0,,New Hanover Regional Medical Center,415960.0,16845.0,415959.0,,,A M COFFEE DISTRIBUTORS LLC,0.0,B,0.0,2021-08-19,1285660.0,Superceded,MassEmail,Mass 1,amcoffeedist@gmail.com,Cindy Allen,,2021-08-23,NaT,2021-08-23 00:03:52.047,C-338914,36365526,Sent Authorization Letter,,8398263.0,Rebekah Dykema,NaT,,NaT,,,hONrG5xpTUyiQB4Rs7gb0g,29510277.0,,,,


In [205]:
# requests.groupBy('WNC').count().show()
# requests.groupBy('StatementWNC').count().show()
# requests.groupBy('WNC', 'StatementWNC').count().show()
# requests.select('WNC').printSchema()

In [211]:
# drop(
#     'WNC',
#     'StatementWNC',
#     'CreatedDate',
#     'CurrentAssigneeID',
#     'MessageID'
# )
requestColumns = [column for column in requests.columns if column in columns]

requests.select(*requestColumns).limit(5).toPandas()

Unnamed: 0,CustomerName,VendorNo,WNC,StatementWNC,RequestDate,Status,RequestMethod,RequestType,Contact,RequesterFullName,LastActivityDate,LastStatementReceivedDate,CreatedDate,ObjectID,CurrentAssigneeID,CurrentAssigneeName,MessageID,VendorContactObjectID
0,,,,,NaT,New,,Caller,,,NaT,NaT,2022-05-05 15:41:16.783,47597380,,,,
1,New Hanover Regional Medical Center,16845.0,,,2020-07-24,Superceded,MassEmail,,amcoffeedist@gmail.com,Jessica Rinehart,2020-07-24,NaT,2020-07-24 16:29:52.840,26681846,19272454.0,Yodit Kahssai,,
2,New Hanover Regional Medical Center,16845.0,,,2020-07-31,Superceded,MassEmail,Mass 1,amcoffeedist@gmail.com,Jessica Rinehart,2020-07-31,NaT,2020-07-31 08:05:15.817,26819776,19272454.0,Yodit Kahssai,,
3,New Hanover Regional Medical Center,16845.0,,,2021-01-19,Superceded,MassEmail,Caller,amcoffeedist@gmail.com,Jessica Rinehart,2020-07-31,NaT,2021-01-19 12:01:21.287,30690711,8398263.0,Rebekah Dykema,,29510277.0
4,New Hanover Regional Medical Center,16845.0,,,2021-08-19,Superceded,MassEmail,Mass 1,amcoffeedist@gmail.com,Cindy Allen,2021-08-23,NaT,2021-08-23 00:03:52.047,36365526,8398263.0,Rebekah Dykema,hONrG5xpTUyiQB4Rs7gb0g,29510277.0


In [294]:
from pyspark.sql.functions import col, create_map, lit, lower

mapping = create_map(
    lit(0), lit(True),
    lit(1), lit(False)
)

column = when(lower('Contact').contains('@'), lower('Contact'))

(
    requests
    .withColumn('WNC', mapping[col('WNC')])
    .withColumn('Contact', column)
    .selectExpr(
        'ObjectID as REQUEST_ID',
        # 'VendorContactObjectID as CONTACT_ID',
        # 'CustomerName as CUSTOMER_NAME',
        # 'VendorNo as VENDOR_NUMBER',
        'cast(RequestDate as date) as REQUEST_DATE',
        'Status as REQUEST_STATUS',
        'RequestMethod as REQUEST_METHOD',
        'RequestType as REQUEST_TYPE',
        'Contact as REQUEST_CONTACT',
        'RequesterFullName as EMPLOYEE_NAME',
        'cast(LastActivityDate as date) as LAST_ACTIVITY_DATE',
        'cast(LastStatementReceivedDate as date) as LAST_RECEIVED_DATE',
        'CurrentAssigneeName as CURRENT_ASSIGNED_NAME'
    )
    .limit(5)
    .toPandas()
)

Unnamed: 0,REQUEST_ID,REQUEST_DATE,REQUEST_STATUS,REQUEST_METHOD,REQUEST_TYPE,REQUEST_CONTACT,EMPLOYEE_NAME,LAST_ACTIVITY_DATE,LAST_RECEIVED_DATE,CURRENT_ASSIGNED_NAME
0,47597380,,New,,Caller,,,,,
1,26681846,2020-07-24,Superceded,MassEmail,,amcoffeedist@gmail.com,Jessica Rinehart,2020-07-24,,Yodit Kahssai
2,26819776,2020-07-31,Superceded,MassEmail,Mass 1,amcoffeedist@gmail.com,Jessica Rinehart,2020-07-31,,Yodit Kahssai
3,30690711,2021-01-19,Superceded,MassEmail,Caller,amcoffeedist@gmail.com,Jessica Rinehart,2020-07-31,,Rebekah Dykema
4,36365526,2021-08-19,Superceded,MassEmail,Mass 1,amcoffeedist@gmail.com,Cindy Allen,2021-08-23,,Rebekah Dykema


In [218]:
# (
#     requests
#         .where('VendorContactObjectID is not null')
#     .join(
#         vendors,
#         on=requests['VendorContactObjectID'] == vendors['ObjectID'],
#         how='inner'
#     )
#     .where('Contact != Email')
#     .select('Contact', 'Email')
#     .show()
# )

In [131]:
activityColumns = [column for column in activities.columns if column in columns]
activities.select(*activityColumns).limit(5).toPandas()

Unnamed: 0,ObjectID,CreatedDate,CustomerVendorName,JobNumber,JobName,ContactType,ActivityUser,ActivityDate,FollowUpDate,STNID,Notes,Outcome,ActivityType,CustVendorObjectID,VendorContactObjectID,StatementRequestObjectID,CreatedBy,MessageID
0,42373858,2022-02-14 10:29:14.410,ONTARGETJOBS INC,3056,Sutter Health - 3056,,Workflow Service,2022-02-14,NaT,,,Sent Authorization Letter,,15863298,31446300,42372122,SVC_WORKFLOW,lo0kpojVRpK63Q0-tZt9FQ
1,42373853,2022-02-14 10:29:08.107,CARDIVA MEDICAL INC,3320,Dignity Health - 3320,,Deborah Murphy,2022-02-14,NaT,,,,,26215461,29600706,36587007,DMURPHY,
2,42373905,2022-02-14 10:30:21.083,"EDGE INFORMATION MANAGEMENT, INC.",4056,"Health First, Inc. - 4056",,Jessica Rinehart,2022-02-14,NaT,,,,,42352516,42355958,42357889,JRINEHART,
3,42374642,2022-02-14 10:43:59.057,FAVORITE NURSES FAVORITE TEMPS,3319,Catholic Health Initiatives - 3319,,Deborah Murphy,2022-02-14,NaT,,,,,3490272,31401710,37795162,DMURPHY,
4,42374788,2022-02-14 10:45:59.810,ARTHREX,4053,Hospital Sisters Health System - 4053,,Dulce Reyes-Loredo,2022-02-14,NaT,,,,,41967124,41967126,41967127,DRLOREDO,


In [295]:
from pyspark.sql.functions import date_trunc

column = when(col('ContactType') != 'N/A', col('ContactType'))

(
    activities
    .withColumn('ContactType', column)
    .withColumn('CreatedDate', date_trunc('second', 'CreatedDate'))
    .selectExpr(
        'ObjectID as ACTIVITY_ID',
        'ReferenceNumber as REFERENCE_ID',
        'VendorContactObjectID as CONTACT_ID',
        'StatementRequestObjectID as REQUEST_ID',
        'CreatedDate as ACTIVITY_DATE',
        'ActivityType as ACTIVITY_TYPE',
        'JobNumber as JOB_NUMBER',
        'JobName as JOB_NAME',
        'CustomerVendorName as VENDOR_NAME',
        'cast(ReferenceNumber as string) as REFERENCE_NUMBER',
        'ContactType as CONTACT_METHOD',
        'ActivityUser as EMPLOYEE_NAME',
        'Outcome as OUTCOME'
    )
    .limit(5)
    .toPandas()
)

Unnamed: 0,ACTIVITY_ID,REFERENCE_ID,CONTACT_ID,REQUEST_ID,ACTIVITY_DATE,ACTIVITY_TYPE,JOB_NUMBER,JOB_NAME,VENDOR_NAME,REFERENCE_NUMBER,CONTACT_METHOD,EMPLOYEE_NAME,OUTCOME
0,42373858,1204872,31446300,42372122,2022-02-14 10:29:14,,3056,Sutter Health - 3056,ONTARGETJOBS INC,1204872,,Workflow Service,Sent Authorization Letter
1,42373853,1281223,29600706,36587007,2022-02-14 10:29:08,,3320,Dignity Health - 3320,CARDIVA MEDICAL INC,1281223,,Deborah Murphy,
2,42373905,1331383,42355958,42357889,2022-02-14 10:30:21,,4056,"Health First, Inc. - 4056","EDGE INFORMATION MANAGEMENT, INC.",1331383,,Jessica Rinehart,
3,42374642,1242775,31401710,37795162,2022-02-14 10:43:59,,3319,Catholic Health Initiatives - 3319,FAVORITE NURSES FAVORITE TEMPS,1242775,,Deborah Murphy,
4,42374788,1324919,41967126,41967127,2022-02-14 10:45:59,,4053,Hospital Sisters Health System - 4053,ARTHREX,1324919,,Dulce Reyes-Loredo,


In [284]:
(
    activities
    .groupBy('ActivityType')
    .count()
    .orderBy(desc('count'))
    .show(truncate=False)
)

+---------------------+-------+
|ActivityType         |count  |
+---------------------+-------+
|null                 |1968056|
|Called Vendor        |191725 |
|Note Only            |73795  |
|Received Call / Email|11746  |
|Emailed Vendor       |9917   |
+---------------------+-------+



In [25]:
statements = spark.read.parquet('/tmp/phone-calls/Statements')
statements.limit(5).toPandas()

Unnamed: 0,ReferenceNumber,JobNo,JobTier,StatementDate,DocumentHandle,ObjectID,CustVendorObjID,CustVendGroupObjID,CustVendGroupName,CustomerName,CID,CustVendName,CustVendNo,Volume,AccountsIdentified,Recon,EmailMessageID,CreatedDate,SRARObjectId,ZeroBalance
0,1224005,3188,1.0,2019-08-14,,20513258,20513253,20513252,OHIO MEDICAL CORPORATION,OhioHealth Corporation,6261,OHIO MEDICAL CORPORATION,40003215,100003.0,,,00000000F643DA057DBD124B829A30CFCF400C1507002B...,2019-08-14 11:18:01.353,,
1,1229524,3244,,2019-08-14,,20513265,20513264,20513263,SCRIPTPRO USA INC,Jackson Health System-CMS,4910,SCRIPTPRO USA INC,108370,738284.0,,,00000000F643DA057DBD124B829A30CFCF400C1507002B...,2019-08-14 11:18:26.637,,
2,1247505,3317,,2019-08-14,,20513268,20513267,20513266,CHROM TECH INC,Mayo Clinic,281,CHROM TECH INC,2846,9108143.52,,,00000000F643DA057DBD124B829A30CFCF400C1507002B...,2019-08-14 11:19:08.073,,
3,1248087,3317,,2019-08-14,,20513273,20513272,20513271,PERFICIENT INC,Mayo Clinic,281,PERFICIENT INC,742853258,949683.0,,,00000000F643DA057DBD124B829A30CFCF400C1507002B...,2019-08-14 11:19:43.393,,
4,1224005,3188,1.0,2019-08-14,,20513254,20513253,20513252,OHIO MEDICAL CORPORATION,OhioHealth Corporation,6261,OHIO MEDICAL CORPORATION,40003215,100003.0,,,00000000F643DA057DBD124B829A30CFCF400C1507002B...,2019-08-14 11:17:48.653,,


In [26]:
statementColumns = [column for column in statements.columns if column in columns]
statements.select(*statementColumns).limit(5).toPandas()

Unnamed: 0,ObjectID,CustomerName,EmailMessageID,CreatedDate
0,20513258,OhioHealth Corporation,00000000F643DA057DBD124B829A30CFCF400C1507002B...,2019-08-14 11:18:01.353
1,20513265,Jackson Health System-CMS,00000000F643DA057DBD124B829A30CFCF400C1507002B...,2019-08-14 11:18:26.637
2,20513268,Mayo Clinic,00000000F643DA057DBD124B829A30CFCF400C1507002B...,2019-08-14 11:19:08.073
3,20513273,Mayo Clinic,00000000F643DA057DBD124B829A30CFCF400C1507002B...,2019-08-14 11:19:43.393
4,20513254,OhioHealth Corporation,00000000F643DA057DBD124B829A30CFCF400C1507002B...,2019-08-14 11:17:48.653


In [153]:
(
    activities
    .join(
        statements,
        on=activities['ObjectId'] == statements['SRARObjectId'],
        how='left_semi'
    )
    .count()
)

15188

In [157]:
(
    statements
    .join(
        activities,
        on=activities['ObjectId'] == statements['SRARObjectId'],
        how='left_semi'
    )
    .groupBy('SRARObjectId')
    .count()
    .where('count < 6')
    .orderBy(desc('count'))
    .show(5)
)

+------------+-----+
|SRARObjectId|count|
+------------+-----+
|    36261070|    5|
|    37812810|    5|
|    37207422|    5|
|    38552676|    5|
|    38316534|    5|
+------------+-----+
only showing top 5 rows



In [206]:
# number of statements 
# statements.where(col('SRARObjectId') == 37812810).toPandas()

In [38]:
from pyspark.sql.functions import approxCountDistinct, desc

# multiple email messages for each statement request
(
    statements
    .where('SRARObjectId is not null')
    .groupBy('SRARObjectId')
    .agg(approxCountDistinct('EmailMessageID').alias('count'))
    .where('count > 1')
    .orderBy(desc('count'))
    .show(10)
)

+------------+-----+
|SRARObjectId|count|
+------------+-----+
|    36876948|   19|
|    38342177|   10|
|    38111176|    4|
|    37842272|    4|
|    37211746|    4|
|    38342291|    4|
|    38119985|    4|
|    37831114|    4|
|    36150575|    4|
|    37831408|    3|
+------------+-----+
only showing top 10 rows



In [44]:
(
    statements
    .where('SRARObjectId is not null')
    .groupBy('SRARObjectId', 'StatementDate')
    .agg(approxCountDistinct('EmailMessageID').alias('count'))
    .where('count > 1')
    .orderBy(desc('count'))
    .show(10)
)

+------------+-------------------+-----+
|SRARObjectId|      StatementDate|count|
+------------+-------------------+-----+
|    36876948|2021-09-09 00:00:00|   19|
|    38342177|2021-10-18 00:00:00|   10|
|    38342291|2021-10-18 00:00:00|    4|
|    38119985|2021-10-14 00:00:00|    4|
|    38111176|2021-10-11 00:00:00|    4|
|    36150575|2021-08-30 00:00:00|    4|
|    38709236|2021-11-02 00:00:00|    3|
|    36510659|2021-08-31 00:00:00|    3|
|    36228293|2021-09-02 00:00:00|    3|
|    38370697|2021-10-19 00:00:00|    3|
+------------+-------------------+-----+
only showing top 10 rows



In [84]:
statements.where('SRARObjectId is null').count() / statements.count()

0.9490284056692593

In [85]:
# cast(ReferenceNumber as ...)
(
    statements
    .where(col('ReferenceNumber').startswith('0'))
    .count()
)

0

In [90]:
(
    statements
    .selectExpr('max(cast(ReferenceNumber as int))')
    .first()
)

# 12578737 > 2**31 - 1

Row(max(CAST(ReferenceNumber AS INT))=12578737)

In [201]:
# %%time
# from pyspark.sql.functions import min, max

# (
#     statements
#     # .where('ReferenceNumber = 1253166')
#     # .withColumn('StatementDate', col('StatementDate').cast('date'))
#     .withColumn('StatementDate', col('CreatedDate').cast('date'))
#     .withColumn('CreatedDate', date_trunc('second', 'CreatedDate'))
#     .groupBy(
#         'ReferenceNumber',
#         'StatementDate',
#         'EmailMessageID'
#     )
#     .agg(
#         (
#             max(col('CreatedDate').cast('long'))
#             - min(col('CreatedDate').cast('long'))
#         ).alias('seconds')
#     )
#     .orderBy(desc('seconds'))
#     .show()
#     # .selectExpr('log(seconds) as seconds')
#     # .toPandas()
#     # .plot
#     # .hist()
#     # .density()
# )

In [202]:
# # statements.where(col('SRARObjectId') == 36876948).orderBy('CreatedDate').toPandas()

# (
#     statements
#     .groupBy('ReferenceNumber', 'StatementDate')
#     .count()
#     .where('count < 6')
#     .orderBy(desc('count'))
#     .show(5)
# )

In [203]:
# statements.where('ReferenceNumber = 1048716').orderBy('CreatedDate').toPandas()

In [204]:
# from pyspark.sql.functions import abs, hash

# statement = (
#     statements
#     .withColumn('EmailMessageID', abs(hash('EmailMessageID')))
#     .withColumn('StatementDate', col('StatementDate').cast('date'))
#     .withColumn('CreatedDate', date_trunc('second', 'CreatedDate'))
#     .where('ReferenceNumber = 1048716')
#     .selectExpr(
#         'ReferenceNumber',
#         'StatementDate',
#         'EmailMessageID',
#         'CreatedDate'
#     )
# )

# statement.cache().toPandas()

In [205]:
# %%time
# from pyspark.sql import Window
# from pyspark.sql.functions import coalesce, lag, lit

# # TODO: change 0 to '0'
# window = (
#     Window
#     .partitionBy('ReferenceNumber', 'StatementDate', coalesce('EmailMessageID', lit('0')))
#     .orderBy('CreatedDate')
# )


# (
#     # statement
#     statements
#     .withColumn('StatementDate', col('StatementDate').cast('date'))
#     .withColumn('_timestamp', col('CreatedDate').cast('long'))
#     .withColumn('seconds', col('_timestamp') - lag('_timestamp', 1).over(window))
#     # .withColumn('_equal', col('EmailMessageID') == lag('EmailMessageID', 1).over(window))
#     # .where('not (_lag is not null and _lag < 600 and _equal = true)')
#     # .where('seconds is null or seconds > 300')
#     # .limit(5)
#     .where('seconds is not null')
#     # .selectExpr('log(seconds) as seconds')
#     .selectExpr('cast(floor(seconds / 60) as int) as minutes')
#     .groupBy('minutes')
#     .count()
#     .orderBy(desc('count'))
#     .show()
#     # .where('minutes < 5')
#     # .toPandas()
#     # .plot
#     # .hist()
# )

In [291]:
%%time
from pyspark.sql import Window
from pyspark.sql.functions import coalesce, date_trunc, lag, lit

window = (
    Window
    .partitionBy('ReferenceNumber', 'StatementDate', coalesce('EmailMessageID', lit('0')))
    .orderBy('CreatedDate')
)


( 
    statements
    .withColumn('CreatedDate', date_trunc('second', 'CreatedDate'))
    .withColumn('StatementDate', col('CreatedDate').cast('date'))
    .withColumn('_timestamp', col('CreatedDate').cast('long'))
    .withColumn('seconds', col('_timestamp') - lag('_timestamp', 1).over(window))
    .where('seconds is null or seconds > 600')
    .selectExpr(
        'ObjectID as STATEMENT_ID',
        'ReferenceNumber as REFERENCE_ID',
        'abs(xxhash64(EmailMessageID)) as EMAIL_ID',
        'CreatedDate as STATMENT_DATE',
        # 'ReferenceNumber as REFERENCE_NUMBER',
        # 'JobNo as JOB_NUMBER',
        # 'CustomerName as CUSTOMER_NAME',
        # 'CustVendNo as VENDOR_NUMBER',
        # 'CustVendName as VENDOR_NAME'
    )   
    .limit(5)
    .toPandas()
)

CPU times: user 22.6 ms, sys: 5.01 ms, total: 27.6 ms
Wall time: 910 ms


Unnamed: 0,STATEMENT_ID,REFERENCE_ID,EMAIL_ID,STATMENT_DATE
0,33612749,1048754,2723467234927794471,2021-05-12 09:35:50
1,33613882,1048754,2723467234927794471,2021-05-12 09:55:59
2,33623983,1048754,2723467234927794471,2021-05-12 13:11:38
3,34482383,1048912,7192993192948554295,2021-06-16 20:41:42
4,33073550,1048934,5434065228407253023,2021-04-20 10:19:54


In [214]:
# from pyspark.sql.functions import approxCountDistinct

# (
#     statements
#     .where('SRARObjectId is not null')
#     .groupBy('SRARObjectId')
#     .agg(approxCountDistinct('StatementDate').alias('count'))
#     .where('count > 1')
#     .orderBy(desc('count'))
#     .show(10)
# )

In [215]:
# statementColumns = [column for column in statements.columns if column in columns]
# statements.select(*statementColumns).limit(5).toPandas()

# (
#     statements
#     .where('SRARObjectId is not null')
#     .groupBy('SRARObjectId')
#     .count()
#     .orderBy(desc('count'))
#     .show(5)
# )

In [80]:
# (
#     outbound
#     .join(
#         requests.where(col('RequestType') != 'Caller'),
#         on=outbound['StatementRequestObjectID'] == requests['ObjectID'],
#         how='left_semi'
#     )
#     .toPandas()
# )

In [216]:
# (
#     requests
#     .join(
#         outbound,
#         on=outbound['StatementRequestObjectID'] == requests['ObjectID'],
#         how='left_semi'
#     )
#     # .groupBy('RequestType')
#     # .count()
#     # .show()
# )

#### Emails

In [51]:
emails = spark.read.parquet('/tmp/phone-calls/StatementEmailDocs')
emails.limit(5).toPandas()

Unnamed: 0,DocumentHandle,DateCreated,MAILDateTime,MAILFromAddress,MAILToAddress,MAILCcAddress,MAILSubject,MAILMessageID,MAILAttachmentCount,S-Ref#,S-CustomerName,S-Job#,S-VendorName,S-CreatedByUser,S-SkipAutoReceive,S-Recon,S-LargeCredits,IngestionSource,S-StatementExistsInBatch
0,15594200,2019-06-12 09:22:11.947,2019-06-07 19:14:29,,,,RE: STATEMENT REQUEST - FAIRVIEW HEALTH SERVIC...,00000000F643DA057DBD124B829A30CFCF400C1507002B...,11,1230333,,,,,,,,,
1,15595567,2019-06-12 09:28:05.423,2019-06-07 19:14:17,,,,RE: STATEMENT REQUEST (REF # 1190860),00000000F643DA057DBD124B829A30CFCF400C1507002B...,7,1190860,,,,,,,,,
2,15595581,2019-06-12 09:40:03.420,2019-06-07 19:12:18,,,,RE: INFORMATION REQUEST #1237232,00000000F643DA057DBD124B829A30CFCF400C1507002B...,2,1237232,,,,,,,,,
3,15595588,2019-06-12 09:42:47.153,2019-06-07 19:15:58,,,,REF#1157275 STATEMENT FROM PROVATION MEDICAL F...,00000000F643DA057DBD124B829A30CFCF400C1507002B...,2,1157275,,,,,,,,,
4,15595595,2019-06-12 09:49:49.783,2019-06-07 19:15:37,,,,RE: INFORMATION REQUEST #1217277,00000000F643DA057DBD124B829A30CFCF400C1507002B...,2,1217277,,,,,,,,,


In [255]:
from pyspark.sql import Window
from pyspark.sql.functions import regexp_replace, row_number

window = Window.partitionBy('MAILMessageID').orderBy('DateCreated')

(
    emails
    .where('MAILMessageID is not null')
    .withColumn('MAILDateTime', date_trunc('second', 'MAILDateTime'))
    .withColumn('MAILSubject', regexp_replace('MAILSubject', r'\s+', ' '))
    .withColumn('_row_number', row_number().over(window))
    .where('_row_number = 1')
    .selectExpr(
        'abs(xxhash64(MAILMessageID)) as EMAIL_ID',
        'MAILDateTime as EMAIL_DATE',
        # 'MAILSubject as SUBJECT',
        'MAILAttachmentCount as ATTACHMENTS'
    )
    # .show(truncate=False)
    .limit(5)
    .toPandas()
)

Unnamed: 0,EMAIL_ID,EMAIL_DATE,ATTACHMENTS
0,7676645025288761150,2020-02-05 15:08:12,5
1,7778416853004093289,2020-07-23 17:49:00,2
2,8489479255620551112,2020-11-18 15:57:11,2
3,6689225018682692358,2020-12-08 21:50:38,5
4,3561445009217631212,2019-08-13 16:07:44,2


In [257]:
employees = spark.read.parquet('/tmp/phone-calls/Employees')
employees.limit(5).toPandas()

Unnamed: 0,ID,OnBaseUserName,BSAPUserName,Email,FirstName,LastName,FullName,Title,Phone,PrimaryRoleName,PodId,NamedClient,NamedWF/WV,Active,PodName,ManagerName,EmployeeID,HireDate,HRManagerFullName,HR-ManagerObjectID,HR-ManagerOnBaseUserName,HR-ManagerLastName,HR-ManagerFirstName,BSAPSID,CreatedBy,CreatedDate
0,194728,APRINS,aprins,aprins@spendmend.com,Andi,Prins,Andi Prins,,616-257-6362,Audit Supervisor,14499753.0,1,1,1,Gold,Travis Wheeler,459,1998-04-26 20:00:00,Travis Wheeler,194747,TWHEELER,Wheeler,Travis,89,MANAGER,2016-07-06 04:32:19.777
1,194729,BVANGOOR,BVangoor,bvangoor@spendmend.com,Bob,VanGoor,Bob VanGoor,,616-257-6306,Audit Supervisor,14497464.0,1,1,1,Red,Dan Hutchins,202,1993-12-31 19:00:00,Dan Hutchins,194732,DHUTCHINS,Hutchins,Dan,91,MANAGER,2016-07-06 04:32:20.107
2,194730,CKRETOWICZ,ckretowicz,ckretowicz@spendmend.com,Colleen,Kretowicz,Colleen Kretowicz,,616-257-6398,Audit Supervisor,14499753.0,1,1,1,Gold,Travis Wheeler,1390,2015-07-05 20:00:00,Travis Wheeler,194747,TWHEELER,Wheeler,Travis,596,MANAGER,2016-07-06 04:32:20.153
3,194731,CALLEN,callen,callen@spendmend.com,Cindy,Allen,Cindy Allen,,616-257-6377,WNC Auditor,,1,1,1,,,1432,2007-01-07 19:00:00,Rob Heminger,12611480,RHEMINGER,Heminger,Rob,402,MANAGER,2016-07-06 04:32:20.200
4,194732,DHUTCHINS,dhutchins,dhutchins@spendmend.com,Dan,Hutchins,Dan Hutchins,,616-257-6317,Audit Manager,14497464.0,1,1,1,Red,Dan Hutchins,206,1995-09-17 20:00:00,Joshua Kiel,194739,JKIEL1,Kiel,Joshua,100,MANAGER,2016-07-06 04:32:20.230


In [297]:
(
    employees
    .selectExpr(
        'ID as EMPLOYEE_ID',
        'FullName as EMPLOYEE_NAME',
        'PrimaryRoleName as ROLE',
        'lower(Email) as EMAIL',
        'ManagerName as MANAGER_NAME',
        'PodName as TEAM'
    )
    .limit(5)
    .toPandas()
)

Unnamed: 0,EMPLOYEE_ID,EMPLOYEE_NAME,ROLE,EMAIL,MANAGER_NAME,TEAM
0,194728,Andi Prins,Audit Supervisor,aprins@spendmend.com,Travis Wheeler,Gold
1,194729,Bob VanGoor,Audit Supervisor,bvangoor@spendmend.com,Dan Hutchins,Red
2,194730,Colleen Kretowicz,Audit Supervisor,ckretowicz@spendmend.com,Travis Wheeler,Gold
3,194731,Cindy Allen,WNC Auditor,callen@spendmend.com,,
4,194732,Dan Hutchins,Audit Manager,dhutchins@spendmend.com,Dan Hutchins,Red


In [276]:
(
    activities
        .where('ActivityUser is not null')
    .join(
        employees,
        on=activities['ActivityUser'] == employees['FullName'],
        how='left_anti'
    )
    .groupBy('ActivityUser')
    .count()
    .orderBy(desc('count'))
    .show(truncate=False)
)

+----------------------+------+
|ActivityUser          |count |
+----------------------+------+
|OnBase                |370056|
|System                |231924|
|Workflow Service      |149561|
|ExcelUpdate           |36347 |
|fbishop               |30904 |
|callen                |28245 |
|bwilhoit              |21622 |
|croark                |15059 |
|Cynthia Allen         |14206 |
|cbrooks               |9179  |
|Jeanette Conway       |8179  |
|jrinehart             |6995  |
|bwilliams1            |6953  |
|mcullen               |6545  |
|Jessica Johnston Macro|6432  |
|William Wilhoit       |4820  |
|ctaylor               |4811  |
|Melissa Cullen        |4598  |
|jdagher               |4283  |
|ykahssai              |4237  |
+----------------------+------+
only showing top 20 rows



In [281]:
# employees.where(col('FullName').contains('Jessica')).toPandas()

employees.where(col('Email').contains('bwilliams1')).toPandas()

Unnamed: 0,ID,OnBaseUserName,BSAPUserName,Email,FirstName,LastName,FullName,Title,Phone,PrimaryRoleName,PodId,NamedClient,NamedWF/WV,Active,PodName,ManagerName,EmployeeID,HireDate,HRManagerFullName,HR-ManagerObjectID,HR-ManagerOnBaseUserName,HR-ManagerLastName,HR-ManagerFirstName,BSAPSID,CreatedBy,CreatedDate
0,26528878,bwilliams1,bwilliams1,bwilliams1@spendmend.com,Becky,Williams,Becky Williams,,(616)257-6324,WNC Auditor,,1,1,1,,,2441,2020-07-19 20:00:00,Cindy Allen,194731,CALLEN,Allen,Cindy,887,ZAUBUCHON,2020-07-16 05:30:08.130


In [258]:
(
    employees
    .groupBy('PodName')
    .count()
    .orderBy(desc('count'))
    .show()
)

+-------+-----+
|PodName|count|
+-------+-----+
|   null|  368|
| Orange|   16|
|   Blue|   16|
|    Red|   15|
|   Gold|   15|
|   Grey|   14|
| Purple|   10|
|  Black|    7|
+-------+-----+



In [262]:
(
    employees
    .where('PodName is not null')
    .groupBy('PrimaryRoleName')
    .count()
    .orderBy(desc('count'), 'PrimaryRoleName')
    .show()
)

+--------------------+-----+
|     PrimaryRoleName|count|
+--------------------+-----+
|    Audit Supervisor|   21|
|       Project Audit|   21|
|    Statement Caller|   16|
|     Statement Audit|   10|
|       Audit Manager|    8|
|     Project Audit 2|    8|
|       Credit Caller|    3|
|Intern/Project Audit|    3|
| Sales and Marketing|    1|
|     Service Account|    1|
|     Statement Admin|    1|
+--------------------+-----+



In [249]:
# (
#     emails
#     .groupBy('MAILMessageID')
#     .agg(approxCountDistinct('MAILAttachmentCount').alias('count'))
#     .where('count > 1')
#     .show(truncate=False)
#     # .selectExpr('max(length(MAILSubject))').first()[0]
#     # .where('MAILMessageID is not null')
#     # .count()
# )

In [253]:
# (
#     statements
#         .where('EmailMessageID is not null')
#     .join(
#         emails,
#         on=emails['MAILMessageID'] == statements['EmailMessageID'],
#         how='left_anti'
#     )
#     .count()
# )

In [217]:
# (
#     emails
#     .join(
#         statements,
#         on=emails['MAILMessageID'] == statements['EmailMessageID'],
#         how='left_semi'
#     )
#     .groupBy('MAILMessageID')
#     .count()
#     .orderBy(desc('count'))
#     .show(5, truncate=False)
# )

In [76]:
# almost all single time mail delivered
(
    emails
    .where('MAILMessageID is not null')
    .groupBy('MAILMessageID')
    .agg(approxCountDistinct('MAILDateTime').alias('count'))
    .where('count > 1')
    .show(truncate=False)
)

141

In [77]:
%%time
from pyspark.sql import Window
from pyspark.sql.functions import date_trunc, desc, row_number

window = Window.partitionBy('MAILMessageID').orderBy(desc('DateCreated'))


(
    emails
    .withColumn('_row_number', row_number().over(window))
    .where('_row_number = 1')
    .select(
        'MAILMessageID',
        date_trunc('second', 'MAILDateTime').alias('MAILDateTime')
    )
    .write
    .mode('overwrite')
    .parquet('/tmp/phone-calls/_email')
)

_emails = spark.read.parquet('/tmp/phone-calls/_email')
_emails.limit(5).toPandas()

CPU times: user 20.5 ms, sys: 4.96 ms, total: 25.5 ms
Wall time: 6.07 s


Unnamed: 0,MAILMessageID,MAILDateTime
0,00000000C477DA00223F534CAC4DB644DFC12E48070057...,2019-08-27 11:24:54
1,00000000C477DA00223F534CAC4DB644DFC12E48070057...,2019-09-05 15:04:31
2,00000000C477DA00223F534CAC4DB644DFC12E48070057...,2020-01-23 15:31:47
3,00000000C477DA00223F534CAC4DB644DFC12E48070057...,2020-05-27 14:53:33
4,00000000C477DA00223F534CAC4DB644DFC12E48070057...,2020-09-16 13:28:10


#### Number of Matches

In [336]:
(
    calls
    .where(col('direction') == 'outbound')
    .withColumn('caller_date', col('date_time').cast('date'))
    .join(
        activities
            .where(col('ActivityType') == 'Called Vendor')
            .join(
                _vendors
                    .withColumnRenamed('Phone', 'callee_number')
                    .select('ObjectID', 'callee_number'),
                on=activities['VendorContactObjectID'] == _vendors['ObjectID'],
                how='left'
            )
            .withColumnRenamed('ActivityUser', 'caller_name')
            .withColumn('caller_date', col('CreatedDate').cast('date')),
        on=['caller_name', 'caller_date', 'callee_number'],
        how='left_semi'
    )
    .count()
    # .groupBy('caller_name', 'caller_date')
    # .count()
    # .orderBy(desc('count'))
    # .show()
)

1367

In [None]:
spark.stop()