In [1]:
from pyspark.sql import SparkSession
from pandas import set_option

set_option('display.max.columns', None)

spark = SparkSession.builder.getOrCreate()

In [186]:
from pyspark.sql import Column
from pyspark.sql.functions import col, udf, when
from typing import Optional
import phonenumbers


@udf(returnType='string')
def parsePhoneNumber(number: str, region: str='US') -> Optional[str]:
    try:
        phoneNumber = phonenumbers.parse(number, region)
        return phonenumbers.format_number(phoneNumber, phonenumbers.PhoneNumberFormat.NATIONAL)
    except:
        return None


def matchPattern(colName: str, pattern: str) -> Column:
    column = col(colName) 
    return when(column.rlike(pattern), column)

In [16]:
activities = spark.read.parquet('/tmp/phone-calls/StatementRequestActivityRecords')
activities.createOrReplaceTempView('activities')
activities.limit(5).toPandas()

Unnamed: 0,ObjectID,CreatedDate,ReferenceNumber,CustomerVendorName,JobNumber,JobName,ContactType,ActivityUser,ActivityDate,FollowUpDate,STNID,Notes,Outcome,ActivityType,CustVendorObjectID,VendorContactObjectID,StatementRequestObjectID,CreatedBy,MessageID
0,42373858,2022-02-14 10:29:14.410,1204872,ONTARGETJOBS INC,3056,Sutter Health - 3056,,Workflow Service,2022-02-14,NaT,,,Sent Authorization Letter,,15863298,31446300,42372122,SVC_WORKFLOW,lo0kpojVRpK63Q0-tZt9FQ
1,42373853,2022-02-14 10:29:08.107,1281223,CARDIVA MEDICAL INC,3320,Dignity Health - 3320,,Deborah Murphy,2022-02-14,NaT,,,,,26215461,29600706,36587007,DMURPHY,
2,42373905,2022-02-14 10:30:21.083,1331383,"EDGE INFORMATION MANAGEMENT, INC.",4056,"Health First, Inc. - 4056",,Jessica Rinehart,2022-02-14,NaT,,,,,42352516,42355958,42357889,JRINEHART,
3,42374642,2022-02-14 10:43:59.057,1242775,FAVORITE NURSES FAVORITE TEMPS,3319,Catholic Health Initiatives - 3319,,Deborah Murphy,2022-02-14,NaT,,,,,3490272,31401710,37795162,DMURPHY,
4,42374788,2022-02-14 10:45:59.810,1324919,ARTHREX,4053,Hospital Sisters Health System - 4053,,Dulce Reyes-Loredo,2022-02-14,NaT,,,,,41967124,41967126,41967127,DRLOREDO,


In [4]:
vendors = spark.read.parquet('/tmp/phone-calls/VendorContacts')
vendors.limit(5).toPandas()

Unnamed: 0,CustVendorGroupNo,CustVendorNo,ContactType,TypePreferredContact,LastName,FirstName,FullName,Email,Phone,Fax,Title,Note,ExternalID,BSAPVCID,CID,CustomerName,CVObjectID,CustObjectID,ObjectID,CreatedDate,CreatedBy,BSAPVendorObjectID
0,52891,1192,Accounts Receivable,,,,,InvoiceInquiries@PremierInc.com,,,,,,,15249,Stanly Regional,412893,194771,1016896,2016-08-15 08:27:25.537,DKUESTER,1016896
1,50771,650,Accounts Receivable,,,,,Tabbie.Alvarado@henryschein.com,,,,,,,15249,Stanly Regional,408624,194771,1016899,2016-08-15 08:33:35.630,DKUESTER,1016899
2,53889,1521,Accounts Receivable,,,,,theracomar@icsconnect.com,(888) 882-9942,,,,,,15249,Stanly Regional,414897,194771,1016843,2016-08-10 13:30:07.430,DKUESTER,1016843
3,80931,782,Accounts Receivable,,Hanrahan,Lynne,Lynne Hanrahan,Lynne.hanrahan@esi.net,(757) 217-1381,,Assistant Controller,,,,2406,Sentara Healthcare,522962,515060,1017021,2016-08-24 11:01:55.053,DKUESTER,1017021
4,77228,103762,Accounts Receivable,,,,,scoleman@ajccpas.com,(804) 347-8839,,,,,,2406,Sentara Healthcare,515472,515060,1017024,2016-08-24 11:04:54.023,DKUESTER,1017024


In [187]:
# (
#     vendors
#     .selectExpr(
#         'ObjectID as CONTACT_ID',
#         'CustVendorNo as VENDOR_NUMBER',
#         'CustomerName as COMPANY_NAME',
#         'FullName as CONTACT_NAME',
#         # 'ContactType as CATEGORY',
#         'Phone as PHONE',
#         'Email as EMAIL'
#     )
#     .limit(10)
#     .toPandas()
# )

In [185]:
# # vendors.groupBy('ContactType').count().show()
# (
#     vendors
#     .groupBy('ContactType')
#     .count()
#     .orderBy(desc('count'))
#     .show(truncate=False)
# )

In [188]:
%%time
from pyspark.sql.functions import col, regexp_replace

pattern = r'\(\d+\) \d+-\d+' # (987) 654-3210

# TODO: while import from database?
(
    vendors
    .withColumn('Phone', regexp_replace('Phone', '^([^0-9]+)', ''))
    .withColumn('Phone', regexp_replace('Phone', '\s+(?=[^0-9])(.*)', ''))
    .withColumn('Phone', parsePhoneNumber('Phone'))
    .withColumn('Phone', matchPattern('Phone', pattern))
    # .where(col('Phone').rlike(pattern))
    .selectExpr(
        'ObjectID as CONTACT_ID',
        'CustVendorNo as VENDOR_NUMBER',
        'CustomerName as COMPANY_NAME',
        'FullName as CONTACT_NAME',
        # 'ContactType as CATEGORY',
        'Phone as PHONE',
        'Email as EMAIL'
    )
    .write
    .mode('overwrite')
    .parquet('/tmp/phone-calls/_vendors')
)

CPU times: user 12.5 ms, sys: 3.38 ms, total: 15.8 ms
Wall time: 1min


In [189]:
_vendors = spark.read.parquet('/tmp/phone-calls/_vendors')
_vendors.createOrReplaceTempView('vendors')
_vendors.limit(5).toPandas()

Unnamed: 0,CONTACT_ID,VENDOR_NUMBER,COMPANY_NAME,CONTACT_NAME,PHONE,EMAIL
0,1016896,1192,Stanly Regional,,,InvoiceInquiries@PremierInc.com
1,1016899,650,Stanly Regional,,,Tabbie.Alvarado@henryschein.com
2,1016843,1521,Stanly Regional,,(888) 882-9942,theracomar@icsconnect.com
3,1017021,782,Sentara Healthcare,Lynne Hanrahan,(757) 217-1381,Lynne.hanrahan@esi.net
4,1017024,103762,Sentara Healthcare,,(804) 347-8839,scoleman@ajccpas.com


In [199]:
# calls = spark.read.parquet('/tmp/phone-calls/AccountCallLogs')
# calls.createOrReplaceTempView('calls')
# calls.limit(5).toPandas()

# remove 0 as will affect aggregation
column = when(col('duration') > 0, col('duration'))

columns = [
    'id',
    'call_type',
    'caller_number',
    'caller_name',
    'callee_number',
    'callee_name',
    'direction',
    'duration',
    'result',
    'date_time'
]

(
    calls
    .withColumn('duration', column)
    .select(*columns)
    .limit(5)
    .toPandas()
)

Unnamed: 0,id,call_type,caller_number,caller_name,callee_number,callee_name,direction,duration,result,date_time
0,34bfc3eb-7180-4c83-b61b-345d084f72b1,pstn,,Nicole Thompson,(828) 737-7552,,outbound,,Call Cancel,2022-05-11 19:36:35
1,d3d79bc4-22cd-4936-9889-e10f815f45ba,voip,(616) 208-1843,Amerisave,,Main Auto Receptionist,inbound,,No Answer,2022-05-11 19:28:25
2,2b8ca4c8-7adc-436a-a2b5-17b0aea69a20,voip,(516) 565-8083,,,Main Auto Receptionist,inbound,,No Answer,2022-05-11 19:21:14
3,fbc302fc-4082-4cb5-bb06-f99b603f3392,voip,(231) 265-6202,2312656202,,Jessica Osborn,inbound,,No Answer,2022-05-11 19:00:02
4,a0d06ee4-4472-40f6-a793-2edadca307f6,voip,(702) 228-0222,RR Partners,,Jessica Caggiano,inbound,,No Answer,2022-05-11 18:47:33


In [191]:
# (
#     calls
#     .where(col('direction') == 'outbound')
#     .withColumn('difference', col('call_end_time').cast('long') - col('date_time').cast('long'))
#     .select('date_time', 'call_end_time', 'duration', 'difference')
#     .limit(5)
#     .toPandas()
# )

In [200]:
# (
#     calls
#     .where(col('direction') == 'inbound')
#     .withColumn('difference', col('call_end_time').cast('long') - col('date_time').cast('long'))
#     .select('date_time', 'call_end_time', 'duration', 'difference')
#     .limit(5)
#     .toPandas()
# )

In [143]:
# (
#     calls
#     .where(col('direction') == 'inbound')
#     .where(col('duration') == 0)
#     # .where('call_end_time is null')
#     # .count()
#     .limit(5)
#     .toPandas()
# )

In [142]:
# (
#     calls
#     .where(col('direction') == 'outbound')
#     .groupBy('result', 'duration')
#     .count()
#     .orderBy(desc('count'))
#     .show()
# )

In [141]:
# (
#     calls
#     .where(col('direction') == 'outbound')
#     .groupBy('result', 'duration')
#     .count()
#     .orderBy(desc('count'))
#     .show()
# )

In [196]:
# (
#     calls
#     .where(col('result') == 'Call connected')
#     .where(col('duration') == 0)
#     .withColumn('difference', col('call_end_time').cast('long') - col('date_time').cast('long'))
#     .select('direction', 'date_time', 'call_end_time', 'duration', 'difference')
#     .limit(10)
#     .toPandas()
# )

In [113]:
# TODO: want only Call connected or average all attempts?
(
    calls
    .groupBy('result')
    .count()
    .show()
)

+--------------+-----+
|        result|count|
+--------------+-----+
|   Call Cancel|  120|
|     No Answer|  199|
|Call connected| 2199|
|      Rejected|   10|
|   Call failed|    2|
+--------------+-----+



In [51]:
# TODO: make NULL?
calls.where(col('duration') == 0).count() # significant amount

343

In [124]:
%%time

query = """
SELECT
    activities.ObjectID,
    activities.StatementRequestObjectID,
    calls.id,
    date_trunc('second', CreatedDate) as CreatedDate,
    calls.date_time,
    ActivityUser,
    calls.caller_name,
    ActivityType,
    calls.direction,
    vendors.Phone,
    -- calls.callee_number,
    vendors.CustomerName as callee_name,
    calls.duration,
    abs(
        cast(CreatedDate as long)
        - cast(date_time as long)
    ) as difference
FROM
    activities
        INNER JOIN
            vendors ON
                activities.VendorContactObjectID = vendors.ObjectID
        INNER JOIN
            calls ON
                activities.ActivityUser = calls.caller_name
                AND cast(activities.CreatedDate as date) = cast(calls.date_time as date)
                AND split(vendors.Phone, ' ')[0] = split(calls.callee_number, ' ')[0]
                AND split(vendors.Phone, ' ')[1] = split(calls.callee_number, ' ')[1]
                
                AND activities.ActivityType = 'Called Vendor'
                AND calls.direction = 'outbound'
"""

from pyspark.sql import Window
from pyspark.sql.functions import row_number

window = Window.partitionBy('id').orderBy('difference')

(
    spark
    .sql(query)
    .withColumn('_row_number', row_number().over(window))
    .where(col('difference') < 600)
    .write
    .mode('overwrite')
    .parquet('/tmp/phone-calls/outbound')
)


outbound = spark.read.parquet('/tmp/phone-calls/outbound')
outbound.limit(5).toPandas()

CPU times: user 30.6 ms, sys: 2.53 ms, total: 33.1 ms
Wall time: 6.71 s


Unnamed: 0,ObjectID,StatementRequestObjectID,id,CreatedDate,date_time,ActivityUser,caller_name,ActivityType,direction,Phone,callee_name,duration,difference,_row_number
0,47836897,46365544,09a20e02-8bb2-424d-aa7e-33bd24c42856,2022-05-10 13:24:44,2022-05-10 13:25:14,Josephine Dagher,Josephine Dagher,Called Vendor,outbound,(901) 730-5718,Martin Memorial Health Systems,68,30,1
1,47839093,46000154,151d7c0b-541e-4748-8e6c-704c0001a079,2022-05-10 15:18:48,2022-05-10 15:18:36,Deborah Murphy,Deborah Murphy,Called Vendor,outbound,(415) 923-9376,Dignity Health,64,12,1
2,47855622,47851855,1ea9f7a7-10da-4b71-a649-7b1c149f9b6e,2022-05-11 10:35:37,2022-05-11 10:31:57,Yodit Kahssai,Yodit Kahssai,Called Vendor,outbound,(800) 438-4810,Community Health Systems,214,220,1
3,47858975,47011657,1ff4388b-3184-440e-aa6d-6ee1ee88cdf4,2022-05-11 12:07:15,2022-05-11 12:07:35,Jessica Caggiano,Jessica Caggiano,Called Vendor,outbound,(727) 545-2800,Envision Healthcare Corporation,111,20,1
4,47863495,38708447,22ea29fa-b7ee-4ea6-a2c1-50482c66e0a8,2022-05-11 15:53:18,2022-05-11 15:51:35,Yodit Kahssai,Yodit Kahssai,Called Vendor,outbound,(907) 279-2500,Community Health Systems,125,103,1


In [70]:
%%time

query = """
SELECT
    activities.ObjectID,
    activities.StatementRequestObjectID,
    calls.id,
    date_trunc('second', activities.CreatedDate) as CreatedDate,
    calls.date_time,
    ActivityUser,
    -- calls.callee_name,
    vendors.CustomerName as caller_name,
    ActivityType,
    calls.direction,
    vendors.Phone,
    -- calls.caller_number,
    -- vendors.CustomerName,
    calls.callee_name,
    calls.duration,
    abs(
        cast(CreatedDate as long)
        - cast(date_time as long)
    ) as difference
FROM
    activities
        INNER JOIN
            vendors ON
                activities.VendorContactObjectID = vendors.ObjectID
        INNER JOIN
            calls ON
                activities.ActivityUser = calls.callee_name
                AND cast(activities.CreatedDate as date) = cast(calls.date_time as date)
                AND split(vendors.Phone, ' ')[0] = split(calls.caller_number, ' ')[0]
                AND split(vendors.Phone, ' ')[1] = split(calls.caller_number, ' ')[1]

                AND activities.ActivityType = 'Received Call / Email'
                AND calls.direction = 'inbound'
"""

from pyspark.sql import Window
from pyspark.sql.functions import row_number

window = Window.partitionBy('id').orderBy('difference')

(
    spark
    .sql(query)
    .withColumn('_row_number', row_number().over(window))
    .where(col('difference') < 600)
    .write
    .mode('overwrite')
    .parquet('/tmp/phone-calls/inbound')
)


inbound = spark.read.parquet('/tmp/phone-calls/inbound')
inbound.limit(5).toPandas()

CPU times: user 38.6 ms, sys: 4.51 ms, total: 43.1 ms
Wall time: 4.27 s


Unnamed: 0,ObjectID,StatementRequestObjectID,id,CreatedDate,date_time,ActivityUser,caller_name,ActivityType,direction,Phone,callee_name,duration,difference,_row_number
0,47773225,46028922,1d13c825-7bb1-4e0d-bd5a-449dec5bbd9b,2022-05-09 11:07:12,2022-05-09 11:04:10,Yodit Kahssai,The Johns Hopkins Health System Corporation,Received Call / Email,inbound,(410) 494-9200,Yodit Kahssai,170,182,1
1,47771759,46888478,0f97fba5-3603-4bf8-8b50-a0c997932a01,2022-05-09 10:20:42,2022-05-09 10:17:22,Xavier Baron,Duke University Health System,Received Call / Email,inbound,(919) 209-2404,Xavier Baron,0,200,1
2,47781354,46887063,d1479e47-a39c-4ddb-9fe5-7d3404eeded0,2022-05-09 16:31:50,2022-05-09 16:27:39,Xavier Baron,Duke University Health System,Received Call / Email,inbound,(919) 609-9493,Xavier Baron,0,251,1
3,47839484,45441553,f9fadf3e-d118-4755-b1f1-7f27d5b9b354,2022-05-10 15:47:29,2022-05-10 15:45:31,Briana Louck,Cambridge Health Alliance,Received Call / Email,inbound,(617) 349-4753,Briana Louck,83,118,1
4,47838906,45441553,7f8e11be-d6f5-404d-84f4-6018cc3580d2,2022-05-10 15:03:37,2022-05-10 15:02:42,Briana Louck,Cambridge Health Alliance,Received Call / Email,inbound,(617) 349-4753,Briana Louck,239,55,1


In [89]:
# from pyspark.sql.functions import desc

# (
#     outbound
#     .groupBy('StatementRequestObjectID')
#     .count()
#     .orderBy(desc('count'))
#     .show()
# )

In [177]:
from pyspark.sql.functions import date_format

# activity.ObjectID
(
    outbound
    .withColumn('_hour', date_format('CreatedDate', 'a hh'))
    .select('CreatedDate', '_hour')
    .groupBy('_hour')
    .count()
    .orderBy('_hour')
    .show()
)

+-----+-----+
|_hour|count|
+-----+-----+
|AM 08|   59|
|AM 09|  117|
|AM 10|  185|
|AM 11|  190|
|PM 01|  171|
|PM 02|  184|
|PM 03|  150|
|PM 04|  100|
|PM 05|   35|
|PM 06|    4|
|PM 12|  135|
+-----+-----+



In [116]:
columns = {
    'StatementRequestObjectID',
    'ObjectID',
    'CreatedDate',
    'SRARObjectID',
    'SRARReferenceNumber',
    'CustomerVendorName',
    'JobNumber',
    'JobName',
    'ContactType',
    'ActivityUser',
    'ActivityDate',
    'FollowUpDate',
    'STNID',
    'Notes',
    'Outcome',
    'ActivityType',
    'CustVendorObjectID',
    'VendorContactObjectID',
    'CreatedBy',
    'MessageID',
    'SRARMessageID',
    #   'ReferenceNumber',
    'NumberOfStatementsReceived',
    'CustomerName',
    'VendorNo',
    'WNC', # will not comply
    'StatementWNC',
    'RequestDate',
    'StatementRequestReferenceNumber',
    'Status',
    'RequestMethod',
    'RequestType',
    'Contact',
    'RequesterFullName',
    'LastActivityDate',
    'LastStatementReceivedDate',
    'CurrentAssigneeID',
    'CurrentAssigneeName',
    'EmailMessageID',
    'MAILDateTime'
}

#### Requests

In [67]:
requests = spark.read.parquet('/tmp/phone-calls/StatementRequests')
requests.limit(5).toPandas()

Unnamed: 0,JobNo,JobTier,CustomerName,CustVendorID,VendorNo,CustVendorGroupID,WNC,StatementWNC,VendorGroupName,Volume,VolumeTier,VolumeLast12,RequestDate,ReferenceNumber,Status,RequestMethod,RequestType,Contact,RequesterFullName,RequestText,LastActivityDate,LastStatementReceivedDate,CreatedDate,CallsheetNo,ObjectID,CallerStatus,ReconStatus,CurrentAssigneeID,CurrentAssigneeName,EnteredReconDate,LastReconQueueName,LastReconQueueEntryDate,AccountsReceived,AccountsRequested,MessageID,VendorContactObjectID,WebsiteVendor,WNCSpecialHandling,NeedLeadVendor,VendorGroupPrimaryAccountType
0,,,,,,,,,,,,,NaT,,New,,Caller,,,,NaT,NaT,2022-05-05 15:41:16.783,,47597380,,,,,NaT,,NaT,,,,,,,,
1,3427.0,,New Hanover Regional Medical Center,415960.0,16845.0,415959.0,,,A M COFFEE DISTRIBUTORS LLC,0.0,B,0.0,2020-07-24,1285660.0,Superceded,MassEmail,,amcoffeedist@gmail.com,Jessica Rinehart,,2020-07-24,NaT,2020-07-24 16:29:52.840,C-338914,26681846,,,19272454.0,Yodit Kahssai,NaT,,NaT,,,,,,,,
2,3427.0,,New Hanover Regional Medical Center,415960.0,16845.0,415959.0,,,A M COFFEE DISTRIBUTORS LLC,0.0,B,0.0,2020-07-31,1285660.0,Superceded,MassEmail,Mass 1,amcoffeedist@gmail.com,Jessica Rinehart,,2020-07-31,NaT,2020-07-31 08:05:15.817,C-338914,26819776,,,19272454.0,Yodit Kahssai,NaT,,NaT,,,,,,,,
3,3427.0,,New Hanover Regional Medical Center,415960.0,16845.0,415959.0,,,A M COFFEE DISTRIBUTORS LLC,0.0,B,0.0,2021-01-19,1285660.0,Superceded,MassEmail,Caller,amcoffeedist@gmail.com,Jessica Rinehart,,2020-07-31,NaT,2021-01-19 12:01:21.287,C-338914,30690711,Sent Authorization Letter,,8398263.0,Rebekah Dykema,NaT,,NaT,,,,29510277.0,,,,
4,3427.0,,New Hanover Regional Medical Center,415960.0,16845.0,415959.0,,,A M COFFEE DISTRIBUTORS LLC,0.0,B,0.0,2021-08-19,1285660.0,Superceded,MassEmail,Mass 1,amcoffeedist@gmail.com,Cindy Allen,,2021-08-23,NaT,2021-08-23 00:03:52.047,C-338914,36365526,Sent Authorization Letter,,8398263.0,Rebekah Dykema,NaT,,NaT,,,hONrG5xpTUyiQB4Rs7gb0g,29510277.0,,,,


In [120]:
# drop(
#     'WNC',
#     'StatementWNC',
#     'CreatedDate',
#     'CurrentAssigneeID',
#     'MessageID'
# )
requestColumns = [column for column in requests.columns if column in columns]

requests.select(*requestColumns).limit(5).toPandas()

Unnamed: 0,CustomerName,VendorNo,WNC,StatementWNC,RequestDate,Status,RequestMethod,RequestType,Contact,RequesterFullName,LastActivityDate,LastStatementReceivedDate,CreatedDate,ObjectID,CurrentAssigneeID,CurrentAssigneeName,MessageID,VendorContactObjectID
0,,,,,NaT,New,,Caller,,,NaT,NaT,2022-05-05 15:41:16.783,47597380,,,,
1,New Hanover Regional Medical Center,16845.0,,,2020-07-24,Superceded,MassEmail,,amcoffeedist@gmail.com,Jessica Rinehart,2020-07-24,NaT,2020-07-24 16:29:52.840,26681846,19272454.0,Yodit Kahssai,,
2,New Hanover Regional Medical Center,16845.0,,,2020-07-31,Superceded,MassEmail,Mass 1,amcoffeedist@gmail.com,Jessica Rinehart,2020-07-31,NaT,2020-07-31 08:05:15.817,26819776,19272454.0,Yodit Kahssai,,
3,New Hanover Regional Medical Center,16845.0,,,2021-01-19,Superceded,MassEmail,Caller,amcoffeedist@gmail.com,Jessica Rinehart,2020-07-31,NaT,2021-01-19 12:01:21.287,30690711,8398263.0,Rebekah Dykema,,29510277.0
4,New Hanover Regional Medical Center,16845.0,,,2021-08-19,Superceded,MassEmail,Mass 1,amcoffeedist@gmail.com,Cindy Allen,2021-08-23,NaT,2021-08-23 00:03:52.047,36365526,8398263.0,Rebekah Dykema,hONrG5xpTUyiQB4Rs7gb0g,29510277.0


In [140]:
(
    requests
    .selectExpr(
        'ObjectID as RequestID',
        'CustomerName',
        'VendorNo as VendorNumber',
        'RequestDate',
        'Status as RequestStatus',
        'RequestMethod',
        'RequestType',
        'Contact as RequestContact',
        'RequesterFullName',
        'LastActivityDate',
        'LastStatementReceivedDate',
        'CurrentAssigneeName'
    )
    .limit(5)
    .toPandas()
)

Unnamed: 0,RequestID,CustomerName,VendorNumber,RequestDate,RequestStatus,RequestMethod,RequestType,RequestContact,RequesterFullName,LastActivityDate,LastStatementReceivedDate,CurrentAssigneeName,VendorContactID
0,47597380,,,NaT,New,,Caller,,,NaT,NaT,,
1,26681846,New Hanover Regional Medical Center,16845.0,2020-07-24,Superceded,MassEmail,,amcoffeedist@gmail.com,Jessica Rinehart,2020-07-24,NaT,Yodit Kahssai,
2,26819776,New Hanover Regional Medical Center,16845.0,2020-07-31,Superceded,MassEmail,Mass 1,amcoffeedist@gmail.com,Jessica Rinehart,2020-07-31,NaT,Yodit Kahssai,
3,30690711,New Hanover Regional Medical Center,16845.0,2021-01-19,Superceded,MassEmail,Caller,amcoffeedist@gmail.com,Jessica Rinehart,2020-07-31,NaT,Rebekah Dykema,29510277.0
4,36365526,New Hanover Regional Medical Center,16845.0,2021-08-19,Superceded,MassEmail,Mass 1,amcoffeedist@gmail.com,Cindy Allen,2021-08-23,NaT,Rebekah Dykema,29510277.0


In [131]:
activityColumns = [column for column in activities.columns if column in columns]
activities.select(*activityColumns).limit(5).toPandas()

Unnamed: 0,ObjectID,CreatedDate,CustomerVendorName,JobNumber,JobName,ContactType,ActivityUser,ActivityDate,FollowUpDate,STNID,Notes,Outcome,ActivityType,CustVendorObjectID,VendorContactObjectID,StatementRequestObjectID,CreatedBy,MessageID
0,42373858,2022-02-14 10:29:14.410,ONTARGETJOBS INC,3056,Sutter Health - 3056,,Workflow Service,2022-02-14,NaT,,,Sent Authorization Letter,,15863298,31446300,42372122,SVC_WORKFLOW,lo0kpojVRpK63Q0-tZt9FQ
1,42373853,2022-02-14 10:29:08.107,CARDIVA MEDICAL INC,3320,Dignity Health - 3320,,Deborah Murphy,2022-02-14,NaT,,,,,26215461,29600706,36587007,DMURPHY,
2,42373905,2022-02-14 10:30:21.083,"EDGE INFORMATION MANAGEMENT, INC.",4056,"Health First, Inc. - 4056",,Jessica Rinehart,2022-02-14,NaT,,,,,42352516,42355958,42357889,JRINEHART,
3,42374642,2022-02-14 10:43:59.057,FAVORITE NURSES FAVORITE TEMPS,3319,Catholic Health Initiatives - 3319,,Deborah Murphy,2022-02-14,NaT,,,,,3490272,31401710,37795162,DMURPHY,
4,42374788,2022-02-14 10:45:59.810,ARTHREX,4053,Hospital Sisters Health System - 4053,,Dulce Reyes-Loredo,2022-02-14,NaT,,,,,41967124,41967126,41967127,DRLOREDO,


In [None]:
(
    activities
    .selectExpr(
        'ObjectID as ActivityID',
        'CreatedDate as ActivityCreatedDate',
        'CustomerVendorName',
        'JobNumber',
        'JobName',
        'ContactType',
        'ActivityUser',
        'ActvityDate',
        
    )
)

In [144]:
statements = spark.read.parquet('/tmp/phone-calls/Statements')
statements.limit(5).toPandas()

Unnamed: 0,ReferenceNumber,JobNo,JobTier,StatementDate,DocumentHandle,ObjectID,CustVendorObjID,CustVendGroupObjID,CustVendGroupName,CustomerName,CID,CustVendName,CustVendNo,Volume,AccountsIdentified,Recon,EmailMessageID,CreatedDate,SRARObjectId,ZeroBalance
0,1224005,3188,1.0,2019-08-14,,20513258,20513253,20513252,OHIO MEDICAL CORPORATION,OhioHealth Corporation,6261,OHIO MEDICAL CORPORATION,40003215,100003.0,,,00000000F643DA057DBD124B829A30CFCF400C1507002B...,2019-08-14 11:18:01.353,,
1,1229524,3244,,2019-08-14,,20513265,20513264,20513263,SCRIPTPRO USA INC,Jackson Health System-CMS,4910,SCRIPTPRO USA INC,108370,738284.0,,,00000000F643DA057DBD124B829A30CFCF400C1507002B...,2019-08-14 11:18:26.637,,
2,1247505,3317,,2019-08-14,,20513268,20513267,20513266,CHROM TECH INC,Mayo Clinic,281,CHROM TECH INC,2846,9108143.52,,,00000000F643DA057DBD124B829A30CFCF400C1507002B...,2019-08-14 11:19:08.073,,
3,1248087,3317,,2019-08-14,,20513273,20513272,20513271,PERFICIENT INC,Mayo Clinic,281,PERFICIENT INC,742853258,949683.0,,,00000000F643DA057DBD124B829A30CFCF400C1507002B...,2019-08-14 11:19:43.393,,
4,1224005,3188,1.0,2019-08-14,,20513254,20513253,20513252,OHIO MEDICAL CORPORATION,OhioHealth Corporation,6261,OHIO MEDICAL CORPORATION,40003215,100003.0,,,00000000F643DA057DBD124B829A30CFCF400C1507002B...,2019-08-14 11:17:48.653,,


In [153]:
(
    activities
    .join(
        statements,
        on=activities['ObjectId'] == statements['SRARObjectId'],
        how='left_semi'
    )
    .count()
)

15188

In [157]:
(
    statements
    .join(
        activities,
        on=activities['ObjectId'] == statements['SRARObjectId'],
        how='left_semi'
    )
    .groupBy('SRARObjectId')
    .count()
    .where('count < 6')
    .orderBy(desc('count'))
    .show(5)
)

+------------+-----+
|SRARObjectId|count|
+------------+-----+
|    36261070|    5|
|    37812810|    5|
|    37207422|    5|
|    38552676|    5|
|    38316534|    5|
+------------+-----+
only showing top 5 rows



In [159]:
# number of statements 
statements.where(col('SRARObjectId') == 37812810).toPandas()

Unnamed: 0,ReferenceNumber,JobNo,JobTier,StatementDate,DocumentHandle,ObjectID,CustVendorObjID,CustVendGroupObjID,CustVendGroupName,CustomerName,CID,CustVendName,CustVendNo,Volume,AccountsIdentified,Recon,EmailMessageID,CreatedDate,SRARObjectId,ZeroBalance
0,1298493,3676,,2021-10-06,17804394,38028732,14671260,14671259,PETNET SOLUTIONS INC,Cleveland Clinic Foundation,6187,PETNET SOLUTIONS INC,122954,2243619.0,1,0,25A4329A-AA9E-4EA6-B5FE-40F5E218CE2A,2021-10-06 09:30:43.637,37812810,
1,1298493,3676,,2021-10-06,17804395,38028735,14671260,14671259,PETNET SOLUTIONS INC,Cleveland Clinic Foundation,6187,PETNET SOLUTIONS INC,122954,2243619.0,1,0,25A4329A-AA9E-4EA6-B5FE-40F5E218CE2A,2021-10-06 09:30:46.387,37812810,
2,1298493,3676,,2021-10-06,17804396,38028738,14671260,14671259,PETNET SOLUTIONS INC,Cleveland Clinic Foundation,6187,PETNET SOLUTIONS INC,122954,2243619.0,1,0,25A4329A-AA9E-4EA6-B5FE-40F5E218CE2A,2021-10-06 09:30:49.967,37812810,
3,1298493,3676,,2021-10-06,17804397,38028740,14671260,14671259,PETNET SOLUTIONS INC,Cleveland Clinic Foundation,6187,PETNET SOLUTIONS INC,122954,2243619.0,1,0,25A4329A-AA9E-4EA6-B5FE-40F5E218CE2A,2021-10-06 09:30:52.553,37812810,
4,1298493,3676,,2021-10-06,17804398,38028743,14671260,14671259,PETNET SOLUTIONS INC,Cleveland Clinic Foundation,6187,PETNET SOLUTIONS INC,122954,2243619.0,1,0,25A4329A-AA9E-4EA6-B5FE-40F5E218CE2A,2021-10-06 09:30:55.800,37812810,


In [171]:
from pyspark.sql.functions import approxCountDistinct

(
    statements
    .where('SRARObjectId is not null')
    .groupBy('SRARObjectId')
    .agg(approxCountDistinct('StatementDate').alias('count'))
    .where('count > 1')
    .orderBy(desc('count'))
    .show(10)
)

+------------+-----+
|SRARObjectId|count|
+------------+-----+
|    37211746|    4|
|    37831408|    3|
|    36329295|    3|
|    36586631|    2|
|    37213781|    2|
|    36577771|    2|
|    37615710|    2|
|    37284906|    2|
|    37326684|    2|
|    37716608|    2|
+------------+-----+
only showing top 10 rows



In [169]:
statements.where(col('SRARObjectId') == 37284906).toPandas()

Unnamed: 0,ReferenceNumber,JobNo,JobTier,StatementDate,DocumentHandle,ObjectID,CustVendorObjID,CustVendGroupObjID,CustVendGroupName,CustomerName,CID,CustVendName,CustVendNo,Volume,AccountsIdentified,Recon,EmailMessageID,CreatedDate,SRARObjectId,ZeroBalance
0,1083044,2820,,2021-09-14,17737120,37380925,2944485,2944484,UNIVERSAL HOSPITAL SERVICES,Spectrum Health,6257,UNIVERSAL HOSPITAL SERVICES,12787,100003.0,0,0,FE1959A4-DC70-434B-9D9F-7FB370443C7A,2021-09-14 14:50:02.920,37284906,
1,1083044,2820,,2021-09-17,17747926,37504289,2944485,2944484,UNIVERSAL HOSPITAL SERVICES,Spectrum Health,6257,UNIVERSAL HOSPITAL SERVICES,12787,100003.0,0,0,6A3DA2FE-E1CB-4257-87EB-EBB7579A0F48,2021-09-17 09:46:20.933,37284906,


In [170]:
activities.where(col('ObjectId') == 37284906).toPandas()

Unnamed: 0,ObjectID,CreatedDate,ReferenceNumber,CustomerVendorName,JobNumber,JobName,ContactType,ActivityUser,ActivityDate,FollowUpDate,STNID,Notes,Outcome,ActivityType,CustVendorObjectID,VendorContactObjectID,StatementRequestObjectID,CreatedBy,MessageID
0,37284906,2021-09-10 15:33:29.383,1083044,UNIVERSAL HOSPITAL SERVICES,2820,Spectrum Health - 2820,,Xavier Baron,2021-09-10,NaT,,"Called Deborah, made a mistake with the email ...",Sent Authorization Letter,Called Vendor,2944485,34518312,36598874,XBARON,YcCAUMzqSyqhlpUAw0VqLw


In [176]:
# statements.where(col('ReferenceNumber') == 1083044).toPandas()

In [175]:
# activities.where(col('ReferenceNumber') == 1083044).limit(10).toPandas()

In [146]:
activities.limit(5).toPandas()

Unnamed: 0,ObjectID,CreatedDate,ReferenceNumber,CustomerVendorName,JobNumber,JobName,ContactType,ActivityUser,ActivityDate,FollowUpDate,STNID,Notes,Outcome,ActivityType,CustVendorObjectID,VendorContactObjectID,StatementRequestObjectID,CreatedBy,MessageID
0,42373858,2022-02-14 10:29:14.410,1204872,ONTARGETJOBS INC,3056,Sutter Health - 3056,,Workflow Service,2022-02-14,NaT,,,Sent Authorization Letter,,15863298,31446300,42372122,SVC_WORKFLOW,lo0kpojVRpK63Q0-tZt9FQ
1,42373853,2022-02-14 10:29:08.107,1281223,CARDIVA MEDICAL INC,3320,Dignity Health - 3320,,Deborah Murphy,2022-02-14,NaT,,,,,26215461,29600706,36587007,DMURPHY,
2,42373905,2022-02-14 10:30:21.083,1331383,"EDGE INFORMATION MANAGEMENT, INC.",4056,"Health First, Inc. - 4056",,Jessica Rinehart,2022-02-14,NaT,,,,,42352516,42355958,42357889,JRINEHART,
3,42374642,2022-02-14 10:43:59.057,1242775,FAVORITE NURSES FAVORITE TEMPS,3319,Catholic Health Initiatives - 3319,,Deborah Murphy,2022-02-14,NaT,,,,,3490272,31401710,37795162,DMURPHY,
4,42374788,2022-02-14 10:45:59.810,1324919,ARTHREX,4053,Hospital Sisters Health System - 4053,,Dulce Reyes-Loredo,2022-02-14,NaT,,,,,41967124,41967126,41967127,DRLOREDO,


In [145]:
# statementColumns = [column for column in statements.columns if column in columns]
# statements.select(*statementColumns).limit(5).toPandas()

(
    statements
    .groupBy('SRARObjectId')
    .count()
    .orderBy(desc('count'))
    .show()
)

+------------+------+
|SRARObjectId| count|
+------------+------+
|        null|447558|
|    36795268|   103|
|    38342291|    92|
|    36444256|    73|
|    37626498|    62|
|    38046700|    51|
|    36776160|    51|
|    38030668|    44|
|    38374579|    44|
|    36544681|    42|
|    36238216|    41|
|    38030582|    40|
|    37375100|    39|
|    36320582|    38|
|    36181648|    37|
|    37909364|    37|
|    38601423|    35|
|    36316433|    31|
|    37520585|    29|
|    37725679|    29|
+------------+------+
only showing top 20 rows



In [129]:
(
    outbound
    .join(
        requests.where(col('RequestType') != 'Caller'),
        on=outbound['StatementRequestObjectID'] == requests['ObjectID'],
        how='left_semi'
    )
    .toPandas()
)

Unnamed: 0,ObjectID,StatementRequestObjectID,id,CreatedDate,date_time,ActivityUser,caller_name,ActivityType,direction,Phone,callee_name,duration,difference,_row_number
0,47779377,43057146,45aba46b-0211-4ad9-acd5-d83ce0af891c,2022-05-09 15:08:25,2022-05-09 15:09:11,Josephine Dagher,Josephine Dagher,Called Vendor,outbound,(800) 800-2726,OhioHealth Corporation,132,46,1
1,47769850,36285346,9e000dee-adde-47da-baf3-72a2906edde3,2022-05-09 09:44:50,2022-05-09 09:45:49,Toni Engle,Toni Engle,Called Vendor,outbound,(757) 321-4469,Sentara Healthcare,98,59,1
2,47836855,39364089,f06fc272-2f30-4d85-873d-3bb10c862c48,2022-05-10 13:19:42,2022-05-10 13:20:29,Toni Engle,Toni Engle,Called Vendor,outbound,(800) 456-9756,Detroit Medical Center,214,47,1
3,47776828,43051117,fe34e470-a3b0-4871-b5a1-237b841fd37a,2022-05-09 12:42:28,2022-05-09 12:42:51,Josephine Dagher,Josephine Dagher,Called Vendor,outbound,(877) 804-0141,OhioHealth Corporation,238,23,1
4,47768795,39779780,d6f761bb-497f-4dd8-90fa-74cf221e2d9d,2022-05-09 09:21:34,2022-05-09 09:23:45,Josephine Dagher,Josephine Dagher,Called Vendor,outbound,(614) 645-3400,OhioHealth Corporation,610,131,1


In [126]:
(
    requests
    .join(
        outbound,
        on=outbound['StatementRequestObjectID'] == requests['ObjectID'],
        how='left_semi'
    )
    # .groupBy('RequestType')
    # .count()
    # .show()
)

+-----------+-----+
|RequestType|count|
+-----------+-----+
|     Caller| 1223|
|     Mass 2|    1|
|     Mass 1|    4|
+-----------+-----+



#### Emails

In [80]:
emails = spark.read.parquet('/tmp/phone-calls/StatementEmailDocs')
emails.limit(5).toPandas()

Unnamed: 0,DocumentHandle,DateCreated,MAILDateTime,MAILFromAddress,MAILToAddress,MAILCcAddress,MAILSubject,MAILMessageID,MAILAttachmentCount,S-Ref#,S-CustomerName,S-Job#,S-VendorName,S-CreatedByUser,S-SkipAutoReceive,S-Recon,S-LargeCredits,IngestionSource,S-StatementExistsInBatch
0,15594200,2019-06-12 09:22:11.947,2019-06-07 19:14:29,,,,RE: STATEMENT REQUEST - FAIRVIEW HEALTH SERVIC...,00000000F643DA057DBD124B829A30CFCF400C1507002B...,11,1230333,,,,,,,,,
1,15595567,2019-06-12 09:28:05.423,2019-06-07 19:14:17,,,,RE: STATEMENT REQUEST (REF # 1190860),00000000F643DA057DBD124B829A30CFCF400C1507002B...,7,1190860,,,,,,,,,
2,15595581,2019-06-12 09:40:03.420,2019-06-07 19:12:18,,,,RE: INFORMATION REQUEST #1237232,00000000F643DA057DBD124B829A30CFCF400C1507002B...,2,1237232,,,,,,,,,
3,15595588,2019-06-12 09:42:47.153,2019-06-07 19:15:58,,,,REF#1157275 STATEMENT FROM PROVATION MEDICAL F...,00000000F643DA057DBD124B829A30CFCF400C1507002B...,2,1157275,,,,,,,,,
4,15595595,2019-06-12 09:49:49.783,2019-06-07 19:15:37,,,,RE: INFORMATION REQUEST #1217277,00000000F643DA057DBD124B829A30CFCF400C1507002B...,2,1217277,,,,,,,,,


In [90]:
%%time
from pyspark.sql import Window
from pyspark.sql.functions import date_trunc, desc, row_number

window = Window.partitionBy('MAILMessageID').orderBy(desc('MAILDateTime'))


(
    emails
    .withColumn('_row_number', row_number().over(window))
    .where('_row_number = 1')
    .select(
        'MAILMessageID',
        date_trunc('second', 'MAILDateTime').alias('MAILDateTime')
    )
    .write
    .mode('overwrite')
    .parquet('/tmp/phone-calls/_email')
)

_emails = spark.read.parquet('/tmp/phone-calls/_email')
_emails.limit(5).toPandas()

CPU times: user 16.2 ms, sys: 4.16 ms, total: 20.4 ms
Wall time: 5.7 s


Unnamed: 0,MAILMessageID,MAILDateTime
0,00000000C477DA00223F534CAC4DB644DFC12E48070057...,2019-08-27 11:24:54
1,00000000C477DA00223F534CAC4DB644DFC12E48070057...,2019-09-05 15:04:31
2,00000000C477DA00223F534CAC4DB644DFC12E48070057...,2020-01-23 15:31:47
3,00000000C477DA00223F534CAC4DB644DFC12E48070057...,2020-05-27 14:53:33
4,00000000C477DA00223F534CAC4DB644DFC12E48070057...,2020-09-16 13:28:10


#### Number of Matches

In [336]:
(
    calls
    .where(col('direction') == 'outbound')
    .withColumn('caller_date', col('date_time').cast('date'))
    .join(
        activities
            .where(col('ActivityType') == 'Called Vendor')
            .join(
                _vendors
                    .withColumnRenamed('Phone', 'callee_number')
                    .select('ObjectID', 'callee_number'),
                on=activities['VendorContactObjectID'] == _vendors['ObjectID'],
                how='left'
            )
            .withColumnRenamed('ActivityUser', 'caller_name')
            .withColumn('caller_date', col('CreatedDate').cast('date')),
        on=['caller_name', 'caller_date', 'callee_number'],
        how='left_semi'
    )
    .count()
    # .groupBy('caller_name', 'caller_date')
    # .count()
    # .orderBy(desc('count'))
    # .show()
)

1367

In [None]:
spark.stop()