### Notes
- what driving receipt of statement
- only need a human when someone picks up the phone


#### Random Forest Requests on target Status
- CallerStatus: 1
- CustomerName: 2
- RequestType: 3
- RequesterFullName: 4
- CurrentAssigneeName: 5
- Contact: 6
- RequestMethod: 7
- Volume: 8

In [4]:
from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .master('local[*]')
    .appName('pyspark-shell')
    .getOrCreate()
)

sc = spark.sparkContext
# sc.defaultParallelism

# # spark.conf.set('spark.sql.session.timeZone', 'UTC')
# # spark.conf.set('spark.sql.session.timeZone', 'America/New_York')

In [6]:
import sys
import os

sys.path.append('/dbfs/cdr/library')

from pandas import set_option
set_option('display.max.columns', None)

# TODO: remove for Databricks
from dbutils import DBUtils, FileInfo
dbutils = DBUtils()

  return f(*args, **kwds)


In [7]:
from sparkFunctions import create_tmp_parquet

# adds file system prefix /dbfs for local and dbfs: for databricks
def resolve(path: str) -> str:
    prefix = dbutils.fs.ls('/')[0].path[:5]
    return prefix + path

In [116]:
from pyspark.sql import Column, DataFrame, Window
from pyspark.sql.functions import approx_count_distinct, collect_set, date_trunc, desc, expr
from pyspark.sql.functions import lit, row_number, size, sum, trim, when
# from pyspark.storagelevel import StorageLevel

from typing import Callable, Dict, List


def truncateTimestamp(dataFrame: DataFrame, unit: str='second') -> DataFrame:
    for column, dtype in dataFrame.dtypes:
        if dtype == 'timestamp':
            dataFrame = dataFrame.withColumn(column, date_trunc(unit, column))

    return dataFrame


def trimString(dataFrame: DataFrame) -> DataFrame:
    for column, dtype in dataFrame.dtypes:
        if dtype == 'string':
            dataFrame = dataFrame.withColumn(column, trim(column))
        
    return dataFrame


# number distinct values for each column not in composite key
def countDistinctValuesMap(
    dataFrame: DataFrame,
    keys: List[str]
) -> Dict[str, int]:
    
    valuesMap = dict()
    
    for column in dataFrame.columns:
        if column not in keys:
            values = (
                dataFrame
                .groupBy(keys)
                .agg(approx_count_distinct(column).alias('count'))
                .where('count > 1')
                .count()
            )
            
            if values > 1:
                valuesMap[column] = values
    
    return valuesMap


# number of distinct values for single column
def columnFactorCounts(
    dataFrame: DataFrame,
    keys: List[str],
    column: str
) -> DataFrame:
    
    return (
        dataFrame
        .groupBy(keys)
        .agg(approx_count_distinct(column).alias(column))
        .sort(desc(column))
    )


# lists unique values in column along with number of values
def columnFactorArray(
    dataFrame: DataFrame,
    keys: List[str],
    column: str
) -> DataFrame:
    
    return (
        dataFrame
        .groupBy(keys)
        .agg(collect_set(column).alias(column))
        .withColumn('size', size(column))
        .sort(desc('size'))
    )


# return single record for each key
def firstRecords(
    dataFrame: DataFrame,
    keys: List[str],
    orders: List[Column]
) -> DataFrame:
    window = Window.partitionBy(*keys).orderBy(*orders)

    return(
        dataFrame
        .withColumn('_row_number', row_number().over(window))
        .where('_row_number = 1')
        .drop('_row_number')
    )



# create column with optional conditions, window and function
def create_column(
    value: Column,
    conditions: List[str]=None,
    groups: List[str]=None,
    function: Callable[[Column], Column]=None,
    default: Column=None,
    logical: str=' AND '
) -> Column:

    column = value

    if isinstance(conditions, list):
        condition = expr(logical.join(conditions))
        column = when(condition, column)

    if isinstance(default, Column):
        column = column.otherwise(default)

    if callable(function):
        column = function(column)

    if isinstance(groups, list):
        window = Window.partitionBy(groups)
        column = column.over(window)

    return column

In [9]:
FOLDER = '/automation'

ZOOM_FOLDER = os.path.join(FOLDER, 'ZoomData')
BASE_FOLDER = os.path.join(FOLDER, 'OnBase')

In [10]:
dbutils.fs.ls(BASE_FOLDER)

[FileInfo(path='/dbfs/automation/OnBase/cc_STStatementEmailDocs.parquet', name='cc_STStatementEmailDocs.parquet', size=25563408),
 FileInfo(path='/dbfs/automation/OnBase/rm_DVStatementRequestActivityRecords.parquet', name='rm_DVStatementRequestActivityRecords.parquet', size=97480388),
 FileInfo(path='/dbfs/automation/OnBase/rm_DVStatementRequests.parquet', name='rm_DVStatementRequests.parquet', size=51427694),
 FileInfo(path='/dbfs/automation/OnBase/rm_DVStatements.parquet', name='rm_DVStatements.parquet', size=40973701),
 FileInfo(path='/dbfs/automation/OnBase/rm_DVVendorContacts.parquet', name='rm_DVVendorContacts.parquet', size=24426383)]

In [11]:
from pyspark.sql.functions import col 

path = os.path.join(ZOOM_FOLDER, 'formattedAccountCallLogs.parquet')
calls = spark.read.parquet(resolve(path))


# Eastern Standard Time
calls = calls.withColumn('date_time', col('date_time_est').cast('timestamp')).drop('date_time_est')

integers = [
    'duration',
    'extension_number',
    'caller_number_type',
    'callee_number_type'
]

for column in integers:
    calls = calls.withColumn(column, col(column).cast('integer'))

calls = calls.withColumn('call_id', col('call_id').cast('long'))

drops = [
    'id',
    'path',
    'has_recording'
]

calls = calls.drop(*drops)

calls = truncateTimestamp(calls)
calls = trimString(calls)

In [12]:
%%time
from pyspark.sql.functions import regexp_extract

# outgoing calls begin with +1
# also use 1 for 1-800 numbers
# ones not found in VENDOR table

# ^ is anchor at start
# \+ escapes literal + sign in regular expression
# same with *, which is generally a wild card
pattern = r'^(\+1|\*)*([0-9]+)'

calls = (
    calls
    # extract second group using index=2
    .withColumn('CALLED_NUMBER', regexp_extract('callee_number', pattern, 2))
    .withColumn('RECEIVED_NUMBER', regexp_extract('caller_number', pattern, 2))
)

calls = create_tmp_parquet(spark, calls, '/tmp/CALLS')
calls.limit(5).toPandas()

CPU times: user 79.3 ms, sys: 35.3 ms, total: 115 ms
Wall time: 6.63 s


Unnamed: 0,caller_name,caller_number,callee_name,callee_number,direction,duration,date_time,call_end_time,result,call_id,extension_number,name,type,call_type,caller_number_type,callee_number_type,has_voicemail,date_time_end_est,CALLED_NUMBER,RECEIVED_NUMBER
0,,16169702069,Mitchell Malling,442,inbound,86,2021-09-17 11:05:47,,Call connected,7008919107000321976,442,Mitchell Malling,user,voip,2,1,False,,442,6169702069
1,Yodit Kahssai,474,,18552675551,outbound,139,2021-09-17 11:06:06,2021-09-17T15:08:27Z,Call connected,7008919188613195707,474,Yodit Kahssai,user,pstn,1,2,False,2021-09-17 11:08:27.000,8552675551,474
2,,16169702069,Nathan Gregorio,270,inbound,14,2021-09-17 11:07:00,,Call connected,7008919416247491134,270,Nathan Gregorio,user,voip,2,1,False,,270,6169702069
3,Dornier Medtech,17705146253,Main Auto Receptionist,901,inbound,0,2021-09-17 11:07:03,,No Answer,7008919433427273168,901,Main Auto Receptionist,autoReceptionist,voip,2,1,False,,901,7705146253
4,Xavier Baron,852,Vssp,16147195221,outbound,31,2021-09-17 11:07:07,2021-09-17T15:07:49Z,Call connected,7008919450606123974,852,Xavier Baron,user,pstn,1,2,False,2021-09-17 11:07:49.000,6147195221,852


In [144]:
%%time
# from pyspark.sql.functions import col

path = os.path.join(BASE_FOLDER, 'rm_DVStatementRequestActivityRecords.parquet')
activities = spark.read.parquet(resolve(path))

dates = [
    'ActivityDate',
    'FollowUpDate'
]

for column in dates:
    activities = activities.withColumn(column, col(column).cast('date'))

# drops = [
#     'STNID',
#     'CustVendorObjectID'
# ]

# activities = activities.drop(*drops)
    
activities = truncateTimestamp(activities)
activities = trimString(activities)
    
activities = create_tmp_parquet(spark, activities, '/tmp/ACTIVITIES')
activities.limit(5).toPandas()

CPU times: user 42.3 ms, sys: 4.78 ms, total: 47.1 ms
Wall time: 8.72 s


Unnamed: 0,ObjectID,CreatedDate,ReferenceNumber,CustomerVendorName,JobNumber,JobName,ContactType,ActivityUser,ActivityDate,FollowUpDate,STNID,Notes,Outcome,ActivityType,CustVendorObjectID,VendorContactObjectID,StatementRequestObjectID,CreatedBy
0,28785475,2020-11-02 06:41:33,1279250,ULINE,3412,Carolinas Shared Service - 3412,Call,ASIBLEY,2020-11-01,2020-11-03,,This is a test - Updated,,,21259327,,25422522,ASIBLEY
1,28785637,2020-11-02 06:47:27,1279250,ULINE,3412,Carolinas Shared Service - 3412,Email,ASIBLEY,2020-11-01,2020-11-04,,Test 2 - Updated,,,21259327,,25422522,ASIBLEY
2,28786094,2020-11-02 06:52:58,1279250,ULINE,3412,Carolinas Shared Service - 3412,Client Email,ASIBLEY,2020-11-01,2020-11-03,,Test 3,,,21259327,,25422522,ASIBLEY
3,28838136,2020-11-04 12:47:58,1287154,BIOCOMPOSITES INC.,3435,The Christ Hospital - 3435,,bwilliams1,2020-11-03,,6192802.0,Statement Request creation skipped because thi...,,,27542411,,27781594,ASIBLEY
4,28838138,2020-11-04 12:48:02,1225130,BONA FIDE COMMERCIAL SERVICES,3177,UC Health - 3177,,jdagher,2020-11-03,,6192806.0,Statement Request creation skipped because thi...,,,24317597,,25860124,ASIBLEY


### Activities Synopsis

- Records without a Job Number have no Statement Request Identity
- Groups can be Job Number and Reference Number
- Each Statement Request Identity has only one Reference Number
- Each Reference Number can have multiple Statement Request Identities
    - 1290366

In [70]:
statementIdentity = (
    activities
    .where('ReferenceNumber = 1290366')
    .where('JobNumber IS NOT NULL')
    .select('StatementRequestObjectID')
    .rdd
    .map(lambda x: x[0])
    .collect()
)

In [74]:
requests.where(col('ObjectID').isin(statementIdentity)).limit(5).toPandas()

Unnamed: 0,JobNo,JobTier,CustomerName,CustVendorID,VendorNo,CustVendorGroupID,WNC,StatementWNC,VendorGroupName,Volume,RequestDate,ReferenceNumber,Status,RequestMethod,RequestType,Contact,RequesterFullName,RequestText,LastActivityDate,LastStatementReceivedDate,CreatedDate,CallsheetNo,ObjectID,CallerStatus,ReconStatus,CurrentAssigneeID,CurrentAssigneeName,EnteredReconDate,LastReconQueueName,LastReconQueueEntryDate,AccountsReceived,AccountsRequested,MessageID,VendorContactObjectID,WebsiteVendor,WNCSpecialHandling,NeedLeadVendor
0,3432,,Centra Health,27888560,216,27888558,0,1,ALLEN MEDICAL SYSTEMS INC,100001.0,2021-05-23,1290366,Superceded,WNC,Caller,globalar@hill-rom.com,Tim Kornoelje,,2021-05-23,NaT,2021-05-24 06:25:33,C-359911,33912409,Will Not Comply,,8404677,Tim Kornoelje,NaT,,NaT,0,2,,31402936.0,0,,
1,3432,,Centra Health,27888560,216,27888558,0,1,ALLEN MEDICAL SYSTEMS INC,100001.0,2021-06-06,1290366,No Receipt,WNC,Caller,globalar@hill-rom.com,Tim Kornoelje,,2021-06-07,NaT,2021-06-08 11:16:28,C-359911,34198735,Needs Supervisor,,8404677,Tim Kornoelje,NaT,,NaT,0,2,,,0,,
2,3432,,Centra Health,27888560,216,27888558,0,1,ALLEN MEDICAL SYSTEMS INC,100001.0,2021-05-04,1290366,Superceded,WNC,Caller,globalar@hill-rom.com,Tim Kornoelje,,2021-05-04,NaT,2021-05-05 08:18:07,C-359911,33451862,Will Not Comply,,8404677,Tim Kornoelje,NaT,,NaT,0,2,,31402936.0,0,,
3,3432,,Centra Health,27888560,216,27888558,0,1,ALLEN MEDICAL SYSTEMS INC,100001.0,2021-04-19,1290366,Superceded,WNC,Caller,globalar@hill-rom.com,Tim Kornoelje,,2021-04-19,NaT,2021-04-20 05:17:23,C-359911,33069733,Will Not Comply,,8404677,Tim Kornoelje,NaT,,NaT,0,2,,31402936.0,0,,
4,3432,,Centra Health,27888560,216,27888558,0,1,ALLEN MEDICAL SYSTEMS INC,100001.0,2021-03-23,1290366,Superceded,WNC,Caller,globalar@hill-rom.com,Tim Kornoelje,,2021-03-23,NaT,2021-03-24 04:35:14,C-359911,32366023,Will Not Comply,,8404677,Tim Kornoelje,NaT,,NaT,0,2,,31402936.0,0,,


In [145]:
%%time
# 614,227
activities = activities.where('StatementRequestObjectID IS NOT NULL')

keys = ['StatementRequestObjectID', 'CreatedDate']
orders = [desc('ObjectID')]

activities = firstRecords(activities, keys, orders)

# 612,124
path = '/tmp/RDD1'
activities = create_tmp_parquet(spark, activities, path)

CPU times: user 7.06 ms, sys: 3.01 ms, total: 10.1 ms
Wall time: 9.98 s


In [146]:
%%time
from pyspark.sql.functions import coalesce, lit, min, when

conditions = ["Outcome = 'Statement Received'"]

groups = ['StatementRequestObjectID']

column = create_column(col('CreatedDate'), conditions, groups, min)

activities = (
    activities
    .withColumn('ReceivedDate', coalesce(column, col('CreatedDate')))
    .where('NOT CreatedDate > ReceivedDate')
    .drop('ReceivedDate')
)

# 597,440
path = '/tmp/RDD2'
activities = create_tmp_parquet(spark, activities, path)

CPU times: user 7.61 ms, sys: 3.08 ms, total: 10.7 ms
Wall time: 10.4 s


In [148]:
activities.where('JobNumber IS NULL').count()

0

In [149]:
(
    activities
    .where("Outcome = 'Statement Received'")
    .groupBy('StatementRequestObjectID')
    .count()
    .where('`count` > 1')
    .count()
)

0

In [164]:
conditions = ["Outcome = 'Statement Received'"]

groups = ['StatementRequestObjectID']

case = create_column(lit(1), conditions, groups, sum)

received = (
    activities
    .withColumn('Received', when(case > 0, lit(True)))
    .where('Received')
    .drop('Received')
)

path = '/tmp/RECEIVED'
received = create_tmp_parquet(spark, received, path)
received.limit(5).toPandas()

Unnamed: 0,ObjectID,CreatedDate,ReferenceNumber,CustomerVendorName,JobNumber,JobName,ContactType,ActivityUser,ActivityDate,FollowUpDate,STNID,Notes,Outcome,ActivityType,CustVendorObjectID,VendorContactObjectID,StatementRequestObjectID,CreatedBy
0,32046465,2021-03-09 12:00:53,1083257,DIAGNOSTICA STAGO INC,2820,Spectrum Health - 2820,,OnBase Processing Service,2021-03-08,,6537782,Statement received in the office. Status chang...,Statement Received,,6295035,,26376809,SVC_WORKFLOW
1,31255490,2021-02-10 07:08:42,1083257,DIAGNOSTICA STAGO INC,2820,Spectrum Health - 2820,Client Email,Rebekah Dykema,2021-02-09,,6468590,emailed 1st req,,,6295035,,26376809,SVC_WORKFLOW
2,31982565,2021-03-05 06:21:00,1083257,DIAGNOSTICA STAGO INC,2820,Spectrum Health - 2820,Client Email,Rebekah Dykema,2021-03-04,,6528463,emailed 2nd req,,,6295035,,26376809,SVC_WORKFLOW
3,31256222,2021-02-10 07:40:24,1083257,DIAGNOSTICA STAGO INC,2820,Spectrum Health - 2820,,Rebekah Dykema,2021-02-09,,6468799,Jody handles this acct,,,6295035,,26376809,SVC_WORKFLOW
4,30281127,2020-12-22 13:03:58,1289623,BRAIN LAB,3426,FMOL-St Dominic Hospital - 3426,Call,fbishop,2020-12-21,,6309980,laura.ramos@brainlab.com,Statement Received,,27709614,,27782711,SVC_WORKFLOW


In [218]:
received = spark.read.parquet('/tmp/RECEIVED')

In [232]:
%%time
window = Window.partitionBy('StatementRequestObjectID').orderBy('CreatedDate')
column = min(col('CreatedDate')).over(window)

survival = received.withColumn('StartTime', col('CreatedDate').cast('long') - column.cast('long'))

column = lead(col('StartTime')).over(window)
column = coalesce(column, col('StartTime'))

survival = survival.withColumn('StopTime', column)

column = when(col('Outcome') == 'Statement Received', lit(1)).otherwise(0)
survival = survival.withColumn('Event', column)

column = when(col('CreatedBy') == 'SVC_WORKFLOW', lit('WORKFLOW')).otherwise('N/A')
survival = survival.withColumn('Created', column)

column = when(col('ActivityUser') == 'OnBase Processing Service', lit('OnBase')).otherwise('N/A')
survival = survival.withColumn('User', column)

survival = survival.na.fill('N/A', subset=['ContactType', 'Outcome', 'ActivityType'])

drops = [
    'ActivityDate',
    'FollowUpDate',
    'STNID',
    'Notes',
    'CustVendorObjectID',
    'VendorContactObjectID'
]

survival = survival.drop(*drops)

path = '/tmp/SURVIVAL'
survival = create_tmp_parquet(spark, survival, path)
survival.limit(5).toPandas()

CPU times: user 45.2 ms, sys: 7.61 ms, total: 52.8 ms
Wall time: 7.81 s


Unnamed: 0,ObjectID,CreatedDate,ReferenceNumber,CustomerVendorName,JobNumber,JobName,ContactType,ActivityUser,Outcome,ActivityType,StatementRequestObjectID,CreatedBy,StartTime,StopTime,Event,Created,User
0,30281656,2020-12-22 15:33:09,1269179,LANGUAGE LINE LLC,3395,Lafayette General Health - 3395,Call,fbishop,Statement Received,,26130325,SVC_WORKFLOW,0,0,1,WORKFLOW,
1,30206893,2020-12-22 02:27:37,1289773,BAYER HEALTHCARE LLC,3426,FMOL-St Dominic Hospital - 3426,Call,fbishop,Statement Received,,27870466,SVC_WORKFLOW,0,0,1,WORKFLOW,
2,29244692,2020-11-20 10:04:09,1223715,FISHER HEALTHCARE,3188,OhioHealth Corporation - 3188,Call,jdagher,,,28038798,SVC_WORKFLOW,0,9415608,0,WORKFLOW,
3,32043102,2021-03-09 09:30:57,1223715,FISHER HEALTHCARE,3188,OhioHealth Corporation - 3188,,OnBase Processing Service,Statement Received,,28038798,SVC_WORKFLOW,9415608,9415608,1,WORKFLOW,OnBase
4,31897864,2021-03-01 08:04:45,1287487,WW GRAINGER,3431,San Antonio Regional Hospital - 3431,,OnBase Processing Service,Statement Received,,28678925,SVC_WORKFLOW,0,0,1,WORKFLOW,OnBase


In [238]:
from pandas import read_parquet

table = read_parquet('/tmp/SURVIVAL')

table.to_csv('../csv/ACTIVITY.csv', index=False)

In [239]:
# ContactType N/A
# Outcome N/A
# ActivityType N/A

# ['ContactType', 'Outcome', 'ActivityType']

# SVC_WORKFLOW highest CreatedBy
# ActivityUser highest OnBase Processing Service

(
    survival
    .groupBy('Outcome')
    .count()
    .sort(desc('count'))
    .toPandas()
)

# survival.where('ActivityUser IS NULL').count()

Unnamed: 0,Outcome,count
0,Sent Authorization Letter,69338
1,Statement Received,65242
2,Will Not Comply,15178
3,Left Voicemail,14375
4,,12757
5,Will Email or Fax,4882
6,Needs Research,1067
7,Need to Receive,831
8,Needs Supervisor,634
9,Needs Account Number,309


In [176]:
(
    received
    .groupBy('StatementRequestObjectID')
    .count()
    .sort(desc('count'))
    .where('count < 6')
    .limit(5)
    .show()
)

+------------------------+-----+
|StatementRequestObjectID|count|
+------------------------+-----+
|                30462412|    5|
|                36581820|    5|
|                30711137|    5|
|                26876481|    5|
|                33464074|    5|
+------------------------+-----+



In [207]:
activity = activities.where('StatementRequestObjectID = 30462412')
activity.unpersist()
activity.cache().count()
activity.sort('CreatedDate').toPandas()

Unnamed: 0,ObjectID,CreatedDate,ReferenceNumber,CustomerVendorName,JobNumber,JobName,ContactType,ActivityUser,ActivityDate,FollowUpDate,STNID,Notes,Outcome,ActivityType,CustVendorObjectID,VendorContactObjectID,StatementRequestObjectID,CreatedBy
0,30462427,2021-01-08 04:13:35,1268701,"AIRGAS, INC.",3366,Cornerstone Healthcare Group - 3366,Email,bwilhoit,2021-01-07,,,,Sent Authorization Letter,,23974550,,30462412,SVC_WORKFLOW
1,32377345,2021-03-24 10:07:45,1268701,"AIRGAS, INC.",3366,Cornerstone Healthcare Group - 3366,,Felicia Bishop,2021-03-23,,,9107052931--Needs Research,Needs Research,Note Only,23974550,,30462412,FBISHOP
2,32377354,2021-03-24 10:09:31,1268701,"AIRGAS, INC.",3366,Cornerstone Healthcare Group - 3366,,Felicia Bishop,2021-03-23,,,9107052931--Needs Research,Needs Research,Note Only,23974550,,30462412,FBISHOP
3,32761556,2021-04-08 04:42:36,1268701,"AIRGAS, INC.",3366,Cornerstone Healthcare Group - 3366,,Bill Wilhoit,2021-04-07,,,Fixed contact info to match BSAP. Sent email,Sent Authorization Letter,Note Only,23974550,31406725.0,30462412,BWILHOIT
4,32991930,2021-04-15 10:34:01,1268701,"AIRGAS, INC.",3366,Cornerstone Healthcare Group - 3366,,OnBase Processing Service,2021-04-14,,6628825.0,Statement received in the office. Status chang...,Statement Received,,23974550,,30462412,SVC_WORKFLOW


In [213]:
window = Window.partitionBy('StatementRequestObjectID').orderBy('CreatedDate')
column = min(col('CreatedDate')).over(window)

activity = activity.withColumn('StartTime', col('CreatedDate').cast('long') - column.cast('long'))

column = lead(col('StartTime')).over(window)
column = coalesce(column, col('StartTime'))

activity = activity.withColumn('StopTime', column)
activity.limit(5).toPandas()

Unnamed: 0,ObjectID,CreatedDate,ReferenceNumber,CustomerVendorName,JobNumber,JobName,ContactType,ActivityUser,ActivityDate,FollowUpDate,STNID,Notes,Outcome,ActivityType,CustVendorObjectID,VendorContactObjectID,StatementRequestObjectID,CreatedBy,StartTime,StopTime
0,30462427,2021-01-08 04:13:35,1268701,"AIRGAS, INC.",3366,Cornerstone Healthcare Group - 3366,Email,bwilhoit,2021-01-07,,,,Sent Authorization Letter,,23974550,,30462412,SVC_WORKFLOW,0,6497650
1,32377345,2021-03-24 10:07:45,1268701,"AIRGAS, INC.",3366,Cornerstone Healthcare Group - 3366,,Felicia Bishop,2021-03-23,,,9107052931--Needs Research,Needs Research,Note Only,23974550,,30462412,FBISHOP,6497650,6497756
2,32377354,2021-03-24 10:09:31,1268701,"AIRGAS, INC.",3366,Cornerstone Healthcare Group - 3366,,Felicia Bishop,2021-03-23,,,9107052931--Needs Research,Needs Research,Note Only,23974550,,30462412,FBISHOP,6497756,7774141
3,32761556,2021-04-08 04:42:36,1268701,"AIRGAS, INC.",3366,Cornerstone Healthcare Group - 3366,,Bill Wilhoit,2021-04-07,,,Fixed contact info to match BSAP. Sent email,Sent Authorization Letter,Note Only,23974550,31406725.0,30462412,BWILHOIT,7774141,8400026
4,32991930,2021-04-15 10:34:01,1268701,"AIRGAS, INC.",3366,Cornerstone Healthcare Group - 3366,,OnBase Processing Service,2021-04-14,,6628825.0,Statement received in the office. Status chang...,Statement Received,,23974550,,30462412,SVC_WORKFLOW,8400026,8400026


In [214]:
column = when(col('Outcome') == 'Statement Received', lit(1)).otherwise(0)

activity.withColumn('Event', column).limit(5).toPandas()

Unnamed: 0,ObjectID,CreatedDate,ReferenceNumber,CustomerVendorName,JobNumber,JobName,ContactType,ActivityUser,ActivityDate,FollowUpDate,STNID,Notes,Outcome,ActivityType,CustVendorObjectID,VendorContactObjectID,StatementRequestObjectID,CreatedBy,StartTime,StopTime,Event
0,30462427,2021-01-08 04:13:35,1268701,"AIRGAS, INC.",3366,Cornerstone Healthcare Group - 3366,Email,bwilhoit,2021-01-07,,,,Sent Authorization Letter,,23974550,,30462412,SVC_WORKFLOW,0,6497650,0
1,32377345,2021-03-24 10:07:45,1268701,"AIRGAS, INC.",3366,Cornerstone Healthcare Group - 3366,,Felicia Bishop,2021-03-23,,,9107052931--Needs Research,Needs Research,Note Only,23974550,,30462412,FBISHOP,6497650,6497756,0
2,32377354,2021-03-24 10:09:31,1268701,"AIRGAS, INC.",3366,Cornerstone Healthcare Group - 3366,,Felicia Bishop,2021-03-23,,,9107052931--Needs Research,Needs Research,Note Only,23974550,,30462412,FBISHOP,6497756,7774141,0
3,32761556,2021-04-08 04:42:36,1268701,"AIRGAS, INC.",3366,Cornerstone Healthcare Group - 3366,,Bill Wilhoit,2021-04-07,,,Fixed contact info to match BSAP. Sent email,Sent Authorization Letter,Note Only,23974550,31406725.0,30462412,BWILHOIT,7774141,8400026,0
4,32991930,2021-04-15 10:34:01,1268701,"AIRGAS, INC.",3366,Cornerstone Healthcare Group - 3366,,OnBase Processing Service,2021-04-14,,6628825.0,Statement received in the office. Status chang...,Statement Received,,23974550,,30462412,SVC_WORKFLOW,8400026,8400026,1


In [196]:
from pyspark.sql.functions import lead

window = Window.partitionBy('StatementRequestObjectID').orderBy('CreatedDate')
column = lead(col('CreatedDate')).over(window)

activity = activity.withColumn('StopDate', column)

column = min(col('CreatedDate')).over(window)
activity = activity.withColumn('StartDate', column)

path = '/tmp/ACTIVITY'
activity = create_tmp_parquet(spark, activity, path)
activity.orderBy('CreatedDate').toPandas()

Unnamed: 0,ObjectID,CreatedDate,ReferenceNumber,CustomerVendorName,JobNumber,JobName,ContactType,ActivityUser,ActivityDate,FollowUpDate,STNID,Notes,Outcome,ActivityType,CustVendorObjectID,VendorContactObjectID,StatementRequestObjectID,CreatedBy,StopDate,StartDate
0,30462427,2021-01-08 04:13:35,1268701,"AIRGAS, INC.",3366,Cornerstone Healthcare Group - 3366,Email,bwilhoit,2021-01-07,,,,Sent Authorization Letter,,23974550,,30462412,SVC_WORKFLOW,2021-03-24 10:07:45,2021-01-08 04:13:35
1,32377345,2021-03-24 10:07:45,1268701,"AIRGAS, INC.",3366,Cornerstone Healthcare Group - 3366,,Felicia Bishop,2021-03-23,,,9107052931--Needs Research,Needs Research,Note Only,23974550,,30462412,FBISHOP,2021-03-24 10:09:31,2021-01-08 04:13:35
2,32377354,2021-03-24 10:09:31,1268701,"AIRGAS, INC.",3366,Cornerstone Healthcare Group - 3366,,Felicia Bishop,2021-03-23,,,9107052931--Needs Research,Needs Research,Note Only,23974550,,30462412,FBISHOP,2021-04-08 04:42:36,2021-01-08 04:13:35
3,32761556,2021-04-08 04:42:36,1268701,"AIRGAS, INC.",3366,Cornerstone Healthcare Group - 3366,,Bill Wilhoit,2021-04-07,,,Fixed contact info to match BSAP. Sent email,Sent Authorization Letter,Note Only,23974550,31406725.0,30462412,BWILHOIT,2021-04-15 10:34:01,2021-01-08 04:13:35
4,32991930,2021-04-15 10:34:01,1268701,"AIRGAS, INC.",3366,Cornerstone Healthcare Group - 3366,,OnBase Processing Service,2021-04-14,,6628825.0,Statement received in the office. Status chang...,Statement Received,,23974550,,30462412,SVC_WORKFLOW,NaT,2021-01-08 04:13:35


In [205]:
# activity.selectExpr('StopDate - CreatedDate AS Difference')

(
    activity
    # .select(col('CreatedDate').cast('long') - col('StartDate').cast('long'))
    # .select(col('StopDate').cast('long') - col('CreatedDate').cast('long'))
    .withColumn('StartTime', col('CreatedDate').cast('long') - col('StartDate').cast('long'))
    .withColumn('StopTime', col('StopDate').cast('long') - col('CreatedDate').cast('long'))
    .toPandas()
)

Unnamed: 0,ObjectID,CreatedDate,ReferenceNumber,CustomerVendorName,JobNumber,JobName,ContactType,ActivityUser,ActivityDate,FollowUpDate,STNID,Notes,Outcome,ActivityType,CustVendorObjectID,VendorContactObjectID,StatementRequestObjectID,CreatedBy,StopDate,StartDate,StartTime,StopTime
0,30462427,2021-01-08 04:13:35,1268701,"AIRGAS, INC.",3366,Cornerstone Healthcare Group - 3366,Email,bwilhoit,2021-01-07,,,,Sent Authorization Letter,,23974550,,30462412,SVC_WORKFLOW,2021-03-24 10:07:45,2021-01-08 04:13:35,0,6497650.0
1,32377345,2021-03-24 10:07:45,1268701,"AIRGAS, INC.",3366,Cornerstone Healthcare Group - 3366,,Felicia Bishop,2021-03-23,,,9107052931--Needs Research,Needs Research,Note Only,23974550,,30462412,FBISHOP,2021-03-24 10:09:31,2021-01-08 04:13:35,6497650,106.0
2,32377354,2021-03-24 10:09:31,1268701,"AIRGAS, INC.",3366,Cornerstone Healthcare Group - 3366,,Felicia Bishop,2021-03-23,,,9107052931--Needs Research,Needs Research,Note Only,23974550,,30462412,FBISHOP,2021-04-08 04:42:36,2021-01-08 04:13:35,6497756,1276385.0
3,32761556,2021-04-08 04:42:36,1268701,"AIRGAS, INC.",3366,Cornerstone Healthcare Group - 3366,,Bill Wilhoit,2021-04-07,,,Fixed contact info to match BSAP. Sent email,Sent Authorization Letter,Note Only,23974550,31406725.0,30462412,BWILHOIT,2021-04-15 10:34:01,2021-01-08 04:13:35,7774141,625885.0
4,32991930,2021-04-15 10:34:01,1268701,"AIRGAS, INC.",3366,Cornerstone Healthcare Group - 3366,,OnBase Processing Service,2021-04-14,,6628825.0,Statement received in the office. Status chang...,Statement Received,,23974550,,30462412,SVC_WORKFLOW,NaT,2021-01-08 04:13:35,8400026,


In [206]:
7774141 + 625885

8400026

In [136]:
# Duplicate
# activity = activities.where('StatementRequestObjectID = 27444129')

# activity.cache().count()

# activity.sort('CreatedDate').toPandas()

In [150]:
# keys = ['ReferenceNumber', 'JobNumber', 'CreatedDate']
# orders = [desc('ObjectID')]

# firstRecords(activity, keys, orders).orderBy('ObjectID').toPandas()

In [89]:
(
    activities
    .where('JobNumber IS NOT NULL')
    .where('ReferenceNumber IS NOT NULL')
    .groupBy('Outcome')
    .count()
    .sort(desc('count'))
    .toPandas()
)

Unnamed: 0,Outcome,count
0,Sent Authorization Letter,238394
1,,154617
2,Statement Received,69513
3,Left Voicemail,55348
4,Will Not Comply,54706
5,Will Email or Fax,9866
6,Need to Receive,8447
7,Needs Supervisor,7319
8,Needs Research,6027
9,Remove From Scope,5665


In [135]:
# (
#     activities
#     .where(col('Outcome') == 'Statement Received')
#     .where('JobNumber IS NOT NULL')
#     .groupBy('ReferenceNumber')
#     .count()
#     .sort(desc('count'))
#     .show()
# )

In [134]:
# received = (
#     activities
#     .where('JobNumber IS NOT NULL')
#     .where('ReferenceNumber = 1049797')
#     # .where('StatementRequestObjectID = 33556705')
# )

# received.cache().count()

# received.orderBy('CreatedDate').toPandas()

In [151]:
# from pyspark.sql.functions import coalesce, lit, min, when

# conditions = ["Outcome = 'Statement Received'"]

# groups = ['StatementRequestObjectID']

# column = create_column(col('CreatedDate'), conditions, groups, min)

# (
#     received
#     .withColumn('ReceivedDate', coalesce(column, col('CreatedDate')))
#     .where('NOT CreatedDate > ReceivedDate')
#     .sort('StatementRequestObjectID', 'CreatedDate')
#     .toPandas()
# )

In [121]:
# conditions = ["Outcome = 'Statement Received'"]

# groups = ['StatementRequestObjectID']


# case = when(col('Outcome') == 'Statement Received', lit(1)).otherwise(0)
# # received = received.withColumn('Received', case)

# window = (
#     Window
#     .partitionBy('StatementRequestObjectID')
#     .orderBy('CreatedDate')
#     .rowsBetween(Window.unboundedPreceding, Window.currentRow)
# )

# (
#     received
#     .withColumn('Received', sum(case).over(window))
#     # .orderBy('CreatedDate')
#     # .toPandas()
# )

# column = sum(case).over(window)

In [16]:
%%time
path = os.path.join(BASE_FOLDER, 'rm_DVStatementRequests.parquet')
requests = spark.read.parquet(resolve(path))

dates = [
    'RequestDate',
    'LastActivityDate'
]

for column in dates:
    requests = requests.withColumn(column, col(column).cast('date'))

requests = requests.withColumn('Volume', col('Volume').cast('double'))

# drops = [
#     'MessageID',
#     'VendorContactObjectID'
# ]

# requests = requests.drop(*drops)

requests = truncateTimestamp(requests)    
requests = trimString(requests)

requests = create_tmp_parquet(spark, requests, '/tmp/REQUESTS')
requests.limit(5).toPandas()

CPU times: user 88.5 ms, sys: 9.45 ms, total: 98 ms
Wall time: 10.5 s


Unnamed: 0,JobNo,JobTier,CustomerName,CustVendorID,VendorNo,CustVendorGroupID,WNC,StatementWNC,VendorGroupName,Volume,RequestDate,ReferenceNumber,Status,RequestMethod,RequestType,Contact,RequesterFullName,RequestText,LastActivityDate,LastStatementReceivedDate,CreatedDate,CallsheetNo,ObjectID,CallerStatus,ReconStatus,CurrentAssigneeID,CurrentAssigneeName,EnteredReconDate,LastReconQueueName,LastReconQueueEntryDate,AccountsReceived,AccountsRequested,MessageID,VendorContactObjectID,WebsiteVendor,WNCSpecialHandling,NeedLeadVendor
0,3427,,New Hanover Regional Medical Center,415960,16845,415959,,,A M COFFEE DISTRIBUTORS LLC,0.0,2021-09-02,1285660,No Receipt,MassEmail,Caller,amcoffeedist@gmail.com,Cindy Allen,,2021-08-22,NaT,2021-09-02 22:50:28,C-338914,36846273,Sent Authorization Letter,,19272454,Yodit Kahssai,NaT,,NaT,,,hONrG5xpTUyiQB4Rs7gb0g,29510277.0,,,
1,3427,,New Hanover Regional Medical Center,415960,16845,415959,,,A M COFFEE DISTRIBUTORS LLC,0.0,2020-07-23,1285660,Superceded,MassEmail,,amcoffeedist@gmail.com,Jessica Rinehart,,2020-07-23,NaT,2020-07-24 12:29:52,C-338914,26681846,,,19272454,Yodit Kahssai,NaT,,NaT,,,,,,,
2,3427,,New Hanover Regional Medical Center,415960,16845,415959,,,A M COFFEE DISTRIBUTORS LLC,0.0,2020-07-30,1285660,Superceded,MassEmail,Mass 1,amcoffeedist@gmail.com,Jessica Rinehart,,2020-07-30,NaT,2020-07-31 04:05:15,C-338914,26819776,,,19272454,Yodit Kahssai,NaT,,NaT,,,,,,,
3,3427,,New Hanover Regional Medical Center,415960,16845,415959,,,A M COFFEE DISTRIBUTORS LLC,0.0,2021-01-18,1285660,Superceded,MassEmail,Caller,amcoffeedist@gmail.com,Jessica Rinehart,,2020-07-30,NaT,2021-01-19 07:01:21,C-338914,30690711,Sent Authorization Letter,,8398263,Rebekah Dykema,NaT,,NaT,,,,29510277.0,,,
4,3427,,New Hanover Regional Medical Center,415960,16845,415959,,,A M COFFEE DISTRIBUTORS LLC,0.0,2021-08-18,1285660,Superceded,MassEmail,Mass 1,amcoffeedist@gmail.com,Cindy Allen,,2021-08-22,NaT,2021-08-22 20:03:52,C-338914,36365526,Sent Authorization Letter,,8398263,Rebekah Dykema,NaT,,NaT,,,hONrG5xpTUyiQB4Rs7gb0g,29510277.0,,,


In [16]:
%%time
from pandas import read_parquet

# requests.count() # 346,199
# requests.where('MessageID IS NOT NULL').count() # 84,709
# requests.where('MessageID IS NOT NULL').select('MessageID').dropDuplicates().count() # 42,473

# requests.write.parquet('/tmp/REQUESTS', mode='overwrite')

# table = read_parquet('/tmp/REQUESTS')

# table.head()

# table.to_parquet('~/Desktop/REQUESTS.parquet')

# table = read_parquet('~/Desktop/REQUESTS.parquet')

# columns = [
#     'CallerStatus',
#     'CustomerName',
#     'RequestType',
#     'RequesterFullName',
#     'CurrentAssigneeName',
#     'RequestMethod',
#     'Volume',
#     'Status'
# ]

# table[columns].fillna('NA').to_csv('~/Desktop/REQUESTS.csv', index=False)

CPU times: user 2.54 s, sys: 135 ms, total: 2.68 s
Wall time: 3.07 s


In [11]:
%%time
path = os.path.join(BASE_FOLDER, 'rm_DVStatements.parquet')

statements = spark.read.parquet(resolve(path))

statements = truncateTimestamp(statements)
statements = trimString(statements)

statements.persist(StorageLevel.DISK_ONLY)

statements.limit(5).toPandas()

CPU times: user 50.5 ms, sys: 6.35 ms, total: 56.9 ms
Wall time: 13.4 s


Unnamed: 0,ReferenceNumber,JobNo,JobTier,StatementDate,DocumentHandle,ObjectID,CustVendorObjID,CustVendGroupObjID,CustVendGroupName,CustomerName,CID,CustVendName,CustVendNo,Volume,AccountsIdentified,Recon,EmailMessageID,CreatedDate,SRARObjectId
0,1074630,2600,,2019-08-08 20:00:00,,20482171,19080046,19080045,TEST - HILL-ROM COMPANY INC,"BSI Healthcare Audit Services, LLC",15121,TEST - HILL-ROM COMPANY INC,2340,2014673.0,,,0000000085C36CE5BB5D9244B44E28C87A7C112B070049...,2019-08-09 09:38:05,
1,1074630,2600,,2019-08-08 20:00:00,,20482173,19080046,19080045,TEST - HILL-ROM COMPANY INC,"BSI Healthcare Audit Services, LLC",15121,TEST - HILL-ROM COMPANY INC,2340,2014673.0,,,0000000085C36CE5BB5D9244B44E28C87A7C112B070049...,2019-08-09 09:45:07,
2,1074630,2600,,2019-08-08 20:00:00,,20482385,19080046,19080045,TEST - HILL-ROM COMPANY INC,"BSI Healthcare Audit Services, LLC",15121,TEST - HILL-ROM COMPANY INC,2340,2014673.0,,,0000000085C36CE5BB5D9244B44E28C87A7C112B070049...,2019-08-09 12:06:44,
3,1074630,2600,,2019-08-08 20:00:00,,20482388,19080046,19080045,TEST - HILL-ROM COMPANY INC,"BSI Healthcare Audit Services, LLC",15121,TEST - HILL-ROM COMPANY INC,2340,2014673.0,,,0000000085C36CE5BB5D9244B44E28C87A7C112B070049...,2019-08-09 12:09:29,
4,1157916,2936,1.0,2019-08-13 20:00:00,,20512575,6523144,6523143,AMBU INC,Providence,12146,AMBU INC,25171,713229.48,,,00000000F643DA057DBD124B829A30CFCF400C1507002B...,2019-08-14 05:37:44,


In [12]:
%%time
path = os.path.join(BASE_FOLDER, 'cc_STStatementEmailDocs.parquet')

emails = spark.read.parquet(resolve(path))

emails = truncateTimestamp(emails)
emails = trimString(emails)

emails.persist(StorageLevel.DISK_ONLY)

emails.limit(5).toPandas()

CPU times: user 65.3 ms, sys: 7.72 ms, total: 73 ms
Wall time: 7.88 s


Unnamed: 0,DocumentHandle,DateCreated,MAILDateTime,MAILFromAddress,MAILToAddress,MAILCcAddress,MAILSubject,MAILMessageID,MAILAttachmentCount,S-Ref#,S-CustomerName,S-Job#,S-VendorName,S-CreatedByUser,S-SkipAutoReceive,S-Recon,S-LargeCredits,IngestionSource,S-StatementExistsInBatch
0,15594200,2019-06-12 05:22:11,2019-06-07 15:14:29,,,,RE: STATEMENT REQUEST - FAIRVIEW HEALTH SERVIC...,00000000F643DA057DBD124B829A30CFCF400C1507002B...,11,1230333,,,,,,,,,
1,15595567,2019-06-12 05:28:05,2019-06-07 15:14:17,,,,RE: STATEMENT REQUEST (REF # 1190860),00000000F643DA057DBD124B829A30CFCF400C1507002B...,7,1190860,,,,,,,,,
2,15595581,2019-06-12 05:40:03,2019-06-07 15:12:18,,,,RE: INFORMATION REQUEST #1237232,00000000F643DA057DBD124B829A30CFCF400C1507002B...,2,1237232,,,,,,,,,
3,15595588,2019-06-12 05:42:47,2019-06-07 15:15:58,,,,REF#1157275 STATEMENT FROM PROVATION MEDICAL F...,00000000F643DA057DBD124B829A30CFCF400C1507002B...,2,1157275,,,,,,,,,
4,15595595,2019-06-12 05:49:49,2019-06-07 15:15:37,,,,RE: INFORMATION REQUEST #1217277,00000000F643DA057DBD124B829A30CFCF400C1507002B...,2,1217277,,,,,,,,,


In [19]:
%%time
activity = (
    activities
    .join(
        requests,
        on=activities['StatementRequestObjectID'] == requests['ObjectID'],
        how='left_semi'
    )
)

activity.limit(5).toPandas()

CPU times: user 39.1 ms, sys: 11.7 ms, total: 50.8 ms
Wall time: 12.6 s


Unnamed: 0,ObjectID,CreatedDate,ReferenceNumber,CustomerVendorName,JobNumber,JobName,ContactType,ActivityUser,ActivityDate,FollowUpDate,STNID,Notes,Outcome,ActivityType,CustVendorObjectID,VendorContactObjectID,StatementRequestObjectID,CreatedBy
0,28785475,2020-11-02 06:41:33,1279250,ULINE,3412,Carolinas Shared Service - 3412,Call,ASIBLEY,2020-11-01,2020-11-03,,This is a test - Updated,,,21259327,,25422522,ASIBLEY
1,28785637,2020-11-02 06:47:27,1279250,ULINE,3412,Carolinas Shared Service - 3412,Email,ASIBLEY,2020-11-01,2020-11-04,,Test 2 - Updated,,,21259327,,25422522,ASIBLEY
2,28786094,2020-11-02 06:52:58,1279250,ULINE,3412,Carolinas Shared Service - 3412,Client Email,ASIBLEY,2020-11-01,2020-11-03,,Test 3,,,21259327,,25422522,ASIBLEY
3,28838136,2020-11-04 12:47:58,1287154,BIOCOMPOSITES INC.,3435,The Christ Hospital - 3435,,bwilliams1,2020-11-03,,6192802.0,Statement Request creation skipped because thi...,,,27542411,,27781594,ASIBLEY
4,28838138,2020-11-04 12:48:02,1225130,BONA FIDE COMMERCIAL SERVICES,3177,UC Health - 3177,,jdagher,2020-11-03,,6192806.0,Statement Request creation skipped because thi...,,,24317597,,25860124,ASIBLEY


In [35]:
%%time
# each StatementRequestObjectID maps to one Customer/Vendor, Job, and ReferenceNumber
# FollowUpDate mostly null
valuesMap = distinctValuesMap(activity, ['StatementRequestObjectID'], debug=True)

ObjectID 142537
CreatedDate 142529
ReferenceNumber 0
CustomerVendorName 0
JobNumber 0
JobName 0
ContactType 62554
ActivityUser 122683
ActivityDate 109358
FollowUpDate 1
STNID 37376
Notes 123734
Outcome 79639
ActivityType 8986
CustVendorObjectID 10
VendorContactObjectID 3953
CreatedBy 42491
CPU times: user 91.9 ms, sys: 25 ms, total: 117 ms
Wall time: 1min 2s


In [138]:
request = (
    requests
    .join(
        activity,
        on=[
            activity['StatementRequestObjectID'] == requests['ObjectID'],
            # activity['Outcome'] == requests['CallerStatus']
        ],
        how='left_semi'
    )
)

request.persist(StorageLevel.DISK_ONLY).count()

260289

In [146]:
# not all activity has Outcome filled with value to update the request caller status
(
    request
    .join(
        activity,
        on=[
            activity['StatementRequestObjectID'] == request['ObjectID'],
            activity['Outcome'].eqNullSafe(request['CallerStatus'])
        ],
        how='left_anti'
    )
    .toPandas()
)

Unnamed: 0,JobNo,JobTier,CustomerName,CustVendorID,VendorNo,CustVendorGroupID,WNC,StatementWNC,VendorGroupName,Volume,RequestDate,ReferenceNumber,Status,RequestMethod,RequestType,Contact,RequesterFullName,RequestText,LastActivityDate,LastStatementReceivedDate,CreatedDate,CallsheetNo,ObjectID,CallerStatus,ReconStatus,CurrentAssigneeID,CurrentAssigneeName,EnteredReconDate,LastReconQueueName,LastReconQueueEntryDate,AccountsReceived,AccountsRequested,MessageID,VendorContactObjectID,WebsiteVendor,WNCSpecialHandling,NeedLeadVendor
0,3177,2.0,UC Health,18169231,100063533,18169230,0,,HENRY SCHEIN INC - DENTAL,262562.01,2020-08-13,1225093,Fully Received,MassEmail,Mass 1,Megan.Albinowski@henryschein.com,Cindy Allen,,2020-11-15,2020-11-16 19:00:00,2020-08-14 06:05:32,C-295396,27434056,Statement Received,,23668664,Josephine Dagher,NaT,,NaT,,,,,,,
1,3306,,Montefiore Health System,21988911,117635,21988910,0,,SIRTEX MEDICAL INC.,0.0,2020-08-16,1259437,Fully Received,MassEmail,Mass 1,bryant.gamez@sirtex.com,Bill Wilhoit,,2020-11-18,2020-11-19 19:00:00,2020-08-17 06:44:48,C-303935,27442628,Statement Received,,8406307,Carla Roark,NaT,,NaT,,,,,,,
2,3431,,San Antonio Regional Hospital,27645383,1501,27645381,0,,PASADENA BAKING CO,66456.58,2020-09-02,1287721,Fully Received,MassFax,Mass 1,16267960081@efaxsend.com,Jessica Rinehart,"<!DOCTYPE html><html><BASE href=""http://efax.c...",2020-12-02,2020-12-02 19:00:00,2020-09-03 04:08:58,C-348725,27757104,Statement Received,,8405658,Felicia Bishop,NaT,,NaT,,,,29605093.0,0.0,,
3,2775,1.0,BJC Healthcare,22523106,5028,22523105,0,,ARTEC ENVIRONMENTAL MONITORING,248662.57,2020-09-10,1049623,Fully Received,MassEmail,Mass 1,deanna@artecenvironmental.com,Bill Wilhoit,,2020-11-12,2020-11-17 19:00:00,2020-09-11 04:48:14,C-305127,27852339,Statement Received,,20420366,Christine Brooks,NaT,,NaT,,,,,,,
4,3212,,Fairview Health Services,15316433,165185,15316432,0,,DELL SOFTWARE INC,11812.32,2020-10-11,1230227,Fully Received,MassEmail,Mass 1,accountsreceivable@quest.com,Bill Wilhoit,,2020-11-15,2020-11-30 19:00:00,2020-10-12 05:12:30,C-315977,28462804,Statement Received,,19272454,Yodit Kahssai,NaT,,NaT,,,,31410727.0,,,


In [150]:
# activities.where('StatementRequestObjectID = 27442628').toPandas()

In [52]:
# columnFactorCounts(activity, ['StatementRequestObjectID'], 'ContactType').show()

# activity.groupBy('FollowUpDate').count().show()

# activity.where("ContactType = 'N/A'").count()

# activity.groupBy('ContactType').count().show()

columnFactorArray(activity, ['StatementRequestObjectID'], 'ContactType').limit(10).toPandas()

Unnamed: 0,StatementRequestObjectID,ContactType,size
0,30633982,"[N/A, Email, Call, Client Email]",4
1,33476087,"[N/A, Email, Call, Client Email]",4
2,29447834,"[N/A, Call, Fax, Client Email]",4
3,31947117,"[N/A, Email, Call, Client Email]",4
4,28887815,"[N/A, Email, Call, Client Email]",4
5,30784634,"[N/A, Email, Call, Fax]",4
6,34050529,"[N/A, Email, Call, Client Email]",4
7,28887824,"[N/A, Email, Call, Client Email]",4
8,30634501,"[N/A, Email, Call, Client Email]",4
9,31195395,"[N/A, Email, Call, Client Email]",4


In [57]:
keys = [
    'JobNumber',
    'CustomerVendorName',
    'ReferenceNumber'
]

columnFactorArray(activity, keys, 'ContactType').limit(10).toPandas()

Unnamed: 0,JobNumber,CustomerVendorName,ReferenceNumber,ContactType,size
0,3100,COSMAN MEDICAL INC,1222378,"[N/A, Email, Call, Fax, Client Email]",5
1,3065,PHONAK LLC,1218252,"[N/A, Email, Call, Fax, Client Email]",5
2,2775,FISHER HEALTHCARE,1049287,"[N/A, Email, Call, Fax, Client Email]",5
3,3435,GE PRECISION HEALTHCARE LLC,1286907,"[N/A, Email, Call, Fax, Client Email]",5
4,3354,CHANGE HEALTHCARE,1274698,"[N/A, Email, Call, Fax, Client Email]",5
5,3444,PENOBSCOT RESPIRATORY P A,1293399,"[N/A, Email, Call, Fax, Client Email]",5
6,3100,AWI FIXTURES AND INTERIORS INC,1222741,"[N/A, Email, Call, Fax, Client Email]",5
7,3022,GE MEDICAL SYSTEMS INFORMATION TECHNOLOGIES,1191589,"[N/A, Email, Call, Fax, Client Email]",5
8,3285,SANOFI PASTEUR INCORPORATED,1239311,"[N/A, Email, Call, Fax, Client Email]",5
9,3063,LABORIE MED TECHNOLOGIES CORP,1277182,"[N/A, Email, Call, Fax, Client Email]",5


In [27]:
conditions = [
    "JobNumber = 3100",
    "CustomerVendorName = 'COSMAN MEDICAL INC'",
    "ReferenceNumber = 1222378"
]

condition = ' AND '.join(conditions)

# activity.where(condition).sort('CreatedDate').toPandas()

In [74]:
from pyspark.sql.functions import col

key = 'StatementRequestObjectID'
value = 30633982

activity.where(col(key) == value).sort('CreatedDate').toPandas()

Unnamed: 0,ObjectID,CreatedDate,ReferenceNumber,CustomerVendorName,JobNumber,JobName,ContactType,ActivityUser,ActivityDate,FollowUpDate,STNID,Notes,Outcome,ActivityType,CustVendorObjectID,VendorContactObjectID,StatementRequestObjectID,CreatedBy
0,30936765,2021-01-25 09:18:09,1238291,CERAPEDICS INC,3285,Community Health Systems - 3285,Call,Yodit Kahssai,2021-01-24,,6427431.0,sent an email,Sent Authorization Letter,,21993184,,30633982,SVC_WORKFLOW
1,31090582,2021-02-01 09:10:57,1238291,CERAPEDICS INC,3285,Community Health Systems - 3285,Call,Yodit Kahssai,2021-01-31,,6446658.0,lvm for AR,Left Voicemail,,21993184,,30633982,SVC_WORKFLOW
2,31122654,2021-02-03 03:44:41,1238291,CERAPEDICS INC,3285,Community Health Systems - 3285,Client Email,Dorthy Kuester,2021-02-02,,6451230.0,emailed CerapedicsAccounts@cerapedics.com remo...,Left Voicemail,,21993184,,30633982,SVC_WORKFLOW
3,31122656,2021-02-03 03:44:43,1238291,CERAPEDICS INC,3285,Community Health Systems - 3285,Client Email,Dorthy Kuester,2021-02-02,,6451232.0,emailed CerapedicsAccounts@cerapedics.com remo...,Left Voicemail,,21993184,,30633982,SVC_WORKFLOW
4,31341318,2021-02-15 05:48:11,1238291,CERAPEDICS INC,3285,Community Health Systems - 3285,,Tammie Miller,2021-02-14,,6479513.0,"removed ph#(303) 974-6275, fax#(888) 671-4277,...",Statement Received,,21993184,,30633982,SVC_WORKFLOW
5,32303854,2021-03-22 04:17:44,1238291,CERAPEDICS INC,3285,Community Health Systems - 3285,,Avreil Markham,2021-03-21,,,,,,21993184,31405165.0,30633982,AMARKHAM
6,32303904,2021-03-22 04:26:08,1238291,CERAPEDICS INC,3285,Community Health Systems - 3285,,Avreil Markham,2021-03-21,,,,,,21993184,31405165.0,30633982,AMARKHAM
7,32304067,2021-03-22 04:38:54,1238291,CERAPEDICS INC,3285,Community Health Systems - 3285,Email,Avreil Markham,2021-03-21,,,"Emailed Jason, jbell@cerapedics.com regarding ...",,,21993184,31405165.0,30633982,AMARKHAM


In [75]:
key = 'ObjectID'
value = 30633982

requests.where(col(key) == value).sort('CreatedDate').toPandas()

Unnamed: 0,JobNo,JobTier,CustomerName,CustVendorID,VendorNo,CustVendorGroupID,WNC,StatementWNC,VendorGroupName,Volume,RequestDate,ReferenceNumber,Status,RequestMethod,RequestType,Contact,RequesterFullName,RequestText,LastActivityDate,LastStatementReceivedDate,CreatedDate,CallsheetNo,ObjectID,CallerStatus,ReconStatus,CurrentAssigneeID,CurrentAssigneeName,EnteredReconDate,LastReconQueueName,LastReconQueueEntryDate,AccountsReceived,AccountsRequested,MessageID,VendorContactObjectID,WebsiteVendor,WNCSpecialHandling,NeedLeadVendor
0,3285,,Community Health Systems,21993184,1011142,21993183,0,,CERAPEDICS INC,1302620.0,2021-01-17,1238291,Fully Received,MassEmail,Caller,CerapedicsAccounts@cerapedics.com,Bill Wilhoit,,2021-03-21,2021-03-22 20:00:00,2021-01-18 08:22:47,C-296573,30633982,Statement Received,First Round Sent,19272454,Yodit Kahssai,2021-02-10 06:46:17,Done,2021-03-23 06:25:52,5,5,,31405165,,,


In [80]:
# `Failed Receipt Follow Up` only level in requests.CallerStatus not in activities.Outcome
(
    requests
    .groupBy('CallerStatus')
    .count()
    # .sort(desc('count'))
    .sort('CallerStatus')
    .show(truncate=False)
)

+-------------------------+------+
|CallerStatus             |count |
+-------------------------+------+
|null                     |77624 |
|Failed Receipt Follow Up |194   |
|Left Voicemail           |8026  |
|Need to Receive          |7170  |
|Needs Account Number     |878   |
|Needs Lead               |407   |
|Needs Research           |1874  |
|Needs Supervisor         |4747  |
|Remove From Scope        |17218 |
|Sent Authorization Letter|111548|
|Statement Received       |81877 |
|Verbal Zero Balance      |2498  |
|Will Email or Fax        |1320  |
|Will Not Comply          |30818 |
+-------------------------+------+



In [112]:
# `Partial Receipt` and `Fully Received` of interest
(
    requests
    .groupBy('Status')
    .count()
    # .sort(desc('count'))
    .sort('Status')
    .show(truncate=False)
)

+----------------------+------+
|Status                |count |
+----------------------+------+
|null                  |8     |
|Account Identification|14267 |
|Fully Received        |87644 |
|No Receipt            |52884 |
|Partial Receipt       |3772  |
|Superceded            |187624|
+----------------------+------+



In [116]:
# top value is `Statement Received`
(
    requests
    .where(col('Status').rlike('Recei'))
    .groupBy('CallerStatus')
    .count()
    .sort(desc('count'))
    .show(truncate=False)
)

+-------------------------+-----+
|CallerStatus             |count|
+-------------------------+-----+
|Statement Received       |59392|
|null                     |27524|
|Sent Authorization Letter|20081|
|Remove From Scope        |17032|
|Will Not Comply          |6159 |
|Left Voicemail           |4800 |
|Needs Supervisor         |2432 |
|Need to Receive          |1982 |
|Verbal Zero Balance      |1764 |
|Needs Research           |1569 |
|Will Email or Fax        |645  |
|Needs Account Number     |608  |
|Needs Lead               |221  |
|Failed Receipt Follow Up |91   |
+-------------------------+-----+



In [81]:
(
    activities
    .groupBy('Outcome')
    .count()
    # .sort(desc('count'))
    .sort('Outcome')
    .show(truncate=False)
)

+-------------------------+-------+
|Outcome                  |count  |
+-------------------------+-------+
|null                     |1198265|
|Left Voicemail           |55354  |
|Need to Receive          |8447   |
|Needs Account Number     |1562   |
|Needs Lead               |720    |
|Needs Research           |6035   |
|Needs Supervisor         |7322   |
|Remove From Scope        |5665   |
|Sent Authorization Letter|238424 |
|Statement Received       |69515  |
|Verbal Zero Balance      |2044   |
|Will Email or Fax        |9869   |
|Will Not Comply          |54736  |
+-------------------------+-------+



In [118]:
(
    requests
    .groupBy('RequestMethod')
    .count()
    .sort(desc('count'))
    .show()
)

+-------------+------+
|RequestMethod| count|
+-------------+------+
|    MassEmail|214024|
|          WNC| 56293|
|      MassFax| 30238|
|        AdHoc| 28715|
|        Macro| 12096|
|         Call|  4187|
|    NeedsLead|   391|
|  ClientEmail|   247|
|         null|     8|
+-------------+------+



In [119]:
(
    requests
    .groupBy('RequestType')
    .count()
    .sort(desc('count'))
    .show()
)

+-----------+------+
|RequestType| count|
+-----------+------+
|     Caller|144206|
|     Mass 1|126803|
|       null| 49193|
|     Mass 2| 19845|
|      Macro|  6152|
+-----------+------+



In [130]:
(
    requests
    .where('RequestMethod IS NOT NULL AND RequestType IS NOT NULL')
    .groupBy('RequestMethod', 'RequestType')
    .count()
    # .sort(desc('count'))
    .sort('RequestMethod', 'RequestType')
    .show(42, truncate=False)
)

+-------------+-----------+------+
|RequestMethod|RequestType|count |
+-------------+-----------+------+
|AdHoc        |Caller     |28535 |
|AdHoc        |Macro      |19    |
|Call         |Caller     |3992  |
|Call         |Macro      |129   |
|ClientEmail  |Caller     |244   |
|ClientEmail  |Macro      |1     |
|Macro        |Caller     |4486  |
|Macro        |Macro      |5731  |
|MassEmail    |Caller     |53556 |
|MassEmail    |Mass 1     |113665|
|MassEmail    |Mass 2     |16430 |
|MassFax      |Caller     |9758  |
|MassFax      |Mass 1     |13138 |
|MassFax      |Mass 2     |3415  |
|NeedsLead    |Caller     |391   |
|WNC          |Caller     |43236 |
|WNC          |Macro      |272   |
+-------------+-----------+------+



In [126]:
(
    requests
    # .where(col('Status').rlike('Recei'))
    .groupBy('RequestMethod', 'Status')
    .count()
    .sort(desc('count'))
    # .show(24, truncate=False)
    .count()
)

42

In [None]:
(
    requests
    .where(col('Status').rlike('Recei'))
    .groupBy('RequestType', 'Status')
    .count()
    .sort(desc('count'))
    .show(24, truncate=False)
)

In [99]:
# RequestMethod, RequestType

columns = [
    'CustomerName',
    'ReferenceNumber',
    'VendorContactObjectID',
    'CreatedDate',
    'ObjectID'
]

request = requests

for column in columns:
    request = request.withColumnRenamed(column, 'Request' + column)

In [117]:
# activities.ActivityDate, requests.LastActivityDate should match? False
# (
#     activities
#     .join(
#         request,
#         on=[
#             activities['StatementRequestObjectID'] == request['RequestObjectID'],
#             activities['Outcome'] != request['CallerStatus'], # 288,382
#             activities['ActivityDate'] == request['LastActivityDate'] # 315,044
#         ],
#         # how='left_semi'
#         how='inner'
#     )
#     # .count()
#     # .select('ActivityDate', 'LastActivityDate')
#     # .show()
#     .limit(10)
#     .toPandas()
# )

In [26]:
keys = [
    'JobNumber',
    'CustomerVendorName',
    'ReferenceNumber'
]

# columnFactorCounts(activity, keys, 'StatementRequestObjectID').show(truncate=False)

In [25]:
conditions = [
    "JobNumber = 3432",
    "CustomerVendorName = 'ELSEVIER HEALTH SCIENCE'",
    "ReferenceNumber = 1290060"
]

condition = ' AND '.join(conditions)

# activity.where(condition).sort('CreatedDate').toPandas()

In [68]:
keys = activity.where(condition).select('StatementRequestObjectID').rdd.map(lambda x: x[0]).collect()

In [24]:
# requests.where(col('ObjectID').isin(keys)).sort('CreatedDate').toPandas()

In [15]:
requests.createOrReplaceTempView(name='requests')
activities.createOrReplaceTempView(name='activities')

In [20]:
%%time
query = """
SELECT
    activities.ContactType,
    activities.ActivityUser,
    activities.Outcome,
    activities.ActivityType,
    requests.CallerStatus,
    requests.CustomerName,
    requests.RequestType,
    requests.RequesterFullName,
    requests.CurrentAssigneeName,
    requests.RequestMethod,
    requests.Status
FROM
    requests
        INNER JOIN
            activities ON
                requests.ObjectID = activities.StatementRequestObjectID
"""


table = spark.sql(query)

path = '/tmp/REQUEST-ACTIVITY'

# table.na.fill(value='N/A').write.parquet(path, mode='overwrite')
table.where('Status IS NOT NULL').write.parquet(path, mode='overwrite')

CPU times: user 3.72 ms, sys: 2.52 ms, total: 6.25 ms
Wall time: 11.7 s


In [17]:
from pandas import read_parquet

table = read_parquet(path)

# table.to_parquet('~/Desktop/REQUEST-ACTIVITY.parquet')

In [18]:
table.to_csv('/Users/curtispassorelli/Desktop/REQUEST-ACTIVITY.csv', index=False)

In [None]:
table.na.fill()

In [None]:
spark.stop()